diff options
| author | Stefan Boberg <[email protected]> | 2025-11-07 14:49:13 +0100 |
|---|---|---|
| committer | GitHub Enterprise <[email protected]> | 2025-11-07 14:49:13 +0100 |
| commit | 24e43a913f29ac3b314354e8ce5175f135bcc64f (patch) | |
| tree | ca442937ceeb63461012b33a4576e9835099f106 /thirdparty/blake3/c | |
| parent | get oplog attachments (#622) (diff) | |
| download | zen-24e43a913f29ac3b314354e8ce5175f135bcc64f.tar.xz zen-24e43a913f29ac3b314354e8ce5175f135bcc64f.zip | |
switch to xmake for package management (#611)
This change removes our dependency on vcpkg for package management, in favour of bringing some code in-tree in the `thirdparty` folder as well as using the xmake build-in package management feature. For the latter, all the package definitions are maintained in the zen repo itself, in the `repo` folder.
It should now also be easier to build the project as it will no longer depend on having the right version of vcpkg installed, which has been a common problem for new people coming in to the codebase. Now you should only need xmake to build.
* Bumps xmake requirement on github runners to 2.9.9 to resolve an issue where xmake on Windows invokes cmake with `v144` toolchain which does not exist
* BLAKE3 is now in-tree at `thirdparty/blake3`
* cpr is now in-tree at `thirdparty/cpr`
* cxxopts is now in-tree at `thirdparty/cxxopts`
* fmt is now in-tree at `thirdparty/fmt`
* robin-map is now in-tree at `thirdparty/robin-map`
* ryml is now in-tree at `thirdparty/ryml`
* sol2 is now in-tree at `thirdparty/sol2`
* spdlog is now in-tree at `thirdparty/spdlog`
* utfcpp is now in-tree at `thirdparty/utfcpp`
* xmake package repo definitions is in `repo`
* implemented support for sanitizers. ASAN is supported on windows, TSAN, UBSAN, MSAN etc are supported on Linux/MacOS though I have not yet tested it extensively on MacOS
* the zencore encryption implementation also now supports using mbedTLS which is used on MacOS, though for now we still use openssl on Linux
* crashpad
* bumps libcurl to 8.11.0 (from 8.8.0) which should address a rare build upload bug
Diffstat (limited to 'thirdparty/blake3/c')
42 files changed, 36447 insertions, 0 deletions
diff --git a/thirdparty/blake3/c/.gitignore b/thirdparty/blake3/c/.gitignore new file mode 100644 index 000000000..6385bd378 --- /dev/null +++ b/thirdparty/blake3/c/.gitignore @@ -0,0 +1,7 @@ +blake3 +example +example_tbb +build/ +*.o + +CMakeUserPresets.json diff --git a/thirdparty/blake3/c/Makefile.testing b/thirdparty/blake3/c/Makefile.testing new file mode 100644 index 000000000..b540528d3 --- /dev/null +++ b/thirdparty/blake3/c/Makefile.testing @@ -0,0 +1,82 @@ +# This Makefile is only for testing. C callers should follow the instructions +# in ./README.md to incorporate these C files into their existing build. + +NAME=blake3 +CC=gcc +CFLAGS=-O3 -Wall -Wextra -std=c11 -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2 -fPIE -fvisibility=hidden +LDFLAGS=-pie -Wl,-z,relro,-z,now +TARGETS= +ASM_TARGETS= +EXTRAFLAGS=-Wa,--noexecstack + +ifdef BLAKE3_NO_SSE2 +EXTRAFLAGS += -DBLAKE3_NO_SSE2 +else +TARGETS += blake3_sse2.o +ASM_TARGETS += blake3_sse2_x86-64_unix.S +endif + +ifdef BLAKE3_NO_SSE41 +EXTRAFLAGS += -DBLAKE3_NO_SSE41 +else +TARGETS += blake3_sse41.o +ASM_TARGETS += blake3_sse41_x86-64_unix.S +endif + +ifdef BLAKE3_NO_AVX2 +EXTRAFLAGS += -DBLAKE3_NO_AVX2 +else +TARGETS += blake3_avx2.o +ASM_TARGETS += blake3_avx2_x86-64_unix.S +endif + +ifdef BLAKE3_NO_AVX512 +EXTRAFLAGS += -DBLAKE3_NO_AVX512 +else +TARGETS += blake3_avx512.o +ASM_TARGETS += blake3_avx512_x86-64_unix.S +endif + +ifdef BLAKE3_USE_NEON +EXTRAFLAGS += -DBLAKE3_USE_NEON=1 +TARGETS += blake3_neon.o +endif + +ifdef BLAKE3_NO_NEON +EXTRAFLAGS += -DBLAKE3_USE_NEON=0 +endif + +all: blake3.c blake3_dispatch.c blake3_portable.c main.c $(TARGETS) + $(CC) $(CFLAGS) $(EXTRAFLAGS) $^ -o $(NAME) $(LDFLAGS) + +blake3_sse2.o: blake3_sse2.c + $(CC) $(CFLAGS) $(EXTRAFLAGS) -c $^ -o $@ -msse2 + +blake3_sse41.o: blake3_sse41.c + $(CC) $(CFLAGS) $(EXTRAFLAGS) -c $^ -o $@ -msse4.1 + +blake3_avx2.o: blake3_avx2.c + $(CC) $(CFLAGS) $(EXTRAFLAGS) -c $^ -o $@ -mavx2 + +blake3_avx512.o: blake3_avx512.c + $(CC) $(CFLAGS) $(EXTRAFLAGS) -c $^ -o $@ -mavx512f -mavx512vl + +blake3_neon.o: blake3_neon.c + $(CC) $(CFLAGS) $(EXTRAFLAGS) -c $^ -o $@ + +test: CFLAGS += -DBLAKE3_TESTING -fsanitize=address,undefined +test: all + ./test.py + +asm: blake3.c blake3_dispatch.c blake3_portable.c main.c $(ASM_TARGETS) + $(CC) $(CFLAGS) $(EXTRAFLAGS) $^ -o $(NAME) $(LDFLAGS) + +test_asm: CFLAGS += -DBLAKE3_TESTING -fsanitize=address,undefined +test_asm: asm + ./test.py + +example: example.c blake3.c blake3_dispatch.c blake3_portable.c $(ASM_TARGETS) + $(CC) $(CFLAGS) $(EXTRAFLAGS) $^ -o $@ $(LDFLAGS) + +clean: + rm -f $(NAME) *.o diff --git a/thirdparty/blake3/c/README.md b/thirdparty/blake3/c/README.md new file mode 100644 index 000000000..57260b43b --- /dev/null +++ b/thirdparty/blake3/c/README.md @@ -0,0 +1,403 @@ +The official C implementation of BLAKE3. + +# Example + +An example program that hashes bytes from standard input and prints the +result: + +```c +#include "blake3.h" +#include <errno.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> + +int main(void) { + // Initialize the hasher. + blake3_hasher hasher; + blake3_hasher_init(&hasher); + + // Read input bytes from stdin. + unsigned char buf[65536]; + while (1) { + ssize_t n = read(STDIN_FILENO, buf, sizeof(buf)); + if (n > 0) { + blake3_hasher_update(&hasher, buf, n); + } else if (n == 0) { + break; // end of file + } else { + fprintf(stderr, "read failed: %s\n", strerror(errno)); + return 1; + } + } + + // Finalize the hash. BLAKE3_OUT_LEN is the default output length, 32 bytes. + uint8_t output[BLAKE3_OUT_LEN]; + blake3_hasher_finalize(&hasher, output, BLAKE3_OUT_LEN); + + // Print the hash as hexadecimal. + for (size_t i = 0; i < BLAKE3_OUT_LEN; i++) { + printf("%02x", output[i]); + } + printf("\n"); + return 0; +} +``` + +The code above is included in this directory as `example.c`. If you're +on x86\_64 with a Unix-like OS, you can compile a working binary like +this: + +```bash +gcc -O3 -o example example.c blake3.c blake3_dispatch.c blake3_portable.c \ + blake3_sse2_x86-64_unix.S blake3_sse41_x86-64_unix.S blake3_avx2_x86-64_unix.S \ + blake3_avx512_x86-64_unix.S +``` + +# API + +## The Struct + +```c +typedef struct { + // private fields +} blake3_hasher; +``` + +An incremental BLAKE3 hashing state, which can accept any number of +updates. This implementation doesn't allocate any heap memory, but +`sizeof(blake3_hasher)` itself is relatively large, currently 1912 bytes +on x86-64. This size can be reduced by restricting the maximum input +length, as described in Section 5.4 of [the BLAKE3 +spec](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf), +but this implementation doesn't currently support that strategy. + +## Common API Functions + +```c +void blake3_hasher_init( + blake3_hasher *self); +``` + +Initialize a `blake3_hasher` in the default hashing mode. + +--- + +```c +void blake3_hasher_update( + blake3_hasher *self, + const void *input, + size_t input_len); +``` + +Add input to the hasher. This can be called any number of times. This function +is always single-threaded; for multithreading see `blake3_hasher_update_tbb` +below. + + +--- + +```c +void blake3_hasher_finalize( + const blake3_hasher *self, + uint8_t *out, + size_t out_len); +``` + +Finalize the hasher and return an output of any length, given in bytes. +This doesn't modify the hasher itself, and it's possible to finalize +again after adding more input. The constant `BLAKE3_OUT_LEN` provides +the default output length, 32 bytes, which is recommended for most +callers. See the [Security Notes](#security-notes) below. + +## Less Common API Functions + +```c +void blake3_hasher_init_keyed( + blake3_hasher *self, + const uint8_t key[BLAKE3_KEY_LEN]); +``` + +Initialize a `blake3_hasher` in the keyed hashing mode. The key must be +exactly 32 bytes. + +--- + +```c +void blake3_hasher_init_derive_key( + blake3_hasher *self, + const char *context); +``` + +Initialize a `blake3_hasher` in the key derivation mode. The context +string is given as an initialization parameter, and afterwards input key +material should be given with `blake3_hasher_update`. The context string +is a null-terminated C string which should be **hardcoded, globally +unique, and application-specific**. The context string should not +include any dynamic input like salts, nonces, or identifiers read from a +database at runtime. A good default format for the context string is +`"[application] [commit timestamp] [purpose]"`, e.g., `"example.com +2019-12-25 16:18:03 session tokens v1"`. + +This function is intended for application code written in C. For +language bindings, see `blake3_hasher_init_derive_key_raw` below. + +--- + +```c +void blake3_hasher_init_derive_key_raw( + blake3_hasher *self, + const void *context, + size_t context_len); +``` + +As `blake3_hasher_init_derive_key` above, except that the context string +is given as a pointer to an array of arbitrary bytes with a provided +length. This is intended for writing language bindings, where C string +conversion would add unnecessary overhead and new error cases. Unicode +strings should be encoded as UTF-8. + +Application code in C should prefer `blake3_hasher_init_derive_key`, +which takes the context as a C string. If you need to use arbitrary +bytes as a context string in application code, consider whether you're +violating the requirement that context strings should be hardcoded. + +--- + +```c +void blake3_hasher_update_tbb( + blake3_hasher *self, + const void *input, + size_t input_len); +``` + +Add input to the hasher, using [oneTBB] to process large inputs using multiple +threads. This can be called any number of times. This gives the same result as +`blake3_hasher_update` above. + +[oneTBB]: https://uxlfoundation.github.io/oneTBB/ + +NOTE: This function is only enabled when the library is compiled with CMake option `BLAKE3_USE_TBB` +and when the oneTBB library is detected on the host system. See the building instructions for +further details. + +To get any performance benefit from multithreading, the input buffer needs to +be large. As a rule of thumb on x86_64, `blake3_hasher_update_tbb` is _slower_ +than `blake3_hasher_update` for inputs under 128 KiB. That threshold varies +quite a lot across different processors, and it's important to benchmark your +specific use case. + +Hashing large files with this function usually requires +[memory-mapping](https://en.wikipedia.org/wiki/Memory-mapped_file), since +reading a file into memory in a single-threaded loop takes longer than hashing +the resulting buffer. Note that hashing a memory-mapped file with this function +produces a "random" pattern of disk reads, which can be slow on spinning disks. +Again it's important to benchmark your specific use case. + +This implementation doesn't require configuration of thread resources and will +use as many cores as possible by default. More fine-grained control of +resources is possible using the [oneTBB] API. + +--- + +```c +void blake3_hasher_finalize_seek( + const blake3_hasher *self, + uint64_t seek, + uint8_t *out, + size_t out_len); +``` + +The same as `blake3_hasher_finalize`, but with an additional `seek` +parameter for the starting byte position in the output stream. To +efficiently stream a large output without allocating memory, call this +function in a loop, incrementing `seek` by the output length each time. + +--- + +```c +void blake3_hasher_reset( + blake3_hasher *self); +``` + +Reset the hasher to its initial state, prior to any calls to +`blake3_hasher_update`. Currently this is no different from calling +`blake3_hasher_init` or similar again. + +# Security Notes + +Outputs shorter than the default length of 32 bytes (256 bits) provide less security. An N-bit +BLAKE3 output is intended to provide N bits of first and second preimage resistance and N/2 +bits of collision resistance, for any N up to 256. Longer outputs don't provide any additional +security. + +Avoid relying on the secrecy of the output offset, that is, the `seek` argument of +`blake3_hasher_finalize_seek`. [_Block-Cipher-Based Tree Hashing_ by Aldo +Gunsing](https://eprint.iacr.org/2022/283) shows that an attacker who knows both the message +and the key (if any) can easily determine the offset of an extended output. For comparison, +AES-CTR has a similar property: if you know the key, you can decrypt a block from an unknown +position in the output stream to recover its block index. Callers with strong secret keys +aren't affected in practice, but secret offsets are a [design +smell](https://en.wikipedia.org/wiki/Design_smell) in any case. + +# Building + +The easiest and most complete method of compiling this library is with CMake. +This is the method described in the next section. Toward the end of the +building section there are more in depth notes about compiling manually and +things that are useful to understand if you need to integrate this library with +another build system. + +## CMake + +The minimum version of CMake is 3.9. The following invocations will compile and +install `libblake3`. With recent CMake: + +```bash +cmake -S c -B c/build "-DCMAKE_INSTALL_PREFIX=/usr/local" +cmake --build c/build --target install +``` + +With an older CMake: + +```bash +cd c +mkdir build +cd build +cmake .. "-DCMAKE_INSTALL_PREFIX=/usr/local" +cmake --build . --target install +``` + +The following options are available when compiling with CMake: + +- `BLAKE3_USE_TBB`: Enable oneTBB parallelism (Requires a C++20 capable compiler) +- `BLAKE3_FETCH_TBB`: Allow fetching oneTBB from GitHub (only if not found on system) +- `BLAKE3_EXAMPLES`: Compile and install example programs + +Options can be enabled like this: + +```bash +cmake -S c -B c/build "-DCMAKE_INSTALL_PREFIX=/usr/local" -DBLAKE3_USE_TBB=1 -DBLAKE3_FETCH_TBB=1 +``` + +## Building manually + +We try to keep the build simple enough that you can compile this library "by +hand", and it's expected that many callers will integrate it with their +pre-existing build systems. See the `gcc` one-liner in the "Example" section +above. + +### x86 + +Dynamic dispatch is enabled by default on x86. The implementation will +query the CPU at runtime to detect SIMD support, and it will use the +widest instruction set available. By default, `blake3_dispatch.c` +expects to be linked with code for five different instruction sets: +portable C, SSE2, SSE4.1, AVX2, and AVX-512. + +For each of the x86 SIMD instruction sets, four versions are available: +three flavors of assembly (Unix, Windows MSVC, and Windows GNU) and one +version using C intrinsics. The assembly versions are generally +preferred. They perform better, they perform more consistently across +different compilers, and they build more quickly. On the other hand, the +assembly versions are x86\_64-only, and you need to select the right +flavor for your target platform. + +Here's an example of building a shared library on x86\_64 Linux using +the assembly implementations: + +```bash +gcc -shared -O3 -o libblake3.so blake3.c blake3_dispatch.c blake3_portable.c \ + blake3_sse2_x86-64_unix.S blake3_sse41_x86-64_unix.S blake3_avx2_x86-64_unix.S \ + blake3_avx512_x86-64_unix.S +``` + +When building the intrinsics-based implementations, you need to build +each implementation separately, with the corresponding instruction set +explicitly enabled in the compiler. Here's the same shared library using +the intrinsics-based implementations: + +```bash +gcc -c -fPIC -O3 -msse2 blake3_sse2.c -o blake3_sse2.o +gcc -c -fPIC -O3 -msse4.1 blake3_sse41.c -o blake3_sse41.o +gcc -c -fPIC -O3 -mavx2 blake3_avx2.c -o blake3_avx2.o +gcc -c -fPIC -O3 -mavx512f -mavx512vl blake3_avx512.c -o blake3_avx512.o +gcc -shared -O3 -o libblake3.so blake3.c blake3_dispatch.c blake3_portable.c \ + blake3_avx2.o blake3_avx512.o blake3_sse41.o blake3_sse2.o +``` + +Note above that building `blake3_avx512.c` requires both `-mavx512f` and +`-mavx512vl` under GCC and Clang. Under MSVC, the single `/arch:AVX512` +flag is sufficient. The MSVC equivalent of `-mavx2` is `/arch:AVX2`. +MSVC enables SSE2 and SSE4.1 by default, and it doesn't have a +corresponding flag. + +If you want to omit SIMD code entirely, you need to explicitly disable +each instruction set. Here's an example of building a shared library on +x86 with only portable code: + +```bash +gcc -shared -O3 -o libblake3.so -DBLAKE3_NO_SSE2 -DBLAKE3_NO_SSE41 -DBLAKE3_NO_AVX2 \ + -DBLAKE3_NO_AVX512 blake3.c blake3_dispatch.c blake3_portable.c +``` + +### ARM NEON + +The NEON implementation is enabled by default on AArch64, but not on +other ARM targets, since not all of them support it. To enable it, set +`BLAKE3_USE_NEON=1`. Here's an example of building a shared library on +ARM Linux with NEON support: + +```bash +gcc -shared -O3 -o libblake3.so -DBLAKE3_USE_NEON=1 blake3.c blake3_dispatch.c \ + blake3_portable.c blake3_neon.c +``` + +To explicitiy disable using NEON instructions on AArch64, set +`BLAKE3_USE_NEON=0`. + +```bash +gcc -shared -O3 -o libblake3.so -DBLAKE3_USE_NEON=0 blake3.c blake3_dispatch.c \ + blake3_portable.c +``` + +Note that on some targets (ARMv7 in particular), extra flags may be +required to activate NEON support in the compiler. If you see an error +like... + +``` +/usr/lib/gcc/armv7l-unknown-linux-gnueabihf/9.2.0/include/arm_neon.h:635:1: error: inlining failed +in call to always_inline ‘vaddq_u32’: target specific option mismatch +``` + +...then you may need to add something like `-mfpu=neon-vfpv4 +-mfloat-abi=hard`. + +### Other Platforms + +The portable implementation should work on most other architectures. For +example: + +```bash +gcc -shared -O3 -o libblake3.so blake3.c blake3_dispatch.c blake3_portable.c +``` + +### Multithreading + +Multithreading is available using [oneTBB], by compiling the optional C++ +support file [`blake3_tbb.cpp`](./blake3_tbb.cpp). For an example of using +`mmap` (non-Windows) and `blake3_hasher_update_tbb` to get large-file +performance on par with [`b3sum`](../b3sum), see +[`example_tbb.c`](./example_tbb.c). You can build it like this: + +```bash +g++ -c -O3 -fno-exceptions -fno-rtti -DBLAKE3_USE_TBB -o blake3_tbb.o blake3_tbb.cpp +gcc -O3 -o example_tbb -lstdc++ -ltbb -DBLAKE3_USE_TBB blake3_tbb.o example_tbb.c blake3.c \ + blake3_dispatch.c blake3_portable.c blake3_sse2_x86-64_unix.S blake3_sse41_x86-64_unix.S \ + blake3_avx2_x86-64_unix.S blake3_avx512_x86-64_unix.S +``` + +NOTE: `-fno-exceptions` or equivalent is required to compile `blake3_tbb.cpp`, +and public API methods with external C linkage are marked `noexcept`. Compiling +that file with exceptions enabled will fail. Compiling with RTTI disabled isn't +required but is recommended for code size. diff --git a/thirdparty/blake3/c/blake3-config.cmake.in b/thirdparty/blake3/c/blake3-config.cmake.in new file mode 100644 index 000000000..92cf395b8 --- /dev/null +++ b/thirdparty/blake3/c/blake3-config.cmake.in @@ -0,0 +1,14 @@ +@PACKAGE_INIT@ + +include(CMakeFindDependencyMacro) + +# Remember TBB option state +set(BLAKE3_USE_TBB @BLAKE3_USE_TBB@) + +if(BLAKE3_USE_TBB) + find_dependency(TBB @TBB_VERSION@) +endif() + +include("${CMAKE_CURRENT_LIST_DIR}/blake3-targets.cmake") + +check_required_components(blake3) diff --git a/thirdparty/blake3/c/blake3.c b/thirdparty/blake3/c/blake3.c new file mode 100644 index 000000000..74fc2e73b --- /dev/null +++ b/thirdparty/blake3/c/blake3.c @@ -0,0 +1,650 @@ +#include <assert.h> +#include <stdbool.h> +#include <string.h> + +#include "blake3.h" +#include "blake3_impl.h" + +const char *blake3_version(void) { return BLAKE3_VERSION_STRING; } + +INLINE void chunk_state_init(blake3_chunk_state *self, const uint32_t key[8], + uint8_t flags) { + memcpy(self->cv, key, BLAKE3_KEY_LEN); + self->chunk_counter = 0; + memset(self->buf, 0, BLAKE3_BLOCK_LEN); + self->buf_len = 0; + self->blocks_compressed = 0; + self->flags = flags; +} + +INLINE void chunk_state_reset(blake3_chunk_state *self, const uint32_t key[8], + uint64_t chunk_counter) { + memcpy(self->cv, key, BLAKE3_KEY_LEN); + self->chunk_counter = chunk_counter; + self->blocks_compressed = 0; + memset(self->buf, 0, BLAKE3_BLOCK_LEN); + self->buf_len = 0; +} + +INLINE size_t chunk_state_len(const blake3_chunk_state *self) { + return (BLAKE3_BLOCK_LEN * (size_t)self->blocks_compressed) + + ((size_t)self->buf_len); +} + +INLINE size_t chunk_state_fill_buf(blake3_chunk_state *self, + const uint8_t *input, size_t input_len) { + size_t take = BLAKE3_BLOCK_LEN - ((size_t)self->buf_len); + if (take > input_len) { + take = input_len; + } + uint8_t *dest = self->buf + ((size_t)self->buf_len); + memcpy(dest, input, take); + self->buf_len += (uint8_t)take; + return take; +} + +INLINE uint8_t chunk_state_maybe_start_flag(const blake3_chunk_state *self) { + if (self->blocks_compressed == 0) { + return CHUNK_START; + } else { + return 0; + } +} + +typedef struct { + uint32_t input_cv[8]; + uint64_t counter; + uint8_t block[BLAKE3_BLOCK_LEN]; + uint8_t block_len; + uint8_t flags; +} output_t; + +INLINE output_t make_output(const uint32_t input_cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags) { + output_t ret; + memcpy(ret.input_cv, input_cv, 32); + memcpy(ret.block, block, BLAKE3_BLOCK_LEN); + ret.block_len = block_len; + ret.counter = counter; + ret.flags = flags; + return ret; +} + +// Chaining values within a given chunk (specifically the compress_in_place +// interface) are represented as words. This avoids unnecessary bytes<->words +// conversion overhead in the portable implementation. However, the hash_many +// interface handles both user input and parent node blocks, so it accepts +// bytes. For that reason, chaining values in the CV stack are represented as +// bytes. +INLINE void output_chaining_value(const output_t *self, uint8_t cv[32]) { + uint32_t cv_words[8]; + memcpy(cv_words, self->input_cv, 32); + blake3_compress_in_place(cv_words, self->block, self->block_len, + self->counter, self->flags); + store_cv_words(cv, cv_words); +} + +INLINE void output_root_bytes(const output_t *self, uint64_t seek, uint8_t *out, + size_t out_len) { + if (out_len == 0) { + return; + } + uint64_t output_block_counter = seek / 64; + size_t offset_within_block = seek % 64; + uint8_t wide_buf[64]; + if(offset_within_block) { + blake3_compress_xof(self->input_cv, self->block, self->block_len, output_block_counter, self->flags | ROOT, wide_buf); + const size_t available_bytes = 64 - offset_within_block; + const size_t bytes = out_len > available_bytes ? available_bytes : out_len; + memcpy(out, wide_buf + offset_within_block, bytes); + out += bytes; + out_len -= bytes; + output_block_counter += 1; + } + if(out_len / 64) { + blake3_xof_many(self->input_cv, self->block, self->block_len, output_block_counter, self->flags | ROOT, out, out_len / 64); + } + output_block_counter += out_len / 64; + out += out_len & -64; + out_len -= out_len & -64; + if(out_len) { + blake3_compress_xof(self->input_cv, self->block, self->block_len, output_block_counter, self->flags | ROOT, wide_buf); + memcpy(out, wide_buf, out_len); + } +} + +INLINE void chunk_state_update(blake3_chunk_state *self, const uint8_t *input, + size_t input_len) { + if (self->buf_len > 0) { + size_t take = chunk_state_fill_buf(self, input, input_len); + input += take; + input_len -= take; + if (input_len > 0) { + blake3_compress_in_place( + self->cv, self->buf, BLAKE3_BLOCK_LEN, self->chunk_counter, + self->flags | chunk_state_maybe_start_flag(self)); + self->blocks_compressed += 1; + self->buf_len = 0; + memset(self->buf, 0, BLAKE3_BLOCK_LEN); + } + } + + while (input_len > BLAKE3_BLOCK_LEN) { + blake3_compress_in_place(self->cv, input, BLAKE3_BLOCK_LEN, + self->chunk_counter, + self->flags | chunk_state_maybe_start_flag(self)); + self->blocks_compressed += 1; + input += BLAKE3_BLOCK_LEN; + input_len -= BLAKE3_BLOCK_LEN; + } + + chunk_state_fill_buf(self, input, input_len); +} + +INLINE output_t chunk_state_output(const blake3_chunk_state *self) { + uint8_t block_flags = + self->flags | chunk_state_maybe_start_flag(self) | CHUNK_END; + return make_output(self->cv, self->buf, self->buf_len, self->chunk_counter, + block_flags); +} + +INLINE output_t parent_output(const uint8_t block[BLAKE3_BLOCK_LEN], + const uint32_t key[8], uint8_t flags) { + return make_output(key, block, BLAKE3_BLOCK_LEN, 0, flags | PARENT); +} + +// Given some input larger than one chunk, return the number of bytes that +// should go in the left subtree. This is the largest power-of-2 number of +// chunks that leaves at least 1 byte for the right subtree. +INLINE size_t left_subtree_len(size_t input_len) { + // Subtract 1 to reserve at least one byte for the right side. input_len + // should always be greater than BLAKE3_CHUNK_LEN. + size_t full_chunks = (input_len - 1) / BLAKE3_CHUNK_LEN; + return round_down_to_power_of_2(full_chunks) * BLAKE3_CHUNK_LEN; +} + +// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE chunks at the same time +// on a single thread. Write out the chunk chaining values and return the +// number of chunks hashed. These chunks are never the root and never empty; +// those cases use a different codepath. +INLINE size_t compress_chunks_parallel(const uint8_t *input, size_t input_len, + const uint32_t key[8], + uint64_t chunk_counter, uint8_t flags, + uint8_t *out) { +#if defined(BLAKE3_TESTING) + assert(0 < input_len); + assert(input_len <= MAX_SIMD_DEGREE * BLAKE3_CHUNK_LEN); +#endif + + const uint8_t *chunks_array[MAX_SIMD_DEGREE]; + size_t input_position = 0; + size_t chunks_array_len = 0; + while (input_len - input_position >= BLAKE3_CHUNK_LEN) { + chunks_array[chunks_array_len] = &input[input_position]; + input_position += BLAKE3_CHUNK_LEN; + chunks_array_len += 1; + } + + blake3_hash_many(chunks_array, chunks_array_len, + BLAKE3_CHUNK_LEN / BLAKE3_BLOCK_LEN, key, chunk_counter, + true, flags, CHUNK_START, CHUNK_END, out); + + // Hash the remaining partial chunk, if there is one. Note that the empty + // chunk (meaning the empty message) is a different codepath. + if (input_len > input_position) { + uint64_t counter = chunk_counter + (uint64_t)chunks_array_len; + blake3_chunk_state chunk_state; + chunk_state_init(&chunk_state, key, flags); + chunk_state.chunk_counter = counter; + chunk_state_update(&chunk_state, &input[input_position], + input_len - input_position); + output_t output = chunk_state_output(&chunk_state); + output_chaining_value(&output, &out[chunks_array_len * BLAKE3_OUT_LEN]); + return chunks_array_len + 1; + } else { + return chunks_array_len; + } +} + +// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE parents at the same time +// on a single thread. Write out the parent chaining values and return the +// number of parents hashed. (If there's an odd input chaining value left over, +// return it as an additional output.) These parents are never the root and +// never empty; those cases use a different codepath. +INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values, + size_t num_chaining_values, + const uint32_t key[8], uint8_t flags, + uint8_t *out) { +#if defined(BLAKE3_TESTING) + assert(2 <= num_chaining_values); + assert(num_chaining_values <= 2 * MAX_SIMD_DEGREE_OR_2); +#endif + + const uint8_t *parents_array[MAX_SIMD_DEGREE_OR_2]; + size_t parents_array_len = 0; + while (num_chaining_values - (2 * parents_array_len) >= 2) { + parents_array[parents_array_len] = + &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN]; + parents_array_len += 1; + } + + blake3_hash_many(parents_array, parents_array_len, 1, key, + 0, // Parents always use counter 0. + false, flags | PARENT, + 0, // Parents have no start flags. + 0, // Parents have no end flags. + out); + + // If there's an odd child left over, it becomes an output. + if (num_chaining_values > 2 * parents_array_len) { + memcpy(&out[parents_array_len * BLAKE3_OUT_LEN], + &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN], + BLAKE3_OUT_LEN); + return parents_array_len + 1; + } else { + return parents_array_len; + } +} + +// The wide helper function returns (writes out) an array of chaining values +// and returns the length of that array. The number of chaining values returned +// is the dynamically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer, +// if the input is shorter than that many chunks. The reason for maintaining a +// wide array of chaining values going back up the tree, is to allow the +// implementation to hash as many parents in parallel as possible. +// +// As a special case when the SIMD degree is 1, this function will still return +// at least 2 outputs. This guarantees that this function doesn't perform the +// root compression. (If it did, it would use the wrong flags, and also we +// wouldn't be able to implement extendable output.) Note that this function is +// not used when the whole input is only 1 chunk long; that's a different +// codepath. +// +// Why not just have the caller split the input on the first update(), instead +// of implementing this special rule? Because we don't want to limit SIMD or +// multi-threading parallelism for that update(). +size_t blake3_compress_subtree_wide(const uint8_t *input, size_t input_len, + const uint32_t key[8], + uint64_t chunk_counter, uint8_t flags, + uint8_t *out, bool use_tbb) { + // Note that the single chunk case does *not* bump the SIMD degree up to 2 + // when it is 1. If this implementation adds multi-threading in the future, + // this gives us the option of multi-threading even the 2-chunk case, which + // can help performance on smaller platforms. + if (input_len <= blake3_simd_degree() * BLAKE3_CHUNK_LEN) { + return compress_chunks_parallel(input, input_len, key, chunk_counter, flags, + out); + } + + // With more than simd_degree chunks, we need to recurse. Start by dividing + // the input into left and right subtrees. (Note that this is only optimal + // as long as the SIMD degree is a power of 2. If we ever get a SIMD degree + // of 3 or something, we'll need a more complicated strategy.) + size_t left_input_len = left_subtree_len(input_len); + size_t right_input_len = input_len - left_input_len; + const uint8_t *right_input = &input[left_input_len]; + uint64_t right_chunk_counter = + chunk_counter + (uint64_t)(left_input_len / BLAKE3_CHUNK_LEN); + + // Make space for the child outputs. Here we use MAX_SIMD_DEGREE_OR_2 to + // account for the special case of returning 2 outputs when the SIMD degree + // is 1. + uint8_t cv_array[2 * MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN]; + size_t degree = blake3_simd_degree(); + if (left_input_len > BLAKE3_CHUNK_LEN && degree == 1) { + // The special case: We always use a degree of at least two, to make + // sure there are two outputs. Except, as noted above, at the chunk + // level, where we allow degree=1. (Note that the 1-chunk-input case is + // a different codepath.) + degree = 2; + } + uint8_t *right_cvs = &cv_array[degree * BLAKE3_OUT_LEN]; + + // Recurse! + size_t left_n = -1; + size_t right_n = -1; + +#if defined(BLAKE3_USE_TBB) + blake3_compress_subtree_wide_join_tbb( + key, flags, use_tbb, + // left-hand side + input, left_input_len, chunk_counter, cv_array, &left_n, + // right-hand side + right_input, right_input_len, right_chunk_counter, right_cvs, &right_n); +#else + left_n = blake3_compress_subtree_wide( + input, left_input_len, key, chunk_counter, flags, cv_array, use_tbb); + right_n = blake3_compress_subtree_wide(right_input, right_input_len, key, + right_chunk_counter, flags, right_cvs, + use_tbb); +#endif // BLAKE3_USE_TBB + + // The special case again. If simd_degree=1, then we'll have left_n=1 and + // right_n=1. Rather than compressing them into a single output, return + // them directly, to make sure we always have at least two outputs. + if (left_n == 1) { + memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN); + return 2; + } + + // Otherwise, do one layer of parent node compression. + size_t num_chaining_values = left_n + right_n; + return compress_parents_parallel(cv_array, num_chaining_values, key, flags, + out); +} + +// Hash a subtree with compress_subtree_wide(), and then condense the resulting +// list of chaining values down to a single parent node. Don't compress that +// last parent node, however. Instead, return its message bytes (the +// concatenated chaining values of its children). This is necessary when the +// first call to update() supplies a complete subtree, because the topmost +// parent node of that subtree could end up being the root. It's also necessary +// for extended output in the general case. +// +// As with compress_subtree_wide(), this function is not used on inputs of 1 +// chunk or less. That's a different codepath. +INLINE void +compress_subtree_to_parent_node(const uint8_t *input, size_t input_len, + const uint32_t key[8], uint64_t chunk_counter, + uint8_t flags, uint8_t out[2 * BLAKE3_OUT_LEN], + bool use_tbb) { +#if defined(BLAKE3_TESTING) + assert(input_len > BLAKE3_CHUNK_LEN); +#endif + + uint8_t cv_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN]; + size_t num_cvs = blake3_compress_subtree_wide(input, input_len, key, + chunk_counter, flags, cv_array, use_tbb); + assert(num_cvs <= MAX_SIMD_DEGREE_OR_2); + // The following loop never executes when MAX_SIMD_DEGREE_OR_2 is 2, because + // as we just asserted, num_cvs will always be <=2 in that case. But GCC + // (particularly GCC 8.5) can't tell that it never executes, and if NDEBUG is + // set then it emits incorrect warnings here. We tried a few different + // hacks to silence these, but in the end our hacks just produced different + // warnings (see https://github.com/BLAKE3-team/BLAKE3/pull/380). Out of + // desperation, we ifdef out this entire loop when we know it's not needed. +#if MAX_SIMD_DEGREE_OR_2 > 2 + // If MAX_SIMD_DEGREE_OR_2 is greater than 2 and there's enough input, + // compress_subtree_wide() returns more than 2 chaining values. Condense + // them into 2 by forming parent nodes repeatedly. + uint8_t out_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2]; + while (num_cvs > 2) { + num_cvs = + compress_parents_parallel(cv_array, num_cvs, key, flags, out_array); + memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN); + } +#endif + memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN); +} + +INLINE void hasher_init_base(blake3_hasher *self, const uint32_t key[8], + uint8_t flags) { + memcpy(self->key, key, BLAKE3_KEY_LEN); + chunk_state_init(&self->chunk, key, flags); + self->cv_stack_len = 0; +} + +void blake3_hasher_init(blake3_hasher *self) { hasher_init_base(self, IV, 0); } + +void blake3_hasher_init_keyed(blake3_hasher *self, + const uint8_t key[BLAKE3_KEY_LEN]) { + uint32_t key_words[8]; + load_key_words(key, key_words); + hasher_init_base(self, key_words, KEYED_HASH); +} + +void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context, + size_t context_len) { + blake3_hasher context_hasher; + hasher_init_base(&context_hasher, IV, DERIVE_KEY_CONTEXT); + blake3_hasher_update(&context_hasher, context, context_len); + uint8_t context_key[BLAKE3_KEY_LEN]; + blake3_hasher_finalize(&context_hasher, context_key, BLAKE3_KEY_LEN); + uint32_t context_key_words[8]; + load_key_words(context_key, context_key_words); + hasher_init_base(self, context_key_words, DERIVE_KEY_MATERIAL); +} + +void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context) { + blake3_hasher_init_derive_key_raw(self, context, strlen(context)); +} + +// As described in hasher_push_cv() below, we do "lazy merging", delaying +// merges until right before the next CV is about to be added. This is +// different from the reference implementation. Another difference is that we +// aren't always merging 1 chunk at a time. Instead, each CV might represent +// any power-of-two number of chunks, as long as the smaller-above-larger stack +// order is maintained. Instead of the "count the trailing 0-bits" algorithm +// described in the spec, we use a "count the total number of 1-bits" variant +// that doesn't require us to retain the subtree size of the CV on top of the +// stack. The principle is the same: each CV that should remain in the stack is +// represented by a 1-bit in the total number of chunks (or bytes) so far. +INLINE void hasher_merge_cv_stack(blake3_hasher *self, uint64_t total_len) { + size_t post_merge_stack_len = (size_t)popcnt(total_len); + while (self->cv_stack_len > post_merge_stack_len) { + uint8_t *parent_node = + &self->cv_stack[(self->cv_stack_len - 2) * BLAKE3_OUT_LEN]; + output_t output = parent_output(parent_node, self->key, self->chunk.flags); + output_chaining_value(&output, parent_node); + self->cv_stack_len -= 1; + } +} + +// In reference_impl.rs, we merge the new CV with existing CVs from the stack +// before pushing it. We can do that because we know more input is coming, so +// we know none of the merges are root. +// +// This setting is different. We want to feed as much input as possible to +// compress_subtree_wide(), without setting aside anything for the chunk_state. +// If the user gives us 64 KiB, we want to parallelize over all 64 KiB at once +// as a single subtree, if at all possible. +// +// This leads to two problems: +// 1) This 64 KiB input might be the only call that ever gets made to update. +// In this case, the root node of the 64 KiB subtree would be the root node +// of the whole tree, and it would need to be ROOT finalized. We can't +// compress it until we know. +// 2) This 64 KiB input might complete a larger tree, whose root node is +// similarly going to be the root of the whole tree. For example, maybe +// we have 196 KiB (that is, 128 + 64) hashed so far. We can't compress the +// node at the root of the 256 KiB subtree until we know how to finalize it. +// +// The second problem is solved with "lazy merging". That is, when we're about +// to add a CV to the stack, we don't merge it with anything first, as the +// reference impl does. Instead we do merges using the *previous* CV that was +// added, which is sitting on top of the stack, and we put the new CV +// (unmerged) on top of the stack afterwards. This guarantees that we never +// merge the root node until finalize(). +// +// Solving the first problem requires an additional tool, +// compress_subtree_to_parent_node(). That function always returns the top +// *two* chaining values of the subtree it's compressing. We then do lazy +// merging with each of them separately, so that the second CV will always +// remain unmerged. (That also helps us support extendable output when we're +// hashing an input all-at-once.) +INLINE void hasher_push_cv(blake3_hasher *self, uint8_t new_cv[BLAKE3_OUT_LEN], + uint64_t chunk_counter) { + hasher_merge_cv_stack(self, chunk_counter); + memcpy(&self->cv_stack[self->cv_stack_len * BLAKE3_OUT_LEN], new_cv, + BLAKE3_OUT_LEN); + self->cv_stack_len += 1; +} + +INLINE void blake3_hasher_update_base(blake3_hasher *self, const void *input, + size_t input_len, bool use_tbb) { + // Explicitly checking for zero avoids causing UB by passing a null pointer + // to memcpy. This comes up in practice with things like: + // std::vector<uint8_t> v; + // blake3_hasher_update(&hasher, v.data(), v.size()); + if (input_len == 0) { + return; + } + + const uint8_t *input_bytes = (const uint8_t *)input; + + // If we have some partial chunk bytes in the internal chunk_state, we need + // to finish that chunk first. + if (chunk_state_len(&self->chunk) > 0) { + size_t take = BLAKE3_CHUNK_LEN - chunk_state_len(&self->chunk); + if (take > input_len) { + take = input_len; + } + chunk_state_update(&self->chunk, input_bytes, take); + input_bytes += take; + input_len -= take; + // If we've filled the current chunk and there's more coming, finalize this + // chunk and proceed. In this case we know it's not the root. + if (input_len > 0) { + output_t output = chunk_state_output(&self->chunk); + uint8_t chunk_cv[32]; + output_chaining_value(&output, chunk_cv); + hasher_push_cv(self, chunk_cv, self->chunk.chunk_counter); + chunk_state_reset(&self->chunk, self->key, self->chunk.chunk_counter + 1); + } else { + return; + } + } + + // Now the chunk_state is clear, and we have more input. If there's more than + // a single chunk (so, definitely not the root chunk), hash the largest whole + // subtree we can, with the full benefits of SIMD (and maybe in the future, + // multi-threading) parallelism. Two restrictions: + // - The subtree has to be a power-of-2 number of chunks. Only subtrees along + // the right edge can be incomplete, and we don't know where the right edge + // is going to be until we get to finalize(). + // - The subtree must evenly divide the total number of chunks up until this + // point (if total is not 0). If the current incomplete subtree is only + // waiting for 1 more chunk, we can't hash a subtree of 4 chunks. We have + // to complete the current subtree first. + // Because we might need to break up the input to form powers of 2, or to + // evenly divide what we already have, this part runs in a loop. + while (input_len > BLAKE3_CHUNK_LEN) { + size_t subtree_len = round_down_to_power_of_2(input_len); + uint64_t count_so_far = self->chunk.chunk_counter * BLAKE3_CHUNK_LEN; + // Shrink the subtree_len until it evenly divides the count so far. We know + // that subtree_len itself is a power of 2, so we can use a bitmasking + // trick instead of an actual remainder operation. (Note that if the caller + // consistently passes power-of-2 inputs of the same size, as is hopefully + // typical, this loop condition will always fail, and subtree_len will + // always be the full length of the input.) + // + // An aside: We don't have to shrink subtree_len quite this much. For + // example, if count_so_far is 1, we could pass 2 chunks to + // compress_subtree_to_parent_node. Since we'll get 2 CVs back, we'll still + // get the right answer in the end, and we might get to use 2-way SIMD + // parallelism. The problem with this optimization, is that it gets us + // stuck always hashing 2 chunks. The total number of chunks will remain + // odd, and we'll never graduate to higher degrees of parallelism. See + // https://github.com/BLAKE3-team/BLAKE3/issues/69. + while ((((uint64_t)(subtree_len - 1)) & count_so_far) != 0) { + subtree_len /= 2; + } + // The shrunken subtree_len might now be 1 chunk long. If so, hash that one + // chunk by itself. Otherwise, compress the subtree into a pair of CVs. + uint64_t subtree_chunks = subtree_len / BLAKE3_CHUNK_LEN; + if (subtree_len <= BLAKE3_CHUNK_LEN) { + blake3_chunk_state chunk_state; + chunk_state_init(&chunk_state, self->key, self->chunk.flags); + chunk_state.chunk_counter = self->chunk.chunk_counter; + chunk_state_update(&chunk_state, input_bytes, subtree_len); + output_t output = chunk_state_output(&chunk_state); + uint8_t cv[BLAKE3_OUT_LEN]; + output_chaining_value(&output, cv); + hasher_push_cv(self, cv, chunk_state.chunk_counter); + } else { + // This is the high-performance happy path, though getting here depends + // on the caller giving us a long enough input. + uint8_t cv_pair[2 * BLAKE3_OUT_LEN]; + compress_subtree_to_parent_node(input_bytes, subtree_len, self->key, + self->chunk.chunk_counter, + self->chunk.flags, cv_pair, use_tbb); + hasher_push_cv(self, cv_pair, self->chunk.chunk_counter); + hasher_push_cv(self, &cv_pair[BLAKE3_OUT_LEN], + self->chunk.chunk_counter + (subtree_chunks / 2)); + } + self->chunk.chunk_counter += subtree_chunks; + input_bytes += subtree_len; + input_len -= subtree_len; + } + + // If there's any remaining input less than a full chunk, add it to the chunk + // state. In that case, also do a final merge loop to make sure the subtree + // stack doesn't contain any unmerged pairs. The remaining input means we + // know these merges are non-root. This merge loop isn't strictly necessary + // here, because hasher_push_chunk_cv already does its own merge loop, but it + // simplifies blake3_hasher_finalize below. + if (input_len > 0) { + chunk_state_update(&self->chunk, input_bytes, input_len); + hasher_merge_cv_stack(self, self->chunk.chunk_counter); + } +} + +void blake3_hasher_update(blake3_hasher *self, const void *input, + size_t input_len) { + bool use_tbb = false; + blake3_hasher_update_base(self, input, input_len, use_tbb); +} + +#if defined(BLAKE3_USE_TBB) +void blake3_hasher_update_tbb(blake3_hasher *self, const void *input, + size_t input_len) { + bool use_tbb = true; + blake3_hasher_update_base(self, input, input_len, use_tbb); +} +#endif // BLAKE3_USE_TBB + +void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out, + size_t out_len) { + blake3_hasher_finalize_seek(self, 0, out, out_len); +} + +void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek, + uint8_t *out, size_t out_len) { + // Explicitly checking for zero avoids causing UB by passing a null pointer + // to memcpy. This comes up in practice with things like: + // std::vector<uint8_t> v; + // blake3_hasher_finalize(&hasher, v.data(), v.size()); + if (out_len == 0) { + return; + } + + // If the subtree stack is empty, then the current chunk is the root. + if (self->cv_stack_len == 0) { + output_t output = chunk_state_output(&self->chunk); + output_root_bytes(&output, seek, out, out_len); + return; + } + // If there are any bytes in the chunk state, finalize that chunk and do a + // roll-up merge between that chunk hash and every subtree in the stack. In + // this case, the extra merge loop at the end of blake3_hasher_update + // guarantees that none of the subtrees in the stack need to be merged with + // each other first. Otherwise, if there are no bytes in the chunk state, + // then the top of the stack is a chunk hash, and we start the merge from + // that. + output_t output; + size_t cvs_remaining; + if (chunk_state_len(&self->chunk) > 0) { + cvs_remaining = self->cv_stack_len; + output = chunk_state_output(&self->chunk); + } else { + // There are always at least 2 CVs in the stack in this case. + cvs_remaining = self->cv_stack_len - 2; + output = parent_output(&self->cv_stack[cvs_remaining * 32], self->key, + self->chunk.flags); + } + while (cvs_remaining > 0) { + cvs_remaining -= 1; + uint8_t parent_block[BLAKE3_BLOCK_LEN]; + memcpy(parent_block, &self->cv_stack[cvs_remaining * 32], 32); + output_chaining_value(&output, &parent_block[32]); + output = parent_output(parent_block, self->key, self->chunk.flags); + } + output_root_bytes(&output, seek, out, out_len); +} + +void blake3_hasher_reset(blake3_hasher *self) { + chunk_state_reset(&self->chunk, self->key, 0); + self->cv_stack_len = 0; +} diff --git a/thirdparty/blake3/c/blake3.h b/thirdparty/blake3/c/blake3.h new file mode 100644 index 000000000..08932954c --- /dev/null +++ b/thirdparty/blake3/c/blake3.h @@ -0,0 +1,86 @@ +#ifndef BLAKE3_H +#define BLAKE3_H + +#include <stddef.h> +#include <stdint.h> + +#if !defined(BLAKE3_API) +# if defined(_WIN32) || defined(__CYGWIN__) +# if defined(BLAKE3_DLL) +# if defined(BLAKE3_DLL_EXPORTS) +# define BLAKE3_API __declspec(dllexport) +# else +# define BLAKE3_API __declspec(dllimport) +# endif +# define BLAKE3_PRIVATE +# else +# define BLAKE3_API +# define BLAKE3_PRIVATE +# endif +# elif __GNUC__ >= 4 +# define BLAKE3_API __attribute__((visibility("default"))) +# define BLAKE3_PRIVATE __attribute__((visibility("hidden"))) +# else +# define BLAKE3_API +# define BLAKE3_PRIVATE +# endif +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#define BLAKE3_VERSION_STRING "1.8.2" +#define BLAKE3_KEY_LEN 32 +#define BLAKE3_OUT_LEN 32 +#define BLAKE3_BLOCK_LEN 64 +#define BLAKE3_CHUNK_LEN 1024 +#define BLAKE3_MAX_DEPTH 54 + +// This struct is a private implementation detail. It has to be here because +// it's part of blake3_hasher below. +typedef struct { + uint32_t cv[8]; + uint64_t chunk_counter; + uint8_t buf[BLAKE3_BLOCK_LEN]; + uint8_t buf_len; + uint8_t blocks_compressed; + uint8_t flags; +} blake3_chunk_state; + +typedef struct { + uint32_t key[8]; + blake3_chunk_state chunk; + uint8_t cv_stack_len; + // The stack size is MAX_DEPTH + 1 because we do lazy merging. For example, + // with 7 chunks, we have 3 entries in the stack. Adding an 8th chunk + // requires a 4th entry, rather than merging everything down to 1, because we + // don't know whether more input is coming. This is different from how the + // reference implementation does things. + uint8_t cv_stack[(BLAKE3_MAX_DEPTH + 1) * BLAKE3_OUT_LEN]; +} blake3_hasher; + +BLAKE3_API const char *blake3_version(void); +BLAKE3_API void blake3_hasher_init(blake3_hasher *self); +BLAKE3_API void blake3_hasher_init_keyed(blake3_hasher *self, + const uint8_t key[BLAKE3_KEY_LEN]); +BLAKE3_API void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context); +BLAKE3_API void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context, + size_t context_len); +BLAKE3_API void blake3_hasher_update(blake3_hasher *self, const void *input, + size_t input_len); +#if defined(BLAKE3_USE_TBB) +BLAKE3_API void blake3_hasher_update_tbb(blake3_hasher *self, const void *input, + size_t input_len); +#endif // BLAKE3_USE_TBB +BLAKE3_API void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out, + size_t out_len); +BLAKE3_API void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek, + uint8_t *out, size_t out_len); +BLAKE3_API void blake3_hasher_reset(blake3_hasher *self); + +#ifdef __cplusplus +} +#endif + +#endif /* BLAKE3_H */ diff --git a/thirdparty/blake3/c/blake3_avx2.c b/thirdparty/blake3/c/blake3_avx2.c new file mode 100644 index 000000000..381e7c422 --- /dev/null +++ b/thirdparty/blake3/c/blake3_avx2.c @@ -0,0 +1,326 @@ +#include "blake3_impl.h" + +#include <immintrin.h> + +#define DEGREE 8 + +INLINE __m256i loadu(const uint8_t src[32]) { + return _mm256_loadu_si256((const __m256i *)src); +} + +INLINE void storeu(__m256i src, uint8_t dest[16]) { + _mm256_storeu_si256((__m256i *)dest, src); +} + +INLINE __m256i addv(__m256i a, __m256i b) { return _mm256_add_epi32(a, b); } + +// Note that clang-format doesn't like the name "xor" for some reason. +INLINE __m256i xorv(__m256i a, __m256i b) { return _mm256_xor_si256(a, b); } + +INLINE __m256i set1(uint32_t x) { return _mm256_set1_epi32((int32_t)x); } + +INLINE __m256i rot16(__m256i x) { + return _mm256_shuffle_epi8( + x, _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2, + 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2)); +} + +INLINE __m256i rot12(__m256i x) { + return _mm256_or_si256(_mm256_srli_epi32(x, 12), _mm256_slli_epi32(x, 32 - 12)); +} + +INLINE __m256i rot8(__m256i x) { + return _mm256_shuffle_epi8( + x, _mm256_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1, + 12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1)); +} + +INLINE __m256i rot7(__m256i x) { + return _mm256_or_si256(_mm256_srli_epi32(x, 7), _mm256_slli_epi32(x, 32 - 7)); +} + +INLINE void round_fn(__m256i v[16], __m256i m[16], size_t r) { + v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); + v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); + v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); + v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); + v[0] = addv(v[0], v[4]); + v[1] = addv(v[1], v[5]); + v[2] = addv(v[2], v[6]); + v[3] = addv(v[3], v[7]); + v[12] = xorv(v[12], v[0]); + v[13] = xorv(v[13], v[1]); + v[14] = xorv(v[14], v[2]); + v[15] = xorv(v[15], v[3]); + v[12] = rot16(v[12]); + v[13] = rot16(v[13]); + v[14] = rot16(v[14]); + v[15] = rot16(v[15]); + v[8] = addv(v[8], v[12]); + v[9] = addv(v[9], v[13]); + v[10] = addv(v[10], v[14]); + v[11] = addv(v[11], v[15]); + v[4] = xorv(v[4], v[8]); + v[5] = xorv(v[5], v[9]); + v[6] = xorv(v[6], v[10]); + v[7] = xorv(v[7], v[11]); + v[4] = rot12(v[4]); + v[5] = rot12(v[5]); + v[6] = rot12(v[6]); + v[7] = rot12(v[7]); + v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); + v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); + v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); + v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); + v[0] = addv(v[0], v[4]); + v[1] = addv(v[1], v[5]); + v[2] = addv(v[2], v[6]); + v[3] = addv(v[3], v[7]); + v[12] = xorv(v[12], v[0]); + v[13] = xorv(v[13], v[1]); + v[14] = xorv(v[14], v[2]); + v[15] = xorv(v[15], v[3]); + v[12] = rot8(v[12]); + v[13] = rot8(v[13]); + v[14] = rot8(v[14]); + v[15] = rot8(v[15]); + v[8] = addv(v[8], v[12]); + v[9] = addv(v[9], v[13]); + v[10] = addv(v[10], v[14]); + v[11] = addv(v[11], v[15]); + v[4] = xorv(v[4], v[8]); + v[5] = xorv(v[5], v[9]); + v[6] = xorv(v[6], v[10]); + v[7] = xorv(v[7], v[11]); + v[4] = rot7(v[4]); + v[5] = rot7(v[5]); + v[6] = rot7(v[6]); + v[7] = rot7(v[7]); + + v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); + v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); + v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); + v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); + v[0] = addv(v[0], v[5]); + v[1] = addv(v[1], v[6]); + v[2] = addv(v[2], v[7]); + v[3] = addv(v[3], v[4]); + v[15] = xorv(v[15], v[0]); + v[12] = xorv(v[12], v[1]); + v[13] = xorv(v[13], v[2]); + v[14] = xorv(v[14], v[3]); + v[15] = rot16(v[15]); + v[12] = rot16(v[12]); + v[13] = rot16(v[13]); + v[14] = rot16(v[14]); + v[10] = addv(v[10], v[15]); + v[11] = addv(v[11], v[12]); + v[8] = addv(v[8], v[13]); + v[9] = addv(v[9], v[14]); + v[5] = xorv(v[5], v[10]); + v[6] = xorv(v[6], v[11]); + v[7] = xorv(v[7], v[8]); + v[4] = xorv(v[4], v[9]); + v[5] = rot12(v[5]); + v[6] = rot12(v[6]); + v[7] = rot12(v[7]); + v[4] = rot12(v[4]); + v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); + v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); + v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); + v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); + v[0] = addv(v[0], v[5]); + v[1] = addv(v[1], v[6]); + v[2] = addv(v[2], v[7]); + v[3] = addv(v[3], v[4]); + v[15] = xorv(v[15], v[0]); + v[12] = xorv(v[12], v[1]); + v[13] = xorv(v[13], v[2]); + v[14] = xorv(v[14], v[3]); + v[15] = rot8(v[15]); + v[12] = rot8(v[12]); + v[13] = rot8(v[13]); + v[14] = rot8(v[14]); + v[10] = addv(v[10], v[15]); + v[11] = addv(v[11], v[12]); + v[8] = addv(v[8], v[13]); + v[9] = addv(v[9], v[14]); + v[5] = xorv(v[5], v[10]); + v[6] = xorv(v[6], v[11]); + v[7] = xorv(v[7], v[8]); + v[4] = xorv(v[4], v[9]); + v[5] = rot7(v[5]); + v[6] = rot7(v[6]); + v[7] = rot7(v[7]); + v[4] = rot7(v[4]); +} + +INLINE void transpose_vecs(__m256i vecs[DEGREE]) { + // Interleave 32-bit lanes. The low unpack is lanes 00/11/44/55, and the high + // is 22/33/66/77. + __m256i ab_0145 = _mm256_unpacklo_epi32(vecs[0], vecs[1]); + __m256i ab_2367 = _mm256_unpackhi_epi32(vecs[0], vecs[1]); + __m256i cd_0145 = _mm256_unpacklo_epi32(vecs[2], vecs[3]); + __m256i cd_2367 = _mm256_unpackhi_epi32(vecs[2], vecs[3]); + __m256i ef_0145 = _mm256_unpacklo_epi32(vecs[4], vecs[5]); + __m256i ef_2367 = _mm256_unpackhi_epi32(vecs[4], vecs[5]); + __m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]); + __m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]); + + // Interleave 64-bit lanes. The low unpack is lanes 00/22 and the high is + // 11/33. + __m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145); + __m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145); + __m256i abcd_26 = _mm256_unpacklo_epi64(ab_2367, cd_2367); + __m256i abcd_37 = _mm256_unpackhi_epi64(ab_2367, cd_2367); + __m256i efgh_04 = _mm256_unpacklo_epi64(ef_0145, gh_0145); + __m256i efgh_15 = _mm256_unpackhi_epi64(ef_0145, gh_0145); + __m256i efgh_26 = _mm256_unpacklo_epi64(ef_2367, gh_2367); + __m256i efgh_37 = _mm256_unpackhi_epi64(ef_2367, gh_2367); + + // Interleave 128-bit lanes. + vecs[0] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x20); + vecs[1] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x20); + vecs[2] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x20); + vecs[3] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x20); + vecs[4] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x31); + vecs[5] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x31); + vecs[6] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x31); + vecs[7] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x31); +} + +INLINE void transpose_msg_vecs(const uint8_t *const *inputs, + size_t block_offset, __m256i out[16]) { + out[0] = loadu(&inputs[0][block_offset + 0 * sizeof(__m256i)]); + out[1] = loadu(&inputs[1][block_offset + 0 * sizeof(__m256i)]); + out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m256i)]); + out[3] = loadu(&inputs[3][block_offset + 0 * sizeof(__m256i)]); + out[4] = loadu(&inputs[4][block_offset + 0 * sizeof(__m256i)]); + out[5] = loadu(&inputs[5][block_offset + 0 * sizeof(__m256i)]); + out[6] = loadu(&inputs[6][block_offset + 0 * sizeof(__m256i)]); + out[7] = loadu(&inputs[7][block_offset + 0 * sizeof(__m256i)]); + out[8] = loadu(&inputs[0][block_offset + 1 * sizeof(__m256i)]); + out[9] = loadu(&inputs[1][block_offset + 1 * sizeof(__m256i)]); + out[10] = loadu(&inputs[2][block_offset + 1 * sizeof(__m256i)]); + out[11] = loadu(&inputs[3][block_offset + 1 * sizeof(__m256i)]); + out[12] = loadu(&inputs[4][block_offset + 1 * sizeof(__m256i)]); + out[13] = loadu(&inputs[5][block_offset + 1 * sizeof(__m256i)]); + out[14] = loadu(&inputs[6][block_offset + 1 * sizeof(__m256i)]); + out[15] = loadu(&inputs[7][block_offset + 1 * sizeof(__m256i)]); + for (size_t i = 0; i < 8; ++i) { + _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0); + } + transpose_vecs(&out[0]); + transpose_vecs(&out[8]); +} + +INLINE void load_counters(uint64_t counter, bool increment_counter, + __m256i *out_lo, __m256i *out_hi) { + const __m256i mask = _mm256_set1_epi32(-(int32_t)increment_counter); + const __m256i add0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); + const __m256i add1 = _mm256_and_si256(mask, add0); + __m256i l = _mm256_add_epi32(_mm256_set1_epi32((int32_t)counter), add1); + __m256i carry = _mm256_cmpgt_epi32(_mm256_xor_si256(add1, _mm256_set1_epi32(0x80000000)), + _mm256_xor_si256( l, _mm256_set1_epi32(0x80000000))); + __m256i h = _mm256_sub_epi32(_mm256_set1_epi32((int32_t)(counter >> 32)), carry); + *out_lo = l; + *out_hi = h; +} + +static +void blake3_hash8_avx2(const uint8_t *const *inputs, size_t blocks, + const uint32_t key[8], uint64_t counter, + bool increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { + __m256i h_vecs[8] = { + set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]), + set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]), + }; + __m256i counter_low_vec, counter_high_vec; + load_counters(counter, increment_counter, &counter_low_vec, + &counter_high_vec); + uint8_t block_flags = flags | flags_start; + + for (size_t block = 0; block < blocks; block++) { + if (block + 1 == blocks) { + block_flags |= flags_end; + } + __m256i block_len_vec = set1(BLAKE3_BLOCK_LEN); + __m256i block_flags_vec = set1(block_flags); + __m256i msg_vecs[16]; + transpose_msg_vecs(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); + + __m256i v[16] = { + h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], + h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], + set1(IV[0]), set1(IV[1]), set1(IV[2]), set1(IV[3]), + counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, + }; + round_fn(v, msg_vecs, 0); + round_fn(v, msg_vecs, 1); + round_fn(v, msg_vecs, 2); + round_fn(v, msg_vecs, 3); + round_fn(v, msg_vecs, 4); + round_fn(v, msg_vecs, 5); + round_fn(v, msg_vecs, 6); + h_vecs[0] = xorv(v[0], v[8]); + h_vecs[1] = xorv(v[1], v[9]); + h_vecs[2] = xorv(v[2], v[10]); + h_vecs[3] = xorv(v[3], v[11]); + h_vecs[4] = xorv(v[4], v[12]); + h_vecs[5] = xorv(v[5], v[13]); + h_vecs[6] = xorv(v[6], v[14]); + h_vecs[7] = xorv(v[7], v[15]); + + block_flags = flags; + } + + transpose_vecs(h_vecs); + storeu(h_vecs[0], &out[0 * sizeof(__m256i)]); + storeu(h_vecs[1], &out[1 * sizeof(__m256i)]); + storeu(h_vecs[2], &out[2 * sizeof(__m256i)]); + storeu(h_vecs[3], &out[3 * sizeof(__m256i)]); + storeu(h_vecs[4], &out[4 * sizeof(__m256i)]); + storeu(h_vecs[5], &out[5 * sizeof(__m256i)]); + storeu(h_vecs[6], &out[6 * sizeof(__m256i)]); + storeu(h_vecs[7], &out[7 * sizeof(__m256i)]); +} + +#if !defined(BLAKE3_NO_SSE41) +void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out); +#else +void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out); +#endif + +void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out) { + while (num_inputs >= DEGREE) { + blake3_hash8_avx2(inputs, blocks, key, counter, increment_counter, flags, + flags_start, flags_end, out); + if (increment_counter) { + counter += DEGREE; + } + inputs += DEGREE; + num_inputs -= DEGREE; + out = &out[DEGREE * BLAKE3_OUT_LEN]; + } +#if !defined(BLAKE3_NO_SSE41) + blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, out); +#else + blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, + out); +#endif +} diff --git a/thirdparty/blake3/c/blake3_avx2_x86-64_unix.S b/thirdparty/blake3/c/blake3_avx2_x86-64_unix.S new file mode 100644 index 000000000..812bb8568 --- /dev/null +++ b/thirdparty/blake3/c/blake3_avx2_x86-64_unix.S @@ -0,0 +1,1815 @@ +#if defined(__ELF__) && defined(__linux__) +.section .note.GNU-stack,"",%progbits +#endif + +#if defined(__ELF__) && defined(__CET__) && defined(__has_include) +#if __has_include(<cet.h>) +#include <cet.h> +#endif +#endif + +#if !defined(_CET_ENDBR) +#define _CET_ENDBR +#endif + +.intel_syntax noprefix +.global _blake3_hash_many_avx2 +.global blake3_hash_many_avx2 +#ifdef __APPLE__ +.text +#else +.section .text +#endif + .p2align 6 +_blake3_hash_many_avx2: +blake3_hash_many_avx2: + _CET_ENDBR + push r15 + push r14 + push r13 + push r12 + push rbx + push rbp + mov rbp, rsp + sub rsp, 680 + and rsp, 0xFFFFFFFFFFFFFFC0 + neg r9d + vmovd xmm0, r9d + vpbroadcastd ymm0, xmm0 + vmovdqa ymmword ptr [rsp+0x280], ymm0 + vpand ymm1, ymm0, ymmword ptr [ADD0+rip] + vpand ymm2, ymm0, ymmword ptr [ADD1+rip] + vmovdqa ymmword ptr [rsp+0x220], ymm2 + vmovd xmm2, r8d + vpbroadcastd ymm2, xmm2 + vpaddd ymm2, ymm2, ymm1 + vmovdqa ymmword ptr [rsp+0x240], ymm2 + vpxor ymm1, ymm1, ymmword ptr [CMP_MSB_MASK+rip] + vpxor ymm2, ymm2, ymmword ptr [CMP_MSB_MASK+rip] + vpcmpgtd ymm2, ymm1, ymm2 + shr r8, 32 + vmovd xmm3, r8d + vpbroadcastd ymm3, xmm3 + vpsubd ymm3, ymm3, ymm2 + vmovdqa ymmword ptr [rsp+0x260], ymm3 + shl rdx, 6 + mov qword ptr [rsp+0x2A0], rdx + cmp rsi, 8 + jc 3f +2: + vpbroadcastd ymm0, dword ptr [rcx] + vpbroadcastd ymm1, dword ptr [rcx+0x4] + vpbroadcastd ymm2, dword ptr [rcx+0x8] + vpbroadcastd ymm3, dword ptr [rcx+0xC] + vpbroadcastd ymm4, dword ptr [rcx+0x10] + vpbroadcastd ymm5, dword ptr [rcx+0x14] + vpbroadcastd ymm6, dword ptr [rcx+0x18] + vpbroadcastd ymm7, dword ptr [rcx+0x1C] + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + mov r12, qword ptr [rdi+0x20] + mov r13, qword ptr [rdi+0x28] + mov r14, qword ptr [rdi+0x30] + mov r15, qword ptr [rdi+0x38] + movzx eax, byte ptr [rbp+0x38] + movzx ebx, byte ptr [rbp+0x40] + or eax, ebx + xor edx, edx +.p2align 5 +9: + movzx ebx, byte ptr [rbp+0x48] + or ebx, eax + add rdx, 64 + cmp rdx, qword ptr [rsp+0x2A0] + cmove eax, ebx + mov dword ptr [rsp+0x200], eax + vmovups xmm8, xmmword ptr [r8+rdx-0x40] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x40] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x40] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x40] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+0x20], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+0x40], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+0x60], ymm11 + vmovups xmm8, xmmword ptr [r8+rdx-0x30] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x30] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x30] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x30] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp+0x80], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+0xA0], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+0xC0], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+0xE0], ymm11 + vmovups xmm8, xmmword ptr [r8+rdx-0x20] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x20] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x20] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x20] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp+0x100], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+0x120], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+0x140], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+0x160], ymm11 + vmovups xmm8, xmmword ptr [r8+rdx-0x10] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x10] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x10] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x10] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp+0x180], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+0x1A0], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+0x1C0], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+0x1E0], ymm11 + vpbroadcastd ymm15, dword ptr [rsp+0x200] + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r12+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r13+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r14+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + prefetcht0 [r15+rdx+0x80] + vpaddd ymm0, ymm0, ymmword ptr [rsp] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x80] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm0, ymmword ptr [rsp+0x240] + vpxor ymm13, ymm1, ymmword ptr [rsp+0x260] + vpxor ymm14, ymm2, ymmword ptr [BLAKE3_BLOCK_LEN+rip] + vpxor ymm15, ymm3, ymm15 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [BLAKE3_IV_0+rip] + vpaddd ymm9, ymm13, ymmword ptr [BLAKE3_IV_1+rip] + vpaddd ymm10, ymm14, ymmword ptr [BLAKE3_IV_2+rip] + vpaddd ymm11, ymm15, ymmword ptr [BLAKE3_IV_3+rip] + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x20] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0xA0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x100] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x180] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x120] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1A0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x40] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0xE0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0xC0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] + vpaddd ymm2, ymm2, ymmword ptr [rsp] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x20] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x120] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x160] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1C0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x60] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1A0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x80] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x40] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0xC0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x160] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0xA0] + vpaddd ymm1, ymm1, ymmword ptr [rsp] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1E0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x140] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1C0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0xE0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x60] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x80] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0xA0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x100] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x180] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1E0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1A0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x140] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0xE0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] + vpaddd ymm2, ymm2, ymmword ptr [rsp] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x40] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x20] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x120] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x100] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1C0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x180] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1A0] + vpaddd ymm1, ymm1, ymmword ptr [rsp] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x40] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x60] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0xC0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x160] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x20] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1E0] + vpaddd ymm1, ymm1, ymmword ptr [rsp] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x120] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1C0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x60] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x140] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x80] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vpxor ymm0, ymm0, ymm8 + vpxor ymm1, ymm1, ymm9 + vpxor ymm2, ymm2, ymm10 + vpxor ymm3, ymm3, ymm11 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpxor ymm4, ymm4, ymm12 + vpxor ymm5, ymm5, ymm13 + vpxor ymm6, ymm6, ymm14 + vpxor ymm7, ymm7, ymm15 + movzx eax, byte ptr [rbp+0x38] + jne 9b + mov rbx, qword ptr [rbp+0x50] + vunpcklps ymm8, ymm0, ymm1 + vunpcklps ymm9, ymm2, ymm3 + vunpckhps ymm10, ymm0, ymm1 + vunpcklps ymm11, ymm4, ymm5 + vunpcklps ymm0, ymm6, ymm7 + vshufps ymm12, ymm8, ymm9, 78 + vblendps ymm1, ymm8, ymm12, 0xCC + vshufps ymm8, ymm11, ymm0, 78 + vunpckhps ymm13, ymm2, ymm3 + vblendps ymm2, ymm11, ymm8, 0xCC + vblendps ymm3, ymm12, ymm9, 0xCC + vperm2f128 ymm12, ymm1, ymm2, 0x20 + vmovups ymmword ptr [rbx], ymm12 + vunpckhps ymm14, ymm4, ymm5 + vblendps ymm4, ymm8, ymm0, 0xCC + vunpckhps ymm15, ymm6, ymm7 + vperm2f128 ymm7, ymm3, ymm4, 0x20 + vmovups ymmword ptr [rbx+0x20], ymm7 + vshufps ymm5, ymm10, ymm13, 78 + vblendps ymm6, ymm5, ymm13, 0xCC + vshufps ymm13, ymm14, ymm15, 78 + vblendps ymm10, ymm10, ymm5, 0xCC + vblendps ymm14, ymm14, ymm13, 0xCC + vperm2f128 ymm8, ymm10, ymm14, 0x20 + vmovups ymmword ptr [rbx+0x40], ymm8 + vblendps ymm15, ymm13, ymm15, 0xCC + vperm2f128 ymm13, ymm6, ymm15, 0x20 + vmovups ymmword ptr [rbx+0x60], ymm13 + vperm2f128 ymm9, ymm1, ymm2, 0x31 + vperm2f128 ymm11, ymm3, ymm4, 0x31 + vmovups ymmword ptr [rbx+0x80], ymm9 + vperm2f128 ymm14, ymm10, ymm14, 0x31 + vperm2f128 ymm15, ymm6, ymm15, 0x31 + vmovups ymmword ptr [rbx+0xA0], ymm11 + vmovups ymmword ptr [rbx+0xC0], ymm14 + vmovups ymmword ptr [rbx+0xE0], ymm15 + vmovdqa ymm0, ymmword ptr [rsp+0x220] + vpaddd ymm1, ymm0, ymmword ptr [rsp+0x240] + vmovdqa ymmword ptr [rsp+0x240], ymm1 + vpxor ymm0, ymm0, ymmword ptr [CMP_MSB_MASK+rip] + vpxor ymm2, ymm1, ymmword ptr [CMP_MSB_MASK+rip] + vpcmpgtd ymm2, ymm0, ymm2 + vmovdqa ymm0, ymmword ptr [rsp+0x260] + vpsubd ymm2, ymm0, ymm2 + vmovdqa ymmword ptr [rsp+0x260], ymm2 + add rdi, 64 + add rbx, 256 + mov qword ptr [rbp+0x50], rbx + sub rsi, 8 + cmp rsi, 8 + jnc 2b + test rsi, rsi + jnz 3f +4: + vzeroupper + mov rsp, rbp + pop rbp + pop rbx + pop r12 + pop r13 + pop r14 + pop r15 + ret +.p2align 5 +3: + mov rbx, qword ptr [rbp+0x50] + mov r15, qword ptr [rsp+0x2A0] + movzx r13d, byte ptr [rbp+0x38] + movzx r12d, byte ptr [rbp+0x48] + test rsi, 0x4 + je 3f + vbroadcasti128 ymm0, xmmword ptr [rcx] + vbroadcasti128 ymm1, xmmword ptr [rcx+0x10] + vmovdqa ymm8, ymm0 + vmovdqa ymm9, ymm1 + vbroadcasti128 ymm12, xmmword ptr [rsp+0x240] + vbroadcasti128 ymm13, xmmword ptr [rsp+0x260] + vpunpckldq ymm14, ymm12, ymm13 + vpunpckhdq ymm15, ymm12, ymm13 + vpermq ymm14, ymm14, 0x50 + vpermq ymm15, ymm15, 0x50 + vbroadcasti128 ymm12, xmmword ptr [BLAKE3_BLOCK_LEN+rip] + vpblendd ymm14, ymm14, ymm12, 0x44 + vpblendd ymm15, ymm15, ymm12, 0x44 + vmovdqa ymmword ptr [rsp], ymm14 + vmovdqa ymmword ptr [rsp+0x20], ymm15 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + mov dword ptr [rsp+0x200], eax + vmovups ymm2, ymmword ptr [r8+rdx-0x40] + vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x40], 0x01 + vmovups ymm3, ymmword ptr [r8+rdx-0x30] + vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x30], 0x01 + vshufps ymm4, ymm2, ymm3, 136 + vshufps ymm5, ymm2, ymm3, 221 + vmovups ymm2, ymmword ptr [r8+rdx-0x20] + vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x20], 0x01 + vmovups ymm3, ymmword ptr [r8+rdx-0x10] + vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x10], 0x01 + vshufps ymm6, ymm2, ymm3, 136 + vshufps ymm7, ymm2, ymm3, 221 + vpshufd ymm6, ymm6, 0x93 + vpshufd ymm7, ymm7, 0x93 + vmovups ymm10, ymmword ptr [r10+rdx-0x40] + vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x40], 0x01 + vmovups ymm11, ymmword ptr [r10+rdx-0x30] + vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x30], 0x01 + vshufps ymm12, ymm10, ymm11, 136 + vshufps ymm13, ymm10, ymm11, 221 + vmovups ymm10, ymmword ptr [r10+rdx-0x20] + vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x20], 0x01 + vmovups ymm11, ymmword ptr [r10+rdx-0x10] + vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x10], 0x01 + vshufps ymm14, ymm10, ymm11, 136 + vshufps ymm15, ymm10, ymm11, 221 + vpshufd ymm14, ymm14, 0x93 + vpshufd ymm15, ymm15, 0x93 + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + vpbroadcastd ymm2, dword ptr [rsp+0x200] + vmovdqa ymm3, ymmword ptr [rsp] + vmovdqa ymm11, ymmword ptr [rsp+0x20] + vpblendd ymm3, ymm3, ymm2, 0x88 + vpblendd ymm11, ymm11, ymm2, 0x88 + vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip] + vmovdqa ymm10, ymm2 + mov al, 7 +9: + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm8, ymm8, ymm12 + vmovdqa ymmword ptr [rsp+0x40], ymm4 + nop + vmovdqa ymmword ptr [rsp+0x60], ymm12 + nop + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT16+rip] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 12 + vpslld ymm9, ymm9, 20 + vpor ymm9, ymm9, ymm4 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vmovdqa ymmword ptr [rsp+0x80], ymm5 + vmovdqa ymmword ptr [rsp+0xA0], ymm13 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT8+rip] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 7 + vpslld ymm9, ymm9, 25 + vpor ymm9, ymm9, ymm4 + vpshufd ymm0, ymm0, 0x93 + vpshufd ymm8, ymm8, 0x93 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm11, ymm11, 0x4E + vpshufd ymm2, ymm2, 0x39 + vpshufd ymm10, ymm10, 0x39 + vpaddd ymm0, ymm0, ymm6 + vpaddd ymm8, ymm8, ymm14 + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT16+rip] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 12 + vpslld ymm9, ymm9, 20 + vpor ymm9, ymm9, ymm4 + vpaddd ymm0, ymm0, ymm7 + vpaddd ymm8, ymm8, ymm15 + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT8+rip] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 7 + vpslld ymm9, ymm9, 25 + vpor ymm9, ymm9, ymm4 + vpshufd ymm0, ymm0, 0x39 + vpshufd ymm8, ymm8, 0x39 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm11, ymm11, 0x4E + vpshufd ymm2, ymm2, 0x93 + vpshufd ymm10, ymm10, 0x93 + dec al + je 9f + vmovdqa ymm4, ymmword ptr [rsp+0x40] + vmovdqa ymm5, ymmword ptr [rsp+0x80] + vshufps ymm12, ymm4, ymm5, 214 + vpshufd ymm13, ymm4, 0x0F + vpshufd ymm4, ymm12, 0x39 + vshufps ymm12, ymm6, ymm7, 250 + vpblendd ymm13, ymm13, ymm12, 0xAA + vpunpcklqdq ymm12, ymm7, ymm5 + vpblendd ymm12, ymm12, ymm6, 0x88 + vpshufd ymm12, ymm12, 0x78 + vpunpckhdq ymm5, ymm5, ymm7 + vpunpckldq ymm6, ymm6, ymm5 + vpshufd ymm7, ymm6, 0x1E + vmovdqa ymmword ptr [rsp+0x40], ymm13 + vmovdqa ymmword ptr [rsp+0x80], ymm12 + vmovdqa ymm12, ymmword ptr [rsp+0x60] + vmovdqa ymm13, ymmword ptr [rsp+0xA0] + vshufps ymm5, ymm12, ymm13, 214 + vpshufd ymm6, ymm12, 0x0F + vpshufd ymm12, ymm5, 0x39 + vshufps ymm5, ymm14, ymm15, 250 + vpblendd ymm6, ymm6, ymm5, 0xAA + vpunpcklqdq ymm5, ymm15, ymm13 + vpblendd ymm5, ymm5, ymm14, 0x88 + vpshufd ymm5, ymm5, 0x78 + vpunpckhdq ymm13, ymm13, ymm15 + vpunpckldq ymm14, ymm14, ymm13 + vpshufd ymm15, ymm14, 0x1E + vmovdqa ymm13, ymm6 + vmovdqa ymm14, ymm5 + vmovdqa ymm5, ymmword ptr [rsp+0x40] + vmovdqa ymm6, ymmword ptr [rsp+0x80] + jmp 9b +9: + vpxor ymm0, ymm0, ymm2 + vpxor ymm1, ymm1, ymm3 + vpxor ymm8, ymm8, ymm10 + vpxor ymm9, ymm9, ymm11 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 + vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 + vmovdqu xmmword ptr [rbx+0x40], xmm8 + vmovdqu xmmword ptr [rbx+0x50], xmm9 + vextracti128 xmmword ptr [rbx+0x60], ymm8, 0x01 + vextracti128 xmmword ptr [rbx+0x70], ymm9, 0x01 + vmovaps xmm8, xmmword ptr [rsp+0x280] + vmovaps xmm0, xmmword ptr [rsp+0x240] + vmovaps xmm1, xmmword ptr [rsp+0x250] + vmovaps xmm2, xmmword ptr [rsp+0x260] + vmovaps xmm3, xmmword ptr [rsp+0x270] + vblendvps xmm0, xmm0, xmm1, xmm8 + vblendvps xmm2, xmm2, xmm3, xmm8 + vmovaps xmmword ptr [rsp+0x240], xmm0 + vmovaps xmmword ptr [rsp+0x260], xmm2 + add rbx, 128 + add rdi, 32 + sub rsi, 4 +3: + test rsi, 0x2 + je 3f + vbroadcasti128 ymm0, xmmword ptr [rcx] + vbroadcasti128 ymm1, xmmword ptr [rcx+0x10] + vmovd xmm13, dword ptr [rsp+0x240] + vpinsrd xmm13, xmm13, dword ptr [rsp+0x260], 1 + vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vmovd xmm14, dword ptr [rsp+0x244] + vpinsrd xmm14, xmm14, dword ptr [rsp+0x264], 1 + vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vinserti128 ymm13, ymm13, xmm14, 0x01 + vbroadcasti128 ymm14, xmmword ptr [ROT16+rip] + vbroadcasti128 ymm15, xmmword ptr [ROT8+rip] + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + mov dword ptr [rsp+0x200], eax + vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip] + vpbroadcastd ymm8, dword ptr [rsp+0x200] + vpblendd ymm3, ymm13, ymm8, 0x88 + vmovups ymm8, ymmword ptr [r8+rdx-0x40] + vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x40], 0x01 + vmovups ymm9, ymmword ptr [r8+rdx-0x30] + vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x30], 0x01 + vshufps ymm4, ymm8, ymm9, 136 + vshufps ymm5, ymm8, ymm9, 221 + vmovups ymm8, ymmword ptr [r8+rdx-0x20] + vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x20], 0x01 + vmovups ymm9, ymmword ptr [r8+rdx-0x10] + vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x10], 0x01 + vshufps ymm6, ymm8, ymm9, 136 + vshufps ymm7, ymm8, ymm9, 221 + vpshufd ymm6, ymm6, 0x93 + vpshufd ymm7, ymm7, 0x93 + mov al, 7 +9: + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm14 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm8 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm15 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm8 + vpshufd ymm0, ymm0, 0x93 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm2, ymm2, 0x39 + vpaddd ymm0, ymm0, ymm6 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm14 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm8 + vpaddd ymm0, ymm0, ymm7 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm15 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm8 + vpshufd ymm0, ymm0, 0x39 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm2, ymm2, 0x93 + dec al + jz 9f + vshufps ymm8, ymm4, ymm5, 214 + vpshufd ymm9, ymm4, 0x0F + vpshufd ymm4, ymm8, 0x39 + vshufps ymm8, ymm6, ymm7, 250 + vpblendd ymm9, ymm9, ymm8, 0xAA + vpunpcklqdq ymm8, ymm7, ymm5 + vpblendd ymm8, ymm8, ymm6, 0x88 + vpshufd ymm8, ymm8, 0x78 + vpunpckhdq ymm5, ymm5, ymm7 + vpunpckldq ymm6, ymm6, ymm5 + vpshufd ymm7, ymm6, 0x1E + vmovdqa ymm5, ymm9 + vmovdqa ymm6, ymm8 + jmp 9b +9: + vpxor ymm0, ymm0, ymm2 + vpxor ymm1, ymm1, ymm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 + vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 + vmovaps ymm8, ymmword ptr [rsp+0x280] + vmovaps ymm0, ymmword ptr [rsp+0x240] + vmovups ymm1, ymmword ptr [rsp+0x248] + vmovaps ymm2, ymmword ptr [rsp+0x260] + vmovups ymm3, ymmword ptr [rsp+0x268] + vblendvps ymm0, ymm0, ymm1, ymm8 + vblendvps ymm2, ymm2, ymm3, ymm8 + vmovaps ymmword ptr [rsp+0x240], ymm0 + vmovaps ymmword ptr [rsp+0x260], ymm2 + add rbx, 64 + add rdi, 16 + sub rsi, 2 +3: + test rsi, 0x1 + je 4b + vmovdqu xmm0, xmmword ptr [rcx] + vmovdqu xmm1, xmmword ptr [rcx+0x10] + vmovd xmm3, dword ptr [rsp+0x240] + vpinsrd xmm3, xmm3, dword ptr [rsp+0x260], 1 + vpinsrd xmm13, xmm3, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vmovdqa xmm14, xmmword ptr [ROT16+rip] + vmovdqa xmm15, xmmword ptr [ROT8+rip] + mov r8, qword ptr [rdi] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + vmovdqa xmm2, xmmword ptr [BLAKE3_IV+rip] + vmovdqa xmm3, xmm13 + vpinsrd xmm3, xmm3, eax, 3 + vmovups xmm8, xmmword ptr [r8+rdx-0x40] + vmovups xmm9, xmmword ptr [r8+rdx-0x30] + vshufps xmm4, xmm8, xmm9, 136 + vshufps xmm5, xmm8, xmm9, 221 + vmovups xmm8, xmmword ptr [r8+rdx-0x20] + vmovups xmm9, xmmword ptr [r8+rdx-0x10] + vshufps xmm6, xmm8, xmm9, 136 + vshufps xmm7, xmm8, xmm9, 221 + vpshufd xmm6, xmm6, 0x93 + vpshufd xmm7, xmm7, 0x93 + mov al, 7 +9: + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm14 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 12 + vpslld xmm1, xmm1, 20 + vpor xmm1, xmm1, xmm8 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm15 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 7 + vpslld xmm1, xmm1, 25 + vpor xmm1, xmm1, xmm8 + vpshufd xmm0, xmm0, 0x93 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x39 + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm14 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 12 + vpslld xmm1, xmm1, 20 + vpor xmm1, xmm1, xmm8 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm15 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 7 + vpslld xmm1, xmm1, 25 + vpor xmm1, xmm1, xmm8 + vpshufd xmm0, xmm0, 0x39 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x93 + dec al + jz 9f + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0x0F + vpshufd xmm4, xmm8, 0x39 + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0xAA + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 0x88 + vpshufd xmm8, xmm8, 0x78 + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 0x1E + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + jmp 9b +9: + vpxor xmm0, xmm0, xmm2 + vpxor xmm1, xmm1, xmm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + jmp 4b + + +#ifdef __APPLE__ +.static_data +#else +.section .rodata +#endif +.p2align 6 +ADD0: + .long 0, 1, 2, 3, 4, 5, 6, 7 +ADD1: + .long 8, 8, 8, 8, 8, 8, 8, 8 +BLAKE3_IV_0: + .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 + .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 +BLAKE3_IV_1: + .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 + .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 +BLAKE3_IV_2: + .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 + .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 +BLAKE3_IV_3: + .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A + .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A +BLAKE3_BLOCK_LEN: + .long 0x00000040, 0x00000040, 0x00000040, 0x00000040 + .long 0x00000040, 0x00000040, 0x00000040, 0x00000040 +ROT16: + .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 +ROT8: + .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 +CMP_MSB_MASK: + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 +BLAKE3_IV: + .long 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A + diff --git a/thirdparty/blake3/c/blake3_avx2_x86-64_windows_gnu.S b/thirdparty/blake3/c/blake3_avx2_x86-64_windows_gnu.S new file mode 100644 index 000000000..3d4be4a7d --- /dev/null +++ b/thirdparty/blake3/c/blake3_avx2_x86-64_windows_gnu.S @@ -0,0 +1,1817 @@ +.intel_syntax noprefix +.global _blake3_hash_many_avx2 +.global blake3_hash_many_avx2 +.section .text + .p2align 6 +_blake3_hash_many_avx2: +blake3_hash_many_avx2: + push r15 + push r14 + push r13 + push r12 + push rsi + push rdi + push rbx + push rbp + mov rbp, rsp + sub rsp, 880 + and rsp, 0xFFFFFFFFFFFFFFC0 + vmovdqa xmmword ptr [rsp+0x2D0], xmm6 + vmovdqa xmmword ptr [rsp+0x2E0], xmm7 + vmovdqa xmmword ptr [rsp+0x2F0], xmm8 + vmovdqa xmmword ptr [rsp+0x300], xmm9 + vmovdqa xmmword ptr [rsp+0x310], xmm10 + vmovdqa xmmword ptr [rsp+0x320], xmm11 + vmovdqa xmmword ptr [rsp+0x330], xmm12 + vmovdqa xmmword ptr [rsp+0x340], xmm13 + vmovdqa xmmword ptr [rsp+0x350], xmm14 + vmovdqa xmmword ptr [rsp+0x360], xmm15 + mov rdi, rcx + mov rsi, rdx + mov rdx, r8 + mov rcx, r9 + mov r8, qword ptr [rbp+0x68] + movzx r9, byte ptr [rbp+0x70] + neg r9d + vmovd xmm0, r9d + vpbroadcastd ymm0, xmm0 + vmovdqa ymmword ptr [rsp+0x260], ymm0 + vpand ymm1, ymm0, ymmword ptr [ADD0+rip] + vpand ymm2, ymm0, ymmword ptr [ADD1+rip] + vmovdqa ymmword ptr [rsp+0x2A0], ymm2 + vmovd xmm2, r8d + vpbroadcastd ymm2, xmm2 + vpaddd ymm2, ymm2, ymm1 + vmovdqa ymmword ptr [rsp+0x220], ymm2 + vpxor ymm1, ymm1, ymmword ptr [CMP_MSB_MASK+rip] + vpxor ymm2, ymm2, ymmword ptr [CMP_MSB_MASK+rip] + vpcmpgtd ymm2, ymm1, ymm2 + shr r8, 32 + vmovd xmm3, r8d + vpbroadcastd ymm3, xmm3 + vpsubd ymm3, ymm3, ymm2 + vmovdqa ymmword ptr [rsp+0x240], ymm3 + shl rdx, 6 + mov qword ptr [rsp+0x2C0], rdx + cmp rsi, 8 + jc 3f +2: + vpbroadcastd ymm0, dword ptr [rcx] + vpbroadcastd ymm1, dword ptr [rcx+0x4] + vpbroadcastd ymm2, dword ptr [rcx+0x8] + vpbroadcastd ymm3, dword ptr [rcx+0xC] + vpbroadcastd ymm4, dword ptr [rcx+0x10] + vpbroadcastd ymm5, dword ptr [rcx+0x14] + vpbroadcastd ymm6, dword ptr [rcx+0x18] + vpbroadcastd ymm7, dword ptr [rcx+0x1C] + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + mov r12, qword ptr [rdi+0x20] + mov r13, qword ptr [rdi+0x28] + mov r14, qword ptr [rdi+0x30] + mov r15, qword ptr [rdi+0x38] + movzx eax, byte ptr [rbp+0x78] + movzx ebx, byte ptr [rbp+0x80] + or eax, ebx + xor edx, edx +.p2align 5 +9: + movzx ebx, byte ptr [rbp+0x88] + or ebx, eax + add rdx, 64 + cmp rdx, qword ptr [rsp+0x2C0] + cmove eax, ebx + mov dword ptr [rsp+0x200], eax + vmovups xmm8, xmmword ptr [r8+rdx-0x40] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x40] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x40] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x40] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+0x20], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+0x40], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+0x60], ymm11 + vmovups xmm8, xmmword ptr [r8+rdx-0x30] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x30] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x30] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x30] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp+0x80], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+0xA0], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+0xC0], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+0xE0], ymm11 + vmovups xmm8, xmmword ptr [r8+rdx-0x20] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x20] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x20] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x20] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp+0x100], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+0x120], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+0x140], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+0x160], ymm11 + vmovups xmm8, xmmword ptr [r8+rdx-0x10] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x10] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x10] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x10] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp+0x180], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+0x1A0], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+0x1C0], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+0x1E0], ymm11 + vpbroadcastd ymm15, dword ptr [rsp+0x200] + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r12+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r13+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r14+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + prefetcht0 [r15+rdx+0x80] + vpaddd ymm0, ymm0, ymmword ptr [rsp] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x80] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm0, ymmword ptr [rsp+0x220] + vpxor ymm13, ymm1, ymmword ptr [rsp+0x240] + vpxor ymm14, ymm2, ymmword ptr [BLAKE3_BLOCK_LEN+rip] + vpxor ymm15, ymm3, ymm15 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [BLAKE3_IV_0+rip] + vpaddd ymm9, ymm13, ymmword ptr [BLAKE3_IV_1+rip] + vpaddd ymm10, ymm14, ymmword ptr [BLAKE3_IV_2+rip] + vpaddd ymm11, ymm15, ymmword ptr [BLAKE3_IV_3+rip] + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x20] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0xA0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x100] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x180] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x120] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1A0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x40] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0xE0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0xC0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] + vpaddd ymm2, ymm2, ymmword ptr [rsp] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x20] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x120] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x160] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1C0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x60] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1A0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x80] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x40] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0xC0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x160] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0xA0] + vpaddd ymm1, ymm1, ymmword ptr [rsp] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1E0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x140] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1C0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0xE0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x60] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x80] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0xA0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x100] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x180] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1E0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1A0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x140] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0xE0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] + vpaddd ymm2, ymm2, ymmword ptr [rsp] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x40] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x20] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x120] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x100] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1C0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x180] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1A0] + vpaddd ymm1, ymm1, ymmword ptr [rsp] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x40] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x60] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0xC0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x160] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x20] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1E0] + vpaddd ymm1, ymm1, ymmword ptr [rsp] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x120] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1C0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x60] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x140] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x80] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vpxor ymm0, ymm0, ymm8 + vpxor ymm1, ymm1, ymm9 + vpxor ymm2, ymm2, ymm10 + vpxor ymm3, ymm3, ymm11 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpxor ymm4, ymm4, ymm12 + vpxor ymm5, ymm5, ymm13 + vpxor ymm6, ymm6, ymm14 + vpxor ymm7, ymm7, ymm15 + movzx eax, byte ptr [rbp+0x78] + jne 9b + mov rbx, qword ptr [rbp+0x90] + vunpcklps ymm8, ymm0, ymm1 + vunpcklps ymm9, ymm2, ymm3 + vunpckhps ymm10, ymm0, ymm1 + vunpcklps ymm11, ymm4, ymm5 + vunpcklps ymm0, ymm6, ymm7 + vshufps ymm12, ymm8, ymm9, 78 + vblendps ymm1, ymm8, ymm12, 0xCC + vshufps ymm8, ymm11, ymm0, 78 + vunpckhps ymm13, ymm2, ymm3 + vblendps ymm2, ymm11, ymm8, 0xCC + vblendps ymm3, ymm12, ymm9, 0xCC + vperm2f128 ymm12, ymm1, ymm2, 0x20 + vmovups ymmword ptr [rbx], ymm12 + vunpckhps ymm14, ymm4, ymm5 + vblendps ymm4, ymm8, ymm0, 0xCC + vunpckhps ymm15, ymm6, ymm7 + vperm2f128 ymm7, ymm3, ymm4, 0x20 + vmovups ymmword ptr [rbx+0x20], ymm7 + vshufps ymm5, ymm10, ymm13, 78 + vblendps ymm6, ymm5, ymm13, 0xCC + vshufps ymm13, ymm14, ymm15, 78 + vblendps ymm10, ymm10, ymm5, 0xCC + vblendps ymm14, ymm14, ymm13, 0xCC + vperm2f128 ymm8, ymm10, ymm14, 0x20 + vmovups ymmword ptr [rbx+0x40], ymm8 + vblendps ymm15, ymm13, ymm15, 0xCC + vperm2f128 ymm13, ymm6, ymm15, 0x20 + vmovups ymmword ptr [rbx+0x60], ymm13 + vperm2f128 ymm9, ymm1, ymm2, 0x31 + vperm2f128 ymm11, ymm3, ymm4, 0x31 + vmovups ymmword ptr [rbx+0x80], ymm9 + vperm2f128 ymm14, ymm10, ymm14, 0x31 + vperm2f128 ymm15, ymm6, ymm15, 0x31 + vmovups ymmword ptr [rbx+0xA0], ymm11 + vmovups ymmword ptr [rbx+0xC0], ymm14 + vmovups ymmword ptr [rbx+0xE0], ymm15 + vmovdqa ymm0, ymmword ptr [rsp+0x2A0] + vpaddd ymm1, ymm0, ymmword ptr [rsp+0x220] + vmovdqa ymmword ptr [rsp+0x220], ymm1 + vpxor ymm0, ymm0, ymmword ptr [CMP_MSB_MASK+rip] + vpxor ymm2, ymm1, ymmword ptr [CMP_MSB_MASK+rip] + vpcmpgtd ymm2, ymm0, ymm2 + vmovdqa ymm0, ymmword ptr [rsp+0x240] + vpsubd ymm2, ymm0, ymm2 + vmovdqa ymmword ptr [rsp+0x240], ymm2 + add rdi, 64 + add rbx, 256 + mov qword ptr [rbp+0x90], rbx + sub rsi, 8 + cmp rsi, 8 + jnc 2b + test rsi, rsi + jnz 3f +4: + vzeroupper + vmovdqa xmm6, xmmword ptr [rsp+0x2D0] + vmovdqa xmm7, xmmword ptr [rsp+0x2E0] + vmovdqa xmm8, xmmword ptr [rsp+0x2F0] + vmovdqa xmm9, xmmword ptr [rsp+0x300] + vmovdqa xmm10, xmmword ptr [rsp+0x310] + vmovdqa xmm11, xmmword ptr [rsp+0x320] + vmovdqa xmm12, xmmword ptr [rsp+0x330] + vmovdqa xmm13, xmmword ptr [rsp+0x340] + vmovdqa xmm14, xmmword ptr [rsp+0x350] + vmovdqa xmm15, xmmword ptr [rsp+0x360] + mov rsp, rbp + pop rbp + pop rbx + pop rdi + pop rsi + pop r12 + pop r13 + pop r14 + pop r15 + ret +.p2align 5 +3: + mov rbx, qword ptr [rbp+0x90] + mov r15, qword ptr [rsp+0x2C0] + movzx r13d, byte ptr [rbp+0x78] + movzx r12d, byte ptr [rbp+0x88] + test rsi, 0x4 + je 3f + vbroadcasti128 ymm0, xmmword ptr [rcx] + vbroadcasti128 ymm1, xmmword ptr [rcx+0x10] + vmovdqa ymm8, ymm0 + vmovdqa ymm9, ymm1 + vbroadcasti128 ymm12, xmmword ptr [rsp+0x220] + vbroadcasti128 ymm13, xmmword ptr [rsp+0x240] + vpunpckldq ymm14, ymm12, ymm13 + vpunpckhdq ymm15, ymm12, ymm13 + vpermq ymm14, ymm14, 0x50 + vpermq ymm15, ymm15, 0x50 + vbroadcasti128 ymm12, xmmword ptr [BLAKE3_BLOCK_LEN+rip] + vpblendd ymm14, ymm14, ymm12, 0x44 + vpblendd ymm15, ymm15, ymm12, 0x44 + vmovdqa ymmword ptr [rsp], ymm14 + vmovdqa ymmword ptr [rsp+0x20], ymm15 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + movzx eax, byte ptr [rbp+0x80] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + mov dword ptr [rsp+0x200], eax + vmovups ymm2, ymmword ptr [r8+rdx-0x40] + vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x40], 0x01 + vmovups ymm3, ymmword ptr [r8+rdx-0x30] + vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x30], 0x01 + vshufps ymm4, ymm2, ymm3, 136 + vshufps ymm5, ymm2, ymm3, 221 + vmovups ymm2, ymmword ptr [r8+rdx-0x20] + vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x20], 0x01 + vmovups ymm3, ymmword ptr [r8+rdx-0x10] + vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x10], 0x01 + vshufps ymm6, ymm2, ymm3, 136 + vshufps ymm7, ymm2, ymm3, 221 + vpshufd ymm6, ymm6, 0x93 + vpshufd ymm7, ymm7, 0x93 + vmovups ymm10, ymmword ptr [r10+rdx-0x40] + vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x40], 0x01 + vmovups ymm11, ymmword ptr [r10+rdx-0x30] + vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x30], 0x01 + vshufps ymm12, ymm10, ymm11, 136 + vshufps ymm13, ymm10, ymm11, 221 + vmovups ymm10, ymmword ptr [r10+rdx-0x20] + vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x20], 0x01 + vmovups ymm11, ymmword ptr [r10+rdx-0x10] + vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x10], 0x01 + vshufps ymm14, ymm10, ymm11, 136 + vshufps ymm15, ymm10, ymm11, 221 + vpshufd ymm14, ymm14, 0x93 + vpshufd ymm15, ymm15, 0x93 + vpbroadcastd ymm2, dword ptr [rsp+0x200] + vmovdqa ymm3, ymmword ptr [rsp] + vmovdqa ymm11, ymmword ptr [rsp+0x20] + vpblendd ymm3, ymm3, ymm2, 0x88 + vpblendd ymm11, ymm11, ymm2, 0x88 + vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip] + vmovdqa ymm10, ymm2 + mov al, 7 +9: + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm8, ymm8, ymm12 + vmovdqa ymmword ptr [rsp+0x40], ymm4 + nop + vmovdqa ymmword ptr [rsp+0x60], ymm12 + nop + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT16+rip] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 12 + vpslld ymm9, ymm9, 20 + vpor ymm9, ymm9, ymm4 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vmovdqa ymmword ptr [rsp+0x80], ymm5 + vmovdqa ymmword ptr [rsp+0xA0], ymm13 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT8+rip] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 7 + vpslld ymm9, ymm9, 25 + vpor ymm9, ymm9, ymm4 + vpshufd ymm0, ymm0, 0x93 + vpshufd ymm8, ymm8, 0x93 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm11, ymm11, 0x4E + vpshufd ymm2, ymm2, 0x39 + vpshufd ymm10, ymm10, 0x39 + vpaddd ymm0, ymm0, ymm6 + vpaddd ymm8, ymm8, ymm14 + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT16+rip] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 12 + vpslld ymm9, ymm9, 20 + vpor ymm9, ymm9, ymm4 + vpaddd ymm0, ymm0, ymm7 + vpaddd ymm8, ymm8, ymm15 + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT8+rip] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 7 + vpslld ymm9, ymm9, 25 + vpor ymm9, ymm9, ymm4 + vpshufd ymm0, ymm0, 0x39 + vpshufd ymm8, ymm8, 0x39 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm11, ymm11, 0x4E + vpshufd ymm2, ymm2, 0x93 + vpshufd ymm10, ymm10, 0x93 + dec al + je 9f + vmovdqa ymm4, ymmword ptr [rsp+0x40] + vmovdqa ymm5, ymmword ptr [rsp+0x80] + vshufps ymm12, ymm4, ymm5, 214 + vpshufd ymm13, ymm4, 0x0F + vpshufd ymm4, ymm12, 0x39 + vshufps ymm12, ymm6, ymm7, 250 + vpblendd ymm13, ymm13, ymm12, 0xAA + vpunpcklqdq ymm12, ymm7, ymm5 + vpblendd ymm12, ymm12, ymm6, 0x88 + vpshufd ymm12, ymm12, 0x78 + vpunpckhdq ymm5, ymm5, ymm7 + vpunpckldq ymm6, ymm6, ymm5 + vpshufd ymm7, ymm6, 0x1E + vmovdqa ymmword ptr [rsp+0x40], ymm13 + vmovdqa ymmword ptr [rsp+0x80], ymm12 + vmovdqa ymm12, ymmword ptr [rsp+0x60] + vmovdqa ymm13, ymmword ptr [rsp+0xA0] + vshufps ymm5, ymm12, ymm13, 214 + vpshufd ymm6, ymm12, 0x0F + vpshufd ymm12, ymm5, 0x39 + vshufps ymm5, ymm14, ymm15, 250 + vpblendd ymm6, ymm6, ymm5, 0xAA + vpunpcklqdq ymm5, ymm15, ymm13 + vpblendd ymm5, ymm5, ymm14, 0x88 + vpshufd ymm5, ymm5, 0x78 + vpunpckhdq ymm13, ymm13, ymm15 + vpunpckldq ymm14, ymm14, ymm13 + vpshufd ymm15, ymm14, 0x1E + vmovdqa ymm13, ymm6 + vmovdqa ymm14, ymm5 + vmovdqa ymm5, ymmword ptr [rsp+0x40] + vmovdqa ymm6, ymmword ptr [rsp+0x80] + jmp 9b +9: + vpxor ymm0, ymm0, ymm2 + vpxor ymm1, ymm1, ymm3 + vpxor ymm8, ymm8, ymm10 + vpxor ymm9, ymm9, ymm11 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 + vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 + vmovdqu xmmword ptr [rbx+0x40], xmm8 + vmovdqu xmmword ptr [rbx+0x50], xmm9 + vextracti128 xmmword ptr [rbx+0x60], ymm8, 0x01 + vextracti128 xmmword ptr [rbx+0x70], ymm9, 0x01 + vmovaps xmm8, xmmword ptr [rsp+0x260] + vmovaps xmm0, xmmword ptr [rsp+0x220] + vmovaps xmm1, xmmword ptr [rsp+0x230] + vmovaps xmm2, xmmword ptr [rsp+0x240] + vmovaps xmm3, xmmword ptr [rsp+0x250] + vblendvps xmm0, xmm0, xmm1, xmm8 + vblendvps xmm2, xmm2, xmm3, xmm8 + vmovaps xmmword ptr [rsp+0x220], xmm0 + vmovaps xmmword ptr [rsp+0x240], xmm2 + add rbx, 128 + add rdi, 32 + sub rsi, 4 +3: + test rsi, 0x2 + je 3f + vbroadcasti128 ymm0, xmmword ptr [rcx] + vbroadcasti128 ymm1, xmmword ptr [rcx+0x10] + vmovd xmm13, dword ptr [rsp+0x220] + vpinsrd xmm13, xmm13, dword ptr [rsp+0x240], 1 + vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vmovd xmm14, dword ptr [rsp+0x224] + vpinsrd xmm14, xmm14, dword ptr [rsp+0x244], 1 + vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vinserti128 ymm13, ymm13, xmm14, 0x01 + vbroadcasti128 ymm14, xmmword ptr [ROT16+rip] + vbroadcasti128 ymm15, xmmword ptr [ROT8+rip] + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + movzx eax, byte ptr [rbp+0x80] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + mov dword ptr [rsp+0x200], eax + vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip] + vpbroadcastd ymm8, dword ptr [rsp+0x200] + vpblendd ymm3, ymm13, ymm8, 0x88 + vmovups ymm8, ymmword ptr [r8+rdx-0x40] + vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x40], 0x01 + vmovups ymm9, ymmword ptr [r8+rdx-0x30] + vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x30], 0x01 + vshufps ymm4, ymm8, ymm9, 136 + vshufps ymm5, ymm8, ymm9, 221 + vmovups ymm8, ymmword ptr [r8+rdx-0x20] + vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x20], 0x01 + vmovups ymm9, ymmword ptr [r8+rdx-0x10] + vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x10], 0x01 + vshufps ymm6, ymm8, ymm9, 136 + vshufps ymm7, ymm8, ymm9, 221 + vpshufd ymm6, ymm6, 0x93 + vpshufd ymm7, ymm7, 0x93 + mov al, 7 +9: + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm14 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm8 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm15 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm8 + vpshufd ymm0, ymm0, 0x93 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm2, ymm2, 0x39 + vpaddd ymm0, ymm0, ymm6 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm14 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm8 + vpaddd ymm0, ymm0, ymm7 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm15 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm8 + vpshufd ymm0, ymm0, 0x39 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm2, ymm2, 0x93 + dec al + jz 9f + vshufps ymm8, ymm4, ymm5, 214 + vpshufd ymm9, ymm4, 0x0F + vpshufd ymm4, ymm8, 0x39 + vshufps ymm8, ymm6, ymm7, 250 + vpblendd ymm9, ymm9, ymm8, 0xAA + vpunpcklqdq ymm8, ymm7, ymm5 + vpblendd ymm8, ymm8, ymm6, 0x88 + vpshufd ymm8, ymm8, 0x78 + vpunpckhdq ymm5, ymm5, ymm7 + vpunpckldq ymm6, ymm6, ymm5 + vpshufd ymm7, ymm6, 0x1E + vmovdqa ymm5, ymm9 + vmovdqa ymm6, ymm8 + jmp 9b +9: + vpxor ymm0, ymm0, ymm2 + vpxor ymm1, ymm1, ymm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 + vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 + vmovaps ymm8, ymmword ptr [rsp+0x260] + vmovaps ymm0, ymmword ptr [rsp+0x220] + vmovups ymm1, ymmword ptr [rsp+0x228] + vmovaps ymm2, ymmword ptr [rsp+0x240] + vmovups ymm3, ymmword ptr [rsp+0x248] + vblendvps ymm0, ymm0, ymm1, ymm8 + vblendvps ymm2, ymm2, ymm3, ymm8 + vmovaps ymmword ptr [rsp+0x220], ymm0 + vmovaps ymmword ptr [rsp+0x240], ymm2 + add rbx, 64 + add rdi, 16 + sub rsi, 2 +3: + test rsi, 0x1 + je 4b + vmovdqu xmm0, xmmword ptr [rcx] + vmovdqu xmm1, xmmword ptr [rcx+0x10] + vmovd xmm3, dword ptr [rsp+0x220] + vpinsrd xmm3, xmm3, dword ptr [rsp+0x240], 1 + vpinsrd xmm13, xmm3, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vmovdqa xmm14, xmmword ptr [ROT16+rip] + vmovdqa xmm15, xmmword ptr [ROT8+rip] + mov r8, qword ptr [rdi] + movzx eax, byte ptr [rbp+0x80] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + vmovdqa xmm2, xmmword ptr [BLAKE3_IV+rip] + vmovdqa xmm3, xmm13 + vpinsrd xmm3, xmm3, eax, 3 + vmovups xmm8, xmmword ptr [r8+rdx-0x40] + vmovups xmm9, xmmword ptr [r8+rdx-0x30] + vshufps xmm4, xmm8, xmm9, 136 + vshufps xmm5, xmm8, xmm9, 221 + vmovups xmm8, xmmword ptr [r8+rdx-0x20] + vmovups xmm9, xmmword ptr [r8+rdx-0x10] + vshufps xmm6, xmm8, xmm9, 136 + vshufps xmm7, xmm8, xmm9, 221 + vpshufd xmm6, xmm6, 0x93 + vpshufd xmm7, xmm7, 0x93 + mov al, 7 +9: + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm14 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 12 + vpslld xmm1, xmm1, 20 + vpor xmm1, xmm1, xmm8 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm15 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 7 + vpslld xmm1, xmm1, 25 + vpor xmm1, xmm1, xmm8 + vpshufd xmm0, xmm0, 0x93 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x39 + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm14 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 12 + vpslld xmm1, xmm1, 20 + vpor xmm1, xmm1, xmm8 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm15 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 7 + vpslld xmm1, xmm1, 25 + vpor xmm1, xmm1, xmm8 + vpshufd xmm0, xmm0, 0x39 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x93 + dec al + jz 9f + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0x0F + vpshufd xmm4, xmm8, 0x39 + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0xAA + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 0x88 + vpshufd xmm8, xmm8, 0x78 + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 0x1E + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + jmp 9b +9: + vpxor xmm0, xmm0, xmm2 + vpxor xmm1, xmm1, xmm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + jmp 4b + +.section .rdata +.p2align 6 +ADD0: + .long 0, 1, 2, 3, 4, 5, 6, 7 +ADD1: + .long 8, 8, 8, 8, 8, 8, 8, 8 +BLAKE3_IV_0: + .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 + .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 +BLAKE3_IV_1: + .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 + .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 +BLAKE3_IV_2: + .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 + .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 +BLAKE3_IV_3: + .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A + .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A +BLAKE3_BLOCK_LEN: + .long 0x00000040, 0x00000040, 0x00000040, 0x00000040 + .long 0x00000040, 0x00000040, 0x00000040, 0x00000040 +ROT16: + .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 +ROT8: + .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 +CMP_MSB_MASK: + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 +BLAKE3_IV: + .long 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A + diff --git a/thirdparty/blake3/c/blake3_avx2_x86-64_windows_msvc.asm b/thirdparty/blake3/c/blake3_avx2_x86-64_windows_msvc.asm new file mode 100644 index 000000000..352298edd --- /dev/null +++ b/thirdparty/blake3/c/blake3_avx2_x86-64_windows_msvc.asm @@ -0,0 +1,1828 @@ +public _blake3_hash_many_avx2 +public blake3_hash_many_avx2 + +_TEXT SEGMENT ALIGN(16) 'CODE' + +ALIGN 16 +blake3_hash_many_avx2 PROC +_blake3_hash_many_avx2 PROC + push r15 + push r14 + push r13 + push r12 + push rsi + push rdi + push rbx + push rbp + mov rbp, rsp + sub rsp, 880 + and rsp, 0FFFFFFFFFFFFFFC0H + vmovdqa xmmword ptr [rsp+2D0H], xmm6 + vmovdqa xmmword ptr [rsp+2E0H], xmm7 + vmovdqa xmmword ptr [rsp+2F0H], xmm8 + vmovdqa xmmword ptr [rsp+300H], xmm9 + vmovdqa xmmword ptr [rsp+310H], xmm10 + vmovdqa xmmword ptr [rsp+320H], xmm11 + vmovdqa xmmword ptr [rsp+330H], xmm12 + vmovdqa xmmword ptr [rsp+340H], xmm13 + vmovdqa xmmword ptr [rsp+350H], xmm14 + vmovdqa xmmword ptr [rsp+360H], xmm15 + mov rdi, rcx + mov rsi, rdx + mov rdx, r8 + mov rcx, r9 + mov r8, qword ptr [rbp+68H] + movzx r9, byte ptr [rbp+70H] + neg r9d + vmovd xmm0, r9d + vpbroadcastd ymm0, xmm0 + vmovdqa ymmword ptr [rsp+260H], ymm0 + vpand ymm1, ymm0, ymmword ptr [ADD0] + vpand ymm2, ymm0, ymmword ptr [ADD1] + vmovdqa ymmword ptr [rsp+2A0H], ymm2 + vmovd xmm2, r8d + vpbroadcastd ymm2, xmm2 + vpaddd ymm2, ymm2, ymm1 + vmovdqa ymmword ptr [rsp+220H], ymm2 + vpxor ymm1, ymm1, ymmword ptr [CMP_MSB_MASK] + vpxor ymm2, ymm2, ymmword ptr [CMP_MSB_MASK] + vpcmpgtd ymm2, ymm1, ymm2 + shr r8, 32 + vmovd xmm3, r8d + vpbroadcastd ymm3, xmm3 + vpsubd ymm3, ymm3, ymm2 + vmovdqa ymmword ptr [rsp+240H], ymm3 + shl rdx, 6 + mov qword ptr [rsp+2C0H], rdx + cmp rsi, 8 + jc final7blocks +outerloop8: + vpbroadcastd ymm0, dword ptr [rcx] + vpbroadcastd ymm1, dword ptr [rcx+4H] + vpbroadcastd ymm2, dword ptr [rcx+8H] + vpbroadcastd ymm3, dword ptr [rcx+0CH] + vpbroadcastd ymm4, dword ptr [rcx+10H] + vpbroadcastd ymm5, dword ptr [rcx+14H] + vpbroadcastd ymm6, dword ptr [rcx+18H] + vpbroadcastd ymm7, dword ptr [rcx+1CH] + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+8H] + mov r10, qword ptr [rdi+10H] + mov r11, qword ptr [rdi+18H] + mov r12, qword ptr [rdi+20H] + mov r13, qword ptr [rdi+28H] + mov r14, qword ptr [rdi+30H] + mov r15, qword ptr [rdi+38H] + movzx eax, byte ptr [rbp+78H] + movzx ebx, byte ptr [rbp+80H] + or eax, ebx + xor edx, edx +ALIGN 16 +innerloop8: + movzx ebx, byte ptr [rbp+88H] + or ebx, eax + add rdx, 64 + cmp rdx, qword ptr [rsp+2C0H] + cmove eax, ebx + mov dword ptr [rsp+200H], eax + vmovups xmm8, xmmword ptr [r8+rdx-40H] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-40H], 01H + vmovups xmm9, xmmword ptr [r9+rdx-40H] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-40H], 01H + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-40H] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-40H], 01H + vmovups xmm11, xmmword ptr [r11+rdx-40H] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-40H], 01H + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+20H], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+40H], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+60H], ymm11 + vmovups xmm8, xmmword ptr [r8+rdx-30H] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-30H], 01H + vmovups xmm9, xmmword ptr [r9+rdx-30H] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-30H], 01H + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-30H] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-30H], 01H + vmovups xmm11, xmmword ptr [r11+rdx-30H] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-30H], 01H + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp+80H], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+0A0H], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+0C0H], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+0E0H], ymm11 + vmovups xmm8, xmmword ptr [r8+rdx-20H] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-20H], 01H + vmovups xmm9, xmmword ptr [r9+rdx-20H] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-20H], 01H + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-20H] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-20H], 01H + vmovups xmm11, xmmword ptr [r11+rdx-20H] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-20H], 01H + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp+100H], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+120H], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+140H], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+160H], ymm11 + vmovups xmm8, xmmword ptr [r8+rdx-10H] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-10H], 01H + vmovups xmm9, xmmword ptr [r9+rdx-10H] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-10H], 01H + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-10H] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-10H], 01H + vmovups xmm11, xmmword ptr [r11+rdx-10H] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-10H], 01H + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp+180H], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+1A0H], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+1C0H], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+1E0H], ymm11 + vpbroadcastd ymm15, dword ptr [rsp+200H] + prefetcht0 byte ptr [r8+rdx+80H] + prefetcht0 byte ptr [r12+rdx+80H] + prefetcht0 byte ptr [r9+rdx+80H] + prefetcht0 byte ptr [r13+rdx+80H] + prefetcht0 byte ptr [r10+rdx+80H] + prefetcht0 byte ptr [r14+rdx+80H] + prefetcht0 byte ptr [r11+rdx+80H] + prefetcht0 byte ptr [r15+rdx+80H] + vpaddd ymm0, ymm0, ymmword ptr [rsp] + vpaddd ymm1, ymm1, ymmword ptr [rsp+40H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+80H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0C0H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm0, ymmword ptr [rsp+220H] + vpxor ymm13, ymm1, ymmword ptr [rsp+240H] + vpxor ymm14, ymm2, ymmword ptr [BLAKE3_BLOCK_LEN] + vpxor ymm15, ymm3, ymm15 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [BLAKE3_IV_0] + vpaddd ymm9, ymm13, ymmword ptr [BLAKE3_IV_1] + vpaddd ymm10, ymm14, ymmword ptr [BLAKE3_IV_2] + vpaddd ymm11, ymm15, ymmword ptr [BLAKE3_IV_3] + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+20H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+60H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0A0H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0E0H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+100H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+140H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+180H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+1C0H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+120H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+160H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+1A0H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+1E0H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+40H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+60H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0E0H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+80H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0C0H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+140H] + vpaddd ymm2, ymm2, ymmword ptr [rsp] + vpaddd ymm3, ymm3, ymmword ptr [rsp+1A0H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+20H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+180H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+120H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+1E0H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+160H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0A0H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+1C0H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+100H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+60H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+140H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+1A0H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0E0H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+80H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+180H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+40H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+1C0H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0C0H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+120H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+160H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+100H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0A0H] + vpaddd ymm1, ymm1, ymmword ptr [rsp] + vpaddd ymm2, ymm2, ymmword ptr [rsp+1E0H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+20H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+140H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+180H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+1C0H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+1A0H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0E0H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+120H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+60H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+1E0H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+80H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+160H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0A0H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+20H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp] + vpaddd ymm1, ymm1, ymmword ptr [rsp+40H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+100H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0C0H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+180H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+120H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+1E0H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+1C0H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+1A0H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+160H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+140H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+100H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0E0H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0A0H] + vpaddd ymm2, ymm2, ymmword ptr [rsp] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0C0H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+40H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+60H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+20H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+80H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+120H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+160H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+100H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+1E0H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+1C0H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0A0H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+180H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+20H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+1A0H] + vpaddd ymm1, ymm1, ymmword ptr [rsp] + vpaddd ymm2, ymm2, ymmword ptr [rsp+40H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+80H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+60H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+140H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0C0H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0E0H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+160H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0A0H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+20H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+100H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+1E0H] + vpaddd ymm1, ymm1, ymmword ptr [rsp] + vpaddd ymm2, ymm2, ymmword ptr [rsp+120H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0C0H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+1C0H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+40H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+60H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0E0H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+140H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+180H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+80H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+1A0H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vpxor ymm0, ymm0, ymm8 + vpxor ymm1, ymm1, ymm9 + vpxor ymm2, ymm2, ymm10 + vpxor ymm3, ymm3, ymm11 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpxor ymm4, ymm4, ymm12 + vpxor ymm5, ymm5, ymm13 + vpxor ymm6, ymm6, ymm14 + vpxor ymm7, ymm7, ymm15 + movzx eax, byte ptr [rbp+78H] + jne innerloop8 + mov rbx, qword ptr [rbp+90H] + vunpcklps ymm8, ymm0, ymm1 + vunpcklps ymm9, ymm2, ymm3 + vunpckhps ymm10, ymm0, ymm1 + vunpcklps ymm11, ymm4, ymm5 + vunpcklps ymm0, ymm6, ymm7 + vshufps ymm12, ymm8, ymm9, 78 + vblendps ymm1, ymm8, ymm12, 0CCH + vshufps ymm8, ymm11, ymm0, 78 + vunpckhps ymm13, ymm2, ymm3 + vblendps ymm2, ymm11, ymm8, 0CCH + vblendps ymm3, ymm12, ymm9, 0CCH + vperm2f128 ymm12, ymm1, ymm2, 20H + vmovups ymmword ptr [rbx], ymm12 + vunpckhps ymm14, ymm4, ymm5 + vblendps ymm4, ymm8, ymm0, 0CCH + vunpckhps ymm15, ymm6, ymm7 + vperm2f128 ymm7, ymm3, ymm4, 20H + vmovups ymmword ptr [rbx+20H], ymm7 + vshufps ymm5, ymm10, ymm13, 78 + vblendps ymm6, ymm5, ymm13, 0CCH + vshufps ymm13, ymm14, ymm15, 78 + vblendps ymm10, ymm10, ymm5, 0CCH + vblendps ymm14, ymm14, ymm13, 0CCH + vperm2f128 ymm8, ymm10, ymm14, 20H + vmovups ymmword ptr [rbx+40H], ymm8 + vblendps ymm15, ymm13, ymm15, 0CCH + vperm2f128 ymm13, ymm6, ymm15, 20H + vmovups ymmword ptr [rbx+60H], ymm13 + vperm2f128 ymm9, ymm1, ymm2, 31H + vperm2f128 ymm11, ymm3, ymm4, 31H + vmovups ymmword ptr [rbx+80H], ymm9 + vperm2f128 ymm14, ymm10, ymm14, 31H + vperm2f128 ymm15, ymm6, ymm15, 31H + vmovups ymmword ptr [rbx+0A0H], ymm11 + vmovups ymmword ptr [rbx+0C0H], ymm14 + vmovups ymmword ptr [rbx+0E0H], ymm15 + vmovdqa ymm0, ymmword ptr [rsp+2A0H] + vpaddd ymm1, ymm0, ymmword ptr [rsp+220H] + vmovdqa ymmword ptr [rsp+220H], ymm1 + vpxor ymm0, ymm0, ymmword ptr [CMP_MSB_MASK] + vpxor ymm2, ymm1, ymmword ptr [CMP_MSB_MASK] + vpcmpgtd ymm2, ymm0, ymm2 + vmovdqa ymm0, ymmword ptr [rsp+240H] + vpsubd ymm2, ymm0, ymm2 + vmovdqa ymmword ptr [rsp+240H], ymm2 + add rdi, 64 + add rbx, 256 + mov qword ptr [rbp+90H], rbx + sub rsi, 8 + cmp rsi, 8 + jnc outerloop8 + test rsi, rsi + jnz final7blocks +unwind: + vzeroupper + vmovdqa xmm6, xmmword ptr [rsp+2D0H] + vmovdqa xmm7, xmmword ptr [rsp+2E0H] + vmovdqa xmm8, xmmword ptr [rsp+2F0H] + vmovdqa xmm9, xmmword ptr [rsp+300H] + vmovdqa xmm10, xmmword ptr [rsp+310H] + vmovdqa xmm11, xmmword ptr [rsp+320H] + vmovdqa xmm12, xmmword ptr [rsp+330H] + vmovdqa xmm13, xmmword ptr [rsp+340H] + vmovdqa xmm14, xmmword ptr [rsp+350H] + vmovdqa xmm15, xmmword ptr [rsp+360H] + mov rsp, rbp + pop rbp + pop rbx + pop rdi + pop rsi + pop r12 + pop r13 + pop r14 + pop r15 + ret +ALIGN 16 +final7blocks: + mov rbx, qword ptr [rbp+90H] + mov r15, qword ptr [rsp+2C0H] + movzx r13d, byte ptr [rbp+78H] + movzx r12d, byte ptr [rbp+88H] + test rsi, 4H + je final3blocks + vbroadcasti128 ymm0, xmmword ptr [rcx] + vbroadcasti128 ymm1, xmmword ptr [rcx+10H] + vmovdqa ymm8, ymm0 + vmovdqa ymm9, ymm1 + vbroadcasti128 ymm12, xmmword ptr [rsp+220H] + vbroadcasti128 ymm13, xmmword ptr [rsp+240H] + vpunpckldq ymm14, ymm12, ymm13 + vpunpckhdq ymm15, ymm12, ymm13 + vpermq ymm14, ymm14, 50H + vpermq ymm15, ymm15, 50H + vbroadcasti128 ymm12, xmmword ptr [BLAKE3_BLOCK_LEN] + vpblendd ymm14, ymm14, ymm12, 44H + vpblendd ymm15, ymm15, ymm12, 44H + vmovdqa ymmword ptr [rsp], ymm14 + vmovdqa ymmword ptr [rsp+20H], ymm15 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+8H] + mov r10, qword ptr [rdi+10H] + mov r11, qword ptr [rdi+18H] + movzx eax, byte ptr [rbp+80H] + or eax, r13d + xor edx, edx +ALIGN 16 +innerloop4: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + mov dword ptr [rsp+200H], eax + vmovups ymm2, ymmword ptr [r8+rdx-40H] + vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-40H], 01H + vmovups ymm3, ymmword ptr [r8+rdx-30H] + vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-30H], 01H + vshufps ymm4, ymm2, ymm3, 136 + vshufps ymm5, ymm2, ymm3, 221 + vmovups ymm2, ymmword ptr [r8+rdx-20H] + vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-20H], 01H + vmovups ymm3, ymmword ptr [r8+rdx-10H] + vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-10H], 01H + vshufps ymm6, ymm2, ymm3, 136 + vshufps ymm7, ymm2, ymm3, 221 + vpshufd ymm6, ymm6, 93H + vpshufd ymm7, ymm7, 93H + vmovups ymm10, ymmword ptr [r10+rdx-40H] + vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-40H], 01H + vmovups ymm11, ymmword ptr [r10+rdx-30H] + vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-30H], 01H + vshufps ymm12, ymm10, ymm11, 136 + vshufps ymm13, ymm10, ymm11, 221 + vmovups ymm10, ymmword ptr [r10+rdx-20H] + vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-20H], 01H + vmovups ymm11, ymmword ptr [r10+rdx-10H] + vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-10H], 01H + vshufps ymm14, ymm10, ymm11, 136 + vshufps ymm15, ymm10, ymm11, 221 + vpshufd ymm14, ymm14, 93H + vpshufd ymm15, ymm15, 93H + vpbroadcastd ymm2, dword ptr [rsp+200H] + vmovdqa ymm3, ymmword ptr [rsp] + vmovdqa ymm11, ymmword ptr [rsp+20H] + vpblendd ymm3, ymm3, ymm2, 88H + vpblendd ymm11, ymm11, ymm2, 88H + vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV] + vmovdqa ymm10, ymm2 + mov al, 7 +roundloop4: + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm8, ymm8, ymm12 + vmovdqa ymmword ptr [rsp+40H], ymm4 + nop + vmovdqa ymmword ptr [rsp+60H], ymm12 + nop + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT16] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 12 + vpslld ymm9, ymm9, 20 + vpor ymm9, ymm9, ymm4 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vmovdqa ymmword ptr [rsp+80H], ymm5 + vmovdqa ymmword ptr [rsp+0A0H], ymm13 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT8] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 7 + vpslld ymm9, ymm9, 25 + vpor ymm9, ymm9, ymm4 + vpshufd ymm0, ymm0, 93H + vpshufd ymm8, ymm8, 93H + vpshufd ymm3, ymm3, 4EH + vpshufd ymm11, ymm11, 4EH + vpshufd ymm2, ymm2, 39H + vpshufd ymm10, ymm10, 39H + vpaddd ymm0, ymm0, ymm6 + vpaddd ymm8, ymm8, ymm14 + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT16] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 12 + vpslld ymm9, ymm9, 20 + vpor ymm9, ymm9, ymm4 + vpaddd ymm0, ymm0, ymm7 + vpaddd ymm8, ymm8, ymm15 + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT8] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 7 + vpslld ymm9, ymm9, 25 + vpor ymm9, ymm9, ymm4 + vpshufd ymm0, ymm0, 39H + vpshufd ymm8, ymm8, 39H + vpshufd ymm3, ymm3, 4EH + vpshufd ymm11, ymm11, 4EH + vpshufd ymm2, ymm2, 93H + vpshufd ymm10, ymm10, 93H + dec al + je endroundloop4 + vmovdqa ymm4, ymmword ptr [rsp+40H] + vmovdqa ymm5, ymmword ptr [rsp+80H] + vshufps ymm12, ymm4, ymm5, 214 + vpshufd ymm13, ymm4, 0FH + vpshufd ymm4, ymm12, 39H + vshufps ymm12, ymm6, ymm7, 250 + vpblendd ymm13, ymm13, ymm12, 0AAH + vpunpcklqdq ymm12, ymm7, ymm5 + vpblendd ymm12, ymm12, ymm6, 88H + vpshufd ymm12, ymm12, 78H + vpunpckhdq ymm5, ymm5, ymm7 + vpunpckldq ymm6, ymm6, ymm5 + vpshufd ymm7, ymm6, 1EH + vmovdqa ymmword ptr [rsp+40H], ymm13 + vmovdqa ymmword ptr [rsp+80H], ymm12 + vmovdqa ymm12, ymmword ptr [rsp+60H] + vmovdqa ymm13, ymmword ptr [rsp+0A0H] + vshufps ymm5, ymm12, ymm13, 214 + vpshufd ymm6, ymm12, 0FH + vpshufd ymm12, ymm5, 39H + vshufps ymm5, ymm14, ymm15, 250 + vpblendd ymm6, ymm6, ymm5, 0AAH + vpunpcklqdq ymm5, ymm15, ymm13 + vpblendd ymm5, ymm5, ymm14, 88H + vpshufd ymm5, ymm5, 78H + vpunpckhdq ymm13, ymm13, ymm15 + vpunpckldq ymm14, ymm14, ymm13 + vpshufd ymm15, ymm14, 1EH + vmovdqa ymm13, ymm6 + vmovdqa ymm14, ymm5 + vmovdqa ymm5, ymmword ptr [rsp+40H] + vmovdqa ymm6, ymmword ptr [rsp+80H] + jmp roundloop4 +endroundloop4: + vpxor ymm0, ymm0, ymm2 + vpxor ymm1, ymm1, ymm3 + vpxor ymm8, ymm8, ymm10 + vpxor ymm9, ymm9, ymm11 + mov eax, r13d + cmp rdx, r15 + jne innerloop4 + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+10H], xmm1 + vextracti128 xmmword ptr [rbx+20H], ymm0, 01H + vextracti128 xmmword ptr [rbx+30H], ymm1, 01H + vmovdqu xmmword ptr [rbx+40H], xmm8 + vmovdqu xmmword ptr [rbx+50H], xmm9 + vextracti128 xmmword ptr [rbx+60H], ymm8, 01H + vextracti128 xmmword ptr [rbx+70H], ymm9, 01H + vmovaps xmm8, xmmword ptr [rsp+260H] + vmovaps xmm0, xmmword ptr [rsp+220H] + vmovaps xmm1, xmmword ptr [rsp+230H] + vmovaps xmm2, xmmword ptr [rsp+240H] + vmovaps xmm3, xmmword ptr [rsp+250H] + vblendvps xmm0, xmm0, xmm1, xmm8 + vblendvps xmm2, xmm2, xmm3, xmm8 + vmovaps xmmword ptr [rsp+220H], xmm0 + vmovaps xmmword ptr [rsp+240H], xmm2 + add rbx, 128 + add rdi, 32 + sub rsi, 4 +final3blocks: + test rsi, 2H + je final1blocks + vbroadcasti128 ymm0, xmmword ptr [rcx] + vbroadcasti128 ymm1, xmmword ptr [rcx+10H] + vmovd xmm13, dword ptr [rsp+220H] + vpinsrd xmm13, xmm13, dword ptr [rsp+240H], 1 + vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN], 2 + vmovd xmm14, dword ptr [rsp+224H] + vpinsrd xmm14, xmm14, dword ptr [rsp+244H], 1 + vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN], 2 + vinserti128 ymm13, ymm13, xmm14, 01H + vbroadcasti128 ymm14, xmmword ptr [ROT16] + vbroadcasti128 ymm15, xmmword ptr [ROT8] + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+8H] + movzx eax, byte ptr [rbp+80H] + or eax, r13d + xor edx, edx +ALIGN 16 +innerloop2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + mov dword ptr [rsp+200H], eax + vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV] + vpbroadcastd ymm8, dword ptr [rsp+200H] + vpblendd ymm3, ymm13, ymm8, 88H + vmovups ymm8, ymmword ptr [r8+rdx-40H] + vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-40H], 01H + vmovups ymm9, ymmword ptr [r8+rdx-30H] + vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-30H], 01H + vshufps ymm4, ymm8, ymm9, 136 + vshufps ymm5, ymm8, ymm9, 221 + vmovups ymm8, ymmword ptr [r8+rdx-20H] + vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-20H], 01H + vmovups ymm9, ymmword ptr [r8+rdx-10H] + vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-10H], 01H + vshufps ymm6, ymm8, ymm9, 136 + vshufps ymm7, ymm8, ymm9, 221 + vpshufd ymm6, ymm6, 93H + vpshufd ymm7, ymm7, 93H + mov al, 7 +roundloop2: + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm14 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm8 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm15 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm8 + vpshufd ymm0, ymm0, 93H + vpshufd ymm3, ymm3, 4EH + vpshufd ymm2, ymm2, 39H + vpaddd ymm0, ymm0, ymm6 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm14 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm8 + vpaddd ymm0, ymm0, ymm7 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm15 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm8 + vpshufd ymm0, ymm0, 39H + vpshufd ymm3, ymm3, 4EH + vpshufd ymm2, ymm2, 93H + dec al + jz endroundloop2 + vshufps ymm8, ymm4, ymm5, 214 + vpshufd ymm9, ymm4, 0FH + vpshufd ymm4, ymm8, 39H + vshufps ymm8, ymm6, ymm7, 250 + vpblendd ymm9, ymm9, ymm8, 0AAH + vpunpcklqdq ymm8, ymm7, ymm5 + vpblendd ymm8, ymm8, ymm6, 88H + vpshufd ymm8, ymm8, 78H + vpunpckhdq ymm5, ymm5, ymm7 + vpunpckldq ymm6, ymm6, ymm5 + vpshufd ymm7, ymm6, 1EH + vmovdqa ymm5, ymm9 + vmovdqa ymm6, ymm8 + jmp roundloop2 +endroundloop2: + vpxor ymm0, ymm0, ymm2 + vpxor ymm1, ymm1, ymm3 + mov eax, r13d + cmp rdx, r15 + jne innerloop2 + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+10H], xmm1 + vextracti128 xmmword ptr [rbx+20H], ymm0, 01H + vextracti128 xmmword ptr [rbx+30H], ymm1, 01H + vmovaps ymm8, ymmword ptr [rsp+260H] + vmovaps ymm0, ymmword ptr [rsp+220H] + vmovups ymm1, ymmword ptr [rsp+228H] + vmovaps ymm2, ymmword ptr [rsp+240H] + vmovups ymm3, ymmword ptr [rsp+248H] + vblendvps ymm0, ymm0, ymm1, ymm8 + vblendvps ymm2, ymm2, ymm3, ymm8 + vmovaps ymmword ptr [rsp+220H], ymm0 + vmovaps ymmword ptr [rsp+240H], ymm2 + add rbx, 64 + add rdi, 16 + sub rsi, 2 +final1blocks: + test rsi, 1H + je unwind + vmovdqu xmm0, xmmword ptr [rcx] + vmovdqu xmm1, xmmword ptr [rcx+10H] + vmovd xmm3, dword ptr [rsp+220H] + vpinsrd xmm3, xmm3, dword ptr [rsp+240H], 1 + vpinsrd xmm13, xmm3, dword ptr [BLAKE3_BLOCK_LEN], 2 + vmovdqa xmm14, xmmword ptr [ROT16] + vmovdqa xmm15, xmmword ptr [ROT8] + mov r8, qword ptr [rdi] + movzx eax, byte ptr [rbp+80H] + or eax, r13d + xor edx, edx +ALIGN 16 +innerloop1: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + vmovdqa xmm2, xmmword ptr [BLAKE3_IV] + vmovdqa xmm3, xmm13 + vpinsrd xmm3, xmm3, eax, 3 + vmovups xmm8, xmmword ptr [r8+rdx-40H] + vmovups xmm9, xmmword ptr [r8+rdx-30H] + vshufps xmm4, xmm8, xmm9, 136 + vshufps xmm5, xmm8, xmm9, 221 + vmovups xmm8, xmmword ptr [r8+rdx-20H] + vmovups xmm9, xmmword ptr [r8+rdx-10H] + vshufps xmm6, xmm8, xmm9, 136 + vshufps xmm7, xmm8, xmm9, 221 + vpshufd xmm6, xmm6, 93H + vpshufd xmm7, xmm7, 93H + mov al, 7 +roundloop1: + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm14 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 12 + vpslld xmm1, xmm1, 20 + vpor xmm1, xmm1, xmm8 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm15 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 7 + vpslld xmm1, xmm1, 25 + vpor xmm1, xmm1, xmm8 + vpshufd xmm0, xmm0, 93H + vpshufd xmm3, xmm3, 4EH + vpshufd xmm2, xmm2, 39H + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm14 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 12 + vpslld xmm1, xmm1, 20 + vpor xmm1, xmm1, xmm8 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm15 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 7 + vpslld xmm1, xmm1, 25 + vpor xmm1, xmm1, xmm8 + vpshufd xmm0, xmm0, 39H + vpshufd xmm3, xmm3, 4EH + vpshufd xmm2, xmm2, 93H + dec al + jz endroundloop1 + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0FH + vpshufd xmm4, xmm8, 39H + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0AAH + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 88H + vpshufd xmm8, xmm8, 78H + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 1EH + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + jmp roundloop1 +endroundloop1: + vpxor xmm0, xmm0, xmm2 + vpxor xmm1, xmm1, xmm3 + mov eax, r13d + cmp rdx, r15 + jne innerloop1 + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+10H], xmm1 + jmp unwind + +_blake3_hash_many_avx2 ENDP +blake3_hash_many_avx2 ENDP +_TEXT ENDS + +_RDATA SEGMENT READONLY PAGE ALIAS(".rdata") 'CONST' +ALIGN 64 +ADD0: + dd 0, 1, 2, 3, 4, 5, 6, 7 + +ADD1: + dd 8 dup (8) + +BLAKE3_IV_0: + dd 8 dup (6A09E667H) + +BLAKE3_IV_1: + dd 8 dup (0BB67AE85H) + +BLAKE3_IV_2: + dd 8 dup (3C6EF372H) + +BLAKE3_IV_3: + dd 8 dup (0A54FF53AH) + +BLAKE3_BLOCK_LEN: + dd 8 dup (64) + +ROT16: + db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 + +ROT8: + db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 + +CMP_MSB_MASK: + dd 8 dup(80000000H) + +BLAKE3_IV: + dd 6A09E667H, 0BB67AE85H, 3C6EF372H, 0A54FF53AH + +_RDATA ENDS +END diff --git a/thirdparty/blake3/c/blake3_avx512.c b/thirdparty/blake3/c/blake3_avx512.c new file mode 100644 index 000000000..f88a32d31 --- /dev/null +++ b/thirdparty/blake3/c/blake3_avx512.c @@ -0,0 +1,1388 @@ +#include "blake3_impl.h" + +#include <immintrin.h> + +#define _mm_shuffle_ps2(a, b, c) \ + (_mm_castps_si128( \ + _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c)))) + +INLINE __m128i loadu_128(const uint8_t src[16]) { + return _mm_loadu_si128((void*)src); +} + +INLINE __m256i loadu_256(const uint8_t src[32]) { + return _mm256_loadu_si256((void*)src); +} + +INLINE __m512i loadu_512(const uint8_t src[64]) { + return _mm512_loadu_si512((void*)src); +} + +INLINE void storeu_128(__m128i src, uint8_t dest[16]) { + _mm_storeu_si128((void*)dest, src); +} + +INLINE void storeu_256(__m256i src, uint8_t dest[16]) { + _mm256_storeu_si256((void*)dest, src); +} + +INLINE void storeu_512(__m512i src, uint8_t dest[16]) { + _mm512_storeu_si512((void*)dest, src); +} + +INLINE __m128i add_128(__m128i a, __m128i b) { return _mm_add_epi32(a, b); } + +INLINE __m256i add_256(__m256i a, __m256i b) { return _mm256_add_epi32(a, b); } + +INLINE __m512i add_512(__m512i a, __m512i b) { return _mm512_add_epi32(a, b); } + +INLINE __m128i xor_128(__m128i a, __m128i b) { return _mm_xor_si128(a, b); } + +INLINE __m256i xor_256(__m256i a, __m256i b) { return _mm256_xor_si256(a, b); } + +INLINE __m512i xor_512(__m512i a, __m512i b) { return _mm512_xor_si512(a, b); } + +INLINE __m128i set1_128(uint32_t x) { return _mm_set1_epi32((int32_t)x); } + +INLINE __m256i set1_256(uint32_t x) { return _mm256_set1_epi32((int32_t)x); } + +INLINE __m512i set1_512(uint32_t x) { return _mm512_set1_epi32((int32_t)x); } + +INLINE __m128i set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { + return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d); +} + +INLINE __m128i rot16_128(__m128i x) { return _mm_ror_epi32(x, 16); } + +INLINE __m256i rot16_256(__m256i x) { return _mm256_ror_epi32(x, 16); } + +INLINE __m512i rot16_512(__m512i x) { return _mm512_ror_epi32(x, 16); } + +INLINE __m128i rot12_128(__m128i x) { return _mm_ror_epi32(x, 12); } + +INLINE __m256i rot12_256(__m256i x) { return _mm256_ror_epi32(x, 12); } + +INLINE __m512i rot12_512(__m512i x) { return _mm512_ror_epi32(x, 12); } + +INLINE __m128i rot8_128(__m128i x) { return _mm_ror_epi32(x, 8); } + +INLINE __m256i rot8_256(__m256i x) { return _mm256_ror_epi32(x, 8); } + +INLINE __m512i rot8_512(__m512i x) { return _mm512_ror_epi32(x, 8); } + +INLINE __m128i rot7_128(__m128i x) { return _mm_ror_epi32(x, 7); } + +INLINE __m256i rot7_256(__m256i x) { return _mm256_ror_epi32(x, 7); } + +INLINE __m512i rot7_512(__m512i x) { return _mm512_ror_epi32(x, 7); } + +/* + * ---------------------------------------------------------------------------- + * compress_avx512 + * ---------------------------------------------------------------------------- + */ + +INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, + __m128i m) { + *row0 = add_128(add_128(*row0, m), *row1); + *row3 = xor_128(*row3, *row0); + *row3 = rot16_128(*row3); + *row2 = add_128(*row2, *row3); + *row1 = xor_128(*row1, *row2); + *row1 = rot12_128(*row1); +} + +INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, + __m128i m) { + *row0 = add_128(add_128(*row0, m), *row1); + *row3 = xor_128(*row3, *row0); + *row3 = rot8_128(*row3); + *row2 = add_128(*row2, *row3); + *row1 = xor_128(*row1, *row2); + *row1 = rot7_128(*row1); +} + +// Note the optimization here of leaving row1 as the unrotated row, rather than +// row0. All the message loads below are adjusted to compensate for this. See +// discussion at https://github.com/sneves/blake2-avx2/pull/4 +INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { + *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3)); + *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); + *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1)); +} + +INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { + *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1)); + *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); + *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3)); +} + +INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, uint8_t flags) { + rows[0] = loadu_128((uint8_t *)&cv[0]); + rows[1] = loadu_128((uint8_t *)&cv[4]); + rows[2] = set4(IV[0], IV[1], IV[2], IV[3]); + rows[3] = set4(counter_low(counter), counter_high(counter), + (uint32_t)block_len, (uint32_t)flags); + + __m128i m0 = loadu_128(&block[sizeof(__m128i) * 0]); + __m128i m1 = loadu_128(&block[sizeof(__m128i) * 1]); + __m128i m2 = loadu_128(&block[sizeof(__m128i) * 2]); + __m128i m3 = loadu_128(&block[sizeof(__m128i) * 3]); + + __m128i t0, t1, t2, t3, tt; + + // Round 1. The first round permutes the message words from the original + // input order, into the groups that get mixed in parallel. + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); // 6 4 2 0 + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); // 7 5 3 1 + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10 8 + t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3)); // 12 10 8 14 + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11 9 + t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3)); // 13 11 9 15 + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 2. This round and all following rounds apply a fixed permutation + // to the message words from the round before. + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 3 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 4 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 5 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 6 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 7 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); +} + +void blake3_compress_xof_avx512(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags, uint8_t out[64]) { + __m128i rows[4]; + compress_pre(rows, cv, block, block_len, counter, flags); + storeu_128(xor_128(rows[0], rows[2]), &out[0]); + storeu_128(xor_128(rows[1], rows[3]), &out[16]); + storeu_128(xor_128(rows[2], loadu_128((uint8_t *)&cv[0])), &out[32]); + storeu_128(xor_128(rows[3], loadu_128((uint8_t *)&cv[4])), &out[48]); +} + +void blake3_compress_in_place_avx512(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags) { + __m128i rows[4]; + compress_pre(rows, cv, block, block_len, counter, flags); + storeu_128(xor_128(rows[0], rows[2]), (uint8_t *)&cv[0]); + storeu_128(xor_128(rows[1], rows[3]), (uint8_t *)&cv[4]); +} + +/* + * ---------------------------------------------------------------------------- + * hash4_avx512 + * ---------------------------------------------------------------------------- + */ + +INLINE void round_fn4(__m128i v[16], __m128i m[16], size_t r) { + v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); + v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); + v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); + v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); + v[0] = add_128(v[0], v[4]); + v[1] = add_128(v[1], v[5]); + v[2] = add_128(v[2], v[6]); + v[3] = add_128(v[3], v[7]); + v[12] = xor_128(v[12], v[0]); + v[13] = xor_128(v[13], v[1]); + v[14] = xor_128(v[14], v[2]); + v[15] = xor_128(v[15], v[3]); + v[12] = rot16_128(v[12]); + v[13] = rot16_128(v[13]); + v[14] = rot16_128(v[14]); + v[15] = rot16_128(v[15]); + v[8] = add_128(v[8], v[12]); + v[9] = add_128(v[9], v[13]); + v[10] = add_128(v[10], v[14]); + v[11] = add_128(v[11], v[15]); + v[4] = xor_128(v[4], v[8]); + v[5] = xor_128(v[5], v[9]); + v[6] = xor_128(v[6], v[10]); + v[7] = xor_128(v[7], v[11]); + v[4] = rot12_128(v[4]); + v[5] = rot12_128(v[5]); + v[6] = rot12_128(v[6]); + v[7] = rot12_128(v[7]); + v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); + v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); + v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); + v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); + v[0] = add_128(v[0], v[4]); + v[1] = add_128(v[1], v[5]); + v[2] = add_128(v[2], v[6]); + v[3] = add_128(v[3], v[7]); + v[12] = xor_128(v[12], v[0]); + v[13] = xor_128(v[13], v[1]); + v[14] = xor_128(v[14], v[2]); + v[15] = xor_128(v[15], v[3]); + v[12] = rot8_128(v[12]); + v[13] = rot8_128(v[13]); + v[14] = rot8_128(v[14]); + v[15] = rot8_128(v[15]); + v[8] = add_128(v[8], v[12]); + v[9] = add_128(v[9], v[13]); + v[10] = add_128(v[10], v[14]); + v[11] = add_128(v[11], v[15]); + v[4] = xor_128(v[4], v[8]); + v[5] = xor_128(v[5], v[9]); + v[6] = xor_128(v[6], v[10]); + v[7] = xor_128(v[7], v[11]); + v[4] = rot7_128(v[4]); + v[5] = rot7_128(v[5]); + v[6] = rot7_128(v[6]); + v[7] = rot7_128(v[7]); + + v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); + v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); + v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); + v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); + v[0] = add_128(v[0], v[5]); + v[1] = add_128(v[1], v[6]); + v[2] = add_128(v[2], v[7]); + v[3] = add_128(v[3], v[4]); + v[15] = xor_128(v[15], v[0]); + v[12] = xor_128(v[12], v[1]); + v[13] = xor_128(v[13], v[2]); + v[14] = xor_128(v[14], v[3]); + v[15] = rot16_128(v[15]); + v[12] = rot16_128(v[12]); + v[13] = rot16_128(v[13]); + v[14] = rot16_128(v[14]); + v[10] = add_128(v[10], v[15]); + v[11] = add_128(v[11], v[12]); + v[8] = add_128(v[8], v[13]); + v[9] = add_128(v[9], v[14]); + v[5] = xor_128(v[5], v[10]); + v[6] = xor_128(v[6], v[11]); + v[7] = xor_128(v[7], v[8]); + v[4] = xor_128(v[4], v[9]); + v[5] = rot12_128(v[5]); + v[6] = rot12_128(v[6]); + v[7] = rot12_128(v[7]); + v[4] = rot12_128(v[4]); + v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); + v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); + v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); + v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); + v[0] = add_128(v[0], v[5]); + v[1] = add_128(v[1], v[6]); + v[2] = add_128(v[2], v[7]); + v[3] = add_128(v[3], v[4]); + v[15] = xor_128(v[15], v[0]); + v[12] = xor_128(v[12], v[1]); + v[13] = xor_128(v[13], v[2]); + v[14] = xor_128(v[14], v[3]); + v[15] = rot8_128(v[15]); + v[12] = rot8_128(v[12]); + v[13] = rot8_128(v[13]); + v[14] = rot8_128(v[14]); + v[10] = add_128(v[10], v[15]); + v[11] = add_128(v[11], v[12]); + v[8] = add_128(v[8], v[13]); + v[9] = add_128(v[9], v[14]); + v[5] = xor_128(v[5], v[10]); + v[6] = xor_128(v[6], v[11]); + v[7] = xor_128(v[7], v[8]); + v[4] = xor_128(v[4], v[9]); + v[5] = rot7_128(v[5]); + v[6] = rot7_128(v[6]); + v[7] = rot7_128(v[7]); + v[4] = rot7_128(v[4]); +} + +INLINE void transpose_vecs_128(__m128i vecs[4]) { + // Interleave 32-bit lanes. The low unpack is lanes 00/11 and the high is + // 22/33. Note that this doesn't split the vector into two lanes, as the + // AVX2 counterparts do. + __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]); + __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]); + __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]); + __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]); + + // Interleave 64-bit lanes. + __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01); + __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01); + __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23); + __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23); + + vecs[0] = abcd_0; + vecs[1] = abcd_1; + vecs[2] = abcd_2; + vecs[3] = abcd_3; +} + +INLINE void transpose_msg_vecs4(const uint8_t *const *inputs, + size_t block_offset, __m128i out[16]) { + out[0] = loadu_128(&inputs[0][block_offset + 0 * sizeof(__m128i)]); + out[1] = loadu_128(&inputs[1][block_offset + 0 * sizeof(__m128i)]); + out[2] = loadu_128(&inputs[2][block_offset + 0 * sizeof(__m128i)]); + out[3] = loadu_128(&inputs[3][block_offset + 0 * sizeof(__m128i)]); + out[4] = loadu_128(&inputs[0][block_offset + 1 * sizeof(__m128i)]); + out[5] = loadu_128(&inputs[1][block_offset + 1 * sizeof(__m128i)]); + out[6] = loadu_128(&inputs[2][block_offset + 1 * sizeof(__m128i)]); + out[7] = loadu_128(&inputs[3][block_offset + 1 * sizeof(__m128i)]); + out[8] = loadu_128(&inputs[0][block_offset + 2 * sizeof(__m128i)]); + out[9] = loadu_128(&inputs[1][block_offset + 2 * sizeof(__m128i)]); + out[10] = loadu_128(&inputs[2][block_offset + 2 * sizeof(__m128i)]); + out[11] = loadu_128(&inputs[3][block_offset + 2 * sizeof(__m128i)]); + out[12] = loadu_128(&inputs[0][block_offset + 3 * sizeof(__m128i)]); + out[13] = loadu_128(&inputs[1][block_offset + 3 * sizeof(__m128i)]); + out[14] = loadu_128(&inputs[2][block_offset + 3 * sizeof(__m128i)]); + out[15] = loadu_128(&inputs[3][block_offset + 3 * sizeof(__m128i)]); + for (size_t i = 0; i < 4; ++i) { + _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0); + } + transpose_vecs_128(&out[0]); + transpose_vecs_128(&out[4]); + transpose_vecs_128(&out[8]); + transpose_vecs_128(&out[12]); +} + +INLINE void load_counters4(uint64_t counter, bool increment_counter, + __m128i *out_lo, __m128i *out_hi) { + uint64_t mask = (increment_counter ? ~0 : 0); + __m256i mask_vec = _mm256_set1_epi64x(mask); + __m256i deltas = _mm256_setr_epi64x(0, 1, 2, 3); + deltas = _mm256_and_si256(mask_vec, deltas); + __m256i counters = + _mm256_add_epi64(_mm256_set1_epi64x((int64_t)counter), deltas); + *out_lo = _mm256_cvtepi64_epi32(counters); + *out_hi = _mm256_cvtepi64_epi32(_mm256_srli_epi64(counters, 32)); +} + +static +void blake3_hash4_avx512(const uint8_t *const *inputs, size_t blocks, + const uint32_t key[8], uint64_t counter, + bool increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { + __m128i h_vecs[8] = { + set1_128(key[0]), set1_128(key[1]), set1_128(key[2]), set1_128(key[3]), + set1_128(key[4]), set1_128(key[5]), set1_128(key[6]), set1_128(key[7]), + }; + __m128i counter_low_vec, counter_high_vec; + load_counters4(counter, increment_counter, &counter_low_vec, + &counter_high_vec); + uint8_t block_flags = flags | flags_start; + + for (size_t block = 0; block < blocks; block++) { + if (block + 1 == blocks) { + block_flags |= flags_end; + } + __m128i block_len_vec = set1_128(BLAKE3_BLOCK_LEN); + __m128i block_flags_vec = set1_128(block_flags); + __m128i msg_vecs[16]; + transpose_msg_vecs4(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); + + __m128i v[16] = { + h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], + h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], + set1_128(IV[0]), set1_128(IV[1]), set1_128(IV[2]), set1_128(IV[3]), + counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, + }; + round_fn4(v, msg_vecs, 0); + round_fn4(v, msg_vecs, 1); + round_fn4(v, msg_vecs, 2); + round_fn4(v, msg_vecs, 3); + round_fn4(v, msg_vecs, 4); + round_fn4(v, msg_vecs, 5); + round_fn4(v, msg_vecs, 6); + h_vecs[0] = xor_128(v[0], v[8]); + h_vecs[1] = xor_128(v[1], v[9]); + h_vecs[2] = xor_128(v[2], v[10]); + h_vecs[3] = xor_128(v[3], v[11]); + h_vecs[4] = xor_128(v[4], v[12]); + h_vecs[5] = xor_128(v[5], v[13]); + h_vecs[6] = xor_128(v[6], v[14]); + h_vecs[7] = xor_128(v[7], v[15]); + + block_flags = flags; + } + + transpose_vecs_128(&h_vecs[0]); + transpose_vecs_128(&h_vecs[4]); + // The first four vecs now contain the first half of each output, and the + // second four vecs contain the second half of each output. + storeu_128(h_vecs[0], &out[0 * sizeof(__m128i)]); + storeu_128(h_vecs[4], &out[1 * sizeof(__m128i)]); + storeu_128(h_vecs[1], &out[2 * sizeof(__m128i)]); + storeu_128(h_vecs[5], &out[3 * sizeof(__m128i)]); + storeu_128(h_vecs[2], &out[4 * sizeof(__m128i)]); + storeu_128(h_vecs[6], &out[5 * sizeof(__m128i)]); + storeu_128(h_vecs[3], &out[6 * sizeof(__m128i)]); + storeu_128(h_vecs[7], &out[7 * sizeof(__m128i)]); +} + +static +void blake3_xof4_avx512(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, uint8_t flags, + uint8_t out[4 * 64]) { + __m128i h_vecs[8] = { + set1_128(cv[0]), set1_128(cv[1]), set1_128(cv[2]), set1_128(cv[3]), + set1_128(cv[4]), set1_128(cv[5]), set1_128(cv[6]), set1_128(cv[7]), + }; + uint32_t block_words[16]; + load_block_words(block, block_words); + __m128i msg_vecs[16]; + for (size_t i = 0; i < 16; i++) { + msg_vecs[i] = set1_128(block_words[i]); + } + __m128i counter_low_vec, counter_high_vec; + load_counters4(counter, true, &counter_low_vec, &counter_high_vec); + __m128i block_len_vec = set1_128(block_len); + __m128i block_flags_vec = set1_128(flags); + __m128i v[16] = { + h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], + h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], + set1_128(IV[0]), set1_128(IV[1]), set1_128(IV[2]), set1_128(IV[3]), + counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, + }; + round_fn4(v, msg_vecs, 0); + round_fn4(v, msg_vecs, 1); + round_fn4(v, msg_vecs, 2); + round_fn4(v, msg_vecs, 3); + round_fn4(v, msg_vecs, 4); + round_fn4(v, msg_vecs, 5); + round_fn4(v, msg_vecs, 6); + for (size_t i = 0; i < 8; i++) { + v[i] = xor_128(v[i], v[i+8]); + v[i+8] = xor_128(v[i+8], h_vecs[i]); + } + transpose_vecs_128(&v[0]); + transpose_vecs_128(&v[4]); + transpose_vecs_128(&v[8]); + transpose_vecs_128(&v[12]); + for (size_t i = 0; i < 4; i++) { + storeu_128(v[i+ 0], &out[(4*i+0) * sizeof(__m128i)]); + storeu_128(v[i+ 4], &out[(4*i+1) * sizeof(__m128i)]); + storeu_128(v[i+ 8], &out[(4*i+2) * sizeof(__m128i)]); + storeu_128(v[i+12], &out[(4*i+3) * sizeof(__m128i)]); + } +} + +/* + * ---------------------------------------------------------------------------- + * hash8_avx512 + * ---------------------------------------------------------------------------- + */ + +INLINE void round_fn8(__m256i v[16], __m256i m[16], size_t r) { + v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); + v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); + v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); + v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); + v[0] = add_256(v[0], v[4]); + v[1] = add_256(v[1], v[5]); + v[2] = add_256(v[2], v[6]); + v[3] = add_256(v[3], v[7]); + v[12] = xor_256(v[12], v[0]); + v[13] = xor_256(v[13], v[1]); + v[14] = xor_256(v[14], v[2]); + v[15] = xor_256(v[15], v[3]); + v[12] = rot16_256(v[12]); + v[13] = rot16_256(v[13]); + v[14] = rot16_256(v[14]); + v[15] = rot16_256(v[15]); + v[8] = add_256(v[8], v[12]); + v[9] = add_256(v[9], v[13]); + v[10] = add_256(v[10], v[14]); + v[11] = add_256(v[11], v[15]); + v[4] = xor_256(v[4], v[8]); + v[5] = xor_256(v[5], v[9]); + v[6] = xor_256(v[6], v[10]); + v[7] = xor_256(v[7], v[11]); + v[4] = rot12_256(v[4]); + v[5] = rot12_256(v[5]); + v[6] = rot12_256(v[6]); + v[7] = rot12_256(v[7]); + v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); + v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); + v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); + v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); + v[0] = add_256(v[0], v[4]); + v[1] = add_256(v[1], v[5]); + v[2] = add_256(v[2], v[6]); + v[3] = add_256(v[3], v[7]); + v[12] = xor_256(v[12], v[0]); + v[13] = xor_256(v[13], v[1]); + v[14] = xor_256(v[14], v[2]); + v[15] = xor_256(v[15], v[3]); + v[12] = rot8_256(v[12]); + v[13] = rot8_256(v[13]); + v[14] = rot8_256(v[14]); + v[15] = rot8_256(v[15]); + v[8] = add_256(v[8], v[12]); + v[9] = add_256(v[9], v[13]); + v[10] = add_256(v[10], v[14]); + v[11] = add_256(v[11], v[15]); + v[4] = xor_256(v[4], v[8]); + v[5] = xor_256(v[5], v[9]); + v[6] = xor_256(v[6], v[10]); + v[7] = xor_256(v[7], v[11]); + v[4] = rot7_256(v[4]); + v[5] = rot7_256(v[5]); + v[6] = rot7_256(v[6]); + v[7] = rot7_256(v[7]); + + v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); + v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); + v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); + v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); + v[0] = add_256(v[0], v[5]); + v[1] = add_256(v[1], v[6]); + v[2] = add_256(v[2], v[7]); + v[3] = add_256(v[3], v[4]); + v[15] = xor_256(v[15], v[0]); + v[12] = xor_256(v[12], v[1]); + v[13] = xor_256(v[13], v[2]); + v[14] = xor_256(v[14], v[3]); + v[15] = rot16_256(v[15]); + v[12] = rot16_256(v[12]); + v[13] = rot16_256(v[13]); + v[14] = rot16_256(v[14]); + v[10] = add_256(v[10], v[15]); + v[11] = add_256(v[11], v[12]); + v[8] = add_256(v[8], v[13]); + v[9] = add_256(v[9], v[14]); + v[5] = xor_256(v[5], v[10]); + v[6] = xor_256(v[6], v[11]); + v[7] = xor_256(v[7], v[8]); + v[4] = xor_256(v[4], v[9]); + v[5] = rot12_256(v[5]); + v[6] = rot12_256(v[6]); + v[7] = rot12_256(v[7]); + v[4] = rot12_256(v[4]); + v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); + v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); + v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); + v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); + v[0] = add_256(v[0], v[5]); + v[1] = add_256(v[1], v[6]); + v[2] = add_256(v[2], v[7]); + v[3] = add_256(v[3], v[4]); + v[15] = xor_256(v[15], v[0]); + v[12] = xor_256(v[12], v[1]); + v[13] = xor_256(v[13], v[2]); + v[14] = xor_256(v[14], v[3]); + v[15] = rot8_256(v[15]); + v[12] = rot8_256(v[12]); + v[13] = rot8_256(v[13]); + v[14] = rot8_256(v[14]); + v[10] = add_256(v[10], v[15]); + v[11] = add_256(v[11], v[12]); + v[8] = add_256(v[8], v[13]); + v[9] = add_256(v[9], v[14]); + v[5] = xor_256(v[5], v[10]); + v[6] = xor_256(v[6], v[11]); + v[7] = xor_256(v[7], v[8]); + v[4] = xor_256(v[4], v[9]); + v[5] = rot7_256(v[5]); + v[6] = rot7_256(v[6]); + v[7] = rot7_256(v[7]); + v[4] = rot7_256(v[4]); +} + +INLINE void transpose_vecs_256(__m256i vecs[8]) { + // Interleave 32-bit lanes. The low unpack is lanes 00/11/44/55, and the high + // is 22/33/66/77. + __m256i ab_0145 = _mm256_unpacklo_epi32(vecs[0], vecs[1]); + __m256i ab_2367 = _mm256_unpackhi_epi32(vecs[0], vecs[1]); + __m256i cd_0145 = _mm256_unpacklo_epi32(vecs[2], vecs[3]); + __m256i cd_2367 = _mm256_unpackhi_epi32(vecs[2], vecs[3]); + __m256i ef_0145 = _mm256_unpacklo_epi32(vecs[4], vecs[5]); + __m256i ef_2367 = _mm256_unpackhi_epi32(vecs[4], vecs[5]); + __m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]); + __m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]); + + // Interleave 64-bit lanes. The low unpack is lanes 00/22 and the high is + // 11/33. + __m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145); + __m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145); + __m256i abcd_26 = _mm256_unpacklo_epi64(ab_2367, cd_2367); + __m256i abcd_37 = _mm256_unpackhi_epi64(ab_2367, cd_2367); + __m256i efgh_04 = _mm256_unpacklo_epi64(ef_0145, gh_0145); + __m256i efgh_15 = _mm256_unpackhi_epi64(ef_0145, gh_0145); + __m256i efgh_26 = _mm256_unpacklo_epi64(ef_2367, gh_2367); + __m256i efgh_37 = _mm256_unpackhi_epi64(ef_2367, gh_2367); + + // Interleave 128-bit lanes. + vecs[0] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x20); + vecs[1] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x20); + vecs[2] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x20); + vecs[3] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x20); + vecs[4] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x31); + vecs[5] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x31); + vecs[6] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x31); + vecs[7] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x31); +} + +INLINE void transpose_msg_vecs8(const uint8_t *const *inputs, + size_t block_offset, __m256i out[16]) { + out[0] = loadu_256(&inputs[0][block_offset + 0 * sizeof(__m256i)]); + out[1] = loadu_256(&inputs[1][block_offset + 0 * sizeof(__m256i)]); + out[2] = loadu_256(&inputs[2][block_offset + 0 * sizeof(__m256i)]); + out[3] = loadu_256(&inputs[3][block_offset + 0 * sizeof(__m256i)]); + out[4] = loadu_256(&inputs[4][block_offset + 0 * sizeof(__m256i)]); + out[5] = loadu_256(&inputs[5][block_offset + 0 * sizeof(__m256i)]); + out[6] = loadu_256(&inputs[6][block_offset + 0 * sizeof(__m256i)]); + out[7] = loadu_256(&inputs[7][block_offset + 0 * sizeof(__m256i)]); + out[8] = loadu_256(&inputs[0][block_offset + 1 * sizeof(__m256i)]); + out[9] = loadu_256(&inputs[1][block_offset + 1 * sizeof(__m256i)]); + out[10] = loadu_256(&inputs[2][block_offset + 1 * sizeof(__m256i)]); + out[11] = loadu_256(&inputs[3][block_offset + 1 * sizeof(__m256i)]); + out[12] = loadu_256(&inputs[4][block_offset + 1 * sizeof(__m256i)]); + out[13] = loadu_256(&inputs[5][block_offset + 1 * sizeof(__m256i)]); + out[14] = loadu_256(&inputs[6][block_offset + 1 * sizeof(__m256i)]); + out[15] = loadu_256(&inputs[7][block_offset + 1 * sizeof(__m256i)]); + for (size_t i = 0; i < 8; ++i) { + _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0); + } + transpose_vecs_256(&out[0]); + transpose_vecs_256(&out[8]); +} + +INLINE void load_counters8(uint64_t counter, bool increment_counter, + __m256i *out_lo, __m256i *out_hi) { + uint64_t mask = (increment_counter ? ~0 : 0); + __m512i mask_vec = _mm512_set1_epi64(mask); + __m512i deltas = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7); + deltas = _mm512_and_si512(mask_vec, deltas); + __m512i counters = + _mm512_add_epi64(_mm512_set1_epi64((int64_t)counter), deltas); + *out_lo = _mm512_cvtepi64_epi32(counters); + *out_hi = _mm512_cvtepi64_epi32(_mm512_srli_epi64(counters, 32)); +} + +static +void blake3_hash8_avx512(const uint8_t *const *inputs, size_t blocks, + const uint32_t key[8], uint64_t counter, + bool increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { + __m256i h_vecs[8] = { + set1_256(key[0]), set1_256(key[1]), set1_256(key[2]), set1_256(key[3]), + set1_256(key[4]), set1_256(key[5]), set1_256(key[6]), set1_256(key[7]), + }; + __m256i counter_low_vec, counter_high_vec; + load_counters8(counter, increment_counter, &counter_low_vec, + &counter_high_vec); + uint8_t block_flags = flags | flags_start; + + for (size_t block = 0; block < blocks; block++) { + if (block + 1 == blocks) { + block_flags |= flags_end; + } + __m256i block_len_vec = set1_256(BLAKE3_BLOCK_LEN); + __m256i block_flags_vec = set1_256(block_flags); + __m256i msg_vecs[16]; + transpose_msg_vecs8(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); + + __m256i v[16] = { + h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], + h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], + set1_256(IV[0]), set1_256(IV[1]), set1_256(IV[2]), set1_256(IV[3]), + counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, + }; + round_fn8(v, msg_vecs, 0); + round_fn8(v, msg_vecs, 1); + round_fn8(v, msg_vecs, 2); + round_fn8(v, msg_vecs, 3); + round_fn8(v, msg_vecs, 4); + round_fn8(v, msg_vecs, 5); + round_fn8(v, msg_vecs, 6); + h_vecs[0] = xor_256(v[0], v[8]); + h_vecs[1] = xor_256(v[1], v[9]); + h_vecs[2] = xor_256(v[2], v[10]); + h_vecs[3] = xor_256(v[3], v[11]); + h_vecs[4] = xor_256(v[4], v[12]); + h_vecs[5] = xor_256(v[5], v[13]); + h_vecs[6] = xor_256(v[6], v[14]); + h_vecs[7] = xor_256(v[7], v[15]); + + block_flags = flags; + } + + transpose_vecs_256(h_vecs); + storeu_256(h_vecs[0], &out[0 * sizeof(__m256i)]); + storeu_256(h_vecs[1], &out[1 * sizeof(__m256i)]); + storeu_256(h_vecs[2], &out[2 * sizeof(__m256i)]); + storeu_256(h_vecs[3], &out[3 * sizeof(__m256i)]); + storeu_256(h_vecs[4], &out[4 * sizeof(__m256i)]); + storeu_256(h_vecs[5], &out[5 * sizeof(__m256i)]); + storeu_256(h_vecs[6], &out[6 * sizeof(__m256i)]); + storeu_256(h_vecs[7], &out[7 * sizeof(__m256i)]); +} + +static +void blake3_xof8_avx512(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, uint8_t flags, + uint8_t out[8 * 64]) { + __m256i h_vecs[8] = { + set1_256(cv[0]), set1_256(cv[1]), set1_256(cv[2]), set1_256(cv[3]), + set1_256(cv[4]), set1_256(cv[5]), set1_256(cv[6]), set1_256(cv[7]), + }; + uint32_t block_words[16]; + load_block_words(block, block_words); + __m256i msg_vecs[16]; + for (size_t i = 0; i < 16; i++) { + msg_vecs[i] = set1_256(block_words[i]); + } + __m256i counter_low_vec, counter_high_vec; + load_counters8(counter, true, &counter_low_vec, &counter_high_vec); + __m256i block_len_vec = set1_256(block_len); + __m256i block_flags_vec = set1_256(flags); + __m256i v[16] = { + h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], + h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], + set1_256(IV[0]), set1_256(IV[1]), set1_256(IV[2]), set1_256(IV[3]), + counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, + }; + round_fn8(v, msg_vecs, 0); + round_fn8(v, msg_vecs, 1); + round_fn8(v, msg_vecs, 2); + round_fn8(v, msg_vecs, 3); + round_fn8(v, msg_vecs, 4); + round_fn8(v, msg_vecs, 5); + round_fn8(v, msg_vecs, 6); + for (size_t i = 0; i < 8; i++) { + v[i] = xor_256(v[i], v[i+8]); + v[i+8] = xor_256(v[i+8], h_vecs[i]); + } + transpose_vecs_256(&v[0]); + transpose_vecs_256(&v[8]); + for (size_t i = 0; i < 8; i++) { + storeu_256(v[i+0], &out[(2*i+0) * sizeof(__m256i)]); + storeu_256(v[i+8], &out[(2*i+1) * sizeof(__m256i)]); + } +} + +/* + * ---------------------------------------------------------------------------- + * hash16_avx512 + * ---------------------------------------------------------------------------- + */ + +INLINE void round_fn16(__m512i v[16], __m512i m[16], size_t r) { + v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); + v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); + v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); + v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); + v[0] = add_512(v[0], v[4]); + v[1] = add_512(v[1], v[5]); + v[2] = add_512(v[2], v[6]); + v[3] = add_512(v[3], v[7]); + v[12] = xor_512(v[12], v[0]); + v[13] = xor_512(v[13], v[1]); + v[14] = xor_512(v[14], v[2]); + v[15] = xor_512(v[15], v[3]); + v[12] = rot16_512(v[12]); + v[13] = rot16_512(v[13]); + v[14] = rot16_512(v[14]); + v[15] = rot16_512(v[15]); + v[8] = add_512(v[8], v[12]); + v[9] = add_512(v[9], v[13]); + v[10] = add_512(v[10], v[14]); + v[11] = add_512(v[11], v[15]); + v[4] = xor_512(v[4], v[8]); + v[5] = xor_512(v[5], v[9]); + v[6] = xor_512(v[6], v[10]); + v[7] = xor_512(v[7], v[11]); + v[4] = rot12_512(v[4]); + v[5] = rot12_512(v[5]); + v[6] = rot12_512(v[6]); + v[7] = rot12_512(v[7]); + v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); + v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); + v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); + v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); + v[0] = add_512(v[0], v[4]); + v[1] = add_512(v[1], v[5]); + v[2] = add_512(v[2], v[6]); + v[3] = add_512(v[3], v[7]); + v[12] = xor_512(v[12], v[0]); + v[13] = xor_512(v[13], v[1]); + v[14] = xor_512(v[14], v[2]); + v[15] = xor_512(v[15], v[3]); + v[12] = rot8_512(v[12]); + v[13] = rot8_512(v[13]); + v[14] = rot8_512(v[14]); + v[15] = rot8_512(v[15]); + v[8] = add_512(v[8], v[12]); + v[9] = add_512(v[9], v[13]); + v[10] = add_512(v[10], v[14]); + v[11] = add_512(v[11], v[15]); + v[4] = xor_512(v[4], v[8]); + v[5] = xor_512(v[5], v[9]); + v[6] = xor_512(v[6], v[10]); + v[7] = xor_512(v[7], v[11]); + v[4] = rot7_512(v[4]); + v[5] = rot7_512(v[5]); + v[6] = rot7_512(v[6]); + v[7] = rot7_512(v[7]); + + v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); + v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); + v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); + v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); + v[0] = add_512(v[0], v[5]); + v[1] = add_512(v[1], v[6]); + v[2] = add_512(v[2], v[7]); + v[3] = add_512(v[3], v[4]); + v[15] = xor_512(v[15], v[0]); + v[12] = xor_512(v[12], v[1]); + v[13] = xor_512(v[13], v[2]); + v[14] = xor_512(v[14], v[3]); + v[15] = rot16_512(v[15]); + v[12] = rot16_512(v[12]); + v[13] = rot16_512(v[13]); + v[14] = rot16_512(v[14]); + v[10] = add_512(v[10], v[15]); + v[11] = add_512(v[11], v[12]); + v[8] = add_512(v[8], v[13]); + v[9] = add_512(v[9], v[14]); + v[5] = xor_512(v[5], v[10]); + v[6] = xor_512(v[6], v[11]); + v[7] = xor_512(v[7], v[8]); + v[4] = xor_512(v[4], v[9]); + v[5] = rot12_512(v[5]); + v[6] = rot12_512(v[6]); + v[7] = rot12_512(v[7]); + v[4] = rot12_512(v[4]); + v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); + v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); + v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); + v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); + v[0] = add_512(v[0], v[5]); + v[1] = add_512(v[1], v[6]); + v[2] = add_512(v[2], v[7]); + v[3] = add_512(v[3], v[4]); + v[15] = xor_512(v[15], v[0]); + v[12] = xor_512(v[12], v[1]); + v[13] = xor_512(v[13], v[2]); + v[14] = xor_512(v[14], v[3]); + v[15] = rot8_512(v[15]); + v[12] = rot8_512(v[12]); + v[13] = rot8_512(v[13]); + v[14] = rot8_512(v[14]); + v[10] = add_512(v[10], v[15]); + v[11] = add_512(v[11], v[12]); + v[8] = add_512(v[8], v[13]); + v[9] = add_512(v[9], v[14]); + v[5] = xor_512(v[5], v[10]); + v[6] = xor_512(v[6], v[11]); + v[7] = xor_512(v[7], v[8]); + v[4] = xor_512(v[4], v[9]); + v[5] = rot7_512(v[5]); + v[6] = rot7_512(v[6]); + v[7] = rot7_512(v[7]); + v[4] = rot7_512(v[4]); +} + +// 0b10001000, or lanes a0/a2/b0/b2 in little-endian order +#define LO_IMM8 0x88 + +INLINE __m512i unpack_lo_128(__m512i a, __m512i b) { + return _mm512_shuffle_i32x4(a, b, LO_IMM8); +} + +// 0b11011101, or lanes a1/a3/b1/b3 in little-endian order +#define HI_IMM8 0xdd + +INLINE __m512i unpack_hi_128(__m512i a, __m512i b) { + return _mm512_shuffle_i32x4(a, b, HI_IMM8); +} + +INLINE void transpose_vecs_512(__m512i vecs[16]) { + // Interleave 32-bit lanes. The _0 unpack is lanes + // 0/0/1/1/4/4/5/5/8/8/9/9/12/12/13/13, and the _2 unpack is lanes + // 2/2/3/3/6/6/7/7/10/10/11/11/14/14/15/15. + __m512i ab_0 = _mm512_unpacklo_epi32(vecs[0], vecs[1]); + __m512i ab_2 = _mm512_unpackhi_epi32(vecs[0], vecs[1]); + __m512i cd_0 = _mm512_unpacklo_epi32(vecs[2], vecs[3]); + __m512i cd_2 = _mm512_unpackhi_epi32(vecs[2], vecs[3]); + __m512i ef_0 = _mm512_unpacklo_epi32(vecs[4], vecs[5]); + __m512i ef_2 = _mm512_unpackhi_epi32(vecs[4], vecs[5]); + __m512i gh_0 = _mm512_unpacklo_epi32(vecs[6], vecs[7]); + __m512i gh_2 = _mm512_unpackhi_epi32(vecs[6], vecs[7]); + __m512i ij_0 = _mm512_unpacklo_epi32(vecs[8], vecs[9]); + __m512i ij_2 = _mm512_unpackhi_epi32(vecs[8], vecs[9]); + __m512i kl_0 = _mm512_unpacklo_epi32(vecs[10], vecs[11]); + __m512i kl_2 = _mm512_unpackhi_epi32(vecs[10], vecs[11]); + __m512i mn_0 = _mm512_unpacklo_epi32(vecs[12], vecs[13]); + __m512i mn_2 = _mm512_unpackhi_epi32(vecs[12], vecs[13]); + __m512i op_0 = _mm512_unpacklo_epi32(vecs[14], vecs[15]); + __m512i op_2 = _mm512_unpackhi_epi32(vecs[14], vecs[15]); + + // Interleave 64-bit lanes. The _0 unpack is lanes + // 0/0/0/0/4/4/4/4/8/8/8/8/12/12/12/12, the _1 unpack is lanes + // 1/1/1/1/5/5/5/5/9/9/9/9/13/13/13/13, the _2 unpack is lanes + // 2/2/2/2/6/6/6/6/10/10/10/10/14/14/14/14, and the _3 unpack is lanes + // 3/3/3/3/7/7/7/7/11/11/11/11/15/15/15/15. + __m512i abcd_0 = _mm512_unpacklo_epi64(ab_0, cd_0); + __m512i abcd_1 = _mm512_unpackhi_epi64(ab_0, cd_0); + __m512i abcd_2 = _mm512_unpacklo_epi64(ab_2, cd_2); + __m512i abcd_3 = _mm512_unpackhi_epi64(ab_2, cd_2); + __m512i efgh_0 = _mm512_unpacklo_epi64(ef_0, gh_0); + __m512i efgh_1 = _mm512_unpackhi_epi64(ef_0, gh_0); + __m512i efgh_2 = _mm512_unpacklo_epi64(ef_2, gh_2); + __m512i efgh_3 = _mm512_unpackhi_epi64(ef_2, gh_2); + __m512i ijkl_0 = _mm512_unpacklo_epi64(ij_0, kl_0); + __m512i ijkl_1 = _mm512_unpackhi_epi64(ij_0, kl_0); + __m512i ijkl_2 = _mm512_unpacklo_epi64(ij_2, kl_2); + __m512i ijkl_3 = _mm512_unpackhi_epi64(ij_2, kl_2); + __m512i mnop_0 = _mm512_unpacklo_epi64(mn_0, op_0); + __m512i mnop_1 = _mm512_unpackhi_epi64(mn_0, op_0); + __m512i mnop_2 = _mm512_unpacklo_epi64(mn_2, op_2); + __m512i mnop_3 = _mm512_unpackhi_epi64(mn_2, op_2); + + // Interleave 128-bit lanes. The _0 unpack is + // 0/0/0/0/8/8/8/8/0/0/0/0/8/8/8/8, the _1 unpack is + // 1/1/1/1/9/9/9/9/1/1/1/1/9/9/9/9, and so on. + __m512i abcdefgh_0 = unpack_lo_128(abcd_0, efgh_0); + __m512i abcdefgh_1 = unpack_lo_128(abcd_1, efgh_1); + __m512i abcdefgh_2 = unpack_lo_128(abcd_2, efgh_2); + __m512i abcdefgh_3 = unpack_lo_128(abcd_3, efgh_3); + __m512i abcdefgh_4 = unpack_hi_128(abcd_0, efgh_0); + __m512i abcdefgh_5 = unpack_hi_128(abcd_1, efgh_1); + __m512i abcdefgh_6 = unpack_hi_128(abcd_2, efgh_2); + __m512i abcdefgh_7 = unpack_hi_128(abcd_3, efgh_3); + __m512i ijklmnop_0 = unpack_lo_128(ijkl_0, mnop_0); + __m512i ijklmnop_1 = unpack_lo_128(ijkl_1, mnop_1); + __m512i ijklmnop_2 = unpack_lo_128(ijkl_2, mnop_2); + __m512i ijklmnop_3 = unpack_lo_128(ijkl_3, mnop_3); + __m512i ijklmnop_4 = unpack_hi_128(ijkl_0, mnop_0); + __m512i ijklmnop_5 = unpack_hi_128(ijkl_1, mnop_1); + __m512i ijklmnop_6 = unpack_hi_128(ijkl_2, mnop_2); + __m512i ijklmnop_7 = unpack_hi_128(ijkl_3, mnop_3); + + // Interleave 128-bit lanes again for the final outputs. + vecs[0] = unpack_lo_128(abcdefgh_0, ijklmnop_0); + vecs[1] = unpack_lo_128(abcdefgh_1, ijklmnop_1); + vecs[2] = unpack_lo_128(abcdefgh_2, ijklmnop_2); + vecs[3] = unpack_lo_128(abcdefgh_3, ijklmnop_3); + vecs[4] = unpack_lo_128(abcdefgh_4, ijklmnop_4); + vecs[5] = unpack_lo_128(abcdefgh_5, ijklmnop_5); + vecs[6] = unpack_lo_128(abcdefgh_6, ijklmnop_6); + vecs[7] = unpack_lo_128(abcdefgh_7, ijklmnop_7); + vecs[8] = unpack_hi_128(abcdefgh_0, ijklmnop_0); + vecs[9] = unpack_hi_128(abcdefgh_1, ijklmnop_1); + vecs[10] = unpack_hi_128(abcdefgh_2, ijklmnop_2); + vecs[11] = unpack_hi_128(abcdefgh_3, ijklmnop_3); + vecs[12] = unpack_hi_128(abcdefgh_4, ijklmnop_4); + vecs[13] = unpack_hi_128(abcdefgh_5, ijklmnop_5); + vecs[14] = unpack_hi_128(abcdefgh_6, ijklmnop_6); + vecs[15] = unpack_hi_128(abcdefgh_7, ijklmnop_7); +} + +INLINE void transpose_msg_vecs16(const uint8_t *const *inputs, + size_t block_offset, __m512i out[16]) { + out[0] = loadu_512(&inputs[0][block_offset]); + out[1] = loadu_512(&inputs[1][block_offset]); + out[2] = loadu_512(&inputs[2][block_offset]); + out[3] = loadu_512(&inputs[3][block_offset]); + out[4] = loadu_512(&inputs[4][block_offset]); + out[5] = loadu_512(&inputs[5][block_offset]); + out[6] = loadu_512(&inputs[6][block_offset]); + out[7] = loadu_512(&inputs[7][block_offset]); + out[8] = loadu_512(&inputs[8][block_offset]); + out[9] = loadu_512(&inputs[9][block_offset]); + out[10] = loadu_512(&inputs[10][block_offset]); + out[11] = loadu_512(&inputs[11][block_offset]); + out[12] = loadu_512(&inputs[12][block_offset]); + out[13] = loadu_512(&inputs[13][block_offset]); + out[14] = loadu_512(&inputs[14][block_offset]); + out[15] = loadu_512(&inputs[15][block_offset]); + for (size_t i = 0; i < 16; ++i) { + _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0); + } + transpose_vecs_512(out); +} + +INLINE void load_counters16(uint64_t counter, bool increment_counter, + __m512i *out_lo, __m512i *out_hi) { + const __m512i mask = _mm512_set1_epi32(-(int32_t)increment_counter); + const __m512i deltas = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + const __m512i masked_deltas = _mm512_and_si512(deltas, mask); + const __m512i low_words = _mm512_add_epi32( + _mm512_set1_epi32((int32_t)counter), + masked_deltas); + // The carry bit is 1 if the high bit of the word was 1 before addition and is + // 0 after. + // NOTE: It would be a bit more natural to use _mm512_cmp_epu32_mask to + // compute the carry bits here, and originally we did, but that intrinsic is + // broken under GCC 5.4. See https://github.com/BLAKE3-team/BLAKE3/issues/271. + const __m512i carries = _mm512_srli_epi32( + _mm512_andnot_si512( + low_words, // 0 after (gets inverted by andnot) + _mm512_set1_epi32((int32_t)counter)), // and 1 before + 31); + const __m512i high_words = _mm512_add_epi32( + _mm512_set1_epi32((int32_t)(counter >> 32)), + carries); + *out_lo = low_words; + *out_hi = high_words; +} + +static +void blake3_hash16_avx512(const uint8_t *const *inputs, size_t blocks, + const uint32_t key[8], uint64_t counter, + bool increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, + uint8_t *out) { + __m512i h_vecs[8] = { + set1_512(key[0]), set1_512(key[1]), set1_512(key[2]), set1_512(key[3]), + set1_512(key[4]), set1_512(key[5]), set1_512(key[6]), set1_512(key[7]), + }; + __m512i counter_low_vec, counter_high_vec; + load_counters16(counter, increment_counter, &counter_low_vec, + &counter_high_vec); + uint8_t block_flags = flags | flags_start; + + for (size_t block = 0; block < blocks; block++) { + if (block + 1 == blocks) { + block_flags |= flags_end; + } + __m512i block_len_vec = set1_512(BLAKE3_BLOCK_LEN); + __m512i block_flags_vec = set1_512(block_flags); + __m512i msg_vecs[16]; + transpose_msg_vecs16(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); + + __m512i v[16] = { + h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], + h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], + set1_512(IV[0]), set1_512(IV[1]), set1_512(IV[2]), set1_512(IV[3]), + counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, + }; + round_fn16(v, msg_vecs, 0); + round_fn16(v, msg_vecs, 1); + round_fn16(v, msg_vecs, 2); + round_fn16(v, msg_vecs, 3); + round_fn16(v, msg_vecs, 4); + round_fn16(v, msg_vecs, 5); + round_fn16(v, msg_vecs, 6); + h_vecs[0] = xor_512(v[0], v[8]); + h_vecs[1] = xor_512(v[1], v[9]); + h_vecs[2] = xor_512(v[2], v[10]); + h_vecs[3] = xor_512(v[3], v[11]); + h_vecs[4] = xor_512(v[4], v[12]); + h_vecs[5] = xor_512(v[5], v[13]); + h_vecs[6] = xor_512(v[6], v[14]); + h_vecs[7] = xor_512(v[7], v[15]); + + block_flags = flags; + } + + // transpose_vecs_512 operates on a 16x16 matrix of words, but we only have 8 + // state vectors. Pad the matrix with zeros. After transposition, store the + // lower half of each vector. + __m512i padded[16] = { + h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], + h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], + set1_512(0), set1_512(0), set1_512(0), set1_512(0), + set1_512(0), set1_512(0), set1_512(0), set1_512(0), + }; + transpose_vecs_512(padded); + _mm256_mask_storeu_epi32(&out[0 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[0])); + _mm256_mask_storeu_epi32(&out[1 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[1])); + _mm256_mask_storeu_epi32(&out[2 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[2])); + _mm256_mask_storeu_epi32(&out[3 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[3])); + _mm256_mask_storeu_epi32(&out[4 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[4])); + _mm256_mask_storeu_epi32(&out[5 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[5])); + _mm256_mask_storeu_epi32(&out[6 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[6])); + _mm256_mask_storeu_epi32(&out[7 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[7])); + _mm256_mask_storeu_epi32(&out[8 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[8])); + _mm256_mask_storeu_epi32(&out[9 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[9])); + _mm256_mask_storeu_epi32(&out[10 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[10])); + _mm256_mask_storeu_epi32(&out[11 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[11])); + _mm256_mask_storeu_epi32(&out[12 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[12])); + _mm256_mask_storeu_epi32(&out[13 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[13])); + _mm256_mask_storeu_epi32(&out[14 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[14])); + _mm256_mask_storeu_epi32(&out[15 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[15])); +} + +static +void blake3_xof16_avx512(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, uint8_t flags, + uint8_t out[16 * 64]) { + __m512i h_vecs[8] = { + set1_512(cv[0]), set1_512(cv[1]), set1_512(cv[2]), set1_512(cv[3]), + set1_512(cv[4]), set1_512(cv[5]), set1_512(cv[6]), set1_512(cv[7]), + }; + uint32_t block_words[16]; + load_block_words(block, block_words); + __m512i msg_vecs[16]; + for (size_t i = 0; i < 16; i++) { + msg_vecs[i] = set1_512(block_words[i]); + } + __m512i counter_low_vec, counter_high_vec; + load_counters16(counter, true, &counter_low_vec, &counter_high_vec); + __m512i block_len_vec = set1_512(block_len); + __m512i block_flags_vec = set1_512(flags); + __m512i v[16] = { + h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], + h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], + set1_512(IV[0]), set1_512(IV[1]), set1_512(IV[2]), set1_512(IV[3]), + counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, + }; + round_fn16(v, msg_vecs, 0); + round_fn16(v, msg_vecs, 1); + round_fn16(v, msg_vecs, 2); + round_fn16(v, msg_vecs, 3); + round_fn16(v, msg_vecs, 4); + round_fn16(v, msg_vecs, 5); + round_fn16(v, msg_vecs, 6); + for (size_t i = 0; i < 8; i++) { + v[i] = xor_512(v[i], v[i+8]); + v[i+8] = xor_512(v[i+8], h_vecs[i]); + } + transpose_vecs_512(&v[0]); + for (size_t i = 0; i < 16; i++) { + storeu_512(v[i], &out[i * sizeof(__m512i)]); + } +} + +/* + * ---------------------------------------------------------------------------- + * hash_many_avx512 + * ---------------------------------------------------------------------------- + */ + +INLINE void hash_one_avx512(const uint8_t *input, size_t blocks, + const uint32_t key[8], uint64_t counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) { + uint32_t cv[8]; + memcpy(cv, key, BLAKE3_KEY_LEN); + uint8_t block_flags = flags | flags_start; + while (blocks > 0) { + if (blocks == 1) { + block_flags |= flags_end; + } + blake3_compress_in_place_avx512(cv, input, BLAKE3_BLOCK_LEN, counter, + block_flags); + input = &input[BLAKE3_BLOCK_LEN]; + blocks -= 1; + block_flags = flags; + } + memcpy(out, cv, BLAKE3_OUT_LEN); +} + +void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out) { + while (num_inputs >= 16) { + blake3_hash16_avx512(inputs, blocks, key, counter, increment_counter, flags, + flags_start, flags_end, out); + if (increment_counter) { + counter += 16; + } + inputs += 16; + num_inputs -= 16; + out = &out[16 * BLAKE3_OUT_LEN]; + } + while (num_inputs >= 8) { + blake3_hash8_avx512(inputs, blocks, key, counter, increment_counter, flags, + flags_start, flags_end, out); + if (increment_counter) { + counter += 8; + } + inputs += 8; + num_inputs -= 8; + out = &out[8 * BLAKE3_OUT_LEN]; + } + while (num_inputs >= 4) { + blake3_hash4_avx512(inputs, blocks, key, counter, increment_counter, flags, + flags_start, flags_end, out); + if (increment_counter) { + counter += 4; + } + inputs += 4; + num_inputs -= 4; + out = &out[4 * BLAKE3_OUT_LEN]; + } + while (num_inputs > 0) { + hash_one_avx512(inputs[0], blocks, key, counter, flags, flags_start, + flags_end, out); + if (increment_counter) { + counter += 1; + } + inputs += 1; + num_inputs -= 1; + out = &out[BLAKE3_OUT_LEN]; + } +} + +void blake3_xof_many_avx512(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, uint8_t flags, + uint8_t* out, size_t outblocks) { + while (outblocks >= 16) { + blake3_xof16_avx512(cv, block, block_len, counter, flags, out); + counter += 16; + outblocks -= 16; + out += 16 * BLAKE3_BLOCK_LEN; + } + while (outblocks >= 8) { + blake3_xof8_avx512(cv, block, block_len, counter, flags, out); + counter += 8; + outblocks -= 8; + out += 8 * BLAKE3_BLOCK_LEN; + } + while (outblocks >= 4) { + blake3_xof4_avx512(cv, block, block_len, counter, flags, out); + counter += 4; + outblocks -= 4; + out += 4 * BLAKE3_BLOCK_LEN; + } + while (outblocks > 0) { + blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out); + counter += 1; + outblocks -= 1; + out += BLAKE3_BLOCK_LEN; + } +} diff --git a/thirdparty/blake3/c/blake3_avx512_x86-64_unix.S b/thirdparty/blake3/c/blake3_avx512_x86-64_unix.S new file mode 100644 index 000000000..9642e413a --- /dev/null +++ b/thirdparty/blake3/c/blake3_avx512_x86-64_unix.S @@ -0,0 +1,4824 @@ +#if defined(__ELF__) && defined(__linux__) +.section .note.GNU-stack,"",%progbits +#endif + +#if defined(__ELF__) && defined(__CET__) && defined(__has_include) +#if __has_include(<cet.h>) +#include <cet.h> +#endif +#endif + +#if !defined(_CET_ENDBR) +#define _CET_ENDBR +#endif + +.intel_syntax noprefix +.global _blake3_hash_many_avx512 +.global blake3_hash_many_avx512 +.global blake3_compress_in_place_avx512 +.global _blake3_compress_in_place_avx512 +.global blake3_compress_xof_avx512 +.global _blake3_compress_xof_avx512 +.global blake3_xof_many_avx512 +.global _blake3_xof_many_avx512 + +#ifdef __APPLE__ +.text +#else +.section .text +#endif +.p2align 6 +_blake3_hash_many_avx512: +blake3_hash_many_avx512: + _CET_ENDBR + push r15 + push r14 + push r13 + push r12 + push rbx + push rbp + mov rbp, rsp + sub rsp, 144 + and rsp, 0xFFFFFFFFFFFFFFC0 + neg r9 + kmovw k1, r9d + vmovd xmm0, r8d + vpbroadcastd ymm0, xmm0 + shr r8, 32 + vmovd xmm1, r8d + vpbroadcastd ymm1, xmm1 + vmovdqa ymm4, ymm1 + vmovdqa ymm5, ymm1 + vpaddd ymm2, ymm0, ymmword ptr [ADD0+rip] + vpaddd ymm3, ymm0, ymmword ptr [ADD0+32+rip] + vpcmpltud k2, ymm2, ymm0 + vpcmpltud k3, ymm3, ymm0 + vpaddd ymm4 {k2}, ymm4, dword ptr [ADD1+rip] {1to8} + vpaddd ymm5 {k3}, ymm5, dword ptr [ADD1+rip] {1to8} + knotw k2, k1 + vmovdqa32 ymm2 {k2}, ymm0 + vmovdqa32 ymm3 {k2}, ymm0 + vmovdqa32 ymm4 {k2}, ymm1 + vmovdqa32 ymm5 {k2}, ymm1 + vmovdqa ymmword ptr [rsp], ymm2 + vmovdqa ymmword ptr [rsp+0x1*0x20], ymm3 + vmovdqa ymmword ptr [rsp+0x2*0x20], ymm4 + vmovdqa ymmword ptr [rsp+0x3*0x20], ymm5 + shl rdx, 6 + mov qword ptr [rsp+0x80], rdx + cmp rsi, 16 + jc 3f +2: + vpbroadcastd zmm0, dword ptr [rcx] + vpbroadcastd zmm1, dword ptr [rcx+0x1*0x4] + vpbroadcastd zmm2, dword ptr [rcx+0x2*0x4] + vpbroadcastd zmm3, dword ptr [rcx+0x3*0x4] + vpbroadcastd zmm4, dword ptr [rcx+0x4*0x4] + vpbroadcastd zmm5, dword ptr [rcx+0x5*0x4] + vpbroadcastd zmm6, dword ptr [rcx+0x6*0x4] + vpbroadcastd zmm7, dword ptr [rcx+0x7*0x4] + movzx eax, byte ptr [rbp+0x38] + movzx ebx, byte ptr [rbp+0x40] + or eax, ebx + xor edx, edx +.p2align 5 +9: + movzx ebx, byte ptr [rbp+0x48] + or ebx, eax + add rdx, 64 + cmp rdx, qword ptr [rsp+0x80] + cmove eax, ebx + mov dword ptr [rsp+0x88], eax + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + mov r12, qword ptr [rdi+0x40] + mov r13, qword ptr [rdi+0x48] + mov r14, qword ptr [rdi+0x50] + mov r15, qword ptr [rdi+0x58] + vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20] + vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01 + vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20] + vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01 + vpunpcklqdq zmm8, zmm16, zmm17 + vpunpckhqdq zmm9, zmm16, zmm17 + vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20] + vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01 + vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20] + vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01 + vpunpcklqdq zmm10, zmm18, zmm19 + vpunpckhqdq zmm11, zmm18, zmm19 + mov r8, qword ptr [rdi+0x20] + mov r9, qword ptr [rdi+0x28] + mov r10, qword ptr [rdi+0x30] + mov r11, qword ptr [rdi+0x38] + mov r12, qword ptr [rdi+0x60] + mov r13, qword ptr [rdi+0x68] + mov r14, qword ptr [rdi+0x70] + mov r15, qword ptr [rdi+0x78] + vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20] + vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01 + vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20] + vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01 + vpunpcklqdq zmm12, zmm16, zmm17 + vpunpckhqdq zmm13, zmm16, zmm17 + vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20] + vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01 + vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20] + vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01 + vpunpcklqdq zmm14, zmm18, zmm19 + vpunpckhqdq zmm15, zmm18, zmm19 + vmovdqa32 zmm27, zmmword ptr [INDEX0+rip] + vmovdqa32 zmm31, zmmword ptr [INDEX1+rip] + vshufps zmm16, zmm8, zmm10, 136 + vshufps zmm17, zmm12, zmm14, 136 + vmovdqa32 zmm20, zmm16 + vpermt2d zmm16, zmm27, zmm17 + vpermt2d zmm20, zmm31, zmm17 + vshufps zmm17, zmm8, zmm10, 221 + vshufps zmm30, zmm12, zmm14, 221 + vmovdqa32 zmm21, zmm17 + vpermt2d zmm17, zmm27, zmm30 + vpermt2d zmm21, zmm31, zmm30 + vshufps zmm18, zmm9, zmm11, 136 + vshufps zmm8, zmm13, zmm15, 136 + vmovdqa32 zmm22, zmm18 + vpermt2d zmm18, zmm27, zmm8 + vpermt2d zmm22, zmm31, zmm8 + vshufps zmm19, zmm9, zmm11, 221 + vshufps zmm8, zmm13, zmm15, 221 + vmovdqa32 zmm23, zmm19 + vpermt2d zmm19, zmm27, zmm8 + vpermt2d zmm23, zmm31, zmm8 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + mov r12, qword ptr [rdi+0x40] + mov r13, qword ptr [rdi+0x48] + mov r14, qword ptr [rdi+0x50] + mov r15, qword ptr [rdi+0x58] + vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20] + vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01 + vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20] + vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01 + vpunpcklqdq zmm8, zmm24, zmm25 + vpunpckhqdq zmm9, zmm24, zmm25 + vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20] + vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01 + vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20] + vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01 + vpunpcklqdq zmm10, zmm24, zmm25 + vpunpckhqdq zmm11, zmm24, zmm25 + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r12+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r13+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r14+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + prefetcht0 [r15+rdx+0x80] + mov r8, qword ptr [rdi+0x20] + mov r9, qword ptr [rdi+0x28] + mov r10, qword ptr [rdi+0x30] + mov r11, qword ptr [rdi+0x38] + mov r12, qword ptr [rdi+0x60] + mov r13, qword ptr [rdi+0x68] + mov r14, qword ptr [rdi+0x70] + mov r15, qword ptr [rdi+0x78] + vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20] + vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01 + vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20] + vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01 + vpunpcklqdq zmm12, zmm24, zmm25 + vpunpckhqdq zmm13, zmm24, zmm25 + vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20] + vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01 + vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20] + vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01 + vpunpcklqdq zmm14, zmm24, zmm25 + vpunpckhqdq zmm15, zmm24, zmm25 + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r12+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r13+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r14+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + prefetcht0 [r15+rdx+0x80] + vshufps zmm24, zmm8, zmm10, 136 + vshufps zmm30, zmm12, zmm14, 136 + vmovdqa32 zmm28, zmm24 + vpermt2d zmm24, zmm27, zmm30 + vpermt2d zmm28, zmm31, zmm30 + vshufps zmm25, zmm8, zmm10, 221 + vshufps zmm30, zmm12, zmm14, 221 + vmovdqa32 zmm29, zmm25 + vpermt2d zmm25, zmm27, zmm30 + vpermt2d zmm29, zmm31, zmm30 + vshufps zmm26, zmm9, zmm11, 136 + vshufps zmm8, zmm13, zmm15, 136 + vmovdqa32 zmm30, zmm26 + vpermt2d zmm26, zmm27, zmm8 + vpermt2d zmm30, zmm31, zmm8 + vshufps zmm8, zmm9, zmm11, 221 + vshufps zmm10, zmm13, zmm15, 221 + vpermi2d zmm27, zmm8, zmm10 + vpermi2d zmm31, zmm8, zmm10 + vpbroadcastd zmm8, dword ptr [BLAKE3_IV_0+rip] + vpbroadcastd zmm9, dword ptr [BLAKE3_IV_1+rip] + vpbroadcastd zmm10, dword ptr [BLAKE3_IV_2+rip] + vpbroadcastd zmm11, dword ptr [BLAKE3_IV_3+rip] + vmovdqa32 zmm12, zmmword ptr [rsp] + vmovdqa32 zmm13, zmmword ptr [rsp+0x1*0x40] + vpbroadcastd zmm14, dword ptr [BLAKE3_BLOCK_LEN+rip] + vpbroadcastd zmm15, dword ptr [rsp+0x22*0x4] + vpaddd zmm0, zmm0, zmm16 + vpaddd zmm1, zmm1, zmm18 + vpaddd zmm2, zmm2, zmm20 + vpaddd zmm3, zmm3, zmm22 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm17 + vpaddd zmm1, zmm1, zmm19 + vpaddd zmm2, zmm2, zmm21 + vpaddd zmm3, zmm3, zmm23 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm24 + vpaddd zmm1, zmm1, zmm26 + vpaddd zmm2, zmm2, zmm28 + vpaddd zmm3, zmm3, zmm30 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm25 + vpaddd zmm1, zmm1, zmm27 + vpaddd zmm2, zmm2, zmm29 + vpaddd zmm3, zmm3, zmm31 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm18 + vpaddd zmm1, zmm1, zmm19 + vpaddd zmm2, zmm2, zmm23 + vpaddd zmm3, zmm3, zmm20 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm22 + vpaddd zmm1, zmm1, zmm26 + vpaddd zmm2, zmm2, zmm16 + vpaddd zmm3, zmm3, zmm29 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm17 + vpaddd zmm1, zmm1, zmm28 + vpaddd zmm2, zmm2, zmm25 + vpaddd zmm3, zmm3, zmm31 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm27 + vpaddd zmm1, zmm1, zmm21 + vpaddd zmm2, zmm2, zmm30 + vpaddd zmm3, zmm3, zmm24 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm19 + vpaddd zmm1, zmm1, zmm26 + vpaddd zmm2, zmm2, zmm29 + vpaddd zmm3, zmm3, zmm23 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm20 + vpaddd zmm1, zmm1, zmm28 + vpaddd zmm2, zmm2, zmm18 + vpaddd zmm3, zmm3, zmm30 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm22 + vpaddd zmm1, zmm1, zmm25 + vpaddd zmm2, zmm2, zmm27 + vpaddd zmm3, zmm3, zmm24 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm21 + vpaddd zmm1, zmm1, zmm16 + vpaddd zmm2, zmm2, zmm31 + vpaddd zmm3, zmm3, zmm17 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm26 + vpaddd zmm1, zmm1, zmm28 + vpaddd zmm2, zmm2, zmm30 + vpaddd zmm3, zmm3, zmm29 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm23 + vpaddd zmm1, zmm1, zmm25 + vpaddd zmm2, zmm2, zmm19 + vpaddd zmm3, zmm3, zmm31 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm20 + vpaddd zmm1, zmm1, zmm27 + vpaddd zmm2, zmm2, zmm21 + vpaddd zmm3, zmm3, zmm17 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm16 + vpaddd zmm1, zmm1, zmm18 + vpaddd zmm2, zmm2, zmm24 + vpaddd zmm3, zmm3, zmm22 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm28 + vpaddd zmm1, zmm1, zmm25 + vpaddd zmm2, zmm2, zmm31 + vpaddd zmm3, zmm3, zmm30 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm29 + vpaddd zmm1, zmm1, zmm27 + vpaddd zmm2, zmm2, zmm26 + vpaddd zmm3, zmm3, zmm24 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm23 + vpaddd zmm1, zmm1, zmm21 + vpaddd zmm2, zmm2, zmm16 + vpaddd zmm3, zmm3, zmm22 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm18 + vpaddd zmm1, zmm1, zmm19 + vpaddd zmm2, zmm2, zmm17 + vpaddd zmm3, zmm3, zmm20 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm25 + vpaddd zmm1, zmm1, zmm27 + vpaddd zmm2, zmm2, zmm24 + vpaddd zmm3, zmm3, zmm31 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm30 + vpaddd zmm1, zmm1, zmm21 + vpaddd zmm2, zmm2, zmm28 + vpaddd zmm3, zmm3, zmm17 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm29 + vpaddd zmm1, zmm1, zmm16 + vpaddd zmm2, zmm2, zmm18 + vpaddd zmm3, zmm3, zmm20 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm19 + vpaddd zmm1, zmm1, zmm26 + vpaddd zmm2, zmm2, zmm22 + vpaddd zmm3, zmm3, zmm23 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm27 + vpaddd zmm1, zmm1, zmm21 + vpaddd zmm2, zmm2, zmm17 + vpaddd zmm3, zmm3, zmm24 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm31 + vpaddd zmm1, zmm1, zmm16 + vpaddd zmm2, zmm2, zmm25 + vpaddd zmm3, zmm3, zmm22 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm30 + vpaddd zmm1, zmm1, zmm18 + vpaddd zmm2, zmm2, zmm19 + vpaddd zmm3, zmm3, zmm23 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm26 + vpaddd zmm1, zmm1, zmm28 + vpaddd zmm2, zmm2, zmm20 + vpaddd zmm3, zmm3, zmm29 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpxord zmm0, zmm0, zmm8 + vpxord zmm1, zmm1, zmm9 + vpxord zmm2, zmm2, zmm10 + vpxord zmm3, zmm3, zmm11 + vpxord zmm4, zmm4, zmm12 + vpxord zmm5, zmm5, zmm13 + vpxord zmm6, zmm6, zmm14 + vpxord zmm7, zmm7, zmm15 + movzx eax, byte ptr [rbp+0x38] + jne 9b + mov rbx, qword ptr [rbp+0x50] + vpunpckldq zmm16, zmm0, zmm1 + vpunpckhdq zmm17, zmm0, zmm1 + vpunpckldq zmm18, zmm2, zmm3 + vpunpckhdq zmm19, zmm2, zmm3 + vpunpckldq zmm20, zmm4, zmm5 + vpunpckhdq zmm21, zmm4, zmm5 + vpunpckldq zmm22, zmm6, zmm7 + vpunpckhdq zmm23, zmm6, zmm7 + vpunpcklqdq zmm0, zmm16, zmm18 + vpunpckhqdq zmm1, zmm16, zmm18 + vpunpcklqdq zmm2, zmm17, zmm19 + vpunpckhqdq zmm3, zmm17, zmm19 + vpunpcklqdq zmm4, zmm20, zmm22 + vpunpckhqdq zmm5, zmm20, zmm22 + vpunpcklqdq zmm6, zmm21, zmm23 + vpunpckhqdq zmm7, zmm21, zmm23 + vshufi32x4 zmm16, zmm0, zmm4, 0x88 + vshufi32x4 zmm17, zmm1, zmm5, 0x88 + vshufi32x4 zmm18, zmm2, zmm6, 0x88 + vshufi32x4 zmm19, zmm3, zmm7, 0x88 + vshufi32x4 zmm20, zmm0, zmm4, 0xDD + vshufi32x4 zmm21, zmm1, zmm5, 0xDD + vshufi32x4 zmm22, zmm2, zmm6, 0xDD + vshufi32x4 zmm23, zmm3, zmm7, 0xDD + vshufi32x4 zmm0, zmm16, zmm17, 0x88 + vshufi32x4 zmm1, zmm18, zmm19, 0x88 + vshufi32x4 zmm2, zmm20, zmm21, 0x88 + vshufi32x4 zmm3, zmm22, zmm23, 0x88 + vshufi32x4 zmm4, zmm16, zmm17, 0xDD + vshufi32x4 zmm5, zmm18, zmm19, 0xDD + vshufi32x4 zmm6, zmm20, zmm21, 0xDD + vshufi32x4 zmm7, zmm22, zmm23, 0xDD + vmovdqu32 zmmword ptr [rbx], zmm0 + vmovdqu32 zmmword ptr [rbx+0x1*0x40], zmm1 + vmovdqu32 zmmword ptr [rbx+0x2*0x40], zmm2 + vmovdqu32 zmmword ptr [rbx+0x3*0x40], zmm3 + vmovdqu32 zmmword ptr [rbx+0x4*0x40], zmm4 + vmovdqu32 zmmword ptr [rbx+0x5*0x40], zmm5 + vmovdqu32 zmmword ptr [rbx+0x6*0x40], zmm6 + vmovdqu32 zmmword ptr [rbx+0x7*0x40], zmm7 + vmovdqa32 zmm0, zmmword ptr [rsp] + vmovdqa32 zmm1, zmmword ptr [rsp+0x1*0x40] + vmovdqa32 zmm2, zmm0 + vpaddd zmm2{k1}, zmm0, dword ptr [ADD16+rip] {1to16} + vpcmpltud k2, zmm2, zmm0 + vpaddd zmm1 {k2}, zmm1, dword ptr [ADD1+rip] {1to16} + vmovdqa32 zmmword ptr [rsp], zmm2 + vmovdqa32 zmmword ptr [rsp+0x1*0x40], zmm1 + add rdi, 128 + add rbx, 512 + mov qword ptr [rbp+0x50], rbx + sub rsi, 16 + cmp rsi, 16 + jnc 2b + test rsi, rsi + jnz 3f +4: + vzeroupper + mov rsp, rbp + pop rbp + pop rbx + pop r12 + pop r13 + pop r14 + pop r15 + ret +.p2align 6 +3: + test esi, 0x8 + je 3f + vpbroadcastd ymm0, dword ptr [rcx] + vpbroadcastd ymm1, dword ptr [rcx+0x4] + vpbroadcastd ymm2, dword ptr [rcx+0x8] + vpbroadcastd ymm3, dword ptr [rcx+0xC] + vpbroadcastd ymm4, dword ptr [rcx+0x10] + vpbroadcastd ymm5, dword ptr [rcx+0x14] + vpbroadcastd ymm6, dword ptr [rcx+0x18] + vpbroadcastd ymm7, dword ptr [rcx+0x1C] + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + mov r12, qword ptr [rdi+0x20] + mov r13, qword ptr [rdi+0x28] + mov r14, qword ptr [rdi+0x30] + mov r15, qword ptr [rdi+0x38] + movzx eax, byte ptr [rbp+0x38] + movzx ebx, byte ptr [rbp+0x40] + or eax, ebx + xor edx, edx +2: + movzx ebx, byte ptr [rbp+0x48] + or ebx, eax + add rdx, 64 + cmp rdx, qword ptr [rsp+0x80] + cmove eax, ebx + mov dword ptr [rsp+0x88], eax + vmovups xmm8, xmmword ptr [r8+rdx-0x40] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x40] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x40] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x40] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm16, ymm12, ymm14, 136 + vshufps ymm17, ymm12, ymm14, 221 + vshufps ymm18, ymm13, ymm15, 136 + vshufps ymm19, ymm13, ymm15, 221 + vmovups xmm8, xmmword ptr [r8+rdx-0x30] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x30] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x30] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x30] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm20, ymm12, ymm14, 136 + vshufps ymm21, ymm12, ymm14, 221 + vshufps ymm22, ymm13, ymm15, 136 + vshufps ymm23, ymm13, ymm15, 221 + vmovups xmm8, xmmword ptr [r8+rdx-0x20] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x20] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x20] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x20] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm24, ymm12, ymm14, 136 + vshufps ymm25, ymm12, ymm14, 221 + vshufps ymm26, ymm13, ymm15, 136 + vshufps ymm27, ymm13, ymm15, 221 + vmovups xmm8, xmmword ptr [r8+rdx-0x10] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x10] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x10] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x10] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm28, ymm12, ymm14, 136 + vshufps ymm29, ymm12, ymm14, 221 + vshufps ymm30, ymm13, ymm15, 136 + vshufps ymm31, ymm13, ymm15, 221 + vpbroadcastd ymm8, dword ptr [BLAKE3_IV_0+rip] + vpbroadcastd ymm9, dword ptr [BLAKE3_IV_1+rip] + vpbroadcastd ymm10, dword ptr [BLAKE3_IV_2+rip] + vpbroadcastd ymm11, dword ptr [BLAKE3_IV_3+rip] + vmovdqa ymm12, ymmword ptr [rsp] + vmovdqa ymm13, ymmword ptr [rsp+0x40] + vpbroadcastd ymm14, dword ptr [BLAKE3_BLOCK_LEN+rip] + vpbroadcastd ymm15, dword ptr [rsp+0x88] + vpaddd ymm0, ymm0, ymm16 + vpaddd ymm1, ymm1, ymm18 + vpaddd ymm2, ymm2, ymm20 + vpaddd ymm3, ymm3, ymm22 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm17 + vpaddd ymm1, ymm1, ymm19 + vpaddd ymm2, ymm2, ymm21 + vpaddd ymm3, ymm3, ymm23 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm24 + vpaddd ymm1, ymm1, ymm26 + vpaddd ymm2, ymm2, ymm28 + vpaddd ymm3, ymm3, ymm30 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm25 + vpaddd ymm1, ymm1, ymm27 + vpaddd ymm2, ymm2, ymm29 + vpaddd ymm3, ymm3, ymm31 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm18 + vpaddd ymm1, ymm1, ymm19 + vpaddd ymm2, ymm2, ymm23 + vpaddd ymm3, ymm3, ymm20 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm22 + vpaddd ymm1, ymm1, ymm26 + vpaddd ymm2, ymm2, ymm16 + vpaddd ymm3, ymm3, ymm29 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm17 + vpaddd ymm1, ymm1, ymm28 + vpaddd ymm2, ymm2, ymm25 + vpaddd ymm3, ymm3, ymm31 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm27 + vpaddd ymm1, ymm1, ymm21 + vpaddd ymm2, ymm2, ymm30 + vpaddd ymm3, ymm3, ymm24 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm19 + vpaddd ymm1, ymm1, ymm26 + vpaddd ymm2, ymm2, ymm29 + vpaddd ymm3, ymm3, ymm23 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm20 + vpaddd ymm1, ymm1, ymm28 + vpaddd ymm2, ymm2, ymm18 + vpaddd ymm3, ymm3, ymm30 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm22 + vpaddd ymm1, ymm1, ymm25 + vpaddd ymm2, ymm2, ymm27 + vpaddd ymm3, ymm3, ymm24 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm21 + vpaddd ymm1, ymm1, ymm16 + vpaddd ymm2, ymm2, ymm31 + vpaddd ymm3, ymm3, ymm17 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm26 + vpaddd ymm1, ymm1, ymm28 + vpaddd ymm2, ymm2, ymm30 + vpaddd ymm3, ymm3, ymm29 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm23 + vpaddd ymm1, ymm1, ymm25 + vpaddd ymm2, ymm2, ymm19 + vpaddd ymm3, ymm3, ymm31 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm20 + vpaddd ymm1, ymm1, ymm27 + vpaddd ymm2, ymm2, ymm21 + vpaddd ymm3, ymm3, ymm17 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm16 + vpaddd ymm1, ymm1, ymm18 + vpaddd ymm2, ymm2, ymm24 + vpaddd ymm3, ymm3, ymm22 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm28 + vpaddd ymm1, ymm1, ymm25 + vpaddd ymm2, ymm2, ymm31 + vpaddd ymm3, ymm3, ymm30 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm29 + vpaddd ymm1, ymm1, ymm27 + vpaddd ymm2, ymm2, ymm26 + vpaddd ymm3, ymm3, ymm24 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm23 + vpaddd ymm1, ymm1, ymm21 + vpaddd ymm2, ymm2, ymm16 + vpaddd ymm3, ymm3, ymm22 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm18 + vpaddd ymm1, ymm1, ymm19 + vpaddd ymm2, ymm2, ymm17 + vpaddd ymm3, ymm3, ymm20 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm25 + vpaddd ymm1, ymm1, ymm27 + vpaddd ymm2, ymm2, ymm24 + vpaddd ymm3, ymm3, ymm31 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm30 + vpaddd ymm1, ymm1, ymm21 + vpaddd ymm2, ymm2, ymm28 + vpaddd ymm3, ymm3, ymm17 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm29 + vpaddd ymm1, ymm1, ymm16 + vpaddd ymm2, ymm2, ymm18 + vpaddd ymm3, ymm3, ymm20 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm19 + vpaddd ymm1, ymm1, ymm26 + vpaddd ymm2, ymm2, ymm22 + vpaddd ymm3, ymm3, ymm23 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm27 + vpaddd ymm1, ymm1, ymm21 + vpaddd ymm2, ymm2, ymm17 + vpaddd ymm3, ymm3, ymm24 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm31 + vpaddd ymm1, ymm1, ymm16 + vpaddd ymm2, ymm2, ymm25 + vpaddd ymm3, ymm3, ymm22 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm30 + vpaddd ymm1, ymm1, ymm18 + vpaddd ymm2, ymm2, ymm19 + vpaddd ymm3, ymm3, ymm23 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm26 + vpaddd ymm1, ymm1, ymm28 + vpaddd ymm2, ymm2, ymm20 + vpaddd ymm3, ymm3, ymm29 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpxor ymm0, ymm0, ymm8 + vpxor ymm1, ymm1, ymm9 + vpxor ymm2, ymm2, ymm10 + vpxor ymm3, ymm3, ymm11 + vpxor ymm4, ymm4, ymm12 + vpxor ymm5, ymm5, ymm13 + vpxor ymm6, ymm6, ymm14 + vpxor ymm7, ymm7, ymm15 + movzx eax, byte ptr [rbp+0x38] + jne 2b + mov rbx, qword ptr [rbp+0x50] + vunpcklps ymm8, ymm0, ymm1 + vunpcklps ymm9, ymm2, ymm3 + vunpckhps ymm10, ymm0, ymm1 + vunpcklps ymm11, ymm4, ymm5 + vunpcklps ymm0, ymm6, ymm7 + vshufps ymm12, ymm8, ymm9, 78 + vblendps ymm1, ymm8, ymm12, 0xCC + vshufps ymm8, ymm11, ymm0, 78 + vunpckhps ymm13, ymm2, ymm3 + vblendps ymm2, ymm11, ymm8, 0xCC + vblendps ymm3, ymm12, ymm9, 0xCC + vperm2f128 ymm12, ymm1, ymm2, 0x20 + vmovups ymmword ptr [rbx], ymm12 + vunpckhps ymm14, ymm4, ymm5 + vblendps ymm4, ymm8, ymm0, 0xCC + vunpckhps ymm15, ymm6, ymm7 + vperm2f128 ymm7, ymm3, ymm4, 0x20 + vmovups ymmword ptr [rbx+0x20], ymm7 + vshufps ymm5, ymm10, ymm13, 78 + vblendps ymm6, ymm5, ymm13, 0xCC + vshufps ymm13, ymm14, ymm15, 78 + vblendps ymm10, ymm10, ymm5, 0xCC + vblendps ymm14, ymm14, ymm13, 0xCC + vperm2f128 ymm8, ymm10, ymm14, 0x20 + vmovups ymmword ptr [rbx+0x40], ymm8 + vblendps ymm15, ymm13, ymm15, 0xCC + vperm2f128 ymm13, ymm6, ymm15, 0x20 + vmovups ymmword ptr [rbx+0x60], ymm13 + vperm2f128 ymm9, ymm1, ymm2, 0x31 + vperm2f128 ymm11, ymm3, ymm4, 0x31 + vmovups ymmword ptr [rbx+0x80], ymm9 + vperm2f128 ymm14, ymm10, ymm14, 0x31 + vperm2f128 ymm15, ymm6, ymm15, 0x31 + vmovups ymmword ptr [rbx+0xA0], ymm11 + vmovups ymmword ptr [rbx+0xC0], ymm14 + vmovups ymmword ptr [rbx+0xE0], ymm15 + vmovdqa ymm0, ymmword ptr [rsp] + vmovdqa ymm2, ymmword ptr [rsp+0x2*0x20] + vmovdqa32 ymm0 {k1}, ymmword ptr [rsp+0x1*0x20] + vmovdqa32 ymm2 {k1}, ymmword ptr [rsp+0x3*0x20] + vmovdqa ymmword ptr [rsp], ymm0 + vmovdqa ymmword ptr [rsp+0x2*0x20], ymm2 + add rbx, 256 + mov qword ptr [rbp+0x50], rbx + add rdi, 64 + sub rsi, 8 +3: + mov rbx, qword ptr [rbp+0x50] + mov r15, qword ptr [rsp+0x80] + movzx r13, byte ptr [rbp+0x38] + movzx r12, byte ptr [rbp+0x48] + test esi, 0x4 + je 3f + vbroadcasti32x4 zmm0, xmmword ptr [rcx] + vbroadcasti32x4 zmm1, xmmword ptr [rcx+0x1*0x10] + vmovdqa xmm12, xmmword ptr [rsp] + vmovdqa xmm13, xmmword ptr [rsp+0x4*0x10] + vpunpckldq xmm14, xmm12, xmm13 + vpunpckhdq xmm15, xmm12, xmm13 + vpermq ymm14, ymm14, 0xDC + vpermq ymm15, ymm15, 0xDC + vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN+rip] + vinserti64x4 zmm13, zmm14, ymm15, 0x01 + mov eax, 17476 + kmovw k2, eax + vpblendmd zmm13 {k2}, zmm13, zmm12 + vbroadcasti32x4 zmm15, xmmword ptr [BLAKE3_IV+rip] + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + mov eax, 43690 + kmovw k3, eax + mov eax, 34952 + kmovw k4, eax + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + mov dword ptr [rsp+0x88], eax + vmovdqa32 zmm2, zmm15 + vpbroadcastd zmm8, dword ptr [rsp+0x22*0x4] + vpblendmd zmm3 {k4}, zmm13, zmm8 + vmovups zmm8, zmmword ptr [r8+rdx-0x1*0x40] + vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-0x4*0x10], 0x01 + vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-0x4*0x10], 0x02 + vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-0x4*0x10], 0x03 + vmovups zmm9, zmmword ptr [r8+rdx-0x30] + vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-0x3*0x10], 0x01 + vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-0x3*0x10], 0x02 + vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-0x3*0x10], 0x03 + vshufps zmm4, zmm8, zmm9, 136 + vshufps zmm5, zmm8, zmm9, 221 + vmovups zmm8, zmmword ptr [r8+rdx-0x20] + vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-0x2*0x10], 0x01 + vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-0x2*0x10], 0x02 + vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-0x2*0x10], 0x03 + vmovups zmm9, zmmword ptr [r8+rdx-0x10] + vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-0x1*0x10], 0x01 + vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-0x1*0x10], 0x02 + vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-0x1*0x10], 0x03 + vshufps zmm6, zmm8, zmm9, 136 + vshufps zmm7, zmm8, zmm9, 221 + vpshufd zmm6, zmm6, 0x93 + vpshufd zmm7, zmm7, 0x93 + mov al, 7 +9: + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm0, zmm0, zmm1 + vpxord zmm3, zmm3, zmm0 + vprord zmm3, zmm3, 16 + vpaddd zmm2, zmm2, zmm3 + vpxord zmm1, zmm1, zmm2 + vprord zmm1, zmm1, 12 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm0, zmm0, zmm1 + vpxord zmm3, zmm3, zmm0 + vprord zmm3, zmm3, 8 + vpaddd zmm2, zmm2, zmm3 + vpxord zmm1, zmm1, zmm2 + vprord zmm1, zmm1, 7 + vpshufd zmm0, zmm0, 0x93 + vpshufd zmm3, zmm3, 0x4E + vpshufd zmm2, zmm2, 0x39 + vpaddd zmm0, zmm0, zmm6 + vpaddd zmm0, zmm0, zmm1 + vpxord zmm3, zmm3, zmm0 + vprord zmm3, zmm3, 16 + vpaddd zmm2, zmm2, zmm3 + vpxord zmm1, zmm1, zmm2 + vprord zmm1, zmm1, 12 + vpaddd zmm0, zmm0, zmm7 + vpaddd zmm0, zmm0, zmm1 + vpxord zmm3, zmm3, zmm0 + vprord zmm3, zmm3, 8 + vpaddd zmm2, zmm2, zmm3 + vpxord zmm1, zmm1, zmm2 + vprord zmm1, zmm1, 7 + vpshufd zmm0, zmm0, 0x39 + vpshufd zmm3, zmm3, 0x4E + vpshufd zmm2, zmm2, 0x93 + dec al + jz 9f + vshufps zmm8, zmm4, zmm5, 214 + vpshufd zmm9, zmm4, 0x0F + vpshufd zmm4, zmm8, 0x39 + vshufps zmm8, zmm6, zmm7, 250 + vpblendmd zmm9 {k3}, zmm9, zmm8 + vpunpcklqdq zmm8, zmm7, zmm5 + vpblendmd zmm8 {k4}, zmm8, zmm6 + vpshufd zmm8, zmm8, 0x78 + vpunpckhdq zmm5, zmm5, zmm7 + vpunpckldq zmm6, zmm6, zmm5 + vpshufd zmm7, zmm6, 0x1E + vmovdqa32 zmm5, zmm9 + vmovdqa32 zmm6, zmm8 + jmp 9b +9: + vpxord zmm0, zmm0, zmm2 + vpxord zmm1, zmm1, zmm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 + vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 + vextracti32x4 xmmword ptr [rbx+0x4*0x10], zmm0, 0x02 + vextracti32x4 xmmword ptr [rbx+0x5*0x10], zmm1, 0x02 + vextracti32x4 xmmword ptr [rbx+0x6*0x10], zmm0, 0x03 + vextracti32x4 xmmword ptr [rbx+0x7*0x10], zmm1, 0x03 + vmovdqa xmm0, xmmword ptr [rsp] + vmovdqa xmm2, xmmword ptr [rsp+0x40] + vmovdqa32 xmm0 {k1}, xmmword ptr [rsp+0x1*0x10] + vmovdqa32 xmm2 {k1}, xmmword ptr [rsp+0x5*0x10] + vmovdqa xmmword ptr [rsp], xmm0 + vmovdqa xmmword ptr [rsp+0x40], xmm2 + add rbx, 128 + add rdi, 32 + sub rsi, 4 +3: + test esi, 0x2 + je 3f + vbroadcasti128 ymm0, xmmword ptr [rcx] + vbroadcasti128 ymm1, xmmword ptr [rcx+0x10] + vmovd xmm13, dword ptr [rsp] + vpinsrd xmm13, xmm13, dword ptr [rsp+0x40], 1 + vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vmovd xmm14, dword ptr [rsp+0x4] + vpinsrd xmm14, xmm14, dword ptr [rsp+0x44], 1 + vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vinserti128 ymm13, ymm13, xmm14, 0x01 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + mov dword ptr [rsp+0x88], eax + vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip] + vpbroadcastd ymm8, dword ptr [rsp+0x88] + vpblendd ymm3, ymm13, ymm8, 0x88 + vmovups ymm8, ymmword ptr [r8+rdx-0x40] + vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x40], 0x01 + vmovups ymm9, ymmword ptr [r8+rdx-0x30] + vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x30], 0x01 + vshufps ymm4, ymm8, ymm9, 136 + vshufps ymm5, ymm8, ymm9, 221 + vmovups ymm8, ymmword ptr [r8+rdx-0x20] + vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x20], 0x01 + vmovups ymm9, ymmword ptr [r8+rdx-0x10] + vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x10], 0x01 + vshufps ymm6, ymm8, ymm9, 136 + vshufps ymm7, ymm8, ymm9, 221 + vpshufd ymm6, ymm6, 0x93 + vpshufd ymm7, ymm7, 0x93 + mov al, 7 +9: + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm0, ymm0, ymm1 + vpxord ymm3, ymm3, ymm0 + vprord ymm3, ymm3, 16 + vpaddd ymm2, ymm2, ymm3 + vpxord ymm1, ymm1, ymm2 + vprord ymm1, ymm1, 12 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm0, ymm0, ymm1 + vpxord ymm3, ymm3, ymm0 + vprord ymm3, ymm3, 8 + vpaddd ymm2, ymm2, ymm3 + vpxord ymm1, ymm1, ymm2 + vprord ymm1, ymm1, 7 + vpshufd ymm0, ymm0, 0x93 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm2, ymm2, 0x39 + vpaddd ymm0, ymm0, ymm6 + vpaddd ymm0, ymm0, ymm1 + vpxord ymm3, ymm3, ymm0 + vprord ymm3, ymm3, 16 + vpaddd ymm2, ymm2, ymm3 + vpxord ymm1, ymm1, ymm2 + vprord ymm1, ymm1, 12 + vpaddd ymm0, ymm0, ymm7 + vpaddd ymm0, ymm0, ymm1 + vpxord ymm3, ymm3, ymm0 + vprord ymm3, ymm3, 8 + vpaddd ymm2, ymm2, ymm3 + vpxord ymm1, ymm1, ymm2 + vprord ymm1, ymm1, 7 + vpshufd ymm0, ymm0, 0x39 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm2, ymm2, 0x93 + dec al + jz 9f + vshufps ymm8, ymm4, ymm5, 214 + vpshufd ymm9, ymm4, 0x0F + vpshufd ymm4, ymm8, 0x39 + vshufps ymm8, ymm6, ymm7, 250 + vpblendd ymm9, ymm9, ymm8, 0xAA + vpunpcklqdq ymm8, ymm7, ymm5 + vpblendd ymm8, ymm8, ymm6, 0x88 + vpshufd ymm8, ymm8, 0x78 + vpunpckhdq ymm5, ymm5, ymm7 + vpunpckldq ymm6, ymm6, ymm5 + vpshufd ymm7, ymm6, 0x1E + vmovdqa ymm5, ymm9 + vmovdqa ymm6, ymm8 + jmp 9b +9: + vpxor ymm0, ymm0, ymm2 + vpxor ymm1, ymm1, ymm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 + vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 + vmovdqa xmm0, xmmword ptr [rsp] + vmovdqa xmm2, xmmword ptr [rsp+0x4*0x10] + vmovdqu32 xmm0 {k1}, xmmword ptr [rsp+0x8] + vmovdqu32 xmm2 {k1}, xmmword ptr [rsp+0x48] + vmovdqa xmmword ptr [rsp], xmm0 + vmovdqa xmmword ptr [rsp+0x4*0x10], xmm2 + add rbx, 64 + add rdi, 16 + sub rsi, 2 +3: + test esi, 0x1 + je 4b + vmovdqu xmm0, xmmword ptr [rcx] + vmovdqu xmm1, xmmword ptr [rcx+0x10] + vmovd xmm14, dword ptr [rsp] + vpinsrd xmm14, xmm14, dword ptr [rsp+0x40], 1 + vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vmovdqa xmm15, xmmword ptr [BLAKE3_IV+rip] + mov r8, qword ptr [rdi] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + vpinsrd xmm3, xmm14, eax, 3 + vmovdqa xmm2, xmm15 + vmovups xmm8, xmmword ptr [r8+rdx-0x40] + vmovups xmm9, xmmword ptr [r8+rdx-0x30] + vshufps xmm4, xmm8, xmm9, 136 + vshufps xmm5, xmm8, xmm9, 221 + vmovups xmm8, xmmword ptr [r8+rdx-0x20] + vmovups xmm9, xmmword ptr [r8+rdx-0x10] + vshufps xmm6, xmm8, xmm9, 136 + vshufps xmm7, xmm8, xmm9, 221 + vpshufd xmm6, xmm6, 0x93 + vpshufd xmm7, xmm7, 0x93 + mov al, 7 +9: + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x93 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x39 + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x39 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x93 + dec al + jz 9f + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0x0F + vpshufd xmm4, xmm8, 0x39 + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0xAA + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 0x88 + vpshufd xmm8, xmm8, 0x78 + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 0x1E + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + jmp 9b +9: + vpxor xmm0, xmm0, xmm2 + vpxor xmm1, xmm1, xmm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + jmp 4b +.p2align 6 +_blake3_compress_in_place_avx512: +blake3_compress_in_place_avx512: + _CET_ENDBR + vmovdqu xmm0, xmmword ptr [rdi] + vmovdqu xmm1, xmmword ptr [rdi+0x10] + movzx eax, r8b + movzx edx, dl + shl rax, 32 + add rdx, rax + vmovq xmm3, rcx + vmovq xmm4, rdx + vpunpcklqdq xmm3, xmm3, xmm4 + vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip] + vmovups xmm8, xmmword ptr [rsi] + vmovups xmm9, xmmword ptr [rsi+0x10] + vshufps xmm4, xmm8, xmm9, 136 + vshufps xmm5, xmm8, xmm9, 221 + vmovups xmm8, xmmword ptr [rsi+0x20] + vmovups xmm9, xmmword ptr [rsi+0x30] + vshufps xmm6, xmm8, xmm9, 136 + vshufps xmm7, xmm8, xmm9, 221 + vpshufd xmm6, xmm6, 0x93 + vpshufd xmm7, xmm7, 0x93 + mov al, 7 +9: + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x93 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x39 + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x39 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x93 + dec al + jz 9f + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0x0F + vpshufd xmm4, xmm8, 0x39 + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0xAA + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 0x88 + vpshufd xmm8, xmm8, 0x78 + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 0x1E + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + jmp 9b +9: + vpxor xmm0, xmm0, xmm2 + vpxor xmm1, xmm1, xmm3 + vmovdqu xmmword ptr [rdi], xmm0 + vmovdqu xmmword ptr [rdi+0x10], xmm1 + ret + +.p2align 6 +_blake3_compress_xof_avx512: +blake3_compress_xof_avx512: + _CET_ENDBR + vmovdqu xmm0, xmmword ptr [rdi] + vmovdqu xmm1, xmmword ptr [rdi+0x10] + movzx eax, r8b + movzx edx, dl + shl rax, 32 + add rdx, rax + vmovq xmm3, rcx + vmovq xmm4, rdx + vpunpcklqdq xmm3, xmm3, xmm4 + vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip] + vmovups xmm8, xmmword ptr [rsi] + vmovups xmm9, xmmword ptr [rsi+0x10] + vshufps xmm4, xmm8, xmm9, 136 + vshufps xmm5, xmm8, xmm9, 221 + vmovups xmm8, xmmword ptr [rsi+0x20] + vmovups xmm9, xmmword ptr [rsi+0x30] + vshufps xmm6, xmm8, xmm9, 136 + vshufps xmm7, xmm8, xmm9, 221 + vpshufd xmm6, xmm6, 0x93 + vpshufd xmm7, xmm7, 0x93 + mov al, 7 +9: + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x93 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x39 + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x39 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x93 + dec al + jz 9f + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0x0F + vpshufd xmm4, xmm8, 0x39 + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0xAA + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 0x88 + vpshufd xmm8, xmm8, 0x78 + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 0x1E + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + jmp 9b +9: + vpxor xmm0, xmm0, xmm2 + vpxor xmm1, xmm1, xmm3 + vpxor xmm2, xmm2, [rdi] + vpxor xmm3, xmm3, [rdi+0x10] + vmovdqu xmmword ptr [r9], xmm0 + vmovdqu xmmword ptr [r9+0x10], xmm1 + vmovdqu xmmword ptr [r9+0x20], xmm2 + vmovdqu xmmword ptr [r9+0x30], xmm3 + ret + +.p2align 6 +blake3_xof_many_avx512: +_blake3_xof_many_avx512: + _CET_ENDBR + mov r10,QWORD PTR [rsp+0x8] + cmp r10,0x1 + ja 2f + vmovdqu xmm0,XMMWORD PTR [rdi] + vmovdqu xmm1,XMMWORD PTR [rdi+0x10] + movzx eax,r8b + movzx edx,dl + shl rax,0x20 + add rdx,rax + vmovq xmm3,rcx + vmovq xmm4,rdx + vpunpcklqdq xmm3,xmm3,xmm4 + vmovaps xmm2,XMMWORD PTR [BLAKE3_IV+rip] + vmovups xmm8,XMMWORD PTR [rsi] + vmovups xmm9,XMMWORD PTR [rsi+0x10] + vshufps xmm4,xmm8,xmm9,0x88 + vshufps xmm5,xmm8,xmm9,0xdd + vmovups xmm8,XMMWORD PTR [rsi+0x20] + vmovups xmm9,XMMWORD PTR [rsi+0x30] + vshufps xmm6,xmm8,xmm9,0x88 + vshufps xmm7,xmm8,xmm9,0xdd + vpshufd xmm6,xmm6,0x93 + vpshufd xmm7,xmm7,0x93 + mov al,0x7 +3: + vpaddd xmm0,xmm0,xmm4 + vpaddd xmm0,xmm0,xmm1 + vpxord xmm3,xmm3,xmm0 + vprord xmm3,xmm3,0x10 + vpaddd xmm2,xmm2,xmm3 + vpxord xmm1,xmm1,xmm2 + vprord xmm1,xmm1,0xc + vpaddd xmm0,xmm0,xmm5 + vpaddd xmm0,xmm0,xmm1 + vpxord xmm3,xmm3,xmm0 + vprord xmm3,xmm3,0x8 + vpaddd xmm2,xmm2,xmm3 + vpxord xmm1,xmm1,xmm2 + vprord xmm1,xmm1,0x7 + vpshufd xmm0,xmm0,0x93 + vpshufd xmm3,xmm3,0x4e + vpshufd xmm2,xmm2,0x39 + vpaddd xmm0,xmm0,xmm6 + vpaddd xmm0,xmm0,xmm1 + vpxord xmm3,xmm3,xmm0 + vprord xmm3,xmm3,0x10 + vpaddd xmm2,xmm2,xmm3 + vpxord xmm1,xmm1,xmm2 + vprord xmm1,xmm1,0xc + vpaddd xmm0,xmm0,xmm7 + vpaddd xmm0,xmm0,xmm1 + vpxord xmm3,xmm3,xmm0 + vprord xmm3,xmm3,0x8 + vpaddd xmm2,xmm2,xmm3 + vpxord xmm1,xmm1,xmm2 + vprord xmm1,xmm1,0x7 + vpshufd xmm0,xmm0,0x39 + vpshufd xmm3,xmm3,0x4e + vpshufd xmm2,xmm2,0x93 + dec al + je 3f + vshufps xmm8,xmm4,xmm5,0xd6 + vpshufd xmm9,xmm4,0xf + vpshufd xmm4,xmm8,0x39 + vshufps xmm8,xmm6,xmm7,0xfa + vpblendd xmm9,xmm9,xmm8,0xaa + vpunpcklqdq xmm8,xmm7,xmm5 + vpblendd xmm8,xmm8,xmm6,0x88 + vpshufd xmm8,xmm8,0x78 + vpunpckhdq xmm5,xmm5,xmm7 + vpunpckldq xmm6,xmm6,xmm5 + vpshufd xmm7,xmm6,0x1e + vmovdqa xmm5,xmm9 + vmovdqa xmm6,xmm8 + jmp 3b +3: + vpxor xmm0,xmm0,xmm2 + vpxor xmm1,xmm1,xmm3 + vpxor xmm2,xmm2,XMMWORD PTR [rdi] + vpxor xmm3,xmm3,XMMWORD PTR [rdi+0x10] + vmovdqu XMMWORD PTR [r9],xmm0 + vmovdqu XMMWORD PTR [r9+0x10],xmm1 + vmovdqu XMMWORD PTR [r9+0x20],xmm2 + vmovdqu XMMWORD PTR [r9+0x30],xmm3 + ret +.p2align 6 +2: + push rbp + mov rbp,rsp + sub rsp,0x90 + and rsp,0xffffffffffffffc0 + vpbroadcastd zmm0,ecx + shr rcx,0x20 + vpbroadcastd zmm1,ecx + vpaddd zmm2,zmm0,ZMMWORD PTR [ADD0+rip] + vpcmpltud k1,zmm2,zmm0 + vpaddd zmm1{k1},zmm1,DWORD PTR [ADD1+rip]{1to16} + vmovdqa32 ZMMWORD PTR [rsp],zmm2 + vmovdqa32 ZMMWORD PTR [rsp+0x40],zmm1 + cmp r10,0x10 + jb 2f +3: + vpbroadcastd zmm16,DWORD PTR [rsi] + vpbroadcastd zmm17,DWORD PTR [rsi+0x4] + vpbroadcastd zmm18,DWORD PTR [rsi+0x8] + vpbroadcastd zmm19,DWORD PTR [rsi+0xc] + vpbroadcastd zmm20,DWORD PTR [rsi+0x10] + vpbroadcastd zmm21,DWORD PTR [rsi+0x14] + vpbroadcastd zmm22,DWORD PTR [rsi+0x18] + vpbroadcastd zmm23,DWORD PTR [rsi+0x1c] + vpbroadcastd zmm24,DWORD PTR [rsi+0x20] + vpbroadcastd zmm25,DWORD PTR [rsi+0x24] + vpbroadcastd zmm26,DWORD PTR [rsi+0x28] + vpbroadcastd zmm27,DWORD PTR [rsi+0x2c] + vpbroadcastd zmm28,DWORD PTR [rsi+0x30] + vpbroadcastd zmm29,DWORD PTR [rsi+0x34] + vpbroadcastd zmm30,DWORD PTR [rsi+0x38] + vpbroadcastd zmm31,DWORD PTR [rsi+0x3c] + vpbroadcastd zmm0,DWORD PTR [rdi] + vpbroadcastd zmm1,DWORD PTR [rdi+0x4] + vpbroadcastd zmm2,DWORD PTR [rdi+0x8] + vpbroadcastd zmm3,DWORD PTR [rdi+0xc] + vpbroadcastd zmm4,DWORD PTR [rdi+0x10] + vpbroadcastd zmm5,DWORD PTR [rdi+0x14] + vpbroadcastd zmm6,DWORD PTR [rdi+0x18] + vpbroadcastd zmm7,DWORD PTR [rdi+0x1c] + vpbroadcastd zmm8,DWORD PTR [BLAKE3_IV_0+rip] + vpbroadcastd zmm9,DWORD PTR [BLAKE3_IV_1+rip] + vpbroadcastd zmm10,DWORD PTR [BLAKE3_IV_2+rip] + vpbroadcastd zmm11,DWORD PTR [BLAKE3_IV_3+rip] + vmovdqa32 zmm12,ZMMWORD PTR [rsp] + vmovdqa32 zmm13,ZMMWORD PTR [rsp+0x40] + vpbroadcastd zmm14,edx + vpbroadcastd zmm15,r8d + vpaddd zmm0,zmm0,zmm16 + vpaddd zmm1,zmm1,zmm18 + vpaddd zmm2,zmm2,zmm20 + vpaddd zmm3,zmm3,zmm22 + vpaddd zmm0,zmm0,zmm4 + vpaddd zmm1,zmm1,zmm5 + vpaddd zmm2,zmm2,zmm6 + vpaddd zmm3,zmm3,zmm7 + vpxord zmm12,zmm12,zmm0 + vpxord zmm13,zmm13,zmm1 + vpxord zmm14,zmm14,zmm2 + vpxord zmm15,zmm15,zmm3 + vprord zmm12,zmm12,0x10 + vprord zmm13,zmm13,0x10 + vprord zmm14,zmm14,0x10 + vprord zmm15,zmm15,0x10 + vpaddd zmm8,zmm8,zmm12 + vpaddd zmm9,zmm9,zmm13 + vpaddd zmm10,zmm10,zmm14 + vpaddd zmm11,zmm11,zmm15 + vpxord zmm4,zmm4,zmm8 + vpxord zmm5,zmm5,zmm9 + vpxord zmm6,zmm6,zmm10 + vpxord zmm7,zmm7,zmm11 + vprord zmm4,zmm4,0xc + vprord zmm5,zmm5,0xc + vprord zmm6,zmm6,0xc + vprord zmm7,zmm7,0xc + vpaddd zmm0,zmm0,zmm17 + vpaddd zmm1,zmm1,zmm19 + vpaddd zmm2,zmm2,zmm21 + vpaddd zmm3,zmm3,zmm23 + vpaddd zmm0,zmm0,zmm4 + vpaddd zmm1,zmm1,zmm5 + vpaddd zmm2,zmm2,zmm6 + vpaddd zmm3,zmm3,zmm7 + vpxord zmm12,zmm12,zmm0 + vpxord zmm13,zmm13,zmm1 + vpxord zmm14,zmm14,zmm2 + vpxord zmm15,zmm15,zmm3 + vprord zmm12,zmm12,0x8 + vprord zmm13,zmm13,0x8 + vprord zmm14,zmm14,0x8 + vprord zmm15,zmm15,0x8 + vpaddd zmm8,zmm8,zmm12 + vpaddd zmm9,zmm9,zmm13 + vpaddd zmm10,zmm10,zmm14 + vpaddd zmm11,zmm11,zmm15 + vpxord zmm4,zmm4,zmm8 + vpxord zmm5,zmm5,zmm9 + vpxord zmm6,zmm6,zmm10 + vpxord zmm7,zmm7,zmm11 + vprord zmm4,zmm4,0x7 + vprord zmm5,zmm5,0x7 + vprord zmm6,zmm6,0x7 + vprord zmm7,zmm7,0x7 + vpaddd zmm0,zmm0,zmm24 + vpaddd zmm1,zmm1,zmm26 + vpaddd zmm2,zmm2,zmm28 + vpaddd zmm3,zmm3,zmm30 + vpaddd zmm0,zmm0,zmm5 + vpaddd zmm1,zmm1,zmm6 + vpaddd zmm2,zmm2,zmm7 + vpaddd zmm3,zmm3,zmm4 + vpxord zmm15,zmm15,zmm0 + vpxord zmm12,zmm12,zmm1 + vpxord zmm13,zmm13,zmm2 + vpxord zmm14,zmm14,zmm3 + vprord zmm15,zmm15,0x10 + vprord zmm12,zmm12,0x10 + vprord zmm13,zmm13,0x10 + vprord zmm14,zmm14,0x10 + vpaddd zmm10,zmm10,zmm15 + vpaddd zmm11,zmm11,zmm12 + vpaddd zmm8,zmm8,zmm13 + vpaddd zmm9,zmm9,zmm14 + vpxord zmm5,zmm5,zmm10 + vpxord zmm6,zmm6,zmm11 + vpxord zmm7,zmm7,zmm8 + vpxord zmm4,zmm4,zmm9 + vprord zmm5,zmm5,0xc + vprord zmm6,zmm6,0xc + vprord zmm7,zmm7,0xc + vprord zmm4,zmm4,0xc + vpaddd zmm0,zmm0,zmm25 + vpaddd zmm1,zmm1,zmm27 + vpaddd zmm2,zmm2,zmm29 + vpaddd zmm3,zmm3,zmm31 + vpaddd zmm0,zmm0,zmm5 + vpaddd zmm1,zmm1,zmm6 + vpaddd zmm2,zmm2,zmm7 + vpaddd zmm3,zmm3,zmm4 + vpxord zmm15,zmm15,zmm0 + vpxord zmm12,zmm12,zmm1 + vpxord zmm13,zmm13,zmm2 + vpxord zmm14,zmm14,zmm3 + vprord zmm15,zmm15,0x8 + vprord zmm12,zmm12,0x8 + vprord zmm13,zmm13,0x8 + vprord zmm14,zmm14,0x8 + vpaddd zmm10,zmm10,zmm15 + vpaddd zmm11,zmm11,zmm12 + vpaddd zmm8,zmm8,zmm13 + vpaddd zmm9,zmm9,zmm14 + vpxord zmm5,zmm5,zmm10 + vpxord zmm6,zmm6,zmm11 + vpxord zmm7,zmm7,zmm8 + vpxord zmm4,zmm4,zmm9 + vprord zmm5,zmm5,0x7 + vprord zmm6,zmm6,0x7 + vprord zmm7,zmm7,0x7 + vprord zmm4,zmm4,0x7 + vpaddd zmm0,zmm0,zmm18 + vpaddd zmm1,zmm1,zmm19 + vpaddd zmm2,zmm2,zmm23 + vpaddd zmm3,zmm3,zmm20 + vpaddd zmm0,zmm0,zmm4 + vpaddd zmm1,zmm1,zmm5 + vpaddd zmm2,zmm2,zmm6 + vpaddd zmm3,zmm3,zmm7 + vpxord zmm12,zmm12,zmm0 + vpxord zmm13,zmm13,zmm1 + vpxord zmm14,zmm14,zmm2 + vpxord zmm15,zmm15,zmm3 + vprord zmm12,zmm12,0x10 + vprord zmm13,zmm13,0x10 + vprord zmm14,zmm14,0x10 + vprord zmm15,zmm15,0x10 + vpaddd zmm8,zmm8,zmm12 + vpaddd zmm9,zmm9,zmm13 + vpaddd zmm10,zmm10,zmm14 + vpaddd zmm11,zmm11,zmm15 + vpxord zmm4,zmm4,zmm8 + vpxord zmm5,zmm5,zmm9 + vpxord zmm6,zmm6,zmm10 + vpxord zmm7,zmm7,zmm11 + vprord zmm4,zmm4,0xc + vprord zmm5,zmm5,0xc + vprord zmm6,zmm6,0xc + vprord zmm7,zmm7,0xc + vpaddd zmm0,zmm0,zmm22 + vpaddd zmm1,zmm1,zmm26 + vpaddd zmm2,zmm2,zmm16 + vpaddd zmm3,zmm3,zmm29 + vpaddd zmm0,zmm0,zmm4 + vpaddd zmm1,zmm1,zmm5 + vpaddd zmm2,zmm2,zmm6 + vpaddd zmm3,zmm3,zmm7 + vpxord zmm12,zmm12,zmm0 + vpxord zmm13,zmm13,zmm1 + vpxord zmm14,zmm14,zmm2 + vpxord zmm15,zmm15,zmm3 + vprord zmm12,zmm12,0x8 + vprord zmm13,zmm13,0x8 + vprord zmm14,zmm14,0x8 + vprord zmm15,zmm15,0x8 + vpaddd zmm8,zmm8,zmm12 + vpaddd zmm9,zmm9,zmm13 + vpaddd zmm10,zmm10,zmm14 + vpaddd zmm11,zmm11,zmm15 + vpxord zmm4,zmm4,zmm8 + vpxord zmm5,zmm5,zmm9 + vpxord zmm6,zmm6,zmm10 + vpxord zmm7,zmm7,zmm11 + vprord zmm4,zmm4,0x7 + vprord zmm5,zmm5,0x7 + vprord zmm6,zmm6,0x7 + vprord zmm7,zmm7,0x7 + vpaddd zmm0,zmm0,zmm17 + vpaddd zmm1,zmm1,zmm28 + vpaddd zmm2,zmm2,zmm25 + vpaddd zmm3,zmm3,zmm31 + vpaddd zmm0,zmm0,zmm5 + vpaddd zmm1,zmm1,zmm6 + vpaddd zmm2,zmm2,zmm7 + vpaddd zmm3,zmm3,zmm4 + vpxord zmm15,zmm15,zmm0 + vpxord zmm12,zmm12,zmm1 + vpxord zmm13,zmm13,zmm2 + vpxord zmm14,zmm14,zmm3 + vprord zmm15,zmm15,0x10 + vprord zmm12,zmm12,0x10 + vprord zmm13,zmm13,0x10 + vprord zmm14,zmm14,0x10 + vpaddd zmm10,zmm10,zmm15 + vpaddd zmm11,zmm11,zmm12 + vpaddd zmm8,zmm8,zmm13 + vpaddd zmm9,zmm9,zmm14 + vpxord zmm5,zmm5,zmm10 + vpxord zmm6,zmm6,zmm11 + vpxord zmm7,zmm7,zmm8 + vpxord zmm4,zmm4,zmm9 + vprord zmm5,zmm5,0xc + vprord zmm6,zmm6,0xc + vprord zmm7,zmm7,0xc + vprord zmm4,zmm4,0xc + vpaddd zmm0,zmm0,zmm27 + vpaddd zmm1,zmm1,zmm21 + vpaddd zmm2,zmm2,zmm30 + vpaddd zmm3,zmm3,zmm24 + vpaddd zmm0,zmm0,zmm5 + vpaddd zmm1,zmm1,zmm6 + vpaddd zmm2,zmm2,zmm7 + vpaddd zmm3,zmm3,zmm4 + vpxord zmm15,zmm15,zmm0 + vpxord zmm12,zmm12,zmm1 + vpxord zmm13,zmm13,zmm2 + vpxord zmm14,zmm14,zmm3 + vprord zmm15,zmm15,0x8 + vprord zmm12,zmm12,0x8 + vprord zmm13,zmm13,0x8 + vprord zmm14,zmm14,0x8 + vpaddd zmm10,zmm10,zmm15 + vpaddd zmm11,zmm11,zmm12 + vpaddd zmm8,zmm8,zmm13 + vpaddd zmm9,zmm9,zmm14 + vpxord zmm5,zmm5,zmm10 + vpxord zmm6,zmm6,zmm11 + vpxord zmm7,zmm7,zmm8 + vpxord zmm4,zmm4,zmm9 + vprord zmm5,zmm5,0x7 + vprord zmm6,zmm6,0x7 + vprord zmm7,zmm7,0x7 + vprord zmm4,zmm4,0x7 + vpaddd zmm0,zmm0,zmm19 + vpaddd zmm1,zmm1,zmm26 + vpaddd zmm2,zmm2,zmm29 + vpaddd zmm3,zmm3,zmm23 + vpaddd zmm0,zmm0,zmm4 + vpaddd zmm1,zmm1,zmm5 + vpaddd zmm2,zmm2,zmm6 + vpaddd zmm3,zmm3,zmm7 + vpxord zmm12,zmm12,zmm0 + vpxord zmm13,zmm13,zmm1 + vpxord zmm14,zmm14,zmm2 + vpxord zmm15,zmm15,zmm3 + vprord zmm12,zmm12,0x10 + vprord zmm13,zmm13,0x10 + vprord zmm14,zmm14,0x10 + vprord zmm15,zmm15,0x10 + vpaddd zmm8,zmm8,zmm12 + vpaddd zmm9,zmm9,zmm13 + vpaddd zmm10,zmm10,zmm14 + vpaddd zmm11,zmm11,zmm15 + vpxord zmm4,zmm4,zmm8 + vpxord zmm5,zmm5,zmm9 + vpxord zmm6,zmm6,zmm10 + vpxord zmm7,zmm7,zmm11 + vprord zmm4,zmm4,0xc + vprord zmm5,zmm5,0xc + vprord zmm6,zmm6,0xc + vprord zmm7,zmm7,0xc + vpaddd zmm0,zmm0,zmm20 + vpaddd zmm1,zmm1,zmm28 + vpaddd zmm2,zmm2,zmm18 + vpaddd zmm3,zmm3,zmm30 + vpaddd zmm0,zmm0,zmm4 + vpaddd zmm1,zmm1,zmm5 + vpaddd zmm2,zmm2,zmm6 + vpaddd zmm3,zmm3,zmm7 + vpxord zmm12,zmm12,zmm0 + vpxord zmm13,zmm13,zmm1 + vpxord zmm14,zmm14,zmm2 + vpxord zmm15,zmm15,zmm3 + vprord zmm12,zmm12,0x8 + vprord zmm13,zmm13,0x8 + vprord zmm14,zmm14,0x8 + vprord zmm15,zmm15,0x8 + vpaddd zmm8,zmm8,zmm12 + vpaddd zmm9,zmm9,zmm13 + vpaddd zmm10,zmm10,zmm14 + vpaddd zmm11,zmm11,zmm15 + vpxord zmm4,zmm4,zmm8 + vpxord zmm5,zmm5,zmm9 + vpxord zmm6,zmm6,zmm10 + vpxord zmm7,zmm7,zmm11 + vprord zmm4,zmm4,0x7 + vprord zmm5,zmm5,0x7 + vprord zmm6,zmm6,0x7 + vprord zmm7,zmm7,0x7 + vpaddd zmm0,zmm0,zmm22 + vpaddd zmm1,zmm1,zmm25 + vpaddd zmm2,zmm2,zmm27 + vpaddd zmm3,zmm3,zmm24 + vpaddd zmm0,zmm0,zmm5 + vpaddd zmm1,zmm1,zmm6 + vpaddd zmm2,zmm2,zmm7 + vpaddd zmm3,zmm3,zmm4 + vpxord zmm15,zmm15,zmm0 + vpxord zmm12,zmm12,zmm1 + vpxord zmm13,zmm13,zmm2 + vpxord zmm14,zmm14,zmm3 + vprord zmm15,zmm15,0x10 + vprord zmm12,zmm12,0x10 + vprord zmm13,zmm13,0x10 + vprord zmm14,zmm14,0x10 + vpaddd zmm10,zmm10,zmm15 + vpaddd zmm11,zmm11,zmm12 + vpaddd zmm8,zmm8,zmm13 + vpaddd zmm9,zmm9,zmm14 + vpxord zmm5,zmm5,zmm10 + vpxord zmm6,zmm6,zmm11 + vpxord zmm7,zmm7,zmm8 + vpxord zmm4,zmm4,zmm9 + vprord zmm5,zmm5,0xc + vprord zmm6,zmm6,0xc + vprord zmm7,zmm7,0xc + vprord zmm4,zmm4,0xc + vpaddd zmm0,zmm0,zmm21 + vpaddd zmm1,zmm1,zmm16 + vpaddd zmm2,zmm2,zmm31 + vpaddd zmm3,zmm3,zmm17 + vpaddd zmm0,zmm0,zmm5 + vpaddd zmm1,zmm1,zmm6 + vpaddd zmm2,zmm2,zmm7 + vpaddd zmm3,zmm3,zmm4 + vpxord zmm15,zmm15,zmm0 + vpxord zmm12,zmm12,zmm1 + vpxord zmm13,zmm13,zmm2 + vpxord zmm14,zmm14,zmm3 + vprord zmm15,zmm15,0x8 + vprord zmm12,zmm12,0x8 + vprord zmm13,zmm13,0x8 + vprord zmm14,zmm14,0x8 + vpaddd zmm10,zmm10,zmm15 + vpaddd zmm11,zmm11,zmm12 + vpaddd zmm8,zmm8,zmm13 + vpaddd zmm9,zmm9,zmm14 + vpxord zmm5,zmm5,zmm10 + vpxord zmm6,zmm6,zmm11 + vpxord zmm7,zmm7,zmm8 + vpxord zmm4,zmm4,zmm9 + vprord zmm5,zmm5,0x7 + vprord zmm6,zmm6,0x7 + vprord zmm7,zmm7,0x7 + vprord zmm4,zmm4,0x7 + vpaddd zmm0,zmm0,zmm26 + vpaddd zmm1,zmm1,zmm28 + vpaddd zmm2,zmm2,zmm30 + vpaddd zmm3,zmm3,zmm29 + vpaddd zmm0,zmm0,zmm4 + vpaddd zmm1,zmm1,zmm5 + vpaddd zmm2,zmm2,zmm6 + vpaddd zmm3,zmm3,zmm7 + vpxord zmm12,zmm12,zmm0 + vpxord zmm13,zmm13,zmm1 + vpxord zmm14,zmm14,zmm2 + vpxord zmm15,zmm15,zmm3 + vprord zmm12,zmm12,0x10 + vprord zmm13,zmm13,0x10 + vprord zmm14,zmm14,0x10 + vprord zmm15,zmm15,0x10 + vpaddd zmm8,zmm8,zmm12 + vpaddd zmm9,zmm9,zmm13 + vpaddd zmm10,zmm10,zmm14 + vpaddd zmm11,zmm11,zmm15 + vpxord zmm4,zmm4,zmm8 + vpxord zmm5,zmm5,zmm9 + vpxord zmm6,zmm6,zmm10 + vpxord zmm7,zmm7,zmm11 + vprord zmm4,zmm4,0xc + vprord zmm5,zmm5,0xc + vprord zmm6,zmm6,0xc + vprord zmm7,zmm7,0xc + vpaddd zmm0,zmm0,zmm23 + vpaddd zmm1,zmm1,zmm25 + vpaddd zmm2,zmm2,zmm19 + vpaddd zmm3,zmm3,zmm31 + vpaddd zmm0,zmm0,zmm4 + vpaddd zmm1,zmm1,zmm5 + vpaddd zmm2,zmm2,zmm6 + vpaddd zmm3,zmm3,zmm7 + vpxord zmm12,zmm12,zmm0 + vpxord zmm13,zmm13,zmm1 + vpxord zmm14,zmm14,zmm2 + vpxord zmm15,zmm15,zmm3 + vprord zmm12,zmm12,0x8 + vprord zmm13,zmm13,0x8 + vprord zmm14,zmm14,0x8 + vprord zmm15,zmm15,0x8 + vpaddd zmm8,zmm8,zmm12 + vpaddd zmm9,zmm9,zmm13 + vpaddd zmm10,zmm10,zmm14 + vpaddd zmm11,zmm11,zmm15 + vpxord zmm4,zmm4,zmm8 + vpxord zmm5,zmm5,zmm9 + vpxord zmm6,zmm6,zmm10 + vpxord zmm7,zmm7,zmm11 + vprord zmm4,zmm4,0x7 + vprord zmm5,zmm5,0x7 + vprord zmm6,zmm6,0x7 + vprord zmm7,zmm7,0x7 + vpaddd zmm0,zmm0,zmm20 + vpaddd zmm1,zmm1,zmm27 + vpaddd zmm2,zmm2,zmm21 + vpaddd zmm3,zmm3,zmm17 + vpaddd zmm0,zmm0,zmm5 + vpaddd zmm1,zmm1,zmm6 + vpaddd zmm2,zmm2,zmm7 + vpaddd zmm3,zmm3,zmm4 + vpxord zmm15,zmm15,zmm0 + vpxord zmm12,zmm12,zmm1 + vpxord zmm13,zmm13,zmm2 + vpxord zmm14,zmm14,zmm3 + vprord zmm15,zmm15,0x10 + vprord zmm12,zmm12,0x10 + vprord zmm13,zmm13,0x10 + vprord zmm14,zmm14,0x10 + vpaddd zmm10,zmm10,zmm15 + vpaddd zmm11,zmm11,zmm12 + vpaddd zmm8,zmm8,zmm13 + vpaddd zmm9,zmm9,zmm14 + vpxord zmm5,zmm5,zmm10 + vpxord zmm6,zmm6,zmm11 + vpxord zmm7,zmm7,zmm8 + vpxord zmm4,zmm4,zmm9 + vprord zmm5,zmm5,0xc + vprord zmm6,zmm6,0xc + vprord zmm7,zmm7,0xc + vprord zmm4,zmm4,0xc + vpaddd zmm0,zmm0,zmm16 + vpaddd zmm1,zmm1,zmm18 + vpaddd zmm2,zmm2,zmm24 + vpaddd zmm3,zmm3,zmm22 + vpaddd zmm0,zmm0,zmm5 + vpaddd zmm1,zmm1,zmm6 + vpaddd zmm2,zmm2,zmm7 + vpaddd zmm3,zmm3,zmm4 + vpxord zmm15,zmm15,zmm0 + vpxord zmm12,zmm12,zmm1 + vpxord zmm13,zmm13,zmm2 + vpxord zmm14,zmm14,zmm3 + vprord zmm15,zmm15,0x8 + vprord zmm12,zmm12,0x8 + vprord zmm13,zmm13,0x8 + vprord zmm14,zmm14,0x8 + vpaddd zmm10,zmm10,zmm15 + vpaddd zmm11,zmm11,zmm12 + vpaddd zmm8,zmm8,zmm13 + vpaddd zmm9,zmm9,zmm14 + vpxord zmm5,zmm5,zmm10 + vpxord zmm6,zmm6,zmm11 + vpxord zmm7,zmm7,zmm8 + vpxord zmm4,zmm4,zmm9 + vprord zmm5,zmm5,0x7 + vprord zmm6,zmm6,0x7 + vprord zmm7,zmm7,0x7 + vprord zmm4,zmm4,0x7 + vpaddd zmm0,zmm0,zmm28 + vpaddd zmm1,zmm1,zmm25 + vpaddd zmm2,zmm2,zmm31 + vpaddd zmm3,zmm3,zmm30 + vpaddd zmm0,zmm0,zmm4 + vpaddd zmm1,zmm1,zmm5 + vpaddd zmm2,zmm2,zmm6 + vpaddd zmm3,zmm3,zmm7 + vpxord zmm12,zmm12,zmm0 + vpxord zmm13,zmm13,zmm1 + vpxord zmm14,zmm14,zmm2 + vpxord zmm15,zmm15,zmm3 + vprord zmm12,zmm12,0x10 + vprord zmm13,zmm13,0x10 + vprord zmm14,zmm14,0x10 + vprord zmm15,zmm15,0x10 + vpaddd zmm8,zmm8,zmm12 + vpaddd zmm9,zmm9,zmm13 + vpaddd zmm10,zmm10,zmm14 + vpaddd zmm11,zmm11,zmm15 + vpxord zmm4,zmm4,zmm8 + vpxord zmm5,zmm5,zmm9 + vpxord zmm6,zmm6,zmm10 + vpxord zmm7,zmm7,zmm11 + vprord zmm4,zmm4,0xc + vprord zmm5,zmm5,0xc + vprord zmm6,zmm6,0xc + vprord zmm7,zmm7,0xc + vpaddd zmm0,zmm0,zmm29 + vpaddd zmm1,zmm1,zmm27 + vpaddd zmm2,zmm2,zmm26 + vpaddd zmm3,zmm3,zmm24 + vpaddd zmm0,zmm0,zmm4 + vpaddd zmm1,zmm1,zmm5 + vpaddd zmm2,zmm2,zmm6 + vpaddd zmm3,zmm3,zmm7 + vpxord zmm12,zmm12,zmm0 + vpxord zmm13,zmm13,zmm1 + vpxord zmm14,zmm14,zmm2 + vpxord zmm15,zmm15,zmm3 + vprord zmm12,zmm12,0x8 + vprord zmm13,zmm13,0x8 + vprord zmm14,zmm14,0x8 + vprord zmm15,zmm15,0x8 + vpaddd zmm8,zmm8,zmm12 + vpaddd zmm9,zmm9,zmm13 + vpaddd zmm10,zmm10,zmm14 + vpaddd zmm11,zmm11,zmm15 + vpxord zmm4,zmm4,zmm8 + vpxord zmm5,zmm5,zmm9 + vpxord zmm6,zmm6,zmm10 + vpxord zmm7,zmm7,zmm11 + vprord zmm4,zmm4,0x7 + vprord zmm5,zmm5,0x7 + vprord zmm6,zmm6,0x7 + vprord zmm7,zmm7,0x7 + vpaddd zmm0,zmm0,zmm23 + vpaddd zmm1,zmm1,zmm21 + vpaddd zmm2,zmm2,zmm16 + vpaddd zmm3,zmm3,zmm22 + vpaddd zmm0,zmm0,zmm5 + vpaddd zmm1,zmm1,zmm6 + vpaddd zmm2,zmm2,zmm7 + vpaddd zmm3,zmm3,zmm4 + vpxord zmm15,zmm15,zmm0 + vpxord zmm12,zmm12,zmm1 + vpxord zmm13,zmm13,zmm2 + vpxord zmm14,zmm14,zmm3 + vprord zmm15,zmm15,0x10 + vprord zmm12,zmm12,0x10 + vprord zmm13,zmm13,0x10 + vprord zmm14,zmm14,0x10 + vpaddd zmm10,zmm10,zmm15 + vpaddd zmm11,zmm11,zmm12 + vpaddd zmm8,zmm8,zmm13 + vpaddd zmm9,zmm9,zmm14 + vpxord zmm5,zmm5,zmm10 + vpxord zmm6,zmm6,zmm11 + vpxord zmm7,zmm7,zmm8 + vpxord zmm4,zmm4,zmm9 + vprord zmm5,zmm5,0xc + vprord zmm6,zmm6,0xc + vprord zmm7,zmm7,0xc + vprord zmm4,zmm4,0xc + vpaddd zmm0,zmm0,zmm18 + vpaddd zmm1,zmm1,zmm19 + vpaddd zmm2,zmm2,zmm17 + vpaddd zmm3,zmm3,zmm20 + vpaddd zmm0,zmm0,zmm5 + vpaddd zmm1,zmm1,zmm6 + vpaddd zmm2,zmm2,zmm7 + vpaddd zmm3,zmm3,zmm4 + vpxord zmm15,zmm15,zmm0 + vpxord zmm12,zmm12,zmm1 + vpxord zmm13,zmm13,zmm2 + vpxord zmm14,zmm14,zmm3 + vprord zmm15,zmm15,0x8 + vprord zmm12,zmm12,0x8 + vprord zmm13,zmm13,0x8 + vprord zmm14,zmm14,0x8 + vpaddd zmm10,zmm10,zmm15 + vpaddd zmm11,zmm11,zmm12 + vpaddd zmm8,zmm8,zmm13 + vpaddd zmm9,zmm9,zmm14 + vpxord zmm5,zmm5,zmm10 + vpxord zmm6,zmm6,zmm11 + vpxord zmm7,zmm7,zmm8 + vpxord zmm4,zmm4,zmm9 + vprord zmm5,zmm5,0x7 + vprord zmm6,zmm6,0x7 + vprord zmm7,zmm7,0x7 + vprord zmm4,zmm4,0x7 + vpaddd zmm0,zmm0,zmm25 + vpaddd zmm1,zmm1,zmm27 + vpaddd zmm2,zmm2,zmm24 + vpaddd zmm3,zmm3,zmm31 + vpaddd zmm0,zmm0,zmm4 + vpaddd zmm1,zmm1,zmm5 + vpaddd zmm2,zmm2,zmm6 + vpaddd zmm3,zmm3,zmm7 + vpxord zmm12,zmm12,zmm0 + vpxord zmm13,zmm13,zmm1 + vpxord zmm14,zmm14,zmm2 + vpxord zmm15,zmm15,zmm3 + vprord zmm12,zmm12,0x10 + vprord zmm13,zmm13,0x10 + vprord zmm14,zmm14,0x10 + vprord zmm15,zmm15,0x10 + vpaddd zmm8,zmm8,zmm12 + vpaddd zmm9,zmm9,zmm13 + vpaddd zmm10,zmm10,zmm14 + vpaddd zmm11,zmm11,zmm15 + vpxord zmm4,zmm4,zmm8 + vpxord zmm5,zmm5,zmm9 + vpxord zmm6,zmm6,zmm10 + vpxord zmm7,zmm7,zmm11 + vprord zmm4,zmm4,0xc + vprord zmm5,zmm5,0xc + vprord zmm6,zmm6,0xc + vprord zmm7,zmm7,0xc + vpaddd zmm0,zmm0,zmm30 + vpaddd zmm1,zmm1,zmm21 + vpaddd zmm2,zmm2,zmm28 + vpaddd zmm3,zmm3,zmm17 + vpaddd zmm0,zmm0,zmm4 + vpaddd zmm1,zmm1,zmm5 + vpaddd zmm2,zmm2,zmm6 + vpaddd zmm3,zmm3,zmm7 + vpxord zmm12,zmm12,zmm0 + vpxord zmm13,zmm13,zmm1 + vpxord zmm14,zmm14,zmm2 + vpxord zmm15,zmm15,zmm3 + vprord zmm12,zmm12,0x8 + vprord zmm13,zmm13,0x8 + vprord zmm14,zmm14,0x8 + vprord zmm15,zmm15,0x8 + vpaddd zmm8,zmm8,zmm12 + vpaddd zmm9,zmm9,zmm13 + vpaddd zmm10,zmm10,zmm14 + vpaddd zmm11,zmm11,zmm15 + vpxord zmm4,zmm4,zmm8 + vpxord zmm5,zmm5,zmm9 + vpxord zmm6,zmm6,zmm10 + vpxord zmm7,zmm7,zmm11 + vprord zmm4,zmm4,0x7 + vprord zmm5,zmm5,0x7 + vprord zmm6,zmm6,0x7 + vprord zmm7,zmm7,0x7 + vpaddd zmm0,zmm0,zmm29 + vpaddd zmm1,zmm1,zmm16 + vpaddd zmm2,zmm2,zmm18 + vpaddd zmm3,zmm3,zmm20 + vpaddd zmm0,zmm0,zmm5 + vpaddd zmm1,zmm1,zmm6 + vpaddd zmm2,zmm2,zmm7 + vpaddd zmm3,zmm3,zmm4 + vpxord zmm15,zmm15,zmm0 + vpxord zmm12,zmm12,zmm1 + vpxord zmm13,zmm13,zmm2 + vpxord zmm14,zmm14,zmm3 + vprord zmm15,zmm15,0x10 + vprord zmm12,zmm12,0x10 + vprord zmm13,zmm13,0x10 + vprord zmm14,zmm14,0x10 + vpaddd zmm10,zmm10,zmm15 + vpaddd zmm11,zmm11,zmm12 + vpaddd zmm8,zmm8,zmm13 + vpaddd zmm9,zmm9,zmm14 + vpxord zmm5,zmm5,zmm10 + vpxord zmm6,zmm6,zmm11 + vpxord zmm7,zmm7,zmm8 + vpxord zmm4,zmm4,zmm9 + vprord zmm5,zmm5,0xc + vprord zmm6,zmm6,0xc + vprord zmm7,zmm7,0xc + vprord zmm4,zmm4,0xc + vpaddd zmm0,zmm0,zmm19 + vpaddd zmm1,zmm1,zmm26 + vpaddd zmm2,zmm2,zmm22 + vpaddd zmm3,zmm3,zmm23 + vpaddd zmm0,zmm0,zmm5 + vpaddd zmm1,zmm1,zmm6 + vpaddd zmm2,zmm2,zmm7 + vpaddd zmm3,zmm3,zmm4 + vpxord zmm15,zmm15,zmm0 + vpxord zmm12,zmm12,zmm1 + vpxord zmm13,zmm13,zmm2 + vpxord zmm14,zmm14,zmm3 + vprord zmm15,zmm15,0x8 + vprord zmm12,zmm12,0x8 + vprord zmm13,zmm13,0x8 + vprord zmm14,zmm14,0x8 + vpaddd zmm10,zmm10,zmm15 + vpaddd zmm11,zmm11,zmm12 + vpaddd zmm8,zmm8,zmm13 + vpaddd zmm9,zmm9,zmm14 + vpxord zmm5,zmm5,zmm10 + vpxord zmm6,zmm6,zmm11 + vpxord zmm7,zmm7,zmm8 + vpxord zmm4,zmm4,zmm9 + vprord zmm5,zmm5,0x7 + vprord zmm6,zmm6,0x7 + vprord zmm7,zmm7,0x7 + vprord zmm4,zmm4,0x7 + vpaddd zmm0,zmm0,zmm27 + vpaddd zmm1,zmm1,zmm21 + vpaddd zmm2,zmm2,zmm17 + vpaddd zmm3,zmm3,zmm24 + vpaddd zmm0,zmm0,zmm4 + vpaddd zmm1,zmm1,zmm5 + vpaddd zmm2,zmm2,zmm6 + vpaddd zmm3,zmm3,zmm7 + vpxord zmm12,zmm12,zmm0 + vpxord zmm13,zmm13,zmm1 + vpxord zmm14,zmm14,zmm2 + vpxord zmm15,zmm15,zmm3 + vprord zmm12,zmm12,0x10 + vprord zmm13,zmm13,0x10 + vprord zmm14,zmm14,0x10 + vprord zmm15,zmm15,0x10 + vpaddd zmm8,zmm8,zmm12 + vpaddd zmm9,zmm9,zmm13 + vpaddd zmm10,zmm10,zmm14 + vpaddd zmm11,zmm11,zmm15 + vpxord zmm4,zmm4,zmm8 + vpxord zmm5,zmm5,zmm9 + vpxord zmm6,zmm6,zmm10 + vpxord zmm7,zmm7,zmm11 + vprord zmm4,zmm4,0xc + vprord zmm5,zmm5,0xc + vprord zmm6,zmm6,0xc + vprord zmm7,zmm7,0xc + vpaddd zmm0,zmm0,zmm31 + vpaddd zmm1,zmm1,zmm16 + vpaddd zmm2,zmm2,zmm25 + vpaddd zmm3,zmm3,zmm22 + vpaddd zmm0,zmm0,zmm4 + vpaddd zmm1,zmm1,zmm5 + vpaddd zmm2,zmm2,zmm6 + vpaddd zmm3,zmm3,zmm7 + vpxord zmm12,zmm12,zmm0 + vpxord zmm13,zmm13,zmm1 + vpxord zmm14,zmm14,zmm2 + vpxord zmm15,zmm15,zmm3 + vprord zmm12,zmm12,0x8 + vprord zmm13,zmm13,0x8 + vprord zmm14,zmm14,0x8 + vprord zmm15,zmm15,0x8 + vpaddd zmm8,zmm8,zmm12 + vpaddd zmm9,zmm9,zmm13 + vpaddd zmm10,zmm10,zmm14 + vpaddd zmm11,zmm11,zmm15 + vpxord zmm4,zmm4,zmm8 + vpxord zmm5,zmm5,zmm9 + vpxord zmm6,zmm6,zmm10 + vpxord zmm7,zmm7,zmm11 + vprord zmm4,zmm4,0x7 + vprord zmm5,zmm5,0x7 + vprord zmm6,zmm6,0x7 + vprord zmm7,zmm7,0x7 + vpaddd zmm0,zmm0,zmm30 + vpaddd zmm1,zmm1,zmm18 + vpaddd zmm2,zmm2,zmm19 + vpaddd zmm3,zmm3,zmm23 + vpaddd zmm0,zmm0,zmm5 + vpaddd zmm1,zmm1,zmm6 + vpaddd zmm2,zmm2,zmm7 + vpaddd zmm3,zmm3,zmm4 + vpxord zmm15,zmm15,zmm0 + vpxord zmm12,zmm12,zmm1 + vpxord zmm13,zmm13,zmm2 + vpxord zmm14,zmm14,zmm3 + vprord zmm15,zmm15,0x10 + vprord zmm12,zmm12,0x10 + vprord zmm13,zmm13,0x10 + vprord zmm14,zmm14,0x10 + vpaddd zmm10,zmm10,zmm15 + vpaddd zmm11,zmm11,zmm12 + vpaddd zmm8,zmm8,zmm13 + vpaddd zmm9,zmm9,zmm14 + vpxord zmm5,zmm5,zmm10 + vpxord zmm6,zmm6,zmm11 + vpxord zmm7,zmm7,zmm8 + vpxord zmm4,zmm4,zmm9 + vprord zmm5,zmm5,0xc + vprord zmm6,zmm6,0xc + vprord zmm7,zmm7,0xc + vprord zmm4,zmm4,0xc + vpaddd zmm0,zmm0,zmm26 + vpaddd zmm1,zmm1,zmm28 + vpaddd zmm2,zmm2,zmm20 + vpaddd zmm3,zmm3,zmm29 + vpaddd zmm0,zmm0,zmm5 + vpaddd zmm1,zmm1,zmm6 + vpaddd zmm2,zmm2,zmm7 + vpaddd zmm3,zmm3,zmm4 + vpxord zmm15,zmm15,zmm0 + vpxord zmm12,zmm12,zmm1 + vpxord zmm13,zmm13,zmm2 + vpxord zmm14,zmm14,zmm3 + vprord zmm15,zmm15,0x8 + vprord zmm12,zmm12,0x8 + vprord zmm13,zmm13,0x8 + vprord zmm14,zmm14,0x8 + vpaddd zmm10,zmm10,zmm15 + vpaddd zmm11,zmm11,zmm12 + vpaddd zmm8,zmm8,zmm13 + vpaddd zmm9,zmm9,zmm14 + vpxord zmm5,zmm5,zmm10 + vpxord zmm6,zmm6,zmm11 + vpxord zmm7,zmm7,zmm8 + vpxord zmm4,zmm4,zmm9 + vprord zmm5,zmm5,0x7 + vprord zmm6,zmm6,0x7 + vprord zmm7,zmm7,0x7 + vprord zmm4,zmm4,0x7 + vpxord zmm0,zmm0,zmm8 + vpxord zmm1,zmm1,zmm9 + vpxord zmm2,zmm2,zmm10 + vpxord zmm3,zmm3,zmm11 + vpxord zmm4,zmm4,zmm12 + vpxord zmm5,zmm5,zmm13 + vpxord zmm6,zmm6,zmm14 + vpxord zmm7,zmm7,zmm15 + vpxord zmm8,zmm8,DWORD PTR [rdi]{1to16} + vpxord zmm9,zmm9,DWORD PTR [rdi+0x4]{1to16} + vpxord zmm10,zmm10,DWORD PTR [rdi+0x8]{1to16} + vpxord zmm11,zmm11,DWORD PTR [rdi+0xc]{1to16} + vpxord zmm12,zmm12,DWORD PTR [rdi+0x10]{1to16} + vpxord zmm13,zmm13,DWORD PTR [rdi+0x14]{1to16} + vpxord zmm14,zmm14,DWORD PTR [rdi+0x18]{1to16} + vpxord zmm15,zmm15,DWORD PTR [rdi+0x1c]{1to16} + vpunpckldq zmm16,zmm0,zmm1 + vpunpckhdq zmm17,zmm0,zmm1 + vpunpckldq zmm18,zmm2,zmm3 + vpunpckhdq zmm19,zmm2,zmm3 + vpunpckldq zmm20,zmm4,zmm5 + vpunpckhdq zmm21,zmm4,zmm5 + vpunpckldq zmm22,zmm6,zmm7 + vpunpckhdq zmm23,zmm6,zmm7 + vpunpckldq zmm24,zmm8,zmm9 + vpunpckhdq zmm25,zmm8,zmm9 + vpunpckldq zmm26,zmm10,zmm11 + vpunpckhdq zmm27,zmm10,zmm11 + vpunpckldq zmm28,zmm12,zmm13 + vpunpckhdq zmm29,zmm12,zmm13 + vpunpckldq zmm30,zmm14,zmm15 + vpunpckhdq zmm31,zmm14,zmm15 + vpunpcklqdq zmm0,zmm16,zmm18 + vpunpckhqdq zmm1,zmm16,zmm18 + vpunpcklqdq zmm2,zmm17,zmm19 + vpunpckhqdq zmm3,zmm17,zmm19 + vpunpcklqdq zmm4,zmm20,zmm22 + vpunpckhqdq zmm5,zmm20,zmm22 + vpunpcklqdq zmm6,zmm21,zmm23 + vpunpckhqdq zmm7,zmm21,zmm23 + vpunpcklqdq zmm8,zmm24,zmm26 + vpunpckhqdq zmm9,zmm24,zmm26 + vpunpcklqdq zmm10,zmm25,zmm27 + vpunpckhqdq zmm11,zmm25,zmm27 + vpunpcklqdq zmm12,zmm28,zmm30 + vpunpckhqdq zmm13,zmm28,zmm30 + vpunpcklqdq zmm14,zmm29,zmm31 + vpunpckhqdq zmm15,zmm29,zmm31 + vshufi32x4 zmm16,zmm0,zmm4,0x88 + vshufi32x4 zmm17,zmm1,zmm5,0x88 + vshufi32x4 zmm18,zmm2,zmm6,0x88 + vshufi32x4 zmm19,zmm3,zmm7,0x88 + vshufi32x4 zmm20,zmm0,zmm4,0xdd + vshufi32x4 zmm21,zmm1,zmm5,0xdd + vshufi32x4 zmm22,zmm2,zmm6,0xdd + vshufi32x4 zmm23,zmm3,zmm7,0xdd + vshufi32x4 zmm24,zmm8,zmm12,0x88 + vshufi32x4 zmm25,zmm9,zmm13,0x88 + vshufi32x4 zmm26,zmm10,zmm14,0x88 + vshufi32x4 zmm27,zmm11,zmm15,0x88 + vshufi32x4 zmm28,zmm8,zmm12,0xdd + vshufi32x4 zmm29,zmm9,zmm13,0xdd + vshufi32x4 zmm30,zmm10,zmm14,0xdd + vshufi32x4 zmm31,zmm11,zmm15,0xdd + vshufi32x4 zmm0,zmm16,zmm24,0x88 + vshufi32x4 zmm1,zmm17,zmm25,0x88 + vshufi32x4 zmm2,zmm18,zmm26,0x88 + vshufi32x4 zmm3,zmm19,zmm27,0x88 + vshufi32x4 zmm4,zmm20,zmm28,0x88 + vshufi32x4 zmm5,zmm21,zmm29,0x88 + vshufi32x4 zmm6,zmm22,zmm30,0x88 + vshufi32x4 zmm7,zmm23,zmm31,0x88 + vshufi32x4 zmm8,zmm16,zmm24,0xdd + vshufi32x4 zmm9,zmm17,zmm25,0xdd + vshufi32x4 zmm10,zmm18,zmm26,0xdd + vshufi32x4 zmm11,zmm19,zmm27,0xdd + vshufi32x4 zmm12,zmm20,zmm28,0xdd + vshufi32x4 zmm13,zmm21,zmm29,0xdd + vshufi32x4 zmm14,zmm22,zmm30,0xdd + vshufi32x4 zmm15,zmm23,zmm31,0xdd + vmovdqu32 ZMMWORD PTR [r9],zmm0 + vmovdqu32 ZMMWORD PTR [r9+0x40],zmm1 + vmovdqu32 ZMMWORD PTR [r9+0x80],zmm2 + vmovdqu32 ZMMWORD PTR [r9+0xc0],zmm3 + vmovdqu32 ZMMWORD PTR [r9+0x100],zmm4 + vmovdqu32 ZMMWORD PTR [r9+0x140],zmm5 + vmovdqu32 ZMMWORD PTR [r9+0x180],zmm6 + vmovdqu32 ZMMWORD PTR [r9+0x1c0],zmm7 + vmovdqu32 ZMMWORD PTR [r9+0x200],zmm8 + vmovdqu32 ZMMWORD PTR [r9+0x240],zmm9 + vmovdqu32 ZMMWORD PTR [r9+0x280],zmm10 + vmovdqu32 ZMMWORD PTR [r9+0x2c0],zmm11 + vmovdqu32 ZMMWORD PTR [r9+0x300],zmm12 + vmovdqu32 ZMMWORD PTR [r9+0x340],zmm13 + vmovdqu32 ZMMWORD PTR [r9+0x380],zmm14 + vmovdqu32 ZMMWORD PTR [r9+0x3c0],zmm15 + vmovdqa32 zmm0,ZMMWORD PTR [rsp] + vmovdqa32 zmm1,ZMMWORD PTR [rsp+0x40] + vpaddd zmm2,zmm0,DWORD PTR [ADD16+rip]{1to16} + vpcmpltud k1,zmm2,zmm0 + vpaddd zmm1{k1},zmm1,DWORD PTR [ADD1+rip]{1to16} + vmovdqa32 ZMMWORD PTR [rsp],zmm2 + vmovdqa32 ZMMWORD PTR [rsp+0x40],zmm1 + add r9,0x400 + sub r10,0x10 + cmp r10,0x10 + jae 3b + test r10,r10 + jne 2f +9: + vzeroupper + mov rsp,rbp + pop rbp + ret +2: + test r10,0x8 + je 2f + vpbroadcastd ymm16,DWORD PTR [rsi] + vpbroadcastd ymm17,DWORD PTR [rsi+0x4] + vpbroadcastd ymm18,DWORD PTR [rsi+0x8] + vpbroadcastd ymm19,DWORD PTR [rsi+0xc] + vpbroadcastd ymm20,DWORD PTR [rsi+0x10] + vpbroadcastd ymm21,DWORD PTR [rsi+0x14] + vpbroadcastd ymm22,DWORD PTR [rsi+0x18] + vpbroadcastd ymm23,DWORD PTR [rsi+0x1c] + vpbroadcastd ymm24,DWORD PTR [rsi+0x20] + vpbroadcastd ymm25,DWORD PTR [rsi+0x24] + vpbroadcastd ymm26,DWORD PTR [rsi+0x28] + vpbroadcastd ymm27,DWORD PTR [rsi+0x2c] + vpbroadcastd ymm28,DWORD PTR [rsi+0x30] + vpbroadcastd ymm29,DWORD PTR [rsi+0x34] + vpbroadcastd ymm30,DWORD PTR [rsi+0x38] + vpbroadcastd ymm31,DWORD PTR [rsi+0x3c] + vpbroadcastd ymm0,DWORD PTR [rdi] + vpbroadcastd ymm1,DWORD PTR [rdi+0x4] + vpbroadcastd ymm2,DWORD PTR [rdi+0x8] + vpbroadcastd ymm3,DWORD PTR [rdi+0xc] + vpbroadcastd ymm4,DWORD PTR [rdi+0x10] + vpbroadcastd ymm5,DWORD PTR [rdi+0x14] + vpbroadcastd ymm6,DWORD PTR [rdi+0x18] + vpbroadcastd ymm7,DWORD PTR [rdi+0x1c] + vpbroadcastd ymm8,DWORD PTR [BLAKE3_IV_0+rip] + vpbroadcastd ymm9,DWORD PTR [BLAKE3_IV_1+rip] + vpbroadcastd ymm10,DWORD PTR [BLAKE3_IV_2+rip] + vpbroadcastd ymm11,DWORD PTR [BLAKE3_IV_3+rip] + vmovdqa ymm12,YMMWORD PTR [rsp] + vmovdqa ymm13,YMMWORD PTR [rsp+0x40] + vpbroadcastd ymm14,edx + vpbroadcastd ymm15,r8d + vpaddd ymm0,ymm0,ymm16 + vpaddd ymm1,ymm1,ymm18 + vpaddd ymm2,ymm2,ymm20 + vpaddd ymm3,ymm3,ymm22 + vpaddd ymm0,ymm0,ymm4 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm3,ymm3,ymm7 + vpxord ymm12,ymm12,ymm0 + vpxord ymm13,ymm13,ymm1 + vpxord ymm14,ymm14,ymm2 + vpxord ymm15,ymm15,ymm3 + vprord ymm12,ymm12,0x10 + vprord ymm13,ymm13,0x10 + vprord ymm14,ymm14,0x10 + vprord ymm15,ymm15,0x10 + vpaddd ymm8,ymm8,ymm12 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm11,ymm11,ymm15 + vpxord ymm4,ymm4,ymm8 + vpxord ymm5,ymm5,ymm9 + vpxord ymm6,ymm6,ymm10 + vpxord ymm7,ymm7,ymm11 + vprord ymm4,ymm4,0xc + vprord ymm5,ymm5,0xc + vprord ymm6,ymm6,0xc + vprord ymm7,ymm7,0xc + vpaddd ymm0,ymm0,ymm17 + vpaddd ymm1,ymm1,ymm19 + vpaddd ymm2,ymm2,ymm21 + vpaddd ymm3,ymm3,ymm23 + vpaddd ymm0,ymm0,ymm4 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm3,ymm3,ymm7 + vpxord ymm12,ymm12,ymm0 + vpxord ymm13,ymm13,ymm1 + vpxord ymm14,ymm14,ymm2 + vpxord ymm15,ymm15,ymm3 + vprord ymm12,ymm12,0x8 + vprord ymm13,ymm13,0x8 + vprord ymm14,ymm14,0x8 + vprord ymm15,ymm15,0x8 + vpaddd ymm8,ymm8,ymm12 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm11,ymm11,ymm15 + vpxord ymm4,ymm4,ymm8 + vpxord ymm5,ymm5,ymm9 + vpxord ymm6,ymm6,ymm10 + vpxord ymm7,ymm7,ymm11 + vprord ymm4,ymm4,0x7 + vprord ymm5,ymm5,0x7 + vprord ymm6,ymm6,0x7 + vprord ymm7,ymm7,0x7 + vpaddd ymm0,ymm0,ymm24 + vpaddd ymm1,ymm1,ymm26 + vpaddd ymm2,ymm2,ymm28 + vpaddd ymm3,ymm3,ymm30 + vpaddd ymm0,ymm0,ymm5 + vpaddd ymm1,ymm1,ymm6 + vpaddd ymm2,ymm2,ymm7 + vpaddd ymm3,ymm3,ymm4 + vpxord ymm15,ymm15,ymm0 + vpxord ymm12,ymm12,ymm1 + vpxord ymm13,ymm13,ymm2 + vpxord ymm14,ymm14,ymm3 + vprord ymm15,ymm15,0x10 + vprord ymm12,ymm12,0x10 + vprord ymm13,ymm13,0x10 + vprord ymm14,ymm14,0x10 + vpaddd ymm10,ymm10,ymm15 + vpaddd ymm11,ymm11,ymm12 + vpaddd ymm8,ymm8,ymm13 + vpaddd ymm9,ymm9,ymm14 + vpxord ymm5,ymm5,ymm10 + vpxord ymm6,ymm6,ymm11 + vpxord ymm7,ymm7,ymm8 + vpxord ymm4,ymm4,ymm9 + vprord ymm5,ymm5,0xc + vprord ymm6,ymm6,0xc + vprord ymm7,ymm7,0xc + vprord ymm4,ymm4,0xc + vpaddd ymm0,ymm0,ymm25 + vpaddd ymm1,ymm1,ymm27 + vpaddd ymm2,ymm2,ymm29 + vpaddd ymm3,ymm3,ymm31 + vpaddd ymm0,ymm0,ymm5 + vpaddd ymm1,ymm1,ymm6 + vpaddd ymm2,ymm2,ymm7 + vpaddd ymm3,ymm3,ymm4 + vpxord ymm15,ymm15,ymm0 + vpxord ymm12,ymm12,ymm1 + vpxord ymm13,ymm13,ymm2 + vpxord ymm14,ymm14,ymm3 + vprord ymm15,ymm15,0x8 + vprord ymm12,ymm12,0x8 + vprord ymm13,ymm13,0x8 + vprord ymm14,ymm14,0x8 + vpaddd ymm10,ymm10,ymm15 + vpaddd ymm11,ymm11,ymm12 + vpaddd ymm8,ymm8,ymm13 + vpaddd ymm9,ymm9,ymm14 + vpxord ymm5,ymm5,ymm10 + vpxord ymm6,ymm6,ymm11 + vpxord ymm7,ymm7,ymm8 + vpxord ymm4,ymm4,ymm9 + vprord ymm5,ymm5,0x7 + vprord ymm6,ymm6,0x7 + vprord ymm7,ymm7,0x7 + vprord ymm4,ymm4,0x7 + vpaddd ymm0,ymm0,ymm18 + vpaddd ymm1,ymm1,ymm19 + vpaddd ymm2,ymm2,ymm23 + vpaddd ymm3,ymm3,ymm20 + vpaddd ymm0,ymm0,ymm4 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm3,ymm3,ymm7 + vpxord ymm12,ymm12,ymm0 + vpxord ymm13,ymm13,ymm1 + vpxord ymm14,ymm14,ymm2 + vpxord ymm15,ymm15,ymm3 + vprord ymm12,ymm12,0x10 + vprord ymm13,ymm13,0x10 + vprord ymm14,ymm14,0x10 + vprord ymm15,ymm15,0x10 + vpaddd ymm8,ymm8,ymm12 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm11,ymm11,ymm15 + vpxord ymm4,ymm4,ymm8 + vpxord ymm5,ymm5,ymm9 + vpxord ymm6,ymm6,ymm10 + vpxord ymm7,ymm7,ymm11 + vprord ymm4,ymm4,0xc + vprord ymm5,ymm5,0xc + vprord ymm6,ymm6,0xc + vprord ymm7,ymm7,0xc + vpaddd ymm0,ymm0,ymm22 + vpaddd ymm1,ymm1,ymm26 + vpaddd ymm2,ymm2,ymm16 + vpaddd ymm3,ymm3,ymm29 + vpaddd ymm0,ymm0,ymm4 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm3,ymm3,ymm7 + vpxord ymm12,ymm12,ymm0 + vpxord ymm13,ymm13,ymm1 + vpxord ymm14,ymm14,ymm2 + vpxord ymm15,ymm15,ymm3 + vprord ymm12,ymm12,0x8 + vprord ymm13,ymm13,0x8 + vprord ymm14,ymm14,0x8 + vprord ymm15,ymm15,0x8 + vpaddd ymm8,ymm8,ymm12 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm11,ymm11,ymm15 + vpxord ymm4,ymm4,ymm8 + vpxord ymm5,ymm5,ymm9 + vpxord ymm6,ymm6,ymm10 + vpxord ymm7,ymm7,ymm11 + vprord ymm4,ymm4,0x7 + vprord ymm5,ymm5,0x7 + vprord ymm6,ymm6,0x7 + vprord ymm7,ymm7,0x7 + vpaddd ymm0,ymm0,ymm17 + vpaddd ymm1,ymm1,ymm28 + vpaddd ymm2,ymm2,ymm25 + vpaddd ymm3,ymm3,ymm31 + vpaddd ymm0,ymm0,ymm5 + vpaddd ymm1,ymm1,ymm6 + vpaddd ymm2,ymm2,ymm7 + vpaddd ymm3,ymm3,ymm4 + vpxord ymm15,ymm15,ymm0 + vpxord ymm12,ymm12,ymm1 + vpxord ymm13,ymm13,ymm2 + vpxord ymm14,ymm14,ymm3 + vprord ymm15,ymm15,0x10 + vprord ymm12,ymm12,0x10 + vprord ymm13,ymm13,0x10 + vprord ymm14,ymm14,0x10 + vpaddd ymm10,ymm10,ymm15 + vpaddd ymm11,ymm11,ymm12 + vpaddd ymm8,ymm8,ymm13 + vpaddd ymm9,ymm9,ymm14 + vpxord ymm5,ymm5,ymm10 + vpxord ymm6,ymm6,ymm11 + vpxord ymm7,ymm7,ymm8 + vpxord ymm4,ymm4,ymm9 + vprord ymm5,ymm5,0xc + vprord ymm6,ymm6,0xc + vprord ymm7,ymm7,0xc + vprord ymm4,ymm4,0xc + vpaddd ymm0,ymm0,ymm27 + vpaddd ymm1,ymm1,ymm21 + vpaddd ymm2,ymm2,ymm30 + vpaddd ymm3,ymm3,ymm24 + vpaddd ymm0,ymm0,ymm5 + vpaddd ymm1,ymm1,ymm6 + vpaddd ymm2,ymm2,ymm7 + vpaddd ymm3,ymm3,ymm4 + vpxord ymm15,ymm15,ymm0 + vpxord ymm12,ymm12,ymm1 + vpxord ymm13,ymm13,ymm2 + vpxord ymm14,ymm14,ymm3 + vprord ymm15,ymm15,0x8 + vprord ymm12,ymm12,0x8 + vprord ymm13,ymm13,0x8 + vprord ymm14,ymm14,0x8 + vpaddd ymm10,ymm10,ymm15 + vpaddd ymm11,ymm11,ymm12 + vpaddd ymm8,ymm8,ymm13 + vpaddd ymm9,ymm9,ymm14 + vpxord ymm5,ymm5,ymm10 + vpxord ymm6,ymm6,ymm11 + vpxord ymm7,ymm7,ymm8 + vpxord ymm4,ymm4,ymm9 + vprord ymm5,ymm5,0x7 + vprord ymm6,ymm6,0x7 + vprord ymm7,ymm7,0x7 + vprord ymm4,ymm4,0x7 + vpaddd ymm0,ymm0,ymm19 + vpaddd ymm1,ymm1,ymm26 + vpaddd ymm2,ymm2,ymm29 + vpaddd ymm3,ymm3,ymm23 + vpaddd ymm0,ymm0,ymm4 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm3,ymm3,ymm7 + vpxord ymm12,ymm12,ymm0 + vpxord ymm13,ymm13,ymm1 + vpxord ymm14,ymm14,ymm2 + vpxord ymm15,ymm15,ymm3 + vprord ymm12,ymm12,0x10 + vprord ymm13,ymm13,0x10 + vprord ymm14,ymm14,0x10 + vprord ymm15,ymm15,0x10 + vpaddd ymm8,ymm8,ymm12 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm11,ymm11,ymm15 + vpxord ymm4,ymm4,ymm8 + vpxord ymm5,ymm5,ymm9 + vpxord ymm6,ymm6,ymm10 + vpxord ymm7,ymm7,ymm11 + vprord ymm4,ymm4,0xc + vprord ymm5,ymm5,0xc + vprord ymm6,ymm6,0xc + vprord ymm7,ymm7,0xc + vpaddd ymm0,ymm0,ymm20 + vpaddd ymm1,ymm1,ymm28 + vpaddd ymm2,ymm2,ymm18 + vpaddd ymm3,ymm3,ymm30 + vpaddd ymm0,ymm0,ymm4 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm3,ymm3,ymm7 + vpxord ymm12,ymm12,ymm0 + vpxord ymm13,ymm13,ymm1 + vpxord ymm14,ymm14,ymm2 + vpxord ymm15,ymm15,ymm3 + vprord ymm12,ymm12,0x8 + vprord ymm13,ymm13,0x8 + vprord ymm14,ymm14,0x8 + vprord ymm15,ymm15,0x8 + vpaddd ymm8,ymm8,ymm12 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm11,ymm11,ymm15 + vpxord ymm4,ymm4,ymm8 + vpxord ymm5,ymm5,ymm9 + vpxord ymm6,ymm6,ymm10 + vpxord ymm7,ymm7,ymm11 + vprord ymm4,ymm4,0x7 + vprord ymm5,ymm5,0x7 + vprord ymm6,ymm6,0x7 + vprord ymm7,ymm7,0x7 + vpaddd ymm0,ymm0,ymm22 + vpaddd ymm1,ymm1,ymm25 + vpaddd ymm2,ymm2,ymm27 + vpaddd ymm3,ymm3,ymm24 + vpaddd ymm0,ymm0,ymm5 + vpaddd ymm1,ymm1,ymm6 + vpaddd ymm2,ymm2,ymm7 + vpaddd ymm3,ymm3,ymm4 + vpxord ymm15,ymm15,ymm0 + vpxord ymm12,ymm12,ymm1 + vpxord ymm13,ymm13,ymm2 + vpxord ymm14,ymm14,ymm3 + vprord ymm15,ymm15,0x10 + vprord ymm12,ymm12,0x10 + vprord ymm13,ymm13,0x10 + vprord ymm14,ymm14,0x10 + vpaddd ymm10,ymm10,ymm15 + vpaddd ymm11,ymm11,ymm12 + vpaddd ymm8,ymm8,ymm13 + vpaddd ymm9,ymm9,ymm14 + vpxord ymm5,ymm5,ymm10 + vpxord ymm6,ymm6,ymm11 + vpxord ymm7,ymm7,ymm8 + vpxord ymm4,ymm4,ymm9 + vprord ymm5,ymm5,0xc + vprord ymm6,ymm6,0xc + vprord ymm7,ymm7,0xc + vprord ymm4,ymm4,0xc + vpaddd ymm0,ymm0,ymm21 + vpaddd ymm1,ymm1,ymm16 + vpaddd ymm2,ymm2,ymm31 + vpaddd ymm3,ymm3,ymm17 + vpaddd ymm0,ymm0,ymm5 + vpaddd ymm1,ymm1,ymm6 + vpaddd ymm2,ymm2,ymm7 + vpaddd ymm3,ymm3,ymm4 + vpxord ymm15,ymm15,ymm0 + vpxord ymm12,ymm12,ymm1 + vpxord ymm13,ymm13,ymm2 + vpxord ymm14,ymm14,ymm3 + vprord ymm15,ymm15,0x8 + vprord ymm12,ymm12,0x8 + vprord ymm13,ymm13,0x8 + vprord ymm14,ymm14,0x8 + vpaddd ymm10,ymm10,ymm15 + vpaddd ymm11,ymm11,ymm12 + vpaddd ymm8,ymm8,ymm13 + vpaddd ymm9,ymm9,ymm14 + vpxord ymm5,ymm5,ymm10 + vpxord ymm6,ymm6,ymm11 + vpxord ymm7,ymm7,ymm8 + vpxord ymm4,ymm4,ymm9 + vprord ymm5,ymm5,0x7 + vprord ymm6,ymm6,0x7 + vprord ymm7,ymm7,0x7 + vprord ymm4,ymm4,0x7 + vpaddd ymm0,ymm0,ymm26 + vpaddd ymm1,ymm1,ymm28 + vpaddd ymm2,ymm2,ymm30 + vpaddd ymm3,ymm3,ymm29 + vpaddd ymm0,ymm0,ymm4 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm3,ymm3,ymm7 + vpxord ymm12,ymm12,ymm0 + vpxord ymm13,ymm13,ymm1 + vpxord ymm14,ymm14,ymm2 + vpxord ymm15,ymm15,ymm3 + vprord ymm12,ymm12,0x10 + vprord ymm13,ymm13,0x10 + vprord ymm14,ymm14,0x10 + vprord ymm15,ymm15,0x10 + vpaddd ymm8,ymm8,ymm12 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm11,ymm11,ymm15 + vpxord ymm4,ymm4,ymm8 + vpxord ymm5,ymm5,ymm9 + vpxord ymm6,ymm6,ymm10 + vpxord ymm7,ymm7,ymm11 + vprord ymm4,ymm4,0xc + vprord ymm5,ymm5,0xc + vprord ymm6,ymm6,0xc + vprord ymm7,ymm7,0xc + vpaddd ymm0,ymm0,ymm23 + vpaddd ymm1,ymm1,ymm25 + vpaddd ymm2,ymm2,ymm19 + vpaddd ymm3,ymm3,ymm31 + vpaddd ymm0,ymm0,ymm4 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm3,ymm3,ymm7 + vpxord ymm12,ymm12,ymm0 + vpxord ymm13,ymm13,ymm1 + vpxord ymm14,ymm14,ymm2 + vpxord ymm15,ymm15,ymm3 + vprord ymm12,ymm12,0x8 + vprord ymm13,ymm13,0x8 + vprord ymm14,ymm14,0x8 + vprord ymm15,ymm15,0x8 + vpaddd ymm8,ymm8,ymm12 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm11,ymm11,ymm15 + vpxord ymm4,ymm4,ymm8 + vpxord ymm5,ymm5,ymm9 + vpxord ymm6,ymm6,ymm10 + vpxord ymm7,ymm7,ymm11 + vprord ymm4,ymm4,0x7 + vprord ymm5,ymm5,0x7 + vprord ymm6,ymm6,0x7 + vprord ymm7,ymm7,0x7 + vpaddd ymm0,ymm0,ymm20 + vpaddd ymm1,ymm1,ymm27 + vpaddd ymm2,ymm2,ymm21 + vpaddd ymm3,ymm3,ymm17 + vpaddd ymm0,ymm0,ymm5 + vpaddd ymm1,ymm1,ymm6 + vpaddd ymm2,ymm2,ymm7 + vpaddd ymm3,ymm3,ymm4 + vpxord ymm15,ymm15,ymm0 + vpxord ymm12,ymm12,ymm1 + vpxord ymm13,ymm13,ymm2 + vpxord ymm14,ymm14,ymm3 + vprord ymm15,ymm15,0x10 + vprord ymm12,ymm12,0x10 + vprord ymm13,ymm13,0x10 + vprord ymm14,ymm14,0x10 + vpaddd ymm10,ymm10,ymm15 + vpaddd ymm11,ymm11,ymm12 + vpaddd ymm8,ymm8,ymm13 + vpaddd ymm9,ymm9,ymm14 + vpxord ymm5,ymm5,ymm10 + vpxord ymm6,ymm6,ymm11 + vpxord ymm7,ymm7,ymm8 + vpxord ymm4,ymm4,ymm9 + vprord ymm5,ymm5,0xc + vprord ymm6,ymm6,0xc + vprord ymm7,ymm7,0xc + vprord ymm4,ymm4,0xc + vpaddd ymm0,ymm0,ymm16 + vpaddd ymm1,ymm1,ymm18 + vpaddd ymm2,ymm2,ymm24 + vpaddd ymm3,ymm3,ymm22 + vpaddd ymm0,ymm0,ymm5 + vpaddd ymm1,ymm1,ymm6 + vpaddd ymm2,ymm2,ymm7 + vpaddd ymm3,ymm3,ymm4 + vpxord ymm15,ymm15,ymm0 + vpxord ymm12,ymm12,ymm1 + vpxord ymm13,ymm13,ymm2 + vpxord ymm14,ymm14,ymm3 + vprord ymm15,ymm15,0x8 + vprord ymm12,ymm12,0x8 + vprord ymm13,ymm13,0x8 + vprord ymm14,ymm14,0x8 + vpaddd ymm10,ymm10,ymm15 + vpaddd ymm11,ymm11,ymm12 + vpaddd ymm8,ymm8,ymm13 + vpaddd ymm9,ymm9,ymm14 + vpxord ymm5,ymm5,ymm10 + vpxord ymm6,ymm6,ymm11 + vpxord ymm7,ymm7,ymm8 + vpxord ymm4,ymm4,ymm9 + vprord ymm5,ymm5,0x7 + vprord ymm6,ymm6,0x7 + vprord ymm7,ymm7,0x7 + vprord ymm4,ymm4,0x7 + vpaddd ymm0,ymm0,ymm28 + vpaddd ymm1,ymm1,ymm25 + vpaddd ymm2,ymm2,ymm31 + vpaddd ymm3,ymm3,ymm30 + vpaddd ymm0,ymm0,ymm4 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm3,ymm3,ymm7 + vpxord ymm12,ymm12,ymm0 + vpxord ymm13,ymm13,ymm1 + vpxord ymm14,ymm14,ymm2 + vpxord ymm15,ymm15,ymm3 + vprord ymm12,ymm12,0x10 + vprord ymm13,ymm13,0x10 + vprord ymm14,ymm14,0x10 + vprord ymm15,ymm15,0x10 + vpaddd ymm8,ymm8,ymm12 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm11,ymm11,ymm15 + vpxord ymm4,ymm4,ymm8 + vpxord ymm5,ymm5,ymm9 + vpxord ymm6,ymm6,ymm10 + vpxord ymm7,ymm7,ymm11 + vprord ymm4,ymm4,0xc + vprord ymm5,ymm5,0xc + vprord ymm6,ymm6,0xc + vprord ymm7,ymm7,0xc + vpaddd ymm0,ymm0,ymm29 + vpaddd ymm1,ymm1,ymm27 + vpaddd ymm2,ymm2,ymm26 + vpaddd ymm3,ymm3,ymm24 + vpaddd ymm0,ymm0,ymm4 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm3,ymm3,ymm7 + vpxord ymm12,ymm12,ymm0 + vpxord ymm13,ymm13,ymm1 + vpxord ymm14,ymm14,ymm2 + vpxord ymm15,ymm15,ymm3 + vprord ymm12,ymm12,0x8 + vprord ymm13,ymm13,0x8 + vprord ymm14,ymm14,0x8 + vprord ymm15,ymm15,0x8 + vpaddd ymm8,ymm8,ymm12 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm11,ymm11,ymm15 + vpxord ymm4,ymm4,ymm8 + vpxord ymm5,ymm5,ymm9 + vpxord ymm6,ymm6,ymm10 + vpxord ymm7,ymm7,ymm11 + vprord ymm4,ymm4,0x7 + vprord ymm5,ymm5,0x7 + vprord ymm6,ymm6,0x7 + vprord ymm7,ymm7,0x7 + vpaddd ymm0,ymm0,ymm23 + vpaddd ymm1,ymm1,ymm21 + vpaddd ymm2,ymm2,ymm16 + vpaddd ymm3,ymm3,ymm22 + vpaddd ymm0,ymm0,ymm5 + vpaddd ymm1,ymm1,ymm6 + vpaddd ymm2,ymm2,ymm7 + vpaddd ymm3,ymm3,ymm4 + vpxord ymm15,ymm15,ymm0 + vpxord ymm12,ymm12,ymm1 + vpxord ymm13,ymm13,ymm2 + vpxord ymm14,ymm14,ymm3 + vprord ymm15,ymm15,0x10 + vprord ymm12,ymm12,0x10 + vprord ymm13,ymm13,0x10 + vprord ymm14,ymm14,0x10 + vpaddd ymm10,ymm10,ymm15 + vpaddd ymm11,ymm11,ymm12 + vpaddd ymm8,ymm8,ymm13 + vpaddd ymm9,ymm9,ymm14 + vpxord ymm5,ymm5,ymm10 + vpxord ymm6,ymm6,ymm11 + vpxord ymm7,ymm7,ymm8 + vpxord ymm4,ymm4,ymm9 + vprord ymm5,ymm5,0xc + vprord ymm6,ymm6,0xc + vprord ymm7,ymm7,0xc + vprord ymm4,ymm4,0xc + vpaddd ymm0,ymm0,ymm18 + vpaddd ymm1,ymm1,ymm19 + vpaddd ymm2,ymm2,ymm17 + vpaddd ymm3,ymm3,ymm20 + vpaddd ymm0,ymm0,ymm5 + vpaddd ymm1,ymm1,ymm6 + vpaddd ymm2,ymm2,ymm7 + vpaddd ymm3,ymm3,ymm4 + vpxord ymm15,ymm15,ymm0 + vpxord ymm12,ymm12,ymm1 + vpxord ymm13,ymm13,ymm2 + vpxord ymm14,ymm14,ymm3 + vprord ymm15,ymm15,0x8 + vprord ymm12,ymm12,0x8 + vprord ymm13,ymm13,0x8 + vprord ymm14,ymm14,0x8 + vpaddd ymm10,ymm10,ymm15 + vpaddd ymm11,ymm11,ymm12 + vpaddd ymm8,ymm8,ymm13 + vpaddd ymm9,ymm9,ymm14 + vpxord ymm5,ymm5,ymm10 + vpxord ymm6,ymm6,ymm11 + vpxord ymm7,ymm7,ymm8 + vpxord ymm4,ymm4,ymm9 + vprord ymm5,ymm5,0x7 + vprord ymm6,ymm6,0x7 + vprord ymm7,ymm7,0x7 + vprord ymm4,ymm4,0x7 + vpaddd ymm0,ymm0,ymm25 + vpaddd ymm1,ymm1,ymm27 + vpaddd ymm2,ymm2,ymm24 + vpaddd ymm3,ymm3,ymm31 + vpaddd ymm0,ymm0,ymm4 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm3,ymm3,ymm7 + vpxord ymm12,ymm12,ymm0 + vpxord ymm13,ymm13,ymm1 + vpxord ymm14,ymm14,ymm2 + vpxord ymm15,ymm15,ymm3 + vprord ymm12,ymm12,0x10 + vprord ymm13,ymm13,0x10 + vprord ymm14,ymm14,0x10 + vprord ymm15,ymm15,0x10 + vpaddd ymm8,ymm8,ymm12 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm11,ymm11,ymm15 + vpxord ymm4,ymm4,ymm8 + vpxord ymm5,ymm5,ymm9 + vpxord ymm6,ymm6,ymm10 + vpxord ymm7,ymm7,ymm11 + vprord ymm4,ymm4,0xc + vprord ymm5,ymm5,0xc + vprord ymm6,ymm6,0xc + vprord ymm7,ymm7,0xc + vpaddd ymm0,ymm0,ymm30 + vpaddd ymm1,ymm1,ymm21 + vpaddd ymm2,ymm2,ymm28 + vpaddd ymm3,ymm3,ymm17 + vpaddd ymm0,ymm0,ymm4 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm3,ymm3,ymm7 + vpxord ymm12,ymm12,ymm0 + vpxord ymm13,ymm13,ymm1 + vpxord ymm14,ymm14,ymm2 + vpxord ymm15,ymm15,ymm3 + vprord ymm12,ymm12,0x8 + vprord ymm13,ymm13,0x8 + vprord ymm14,ymm14,0x8 + vprord ymm15,ymm15,0x8 + vpaddd ymm8,ymm8,ymm12 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm11,ymm11,ymm15 + vpxord ymm4,ymm4,ymm8 + vpxord ymm5,ymm5,ymm9 + vpxord ymm6,ymm6,ymm10 + vpxord ymm7,ymm7,ymm11 + vprord ymm4,ymm4,0x7 + vprord ymm5,ymm5,0x7 + vprord ymm6,ymm6,0x7 + vprord ymm7,ymm7,0x7 + vpaddd ymm0,ymm0,ymm29 + vpaddd ymm1,ymm1,ymm16 + vpaddd ymm2,ymm2,ymm18 + vpaddd ymm3,ymm3,ymm20 + vpaddd ymm0,ymm0,ymm5 + vpaddd ymm1,ymm1,ymm6 + vpaddd ymm2,ymm2,ymm7 + vpaddd ymm3,ymm3,ymm4 + vpxord ymm15,ymm15,ymm0 + vpxord ymm12,ymm12,ymm1 + vpxord ymm13,ymm13,ymm2 + vpxord ymm14,ymm14,ymm3 + vprord ymm15,ymm15,0x10 + vprord ymm12,ymm12,0x10 + vprord ymm13,ymm13,0x10 + vprord ymm14,ymm14,0x10 + vpaddd ymm10,ymm10,ymm15 + vpaddd ymm11,ymm11,ymm12 + vpaddd ymm8,ymm8,ymm13 + vpaddd ymm9,ymm9,ymm14 + vpxord ymm5,ymm5,ymm10 + vpxord ymm6,ymm6,ymm11 + vpxord ymm7,ymm7,ymm8 + vpxord ymm4,ymm4,ymm9 + vprord ymm5,ymm5,0xc + vprord ymm6,ymm6,0xc + vprord ymm7,ymm7,0xc + vprord ymm4,ymm4,0xc + vpaddd ymm0,ymm0,ymm19 + vpaddd ymm1,ymm1,ymm26 + vpaddd ymm2,ymm2,ymm22 + vpaddd ymm3,ymm3,ymm23 + vpaddd ymm0,ymm0,ymm5 + vpaddd ymm1,ymm1,ymm6 + vpaddd ymm2,ymm2,ymm7 + vpaddd ymm3,ymm3,ymm4 + vpxord ymm15,ymm15,ymm0 + vpxord ymm12,ymm12,ymm1 + vpxord ymm13,ymm13,ymm2 + vpxord ymm14,ymm14,ymm3 + vprord ymm15,ymm15,0x8 + vprord ymm12,ymm12,0x8 + vprord ymm13,ymm13,0x8 + vprord ymm14,ymm14,0x8 + vpaddd ymm10,ymm10,ymm15 + vpaddd ymm11,ymm11,ymm12 + vpaddd ymm8,ymm8,ymm13 + vpaddd ymm9,ymm9,ymm14 + vpxord ymm5,ymm5,ymm10 + vpxord ymm6,ymm6,ymm11 + vpxord ymm7,ymm7,ymm8 + vpxord ymm4,ymm4,ymm9 + vprord ymm5,ymm5,0x7 + vprord ymm6,ymm6,0x7 + vprord ymm7,ymm7,0x7 + vprord ymm4,ymm4,0x7 + vpaddd ymm0,ymm0,ymm27 + vpaddd ymm1,ymm1,ymm21 + vpaddd ymm2,ymm2,ymm17 + vpaddd ymm3,ymm3,ymm24 + vpaddd ymm0,ymm0,ymm4 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm3,ymm3,ymm7 + vpxord ymm12,ymm12,ymm0 + vpxord ymm13,ymm13,ymm1 + vpxord ymm14,ymm14,ymm2 + vpxord ymm15,ymm15,ymm3 + vprord ymm12,ymm12,0x10 + vprord ymm13,ymm13,0x10 + vprord ymm14,ymm14,0x10 + vprord ymm15,ymm15,0x10 + vpaddd ymm8,ymm8,ymm12 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm11,ymm11,ymm15 + vpxord ymm4,ymm4,ymm8 + vpxord ymm5,ymm5,ymm9 + vpxord ymm6,ymm6,ymm10 + vpxord ymm7,ymm7,ymm11 + vprord ymm4,ymm4,0xc + vprord ymm5,ymm5,0xc + vprord ymm6,ymm6,0xc + vprord ymm7,ymm7,0xc + vpaddd ymm0,ymm0,ymm31 + vpaddd ymm1,ymm1,ymm16 + vpaddd ymm2,ymm2,ymm25 + vpaddd ymm3,ymm3,ymm22 + vpaddd ymm0,ymm0,ymm4 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm3,ymm3,ymm7 + vpxord ymm12,ymm12,ymm0 + vpxord ymm13,ymm13,ymm1 + vpxord ymm14,ymm14,ymm2 + vpxord ymm15,ymm15,ymm3 + vprord ymm12,ymm12,0x8 + vprord ymm13,ymm13,0x8 + vprord ymm14,ymm14,0x8 + vprord ymm15,ymm15,0x8 + vpaddd ymm8,ymm8,ymm12 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm11,ymm11,ymm15 + vpxord ymm4,ymm4,ymm8 + vpxord ymm5,ymm5,ymm9 + vpxord ymm6,ymm6,ymm10 + vpxord ymm7,ymm7,ymm11 + vprord ymm4,ymm4,0x7 + vprord ymm5,ymm5,0x7 + vprord ymm6,ymm6,0x7 + vprord ymm7,ymm7,0x7 + vpaddd ymm0,ymm0,ymm30 + vpaddd ymm1,ymm1,ymm18 + vpaddd ymm2,ymm2,ymm19 + vpaddd ymm3,ymm3,ymm23 + vpaddd ymm0,ymm0,ymm5 + vpaddd ymm1,ymm1,ymm6 + vpaddd ymm2,ymm2,ymm7 + vpaddd ymm3,ymm3,ymm4 + vpxord ymm15,ymm15,ymm0 + vpxord ymm12,ymm12,ymm1 + vpxord ymm13,ymm13,ymm2 + vpxord ymm14,ymm14,ymm3 + vprord ymm15,ymm15,0x10 + vprord ymm12,ymm12,0x10 + vprord ymm13,ymm13,0x10 + vprord ymm14,ymm14,0x10 + vpaddd ymm10,ymm10,ymm15 + vpaddd ymm11,ymm11,ymm12 + vpaddd ymm8,ymm8,ymm13 + vpaddd ymm9,ymm9,ymm14 + vpxord ymm5,ymm5,ymm10 + vpxord ymm6,ymm6,ymm11 + vpxord ymm7,ymm7,ymm8 + vpxord ymm4,ymm4,ymm9 + vprord ymm5,ymm5,0xc + vprord ymm6,ymm6,0xc + vprord ymm7,ymm7,0xc + vprord ymm4,ymm4,0xc + vpaddd ymm0,ymm0,ymm26 + vpaddd ymm1,ymm1,ymm28 + vpaddd ymm2,ymm2,ymm20 + vpaddd ymm3,ymm3,ymm29 + vpaddd ymm0,ymm0,ymm5 + vpaddd ymm1,ymm1,ymm6 + vpaddd ymm2,ymm2,ymm7 + vpaddd ymm3,ymm3,ymm4 + vpxord ymm15,ymm15,ymm0 + vpxord ymm12,ymm12,ymm1 + vpxord ymm13,ymm13,ymm2 + vpxord ymm14,ymm14,ymm3 + vprord ymm15,ymm15,0x8 + vprord ymm12,ymm12,0x8 + vprord ymm13,ymm13,0x8 + vprord ymm14,ymm14,0x8 + vpaddd ymm10,ymm10,ymm15 + vpaddd ymm11,ymm11,ymm12 + vpaddd ymm8,ymm8,ymm13 + vpaddd ymm9,ymm9,ymm14 + vpxord ymm5,ymm5,ymm10 + vpxord ymm6,ymm6,ymm11 + vpxord ymm7,ymm7,ymm8 + vpxord ymm4,ymm4,ymm9 + vprord ymm5,ymm5,0x7 + vprord ymm6,ymm6,0x7 + vprord ymm7,ymm7,0x7 + vprord ymm4,ymm4,0x7 + vpxor ymm0,ymm0,ymm8 + vpxor ymm1,ymm1,ymm9 + vpxor ymm2,ymm2,ymm10 + vpxor ymm3,ymm3,ymm11 + vpxor ymm4,ymm4,ymm12 + vpxor ymm5,ymm5,ymm13 + vpxor ymm6,ymm6,ymm14 + vpxor ymm7,ymm7,ymm15 + vpxord ymm8,ymm8,DWORD PTR [rdi]{1to8} + vpxord ymm9,ymm9,DWORD PTR [rdi+0x4]{1to8} + vpxord ymm10,ymm10,DWORD PTR [rdi+0x8]{1to8} + vpxord ymm11,ymm11,DWORD PTR [rdi+0xc]{1to8} + vpxord ymm12,ymm12,DWORD PTR [rdi+0x10]{1to8} + vpxord ymm13,ymm13,DWORD PTR [rdi+0x14]{1to8} + vpxord ymm14,ymm14,DWORD PTR [rdi+0x18]{1to8} + vpxord ymm15,ymm15,DWORD PTR [rdi+0x1c]{1to8} + vpunpckldq ymm16,ymm0,ymm1 + vpunpckhdq ymm17,ymm0,ymm1 + vpunpckldq ymm18,ymm2,ymm3 + vpunpckhdq ymm19,ymm2,ymm3 + vpunpckldq ymm20,ymm4,ymm5 + vpunpckhdq ymm21,ymm4,ymm5 + vpunpckldq ymm22,ymm6,ymm7 + vpunpckhdq ymm23,ymm6,ymm7 + vpunpckldq ymm24,ymm8,ymm9 + vpunpckhdq ymm25,ymm8,ymm9 + vpunpckldq ymm26,ymm10,ymm11 + vpunpckhdq ymm27,ymm10,ymm11 + vpunpckldq ymm28,ymm12,ymm13 + vpunpckhdq ymm29,ymm12,ymm13 + vpunpckldq ymm30,ymm14,ymm15 + vpunpckhdq ymm31,ymm14,ymm15 + vpunpcklqdq ymm0,ymm16,ymm18 + vpunpckhqdq ymm1,ymm16,ymm18 + vpunpcklqdq ymm2,ymm17,ymm19 + vpunpckhqdq ymm3,ymm17,ymm19 + vpunpcklqdq ymm4,ymm20,ymm22 + vpunpckhqdq ymm5,ymm20,ymm22 + vpunpcklqdq ymm6,ymm21,ymm23 + vpunpckhqdq ymm7,ymm21,ymm23 + vpunpcklqdq ymm8,ymm24,ymm26 + vpunpckhqdq ymm9,ymm24,ymm26 + vpunpcklqdq ymm10,ymm25,ymm27 + vpunpckhqdq ymm11,ymm25,ymm27 + vpunpcklqdq ymm12,ymm28,ymm30 + vpunpckhqdq ymm13,ymm28,ymm30 + vpunpcklqdq ymm14,ymm29,ymm31 + vpunpckhqdq ymm15,ymm29,ymm31 + vshufi32x4 ymm16,ymm0,ymm4,0x0 + vshufi32x4 ymm17,ymm8,ymm12,0x0 + vshufi32x4 ymm18,ymm1,ymm5,0x0 + vshufi32x4 ymm19,ymm9,ymm13,0x0 + vshufi32x4 ymm20,ymm2,ymm6,0x0 + vshufi32x4 ymm21,ymm10,ymm14,0x0 + vshufi32x4 ymm22,ymm3,ymm7,0x0 + vshufi32x4 ymm23,ymm11,ymm15,0x0 + vshufi32x4 ymm24,ymm0,ymm4,0x3 + vshufi32x4 ymm25,ymm8,ymm12,0x3 + vshufi32x4 ymm26,ymm1,ymm5,0x3 + vshufi32x4 ymm27,ymm9,ymm13,0x3 + vshufi32x4 ymm28,ymm2,ymm6,0x3 + vshufi32x4 ymm29,ymm10,ymm14,0x3 + vshufi32x4 ymm30,ymm3,ymm7,0x3 + vshufi32x4 ymm31,ymm11,ymm15,0x3 + vmovdqu32 YMMWORD PTR [r9],ymm16 + vmovdqu32 YMMWORD PTR [r9+0x20],ymm17 + vmovdqu32 YMMWORD PTR [r9+0x40],ymm18 + vmovdqu32 YMMWORD PTR [r9+0x60],ymm19 + vmovdqu32 YMMWORD PTR [r9+0x80],ymm20 + vmovdqu32 YMMWORD PTR [r9+0xa0],ymm21 + vmovdqu32 YMMWORD PTR [r9+0xc0],ymm22 + vmovdqu32 YMMWORD PTR [r9+0xe0],ymm23 + vmovdqu32 YMMWORD PTR [r9+0x100],ymm24 + vmovdqu32 YMMWORD PTR [r9+0x120],ymm25 + vmovdqu32 YMMWORD PTR [r9+0x140],ymm26 + vmovdqu32 YMMWORD PTR [r9+0x160],ymm27 + vmovdqu32 YMMWORD PTR [r9+0x180],ymm28 + vmovdqu32 YMMWORD PTR [r9+0x1a0],ymm29 + vmovdqu32 YMMWORD PTR [r9+0x1c0],ymm30 + vmovdqu32 YMMWORD PTR [r9+0x1e0],ymm31 + vmovdqa ymm0,YMMWORD PTR [rsp+0x20] + vmovdqa ymm1,YMMWORD PTR [rsp+0x60] + vmovdqa YMMWORD PTR [rsp],ymm0 + vmovdqa YMMWORD PTR [rsp+0x40],ymm1 + add r9,0x200 + sub r10,0x8 +2: + test r10,0x4 + je 2f + vbroadcasti32x4 zmm0,XMMWORD PTR [rdi] + vbroadcasti32x4 zmm1,XMMWORD PTR [rdi+0x10] + vbroadcasti32x4 zmm2,XMMWORD PTR [BLAKE3_IV+rip] + vmovdqa xmm12,XMMWORD PTR [rsp] + vmovdqa xmm13,XMMWORD PTR [rsp+0x40] + vpunpckldq xmm14,xmm12,xmm13 + vpunpckhdq xmm15,xmm12,xmm13 + vpermq ymm14,ymm14,0xdc + vpermq ymm15,ymm15,0xdc + vpbroadcastd zmm12,edx + vinserti64x4 zmm13,zmm14,ymm15,0x1 + mov eax,0x4444 + kmovw k2,eax + vpblendmd zmm13{k2},zmm13,zmm12 + vpbroadcastd zmm15,r8d + mov eax,0x8888 + kmovw k4,eax + vpblendmd zmm3{k4},zmm13,zmm15 + mov eax,0xaaaa + kmovw k3,eax + vbroadcasti32x4 zmm8,XMMWORD PTR [rsi] + vbroadcasti32x4 zmm9,XMMWORD PTR [rsi+0x10] + vshufps zmm4,zmm8,zmm9,0x88 + vshufps zmm5,zmm8,zmm9,0xdd + vbroadcasti32x4 zmm8,XMMWORD PTR [rsi+0x20] + vbroadcasti32x4 zmm9,XMMWORD PTR [rsi+0x30] + vshufps zmm6,zmm8,zmm9,0x88 + vshufps zmm7,zmm8,zmm9,0xdd + vpshufd zmm6,zmm6,0x93 + vpshufd zmm7,zmm7,0x93 + mov al,0x7 +3: + vpaddd zmm0,zmm0,zmm4 + vpaddd zmm0,zmm0,zmm1 + vpxord zmm3,zmm3,zmm0 + vprord zmm3,zmm3,0x10 + vpaddd zmm2,zmm2,zmm3 + vpxord zmm1,zmm1,zmm2 + vprord zmm1,zmm1,0xc + vpaddd zmm0,zmm0,zmm5 + vpaddd zmm0,zmm0,zmm1 + vpxord zmm3,zmm3,zmm0 + vprord zmm3,zmm3,0x8 + vpaddd zmm2,zmm2,zmm3 + vpxord zmm1,zmm1,zmm2 + vprord zmm1,zmm1,0x7 + vpshufd zmm0,zmm0,0x93 + vpshufd zmm3,zmm3,0x4e + vpshufd zmm2,zmm2,0x39 + vpaddd zmm0,zmm0,zmm6 + vpaddd zmm0,zmm0,zmm1 + vpxord zmm3,zmm3,zmm0 + vprord zmm3,zmm3,0x10 + vpaddd zmm2,zmm2,zmm3 + vpxord zmm1,zmm1,zmm2 + vprord zmm1,zmm1,0xc + vpaddd zmm0,zmm0,zmm7 + vpaddd zmm0,zmm0,zmm1 + vpxord zmm3,zmm3,zmm0 + vprord zmm3,zmm3,0x8 + vpaddd zmm2,zmm2,zmm3 + vpxord zmm1,zmm1,zmm2 + vprord zmm1,zmm1,0x7 + vpshufd zmm0,zmm0,0x39 + vpshufd zmm3,zmm3,0x4e + vpshufd zmm2,zmm2,0x93 + dec al + je 3f + vshufps zmm8,zmm4,zmm5,0xd6 + vpshufd zmm9,zmm4,0xf + vpshufd zmm4,zmm8,0x39 + vshufps zmm8,zmm6,zmm7,0xfa + vpblendmd zmm9{k3},zmm9,zmm8 + vpunpcklqdq zmm8,zmm7,zmm5 + vpblendmd zmm8{k4},zmm8,zmm6 + vpshufd zmm8,zmm8,0x78 + vpunpckhdq zmm5,zmm5,zmm7 + vpunpckldq zmm6,zmm6,zmm5 + vpshufd zmm7,zmm6,0x1e + vmovdqa32 zmm5,zmm9 + vmovdqa32 zmm6,zmm8 + jmp 3b +3: + vpxord zmm0,zmm0,zmm2 + vpxord zmm1,zmm1,zmm3 + vbroadcasti32x4 zmm8,XMMWORD PTR [rdi] + vbroadcasti32x4 zmm9,XMMWORD PTR [rdi+0x10] + vpxord zmm2,zmm2,zmm8 + vpxord zmm3,zmm3,zmm9 + vmovdqu XMMWORD PTR [r9],xmm0 + vmovdqu XMMWORD PTR [r9+0x10],xmm1 + vmovdqu XMMWORD PTR [r9+0x20],xmm2 + vmovdqu XMMWORD PTR [r9+0x30],xmm3 + vextracti128 XMMWORD PTR [r9+0x40],ymm0,0x1 + vextracti128 XMMWORD PTR [r9+0x50],ymm1,0x1 + vextracti128 XMMWORD PTR [r9+0x60],ymm2,0x1 + vextracti128 XMMWORD PTR [r9+0x70],ymm3,0x1 + vextracti32x4 XMMWORD PTR [r9+0x80],zmm0,0x2 + vextracti32x4 XMMWORD PTR [r9+0x90],zmm1,0x2 + vextracti32x4 XMMWORD PTR [r9+0xa0],zmm2,0x2 + vextracti32x4 XMMWORD PTR [r9+0xb0],zmm3,0x2 + vextracti32x4 XMMWORD PTR [r9+0xc0],zmm0,0x3 + vextracti32x4 XMMWORD PTR [r9+0xd0],zmm1,0x3 + vextracti32x4 XMMWORD PTR [r9+0xe0],zmm2,0x3 + vextracti32x4 XMMWORD PTR [r9+0xf0],zmm3,0x3 + vmovdqa xmm0,XMMWORD PTR [rsp+0x10] + vmovdqa xmm1,XMMWORD PTR [rsp+0x50] + vmovdqa XMMWORD PTR [rsp],xmm0 + vmovdqa XMMWORD PTR [rsp+0x40],xmm1 + add r9,0x100 + sub r10,0x4 +2: + test r10,0x2 + je 2f + vbroadcasti128 ymm0,XMMWORD PTR [rdi] + vbroadcasti128 ymm1,XMMWORD PTR [rdi+0x10] + vmovd xmm13,DWORD PTR [rsp] + vpinsrd xmm13,xmm13,DWORD PTR [rsp+0x40],0x1 + vpinsrd xmm13,xmm13,edx,0x2 + vmovd xmm14,DWORD PTR [rsp+0x4] + vpinsrd xmm14,xmm14,DWORD PTR [rsp+0x44],0x1 + vpinsrd xmm14,xmm14,edx,0x2 + vinserti128 ymm13,ymm13,xmm14,0x1 + vbroadcasti128 ymm2,XMMWORD PTR [BLAKE3_IV+rip] + vpbroadcastd ymm8,r8d + vpblendd ymm3,ymm13,ymm8,0x88 + vbroadcasti128 ymm8,XMMWORD PTR [rsi] + vbroadcasti128 ymm9,XMMWORD PTR [rsi+0x10] + vshufps ymm4,ymm8,ymm9,0x88 + vshufps ymm5,ymm8,ymm9,0xdd + vbroadcasti128 ymm8,XMMWORD PTR [rsi+0x20] + vbroadcasti128 ymm9,XMMWORD PTR [rsi+0x30] + vshufps ymm6,ymm8,ymm9,0x88 + vshufps ymm7,ymm8,ymm9,0xdd + vpshufd ymm6,ymm6,0x93 + vpshufd ymm7,ymm7,0x93 + mov al,0x7 +3: + vpaddd ymm0,ymm0,ymm4 + vpaddd ymm0,ymm0,ymm1 + vpxord ymm3,ymm3,ymm0 + vprord ymm3,ymm3,0x10 + vpaddd ymm2,ymm2,ymm3 + vpxord ymm1,ymm1,ymm2 + vprord ymm1,ymm1,0xc + vpaddd ymm0,ymm0,ymm5 + vpaddd ymm0,ymm0,ymm1 + vpxord ymm3,ymm3,ymm0 + vprord ymm3,ymm3,0x8 + vpaddd ymm2,ymm2,ymm3 + vpxord ymm1,ymm1,ymm2 + vprord ymm1,ymm1,0x7 + vpshufd ymm0,ymm0,0x93 + vpshufd ymm3,ymm3,0x4e + vpshufd ymm2,ymm2,0x39 + vpaddd ymm0,ymm0,ymm6 + vpaddd ymm0,ymm0,ymm1 + vpxord ymm3,ymm3,ymm0 + vprord ymm3,ymm3,0x10 + vpaddd ymm2,ymm2,ymm3 + vpxord ymm1,ymm1,ymm2 + vprord ymm1,ymm1,0xc + vpaddd ymm0,ymm0,ymm7 + vpaddd ymm0,ymm0,ymm1 + vpxord ymm3,ymm3,ymm0 + vprord ymm3,ymm3,0x8 + vpaddd ymm2,ymm2,ymm3 + vpxord ymm1,ymm1,ymm2 + vprord ymm1,ymm1,0x7 + vpshufd ymm0,ymm0,0x39 + vpshufd ymm3,ymm3,0x4e + vpshufd ymm2,ymm2,0x93 + dec al + je 3f + vshufps ymm8,ymm4,ymm5,0xd6 + vpshufd ymm9,ymm4,0xf + vpshufd ymm4,ymm8,0x39 + vshufps ymm8,ymm6,ymm7,0xfa + vpblendd ymm9,ymm9,ymm8,0xaa + vpunpcklqdq ymm8,ymm7,ymm5 + vpblendd ymm8,ymm8,ymm6,0x88 + vpshufd ymm8,ymm8,0x78 + vpunpckhdq ymm5,ymm5,ymm7 + vpunpckldq ymm6,ymm6,ymm5 + vpshufd ymm7,ymm6,0x1e + vmovdqa ymm5,ymm9 + vmovdqa ymm6,ymm8 + jmp 3b +3: + vpxor ymm0,ymm0,ymm2 + vpxor ymm1,ymm1,ymm3 + vbroadcasti128 ymm8,XMMWORD PTR [rdi] + vbroadcasti128 ymm9,XMMWORD PTR [rdi+0x10] + vpxor ymm2,ymm2,ymm8 + vpxor ymm3,ymm3,ymm9 + vmovdqu XMMWORD PTR [r9],xmm0 + vmovdqu XMMWORD PTR [r9+0x10],xmm1 + vmovdqu XMMWORD PTR [r9+0x20],xmm2 + vmovdqu XMMWORD PTR [r9+0x30],xmm3 + vextracti128 XMMWORD PTR [r9+0x40],ymm0,0x1 + vextracti128 XMMWORD PTR [r9+0x50],ymm1,0x1 + vextracti128 XMMWORD PTR [r9+0x60],ymm2,0x1 + vextracti128 XMMWORD PTR [r9+0x70],ymm3,0x1 + vmovdqu xmm0,XMMWORD PTR [rsp+0x8] + vmovdqu xmm1,XMMWORD PTR [rsp+0x48] + vmovdqa XMMWORD PTR [rsp],xmm0 + vmovdqa XMMWORD PTR [rsp+0x40],xmm1 + add r9,0x80 + sub r10,0x2 +2: + test r10,0x1 + je 9b + vmovdqu xmm0,XMMWORD PTR [rdi] + vmovdqu xmm1,XMMWORD PTR [rdi+0x10] + vmovd xmm14,DWORD PTR [rsp] + vpinsrd xmm14,xmm14,DWORD PTR [rsp+0x40],0x1 + vpinsrd xmm14,xmm14,edx,0x2 + vmovdqa xmm2,XMMWORD PTR [BLAKE3_IV+rip] + vpinsrd xmm3,xmm14,r8d,0x3 + vmovups xmm8,XMMWORD PTR [rsi] + vmovups xmm9,XMMWORD PTR [rsi+0x10] + vshufps xmm4,xmm8,xmm9,0x88 + vshufps xmm5,xmm8,xmm9,0xdd + vmovups xmm8,XMMWORD PTR [rsi+0x20] + vmovups xmm9,XMMWORD PTR [rsi+0x30] + vshufps xmm6,xmm8,xmm9,0x88 + vshufps xmm7,xmm8,xmm9,0xdd + vpshufd xmm6,xmm6,0x93 + vpshufd xmm7,xmm7,0x93 + mov al,0x7 +3: + vpaddd xmm0,xmm0,xmm4 + vpaddd xmm0,xmm0,xmm1 + vpxord xmm3,xmm3,xmm0 + vprord xmm3,xmm3,0x10 + vpaddd xmm2,xmm2,xmm3 + vpxord xmm1,xmm1,xmm2 + vprord xmm1,xmm1,0xc + vpaddd xmm0,xmm0,xmm5 + vpaddd xmm0,xmm0,xmm1 + vpxord xmm3,xmm3,xmm0 + vprord xmm3,xmm3,0x8 + vpaddd xmm2,xmm2,xmm3 + vpxord xmm1,xmm1,xmm2 + vprord xmm1,xmm1,0x7 + vpshufd xmm0,xmm0,0x93 + vpshufd xmm3,xmm3,0x4e + vpshufd xmm2,xmm2,0x39 + vpaddd xmm0,xmm0,xmm6 + vpaddd xmm0,xmm0,xmm1 + vpxord xmm3,xmm3,xmm0 + vprord xmm3,xmm3,0x10 + vpaddd xmm2,xmm2,xmm3 + vpxord xmm1,xmm1,xmm2 + vprord xmm1,xmm1,0xc + vpaddd xmm0,xmm0,xmm7 + vpaddd xmm0,xmm0,xmm1 + vpxord xmm3,xmm3,xmm0 + vprord xmm3,xmm3,0x8 + vpaddd xmm2,xmm2,xmm3 + vpxord xmm1,xmm1,xmm2 + vprord xmm1,xmm1,0x7 + vpshufd xmm0,xmm0,0x39 + vpshufd xmm3,xmm3,0x4e + vpshufd xmm2,xmm2,0x93 + dec al + je 3f + vshufps xmm8,xmm4,xmm5,0xd6 + vpshufd xmm9,xmm4,0xf + vpshufd xmm4,xmm8,0x39 + vshufps xmm8,xmm6,xmm7,0xfa + vpblendd xmm9,xmm9,xmm8,0xaa + vpunpcklqdq xmm8,xmm7,xmm5 + vpblendd xmm8,xmm8,xmm6,0x88 + vpshufd xmm8,xmm8,0x78 + vpunpckhdq xmm5,xmm5,xmm7 + vpunpckldq xmm6,xmm6,xmm5 + vpshufd xmm7,xmm6,0x1e + vmovdqa xmm5,xmm9 + vmovdqa xmm6,xmm8 + jmp 3b +3: + vpxor xmm0,xmm0,xmm2 + vpxor xmm1,xmm1,xmm3 + vpxor xmm2,xmm2,XMMWORD PTR [rdi] + vpxor xmm3,xmm3,XMMWORD PTR [rdi+0x10] + vmovdqu XMMWORD PTR [r9],xmm0 + vmovdqu XMMWORD PTR [r9+0x10],xmm1 + vmovdqu XMMWORD PTR [r9+0x20],xmm2 + vmovdqu XMMWORD PTR [r9+0x30],xmm3 + jmp 9b + + +#ifdef __APPLE__ +.static_data +#else +.section .rodata +#endif +.p2align 6 +INDEX0: + .long 0, 1, 2, 3, 16, 17, 18, 19 + .long 8, 9, 10, 11, 24, 25, 26, 27 +INDEX1: + .long 4, 5, 6, 7, 20, 21, 22, 23 + .long 12, 13, 14, 15, 28, 29, 30, 31 +ADD0: + .long 0, 1, 2, 3, 4, 5, 6, 7 + .long 8, 9, 10, 11, 12, 13, 14, 15 +ADD1: .long 1 + +ADD16: .long 16 +BLAKE3_BLOCK_LEN: + .long 64 +.p2align 6 +BLAKE3_IV: +BLAKE3_IV_0: + .long 0x6A09E667 +BLAKE3_IV_1: + .long 0xBB67AE85 +BLAKE3_IV_2: + .long 0x3C6EF372 +BLAKE3_IV_3: + .long 0xA54FF53A diff --git a/thirdparty/blake3/c/blake3_avx512_x86-64_windows_gnu.S b/thirdparty/blake3/c/blake3_avx512_x86-64_windows_gnu.S new file mode 100644 index 000000000..ba4fc5fa9 --- /dev/null +++ b/thirdparty/blake3/c/blake3_avx512_x86-64_windows_gnu.S @@ -0,0 +1,2615 @@ +.intel_syntax noprefix + +.global _blake3_hash_many_avx512 +.global blake3_hash_many_avx512 +.global blake3_compress_in_place_avx512 +.global _blake3_compress_in_place_avx512 +.global blake3_compress_xof_avx512 +.global _blake3_compress_xof_avx512 + +.section .text +.p2align 6 +_blake3_hash_many_avx512: +blake3_hash_many_avx512: + push r15 + push r14 + push r13 + push r12 + push rdi + push rsi + push rbx + push rbp + mov rbp, rsp + sub rsp, 304 + and rsp, 0xFFFFFFFFFFFFFFC0 + vmovdqa xmmword ptr [rsp+0x90], xmm6 + vmovdqa xmmword ptr [rsp+0xA0], xmm7 + vmovdqa xmmword ptr [rsp+0xB0], xmm8 + vmovdqa xmmword ptr [rsp+0xC0], xmm9 + vmovdqa xmmword ptr [rsp+0xD0], xmm10 + vmovdqa xmmword ptr [rsp+0xE0], xmm11 + vmovdqa xmmword ptr [rsp+0xF0], xmm12 + vmovdqa xmmword ptr [rsp+0x100], xmm13 + vmovdqa xmmword ptr [rsp+0x110], xmm14 + vmovdqa xmmword ptr [rsp+0x120], xmm15 + mov rdi, rcx + mov rsi, rdx + mov rdx, r8 + mov rcx, r9 + mov r8, qword ptr [rbp+0x68] + movzx r9, byte ptr [rbp+0x70] + neg r9 + kmovw k1, r9d + vmovd xmm0, r8d + vpbroadcastd ymm0, xmm0 + shr r8, 32 + vmovd xmm1, r8d + vpbroadcastd ymm1, xmm1 + vmovdqa ymm4, ymm1 + vmovdqa ymm5, ymm1 + vpaddd ymm2, ymm0, ymmword ptr [ADD0+rip] + vpaddd ymm3, ymm0, ymmword ptr [ADD0+32+rip] + vpcmpltud k2, ymm2, ymm0 + vpcmpltud k3, ymm3, ymm0 + vpaddd ymm4 {k2}, ymm4, dword ptr [ADD1+rip] {1to8} + vpaddd ymm5 {k3}, ymm5, dword ptr [ADD1+rip] {1to8} + knotw k2, k1 + vmovdqa32 ymm2 {k2}, ymm0 + vmovdqa32 ymm3 {k2}, ymm0 + vmovdqa32 ymm4 {k2}, ymm1 + vmovdqa32 ymm5 {k2}, ymm1 + vmovdqa ymmword ptr [rsp], ymm2 + vmovdqa ymmword ptr [rsp+0x20], ymm3 + vmovdqa ymmword ptr [rsp+0x40], ymm4 + vmovdqa ymmword ptr [rsp+0x60], ymm5 + shl rdx, 6 + mov qword ptr [rsp+0x80], rdx + cmp rsi, 16 + jc 3f +2: + vpbroadcastd zmm0, dword ptr [rcx] + vpbroadcastd zmm1, dword ptr [rcx+0x1*0x4] + vpbroadcastd zmm2, dword ptr [rcx+0x2*0x4] + vpbroadcastd zmm3, dword ptr [rcx+0x3*0x4] + vpbroadcastd zmm4, dword ptr [rcx+0x4*0x4] + vpbroadcastd zmm5, dword ptr [rcx+0x5*0x4] + vpbroadcastd zmm6, dword ptr [rcx+0x6*0x4] + vpbroadcastd zmm7, dword ptr [rcx+0x7*0x4] + movzx eax, byte ptr [rbp+0x78] + movzx ebx, byte ptr [rbp+0x80] + or eax, ebx + xor edx, edx +.p2align 5 +9: + movzx ebx, byte ptr [rbp+0x88] + or ebx, eax + add rdx, 64 + cmp rdx, qword ptr [rsp+0x80] + cmove eax, ebx + mov dword ptr [rsp+0x88], eax + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + mov r12, qword ptr [rdi+0x40] + mov r13, qword ptr [rdi+0x48] + mov r14, qword ptr [rdi+0x50] + mov r15, qword ptr [rdi+0x58] + vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20] + vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01 + vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20] + vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01 + vpunpcklqdq zmm8, zmm16, zmm17 + vpunpckhqdq zmm9, zmm16, zmm17 + vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20] + vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01 + vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20] + vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01 + vpunpcklqdq zmm10, zmm18, zmm19 + vpunpckhqdq zmm11, zmm18, zmm19 + mov r8, qword ptr [rdi+0x20] + mov r9, qword ptr [rdi+0x28] + mov r10, qword ptr [rdi+0x30] + mov r11, qword ptr [rdi+0x38] + mov r12, qword ptr [rdi+0x60] + mov r13, qword ptr [rdi+0x68] + mov r14, qword ptr [rdi+0x70] + mov r15, qword ptr [rdi+0x78] + vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20] + vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01 + vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20] + vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01 + vpunpcklqdq zmm12, zmm16, zmm17 + vpunpckhqdq zmm13, zmm16, zmm17 + vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20] + vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01 + vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20] + vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01 + vpunpcklqdq zmm14, zmm18, zmm19 + vpunpckhqdq zmm15, zmm18, zmm19 + vmovdqa32 zmm27, zmmword ptr [INDEX0+rip] + vmovdqa32 zmm31, zmmword ptr [INDEX1+rip] + vshufps zmm16, zmm8, zmm10, 136 + vshufps zmm17, zmm12, zmm14, 136 + vmovdqa32 zmm20, zmm16 + vpermt2d zmm16, zmm27, zmm17 + vpermt2d zmm20, zmm31, zmm17 + vshufps zmm17, zmm8, zmm10, 221 + vshufps zmm30, zmm12, zmm14, 221 + vmovdqa32 zmm21, zmm17 + vpermt2d zmm17, zmm27, zmm30 + vpermt2d zmm21, zmm31, zmm30 + vshufps zmm18, zmm9, zmm11, 136 + vshufps zmm8, zmm13, zmm15, 136 + vmovdqa32 zmm22, zmm18 + vpermt2d zmm18, zmm27, zmm8 + vpermt2d zmm22, zmm31, zmm8 + vshufps zmm19, zmm9, zmm11, 221 + vshufps zmm8, zmm13, zmm15, 221 + vmovdqa32 zmm23, zmm19 + vpermt2d zmm19, zmm27, zmm8 + vpermt2d zmm23, zmm31, zmm8 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + mov r12, qword ptr [rdi+0x40] + mov r13, qword ptr [rdi+0x48] + mov r14, qword ptr [rdi+0x50] + mov r15, qword ptr [rdi+0x58] + vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20] + vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01 + vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20] + vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01 + vpunpcklqdq zmm8, zmm24, zmm25 + vpunpckhqdq zmm9, zmm24, zmm25 + vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20] + vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01 + vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20] + vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01 + vpunpcklqdq zmm10, zmm24, zmm25 + vpunpckhqdq zmm11, zmm24, zmm25 + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r12+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r13+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r14+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + prefetcht0 [r15+rdx+0x80] + mov r8, qword ptr [rdi+0x20] + mov r9, qword ptr [rdi+0x28] + mov r10, qword ptr [rdi+0x30] + mov r11, qword ptr [rdi+0x38] + mov r12, qword ptr [rdi+0x60] + mov r13, qword ptr [rdi+0x68] + mov r14, qword ptr [rdi+0x70] + mov r15, qword ptr [rdi+0x78] + vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20] + vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01 + vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20] + vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01 + vpunpcklqdq zmm12, zmm24, zmm25 + vpunpckhqdq zmm13, zmm24, zmm25 + vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20] + vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01 + vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20] + vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01 + vpunpcklqdq zmm14, zmm24, zmm25 + vpunpckhqdq zmm15, zmm24, zmm25 + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r12+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r13+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r14+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + prefetcht0 [r15+rdx+0x80] + vshufps zmm24, zmm8, zmm10, 136 + vshufps zmm30, zmm12, zmm14, 136 + vmovdqa32 zmm28, zmm24 + vpermt2d zmm24, zmm27, zmm30 + vpermt2d zmm28, zmm31, zmm30 + vshufps zmm25, zmm8, zmm10, 221 + vshufps zmm30, zmm12, zmm14, 221 + vmovdqa32 zmm29, zmm25 + vpermt2d zmm25, zmm27, zmm30 + vpermt2d zmm29, zmm31, zmm30 + vshufps zmm26, zmm9, zmm11, 136 + vshufps zmm8, zmm13, zmm15, 136 + vmovdqa32 zmm30, zmm26 + vpermt2d zmm26, zmm27, zmm8 + vpermt2d zmm30, zmm31, zmm8 + vshufps zmm8, zmm9, zmm11, 221 + vshufps zmm10, zmm13, zmm15, 221 + vpermi2d zmm27, zmm8, zmm10 + vpermi2d zmm31, zmm8, zmm10 + vpbroadcastd zmm8, dword ptr [BLAKE3_IV_0+rip] + vpbroadcastd zmm9, dword ptr [BLAKE3_IV_1+rip] + vpbroadcastd zmm10, dword ptr [BLAKE3_IV_2+rip] + vpbroadcastd zmm11, dword ptr [BLAKE3_IV_3+rip] + vmovdqa32 zmm12, zmmword ptr [rsp] + vmovdqa32 zmm13, zmmword ptr [rsp+0x1*0x40] + vpbroadcastd zmm14, dword ptr [BLAKE3_BLOCK_LEN+rip] + vpbroadcastd zmm15, dword ptr [rsp+0x22*0x4] + vpaddd zmm0, zmm0, zmm16 + vpaddd zmm1, zmm1, zmm18 + vpaddd zmm2, zmm2, zmm20 + vpaddd zmm3, zmm3, zmm22 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm17 + vpaddd zmm1, zmm1, zmm19 + vpaddd zmm2, zmm2, zmm21 + vpaddd zmm3, zmm3, zmm23 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm24 + vpaddd zmm1, zmm1, zmm26 + vpaddd zmm2, zmm2, zmm28 + vpaddd zmm3, zmm3, zmm30 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm25 + vpaddd zmm1, zmm1, zmm27 + vpaddd zmm2, zmm2, zmm29 + vpaddd zmm3, zmm3, zmm31 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm18 + vpaddd zmm1, zmm1, zmm19 + vpaddd zmm2, zmm2, zmm23 + vpaddd zmm3, zmm3, zmm20 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm22 + vpaddd zmm1, zmm1, zmm26 + vpaddd zmm2, zmm2, zmm16 + vpaddd zmm3, zmm3, zmm29 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm17 + vpaddd zmm1, zmm1, zmm28 + vpaddd zmm2, zmm2, zmm25 + vpaddd zmm3, zmm3, zmm31 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm27 + vpaddd zmm1, zmm1, zmm21 + vpaddd zmm2, zmm2, zmm30 + vpaddd zmm3, zmm3, zmm24 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm19 + vpaddd zmm1, zmm1, zmm26 + vpaddd zmm2, zmm2, zmm29 + vpaddd zmm3, zmm3, zmm23 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm20 + vpaddd zmm1, zmm1, zmm28 + vpaddd zmm2, zmm2, zmm18 + vpaddd zmm3, zmm3, zmm30 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm22 + vpaddd zmm1, zmm1, zmm25 + vpaddd zmm2, zmm2, zmm27 + vpaddd zmm3, zmm3, zmm24 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm21 + vpaddd zmm1, zmm1, zmm16 + vpaddd zmm2, zmm2, zmm31 + vpaddd zmm3, zmm3, zmm17 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm26 + vpaddd zmm1, zmm1, zmm28 + vpaddd zmm2, zmm2, zmm30 + vpaddd zmm3, zmm3, zmm29 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm23 + vpaddd zmm1, zmm1, zmm25 + vpaddd zmm2, zmm2, zmm19 + vpaddd zmm3, zmm3, zmm31 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm20 + vpaddd zmm1, zmm1, zmm27 + vpaddd zmm2, zmm2, zmm21 + vpaddd zmm3, zmm3, zmm17 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm16 + vpaddd zmm1, zmm1, zmm18 + vpaddd zmm2, zmm2, zmm24 + vpaddd zmm3, zmm3, zmm22 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm28 + vpaddd zmm1, zmm1, zmm25 + vpaddd zmm2, zmm2, zmm31 + vpaddd zmm3, zmm3, zmm30 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm29 + vpaddd zmm1, zmm1, zmm27 + vpaddd zmm2, zmm2, zmm26 + vpaddd zmm3, zmm3, zmm24 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm23 + vpaddd zmm1, zmm1, zmm21 + vpaddd zmm2, zmm2, zmm16 + vpaddd zmm3, zmm3, zmm22 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm18 + vpaddd zmm1, zmm1, zmm19 + vpaddd zmm2, zmm2, zmm17 + vpaddd zmm3, zmm3, zmm20 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm25 + vpaddd zmm1, zmm1, zmm27 + vpaddd zmm2, zmm2, zmm24 + vpaddd zmm3, zmm3, zmm31 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm30 + vpaddd zmm1, zmm1, zmm21 + vpaddd zmm2, zmm2, zmm28 + vpaddd zmm3, zmm3, zmm17 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm29 + vpaddd zmm1, zmm1, zmm16 + vpaddd zmm2, zmm2, zmm18 + vpaddd zmm3, zmm3, zmm20 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm19 + vpaddd zmm1, zmm1, zmm26 + vpaddd zmm2, zmm2, zmm22 + vpaddd zmm3, zmm3, zmm23 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm27 + vpaddd zmm1, zmm1, zmm21 + vpaddd zmm2, zmm2, zmm17 + vpaddd zmm3, zmm3, zmm24 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm31 + vpaddd zmm1, zmm1, zmm16 + vpaddd zmm2, zmm2, zmm25 + vpaddd zmm3, zmm3, zmm22 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm30 + vpaddd zmm1, zmm1, zmm18 + vpaddd zmm2, zmm2, zmm19 + vpaddd zmm3, zmm3, zmm23 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm26 + vpaddd zmm1, zmm1, zmm28 + vpaddd zmm2, zmm2, zmm20 + vpaddd zmm3, zmm3, zmm29 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpxord zmm0, zmm0, zmm8 + vpxord zmm1, zmm1, zmm9 + vpxord zmm2, zmm2, zmm10 + vpxord zmm3, zmm3, zmm11 + vpxord zmm4, zmm4, zmm12 + vpxord zmm5, zmm5, zmm13 + vpxord zmm6, zmm6, zmm14 + vpxord zmm7, zmm7, zmm15 + movzx eax, byte ptr [rbp+0x78] + jne 9b + mov rbx, qword ptr [rbp+0x90] + vpunpckldq zmm16, zmm0, zmm1 + vpunpckhdq zmm17, zmm0, zmm1 + vpunpckldq zmm18, zmm2, zmm3 + vpunpckhdq zmm19, zmm2, zmm3 + vpunpckldq zmm20, zmm4, zmm5 + vpunpckhdq zmm21, zmm4, zmm5 + vpunpckldq zmm22, zmm6, zmm7 + vpunpckhdq zmm23, zmm6, zmm7 + vpunpcklqdq zmm0, zmm16, zmm18 + vpunpckhqdq zmm1, zmm16, zmm18 + vpunpcklqdq zmm2, zmm17, zmm19 + vpunpckhqdq zmm3, zmm17, zmm19 + vpunpcklqdq zmm4, zmm20, zmm22 + vpunpckhqdq zmm5, zmm20, zmm22 + vpunpcklqdq zmm6, zmm21, zmm23 + vpunpckhqdq zmm7, zmm21, zmm23 + vshufi32x4 zmm16, zmm0, zmm4, 0x88 + vshufi32x4 zmm17, zmm1, zmm5, 0x88 + vshufi32x4 zmm18, zmm2, zmm6, 0x88 + vshufi32x4 zmm19, zmm3, zmm7, 0x88 + vshufi32x4 zmm20, zmm0, zmm4, 0xDD + vshufi32x4 zmm21, zmm1, zmm5, 0xDD + vshufi32x4 zmm22, zmm2, zmm6, 0xDD + vshufi32x4 zmm23, zmm3, zmm7, 0xDD + vshufi32x4 zmm0, zmm16, zmm17, 0x88 + vshufi32x4 zmm1, zmm18, zmm19, 0x88 + vshufi32x4 zmm2, zmm20, zmm21, 0x88 + vshufi32x4 zmm3, zmm22, zmm23, 0x88 + vshufi32x4 zmm4, zmm16, zmm17, 0xDD + vshufi32x4 zmm5, zmm18, zmm19, 0xDD + vshufi32x4 zmm6, zmm20, zmm21, 0xDD + vshufi32x4 zmm7, zmm22, zmm23, 0xDD + vmovdqu32 zmmword ptr [rbx], zmm0 + vmovdqu32 zmmword ptr [rbx+0x1*0x40], zmm1 + vmovdqu32 zmmword ptr [rbx+0x2*0x40], zmm2 + vmovdqu32 zmmword ptr [rbx+0x3*0x40], zmm3 + vmovdqu32 zmmword ptr [rbx+0x4*0x40], zmm4 + vmovdqu32 zmmword ptr [rbx+0x5*0x40], zmm5 + vmovdqu32 zmmword ptr [rbx+0x6*0x40], zmm6 + vmovdqu32 zmmword ptr [rbx+0x7*0x40], zmm7 + vmovdqa32 zmm0, zmmword ptr [rsp] + vmovdqa32 zmm1, zmmword ptr [rsp+0x1*0x40] + vmovdqa32 zmm2, zmm0 + vpaddd zmm2{k1}, zmm0, dword ptr [ADD16+rip] {1to16} + vpcmpltud k2, zmm2, zmm0 + vpaddd zmm1 {k2}, zmm1, dword ptr [ADD1+rip] {1to16} + vmovdqa32 zmmword ptr [rsp], zmm2 + vmovdqa32 zmmword ptr [rsp+0x1*0x40], zmm1 + add rdi, 128 + add rbx, 512 + mov qword ptr [rbp+0x90], rbx + sub rsi, 16 + cmp rsi, 16 + jnc 2b + test rsi, rsi + jne 3f +4: + vzeroupper + vmovdqa xmm6, xmmword ptr [rsp+0x90] + vmovdqa xmm7, xmmword ptr [rsp+0xA0] + vmovdqa xmm8, xmmword ptr [rsp+0xB0] + vmovdqa xmm9, xmmword ptr [rsp+0xC0] + vmovdqa xmm10, xmmword ptr [rsp+0xD0] + vmovdqa xmm11, xmmword ptr [rsp+0xE0] + vmovdqa xmm12, xmmword ptr [rsp+0xF0] + vmovdqa xmm13, xmmword ptr [rsp+0x100] + vmovdqa xmm14, xmmword ptr [rsp+0x110] + vmovdqa xmm15, xmmword ptr [rsp+0x120] + mov rsp, rbp + pop rbp + pop rbx + pop rsi + pop rdi + pop r12 + pop r13 + pop r14 + pop r15 + ret +.p2align 6 +3: + test esi, 0x8 + je 3f + vpbroadcastd ymm0, dword ptr [rcx] + vpbroadcastd ymm1, dword ptr [rcx+0x4] + vpbroadcastd ymm2, dword ptr [rcx+0x8] + vpbroadcastd ymm3, dword ptr [rcx+0xC] + vpbroadcastd ymm4, dword ptr [rcx+0x10] + vpbroadcastd ymm5, dword ptr [rcx+0x14] + vpbroadcastd ymm6, dword ptr [rcx+0x18] + vpbroadcastd ymm7, dword ptr [rcx+0x1C] + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + mov r12, qword ptr [rdi+0x20] + mov r13, qword ptr [rdi+0x28] + mov r14, qword ptr [rdi+0x30] + mov r15, qword ptr [rdi+0x38] + movzx eax, byte ptr [rbp+0x78] + movzx ebx, byte ptr [rbp+0x80] + or eax, ebx + xor edx, edx +2: + movzx ebx, byte ptr [rbp+0x88] + or ebx, eax + add rdx, 64 + cmp rdx, qword ptr [rsp+0x80] + cmove eax, ebx + mov dword ptr [rsp+0x88], eax + vmovups xmm8, xmmword ptr [r8+rdx-0x40] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x40] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x40] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x40] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm16, ymm12, ymm14, 136 + vshufps ymm17, ymm12, ymm14, 221 + vshufps ymm18, ymm13, ymm15, 136 + vshufps ymm19, ymm13, ymm15, 221 + vmovups xmm8, xmmword ptr [r8+rdx-0x30] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x30] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x30] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x30] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm20, ymm12, ymm14, 136 + vshufps ymm21, ymm12, ymm14, 221 + vshufps ymm22, ymm13, ymm15, 136 + vshufps ymm23, ymm13, ymm15, 221 + vmovups xmm8, xmmword ptr [r8+rdx-0x20] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x20] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x20] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x20] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm24, ymm12, ymm14, 136 + vshufps ymm25, ymm12, ymm14, 221 + vshufps ymm26, ymm13, ymm15, 136 + vshufps ymm27, ymm13, ymm15, 221 + vmovups xmm8, xmmword ptr [r8+rdx-0x10] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x10] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x10] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x10] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm28, ymm12, ymm14, 136 + vshufps ymm29, ymm12, ymm14, 221 + vshufps ymm30, ymm13, ymm15, 136 + vshufps ymm31, ymm13, ymm15, 221 + vpbroadcastd ymm8, dword ptr [BLAKE3_IV_0+rip] + vpbroadcastd ymm9, dword ptr [BLAKE3_IV_1+rip] + vpbroadcastd ymm10, dword ptr [BLAKE3_IV_2+rip] + vpbroadcastd ymm11, dword ptr [BLAKE3_IV_3+rip] + vmovdqa ymm12, ymmword ptr [rsp] + vmovdqa ymm13, ymmword ptr [rsp+0x40] + vpbroadcastd ymm14, dword ptr [BLAKE3_BLOCK_LEN+rip] + vpbroadcastd ymm15, dword ptr [rsp+0x88] + vpaddd ymm0, ymm0, ymm16 + vpaddd ymm1, ymm1, ymm18 + vpaddd ymm2, ymm2, ymm20 + vpaddd ymm3, ymm3, ymm22 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm17 + vpaddd ymm1, ymm1, ymm19 + vpaddd ymm2, ymm2, ymm21 + vpaddd ymm3, ymm3, ymm23 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm24 + vpaddd ymm1, ymm1, ymm26 + vpaddd ymm2, ymm2, ymm28 + vpaddd ymm3, ymm3, ymm30 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm25 + vpaddd ymm1, ymm1, ymm27 + vpaddd ymm2, ymm2, ymm29 + vpaddd ymm3, ymm3, ymm31 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm18 + vpaddd ymm1, ymm1, ymm19 + vpaddd ymm2, ymm2, ymm23 + vpaddd ymm3, ymm3, ymm20 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm22 + vpaddd ymm1, ymm1, ymm26 + vpaddd ymm2, ymm2, ymm16 + vpaddd ymm3, ymm3, ymm29 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm17 + vpaddd ymm1, ymm1, ymm28 + vpaddd ymm2, ymm2, ymm25 + vpaddd ymm3, ymm3, ymm31 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm27 + vpaddd ymm1, ymm1, ymm21 + vpaddd ymm2, ymm2, ymm30 + vpaddd ymm3, ymm3, ymm24 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm19 + vpaddd ymm1, ymm1, ymm26 + vpaddd ymm2, ymm2, ymm29 + vpaddd ymm3, ymm3, ymm23 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm20 + vpaddd ymm1, ymm1, ymm28 + vpaddd ymm2, ymm2, ymm18 + vpaddd ymm3, ymm3, ymm30 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm22 + vpaddd ymm1, ymm1, ymm25 + vpaddd ymm2, ymm2, ymm27 + vpaddd ymm3, ymm3, ymm24 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm21 + vpaddd ymm1, ymm1, ymm16 + vpaddd ymm2, ymm2, ymm31 + vpaddd ymm3, ymm3, ymm17 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm26 + vpaddd ymm1, ymm1, ymm28 + vpaddd ymm2, ymm2, ymm30 + vpaddd ymm3, ymm3, ymm29 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm23 + vpaddd ymm1, ymm1, ymm25 + vpaddd ymm2, ymm2, ymm19 + vpaddd ymm3, ymm3, ymm31 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm20 + vpaddd ymm1, ymm1, ymm27 + vpaddd ymm2, ymm2, ymm21 + vpaddd ymm3, ymm3, ymm17 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm16 + vpaddd ymm1, ymm1, ymm18 + vpaddd ymm2, ymm2, ymm24 + vpaddd ymm3, ymm3, ymm22 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm28 + vpaddd ymm1, ymm1, ymm25 + vpaddd ymm2, ymm2, ymm31 + vpaddd ymm3, ymm3, ymm30 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm29 + vpaddd ymm1, ymm1, ymm27 + vpaddd ymm2, ymm2, ymm26 + vpaddd ymm3, ymm3, ymm24 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm23 + vpaddd ymm1, ymm1, ymm21 + vpaddd ymm2, ymm2, ymm16 + vpaddd ymm3, ymm3, ymm22 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm18 + vpaddd ymm1, ymm1, ymm19 + vpaddd ymm2, ymm2, ymm17 + vpaddd ymm3, ymm3, ymm20 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm25 + vpaddd ymm1, ymm1, ymm27 + vpaddd ymm2, ymm2, ymm24 + vpaddd ymm3, ymm3, ymm31 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm30 + vpaddd ymm1, ymm1, ymm21 + vpaddd ymm2, ymm2, ymm28 + vpaddd ymm3, ymm3, ymm17 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm29 + vpaddd ymm1, ymm1, ymm16 + vpaddd ymm2, ymm2, ymm18 + vpaddd ymm3, ymm3, ymm20 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm19 + vpaddd ymm1, ymm1, ymm26 + vpaddd ymm2, ymm2, ymm22 + vpaddd ymm3, ymm3, ymm23 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm27 + vpaddd ymm1, ymm1, ymm21 + vpaddd ymm2, ymm2, ymm17 + vpaddd ymm3, ymm3, ymm24 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm31 + vpaddd ymm1, ymm1, ymm16 + vpaddd ymm2, ymm2, ymm25 + vpaddd ymm3, ymm3, ymm22 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm30 + vpaddd ymm1, ymm1, ymm18 + vpaddd ymm2, ymm2, ymm19 + vpaddd ymm3, ymm3, ymm23 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm26 + vpaddd ymm1, ymm1, ymm28 + vpaddd ymm2, ymm2, ymm20 + vpaddd ymm3, ymm3, ymm29 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpxor ymm0, ymm0, ymm8 + vpxor ymm1, ymm1, ymm9 + vpxor ymm2, ymm2, ymm10 + vpxor ymm3, ymm3, ymm11 + vpxor ymm4, ymm4, ymm12 + vpxor ymm5, ymm5, ymm13 + vpxor ymm6, ymm6, ymm14 + vpxor ymm7, ymm7, ymm15 + movzx eax, byte ptr [rbp+0x78] + jne 2b + mov rbx, qword ptr [rbp+0x90] + vunpcklps ymm8, ymm0, ymm1 + vunpcklps ymm9, ymm2, ymm3 + vunpckhps ymm10, ymm0, ymm1 + vunpcklps ymm11, ymm4, ymm5 + vunpcklps ymm0, ymm6, ymm7 + vshufps ymm12, ymm8, ymm9, 78 + vblendps ymm1, ymm8, ymm12, 0xCC + vshufps ymm8, ymm11, ymm0, 78 + vunpckhps ymm13, ymm2, ymm3 + vblendps ymm2, ymm11, ymm8, 0xCC + vblendps ymm3, ymm12, ymm9, 0xCC + vperm2f128 ymm12, ymm1, ymm2, 0x20 + vmovups ymmword ptr [rbx], ymm12 + vunpckhps ymm14, ymm4, ymm5 + vblendps ymm4, ymm8, ymm0, 0xCC + vunpckhps ymm15, ymm6, ymm7 + vperm2f128 ymm7, ymm3, ymm4, 0x20 + vmovups ymmword ptr [rbx+0x20], ymm7 + vshufps ymm5, ymm10, ymm13, 78 + vblendps ymm6, ymm5, ymm13, 0xCC + vshufps ymm13, ymm14, ymm15, 78 + vblendps ymm10, ymm10, ymm5, 0xCC + vblendps ymm14, ymm14, ymm13, 0xCC + vperm2f128 ymm8, ymm10, ymm14, 0x20 + vmovups ymmword ptr [rbx+0x40], ymm8 + vblendps ymm15, ymm13, ymm15, 0xCC + vperm2f128 ymm13, ymm6, ymm15, 0x20 + vmovups ymmword ptr [rbx+0x60], ymm13 + vperm2f128 ymm9, ymm1, ymm2, 0x31 + vperm2f128 ymm11, ymm3, ymm4, 0x31 + vmovups ymmword ptr [rbx+0x80], ymm9 + vperm2f128 ymm14, ymm10, ymm14, 0x31 + vperm2f128 ymm15, ymm6, ymm15, 0x31 + vmovups ymmword ptr [rbx+0xA0], ymm11 + vmovups ymmword ptr [rbx+0xC0], ymm14 + vmovups ymmword ptr [rbx+0xE0], ymm15 + vmovdqa ymm0, ymmword ptr [rsp] + vmovdqa ymm2, ymmword ptr [rsp+0x40] + vmovdqa32 ymm0 {k1}, ymmword ptr [rsp+0x1*0x20] + vmovdqa32 ymm2 {k1}, ymmword ptr [rsp+0x3*0x20] + vmovdqa ymmword ptr [rsp], ymm0 + vmovdqa ymmword ptr [rsp+0x40], ymm2 + add rbx, 256 + mov qword ptr [rbp+0x90], rbx + add rdi, 64 + sub rsi, 8 +3: + mov rbx, qword ptr [rbp+0x90] + mov r15, qword ptr [rsp+0x80] + movzx r13, byte ptr [rbp+0x78] + movzx r12, byte ptr [rbp+0x88] + test esi, 0x4 + je 3f + vbroadcasti32x4 zmm0, xmmword ptr [rcx] + vbroadcasti32x4 zmm1, xmmword ptr [rcx+0x1*0x10] + vmovdqa xmm12, xmmword ptr [rsp] + vmovdqa xmm13, xmmword ptr [rsp+0x40] + vpunpckldq xmm14, xmm12, xmm13 + vpunpckhdq xmm15, xmm12, xmm13 + vpermq ymm14, ymm14, 0xDC + vpermq ymm15, ymm15, 0xDC + vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN+rip] + vinserti64x4 zmm13, zmm14, ymm15, 0x01 + mov eax, 17476 + kmovw k2, eax + vpblendmd zmm13 {k2}, zmm13, zmm12 + vbroadcasti32x4 zmm15, xmmword ptr [BLAKE3_IV+rip] + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + mov eax, 43690 + kmovw k3, eax + mov eax, 34952 + kmovw k4, eax + movzx eax, byte ptr [rbp+0x80] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + mov dword ptr [rsp+0x88], eax + vmovdqa32 zmm2, zmm15 + vpbroadcastd zmm8, dword ptr [rsp+0x22*0x4] + vpblendmd zmm3 {k4}, zmm13, zmm8 + vmovups zmm8, zmmword ptr [r8+rdx-0x1*0x40] + vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-0x4*0x10], 0x01 + vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-0x4*0x10], 0x02 + vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-0x4*0x10], 0x03 + vmovups zmm9, zmmword ptr [r8+rdx-0x30] + vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-0x3*0x10], 0x01 + vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-0x3*0x10], 0x02 + vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-0x3*0x10], 0x03 + vshufps zmm4, zmm8, zmm9, 136 + vshufps zmm5, zmm8, zmm9, 221 + vmovups zmm8, zmmword ptr [r8+rdx-0x20] + vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-0x2*0x10], 0x01 + vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-0x2*0x10], 0x02 + vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-0x2*0x10], 0x03 + vmovups zmm9, zmmword ptr [r8+rdx-0x10] + vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-0x1*0x10], 0x01 + vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-0x1*0x10], 0x02 + vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-0x1*0x10], 0x03 + vshufps zmm6, zmm8, zmm9, 136 + vshufps zmm7, zmm8, zmm9, 221 + vpshufd zmm6, zmm6, 0x93 + vpshufd zmm7, zmm7, 0x93 + mov al, 7 +9: + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm0, zmm0, zmm1 + vpxord zmm3, zmm3, zmm0 + vprord zmm3, zmm3, 16 + vpaddd zmm2, zmm2, zmm3 + vpxord zmm1, zmm1, zmm2 + vprord zmm1, zmm1, 12 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm0, zmm0, zmm1 + vpxord zmm3, zmm3, zmm0 + vprord zmm3, zmm3, 8 + vpaddd zmm2, zmm2, zmm3 + vpxord zmm1, zmm1, zmm2 + vprord zmm1, zmm1, 7 + vpshufd zmm0, zmm0, 0x93 + vpshufd zmm3, zmm3, 0x4E + vpshufd zmm2, zmm2, 0x39 + vpaddd zmm0, zmm0, zmm6 + vpaddd zmm0, zmm0, zmm1 + vpxord zmm3, zmm3, zmm0 + vprord zmm3, zmm3, 16 + vpaddd zmm2, zmm2, zmm3 + vpxord zmm1, zmm1, zmm2 + vprord zmm1, zmm1, 12 + vpaddd zmm0, zmm0, zmm7 + vpaddd zmm0, zmm0, zmm1 + vpxord zmm3, zmm3, zmm0 + vprord zmm3, zmm3, 8 + vpaddd zmm2, zmm2, zmm3 + vpxord zmm1, zmm1, zmm2 + vprord zmm1, zmm1, 7 + vpshufd zmm0, zmm0, 0x39 + vpshufd zmm3, zmm3, 0x4E + vpshufd zmm2, zmm2, 0x93 + dec al + jz 9f + vshufps zmm8, zmm4, zmm5, 214 + vpshufd zmm9, zmm4, 0x0F + vpshufd zmm4, zmm8, 0x39 + vshufps zmm8, zmm6, zmm7, 250 + vpblendmd zmm9 {k3}, zmm9, zmm8 + vpunpcklqdq zmm8, zmm7, zmm5 + vpblendmd zmm8 {k4}, zmm8, zmm6 + vpshufd zmm8, zmm8, 0x78 + vpunpckhdq zmm5, zmm5, zmm7 + vpunpckldq zmm6, zmm6, zmm5 + vpshufd zmm7, zmm6, 0x1E + vmovdqa32 zmm5, zmm9 + vmovdqa32 zmm6, zmm8 + jmp 9b +9: + vpxord zmm0, zmm0, zmm2 + vpxord zmm1, zmm1, zmm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 + vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 + vextracti32x4 xmmword ptr [rbx+0x4*0x10], zmm0, 0x02 + vextracti32x4 xmmword ptr [rbx+0x5*0x10], zmm1, 0x02 + vextracti32x4 xmmword ptr [rbx+0x6*0x10], zmm0, 0x03 + vextracti32x4 xmmword ptr [rbx+0x7*0x10], zmm1, 0x03 + vmovdqa xmm0, xmmword ptr [rsp] + vmovdqa xmm2, xmmword ptr [rsp+0x40] + vmovdqa32 xmm0 {k1}, xmmword ptr [rsp+0x1*0x10] + vmovdqa32 xmm2 {k1}, xmmword ptr [rsp+0x5*0x10] + vmovdqa xmmword ptr [rsp], xmm0 + vmovdqa xmmword ptr [rsp+0x40], xmm2 + add rbx, 128 + add rdi, 32 + sub rsi, 4 +3: + test esi, 0x2 + je 3f + vbroadcasti128 ymm0, xmmword ptr [rcx] + vbroadcasti128 ymm1, xmmword ptr [rcx+0x10] + vmovd xmm13, dword ptr [rsp] + vpinsrd xmm13, xmm13, dword ptr [rsp+0x40], 1 + vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vmovd xmm14, dword ptr [rsp+0x4] + vpinsrd xmm14, xmm14, dword ptr [rsp+0x44], 1 + vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vinserti128 ymm13, ymm13, xmm14, 0x01 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + movzx eax, byte ptr [rbp+0x80] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + mov dword ptr [rsp+0x88], eax + vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip] + vpbroadcastd ymm8, dword ptr [rsp+0x88] + vpblendd ymm3, ymm13, ymm8, 0x88 + vmovups ymm8, ymmword ptr [r8+rdx-0x40] + vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x40], 0x01 + vmovups ymm9, ymmword ptr [r8+rdx-0x30] + vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x30], 0x01 + vshufps ymm4, ymm8, ymm9, 136 + vshufps ymm5, ymm8, ymm9, 221 + vmovups ymm8, ymmword ptr [r8+rdx-0x20] + vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x20], 0x01 + vmovups ymm9, ymmword ptr [r8+rdx-0x10] + vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x10], 0x01 + vshufps ymm6, ymm8, ymm9, 136 + vshufps ymm7, ymm8, ymm9, 221 + vpshufd ymm6, ymm6, 0x93 + vpshufd ymm7, ymm7, 0x93 + mov al, 7 +9: + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm0, ymm0, ymm1 + vpxord ymm3, ymm3, ymm0 + vprord ymm3, ymm3, 16 + vpaddd ymm2, ymm2, ymm3 + vpxord ymm1, ymm1, ymm2 + vprord ymm1, ymm1, 12 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm0, ymm0, ymm1 + vpxord ymm3, ymm3, ymm0 + vprord ymm3, ymm3, 8 + vpaddd ymm2, ymm2, ymm3 + vpxord ymm1, ymm1, ymm2 + vprord ymm1, ymm1, 7 + vpshufd ymm0, ymm0, 0x93 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm2, ymm2, 0x39 + vpaddd ymm0, ymm0, ymm6 + vpaddd ymm0, ymm0, ymm1 + vpxord ymm3, ymm3, ymm0 + vprord ymm3, ymm3, 16 + vpaddd ymm2, ymm2, ymm3 + vpxord ymm1, ymm1, ymm2 + vprord ymm1, ymm1, 12 + vpaddd ymm0, ymm0, ymm7 + vpaddd ymm0, ymm0, ymm1 + vpxord ymm3, ymm3, ymm0 + vprord ymm3, ymm3, 8 + vpaddd ymm2, ymm2, ymm3 + vpxord ymm1, ymm1, ymm2 + vprord ymm1, ymm1, 7 + vpshufd ymm0, ymm0, 0x39 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm2, ymm2, 0x93 + dec al + jz 9f + vshufps ymm8, ymm4, ymm5, 214 + vpshufd ymm9, ymm4, 0x0F + vpshufd ymm4, ymm8, 0x39 + vshufps ymm8, ymm6, ymm7, 250 + vpblendd ymm9, ymm9, ymm8, 0xAA + vpunpcklqdq ymm8, ymm7, ymm5 + vpblendd ymm8, ymm8, ymm6, 0x88 + vpshufd ymm8, ymm8, 0x78 + vpunpckhdq ymm5, ymm5, ymm7 + vpunpckldq ymm6, ymm6, ymm5 + vpshufd ymm7, ymm6, 0x1E + vmovdqa ymm5, ymm9 + vmovdqa ymm6, ymm8 + jmp 9b +9: + vpxor ymm0, ymm0, ymm2 + vpxor ymm1, ymm1, ymm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 + vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 + vmovdqa xmm0, xmmword ptr [rsp] + vmovdqa xmm2, xmmword ptr [rsp+0x40] + vmovdqu32 xmm0 {k1}, xmmword ptr [rsp+0x8] + vmovdqu32 xmm2 {k1}, xmmword ptr [rsp+0x48] + vmovdqa xmmword ptr [rsp], xmm0 + vmovdqa xmmword ptr [rsp+0x40], xmm2 + add rbx, 64 + add rdi, 16 + sub rsi, 2 +3: + test esi, 0x1 + je 4b + vmovdqu xmm0, xmmword ptr [rcx] + vmovdqu xmm1, xmmword ptr [rcx+0x10] + vmovd xmm14, dword ptr [rsp] + vpinsrd xmm14, xmm14, dword ptr [rsp+0x40], 1 + vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vmovdqa xmm15, xmmword ptr [BLAKE3_IV+rip] + mov r8, qword ptr [rdi] + movzx eax, byte ptr [rbp+0x80] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + vpinsrd xmm3, xmm14, eax, 3 + vmovdqa xmm2, xmm15 + vmovups xmm8, xmmword ptr [r8+rdx-0x40] + vmovups xmm9, xmmword ptr [r8+rdx-0x30] + vshufps xmm4, xmm8, xmm9, 136 + vshufps xmm5, xmm8, xmm9, 221 + vmovups xmm8, xmmword ptr [r8+rdx-0x20] + vmovups xmm9, xmmword ptr [r8+rdx-0x10] + vshufps xmm6, xmm8, xmm9, 136 + vshufps xmm7, xmm8, xmm9, 221 + vpshufd xmm6, xmm6, 0x93 + vpshufd xmm7, xmm7, 0x93 + mov al, 7 +9: + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x93 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x39 + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x39 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x93 + dec al + jz 9f + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0x0F + vpshufd xmm4, xmm8, 0x39 + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0xAA + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 0x88 + vpshufd xmm8, xmm8, 0x78 + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 0x1E + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + jmp 9b +9: + vpxor xmm0, xmm0, xmm2 + vpxor xmm1, xmm1, xmm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + jmp 4b + + +.p2align 6 +_blake3_compress_in_place_avx512: +blake3_compress_in_place_avx512: + sub rsp, 72 + vmovdqa xmmword ptr [rsp], xmm6 + vmovdqa xmmword ptr [rsp+0x10], xmm7 + vmovdqa xmmword ptr [rsp+0x20], xmm8 + vmovdqa xmmword ptr [rsp+0x30], xmm9 + vmovdqu xmm0, xmmword ptr [rcx] + vmovdqu xmm1, xmmword ptr [rcx+0x10] + movzx eax, byte ptr [rsp+0x70] + movzx r8d, r8b + shl rax, 32 + add r8, rax + vmovq xmm3, r9 + vmovq xmm4, r8 + vpunpcklqdq xmm3, xmm3, xmm4 + vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip] + vmovups xmm8, xmmword ptr [rdx] + vmovups xmm9, xmmword ptr [rdx+0x10] + vshufps xmm4, xmm8, xmm9, 136 + vshufps xmm5, xmm8, xmm9, 221 + vmovups xmm8, xmmword ptr [rdx+0x20] + vmovups xmm9, xmmword ptr [rdx+0x30] + vshufps xmm6, xmm8, xmm9, 136 + vshufps xmm7, xmm8, xmm9, 221 + vpshufd xmm6, xmm6, 0x93 + vpshufd xmm7, xmm7, 0x93 + mov al, 7 +9: + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x93 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x39 + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x39 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x93 + dec al + jz 9f + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0x0F + vpshufd xmm4, xmm8, 0x39 + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0xAA + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 0x88 + vpshufd xmm8, xmm8, 0x78 + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 0x1E + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + jmp 9b +9: + vpxor xmm0, xmm0, xmm2 + vpxor xmm1, xmm1, xmm3 + vmovdqu xmmword ptr [rcx], xmm0 + vmovdqu xmmword ptr [rcx+0x10], xmm1 + vmovdqa xmm6, xmmword ptr [rsp] + vmovdqa xmm7, xmmword ptr [rsp+0x10] + vmovdqa xmm8, xmmword ptr [rsp+0x20] + vmovdqa xmm9, xmmword ptr [rsp+0x30] + add rsp, 72 + ret + + +.p2align 6 +_blake3_compress_xof_avx512: +blake3_compress_xof_avx512: + sub rsp, 72 + vmovdqa xmmword ptr [rsp], xmm6 + vmovdqa xmmword ptr [rsp+0x10], xmm7 + vmovdqa xmmword ptr [rsp+0x20], xmm8 + vmovdqa xmmword ptr [rsp+0x30], xmm9 + vmovdqu xmm0, xmmword ptr [rcx] + vmovdqu xmm1, xmmword ptr [rcx+0x10] + movzx eax, byte ptr [rsp+0x70] + movzx r8d, r8b + mov r10, qword ptr [rsp+0x78] + shl rax, 32 + add r8, rax + vmovq xmm3, r9 + vmovq xmm4, r8 + vpunpcklqdq xmm3, xmm3, xmm4 + vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip] + vmovups xmm8, xmmword ptr [rdx] + vmovups xmm9, xmmword ptr [rdx+0x10] + vshufps xmm4, xmm8, xmm9, 136 + vshufps xmm5, xmm8, xmm9, 221 + vmovups xmm8, xmmword ptr [rdx+0x20] + vmovups xmm9, xmmword ptr [rdx+0x30] + vshufps xmm6, xmm8, xmm9, 136 + vshufps xmm7, xmm8, xmm9, 221 + vpshufd xmm6, xmm6, 0x93 + vpshufd xmm7, xmm7, 0x93 + mov al, 7 +9: + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x93 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x39 + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x39 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x93 + dec al + jz 9f + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0x0F + vpshufd xmm4, xmm8, 0x39 + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0xAA + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 0x88 + vpshufd xmm8, xmm8, 0x78 + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 0x1E + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + jmp 9b +9: + vpxor xmm0, xmm0, xmm2 + vpxor xmm1, xmm1, xmm3 + vpxor xmm2, xmm2, xmmword ptr [rcx] + vpxor xmm3, xmm3, xmmword ptr [rcx+0x10] + vmovdqu xmmword ptr [r10], xmm0 + vmovdqu xmmword ptr [r10+0x10], xmm1 + vmovdqu xmmword ptr [r10+0x20], xmm2 + vmovdqu xmmword ptr [r10+0x30], xmm3 + vmovdqa xmm6, xmmword ptr [rsp] + vmovdqa xmm7, xmmword ptr [rsp+0x10] + vmovdqa xmm8, xmmword ptr [rsp+0x20] + vmovdqa xmm9, xmmword ptr [rsp+0x30] + add rsp, 72 + ret + +.section .rdata +.p2align 6 +INDEX0: + .long 0, 1, 2, 3, 16, 17, 18, 19 + .long 8, 9, 10, 11, 24, 25, 26, 27 +INDEX1: + .long 4, 5, 6, 7, 20, 21, 22, 23 + .long 12, 13, 14, 15, 28, 29, 30, 31 +ADD0: + .long 0, 1, 2, 3, 4, 5, 6, 7 + .long 8, 9, 10, 11, 12, 13, 14, 15 +ADD1: .long 1 + +ADD16: .long 16 +BLAKE3_BLOCK_LEN: + .long 64 +.p2align 6 +BLAKE3_IV: +BLAKE3_IV_0: + .long 0x6A09E667 +BLAKE3_IV_1: + .long 0xBB67AE85 +BLAKE3_IV_2: + .long 0x3C6EF372 +BLAKE3_IV_3: + .long 0xA54FF53A diff --git a/thirdparty/blake3/c/blake3_avx512_x86-64_windows_msvc.asm b/thirdparty/blake3/c/blake3_avx512_x86-64_windows_msvc.asm new file mode 100644 index 000000000..b19efbaae --- /dev/null +++ b/thirdparty/blake3/c/blake3_avx512_x86-64_windows_msvc.asm @@ -0,0 +1,2634 @@ +public _blake3_hash_many_avx512 +public blake3_hash_many_avx512 +public blake3_compress_in_place_avx512 +public _blake3_compress_in_place_avx512 +public blake3_compress_xof_avx512 +public _blake3_compress_xof_avx512 + +_TEXT SEGMENT ALIGN(16) 'CODE' + +ALIGN 16 +blake3_hash_many_avx512 PROC +_blake3_hash_many_avx512 PROC + push r15 + push r14 + push r13 + push r12 + push rdi + push rsi + push rbx + push rbp + mov rbp, rsp + sub rsp, 304 + and rsp, 0FFFFFFFFFFFFFFC0H + vmovdqa xmmword ptr [rsp+90H], xmm6 + vmovdqa xmmword ptr [rsp+0A0H], xmm7 + vmovdqa xmmword ptr [rsp+0B0H], xmm8 + vmovdqa xmmword ptr [rsp+0C0H], xmm9 + vmovdqa xmmword ptr [rsp+0D0H], xmm10 + vmovdqa xmmword ptr [rsp+0E0H], xmm11 + vmovdqa xmmword ptr [rsp+0F0H], xmm12 + vmovdqa xmmword ptr [rsp+100H], xmm13 + vmovdqa xmmword ptr [rsp+110H], xmm14 + vmovdqa xmmword ptr [rsp+120H], xmm15 + mov rdi, rcx + mov rsi, rdx + mov rdx, r8 + mov rcx, r9 + mov r8, qword ptr [rbp+68H] + movzx r9, byte ptr [rbp+70H] + neg r9 + kmovw k1, r9d + vmovd xmm0, r8d + vpbroadcastd ymm0, xmm0 + shr r8, 32 + vmovd xmm1, r8d + vpbroadcastd ymm1, xmm1 + vmovdqa ymm4, ymm1 + vmovdqa ymm5, ymm1 + vpaddd ymm2, ymm0, ymmword ptr [ADD0] + vpaddd ymm3, ymm0, ymmword ptr [ADD0+32] + vpcmpud k2, ymm2, ymm0, 1 + vpcmpud k3, ymm3, ymm0, 1 + ; XXX: ml64.exe does not currently understand the syntax. We use a workaround. + vpbroadcastd ymm6, dword ptr [ADD1] + vpaddd ymm4 {k2}, ymm4, ymm6 + vpaddd ymm5 {k3}, ymm5, ymm6 + ; vpaddd ymm4 {k2}, ymm4, dword ptr [ADD1] {1to8} + ; vpaddd ymm5 {k3}, ymm5, dword ptr [ADD1] {1to8} + knotw k2, k1 + vmovdqa32 ymm2 {k2}, ymm0 + vmovdqa32 ymm3 {k2}, ymm0 + vmovdqa32 ymm4 {k2}, ymm1 + vmovdqa32 ymm5 {k2}, ymm1 + vmovdqa ymmword ptr [rsp], ymm2 + vmovdqa ymmword ptr [rsp+20H], ymm3 + vmovdqa ymmword ptr [rsp+40H], ymm4 + vmovdqa ymmword ptr [rsp+60H], ymm5 + shl rdx, 6 + mov qword ptr [rsp+80H], rdx + cmp rsi, 16 + jc final15blocks +outerloop16: + vpbroadcastd zmm0, dword ptr [rcx] + vpbroadcastd zmm1, dword ptr [rcx+1H*4H] + vpbroadcastd zmm2, dword ptr [rcx+2H*4H] + vpbroadcastd zmm3, dword ptr [rcx+3H*4H] + vpbroadcastd zmm4, dword ptr [rcx+4H*4H] + vpbroadcastd zmm5, dword ptr [rcx+5H*4H] + vpbroadcastd zmm6, dword ptr [rcx+6H*4H] + vpbroadcastd zmm7, dword ptr [rcx+7H*4H] + movzx eax, byte ptr [rbp+78H] + movzx ebx, byte ptr [rbp+80H] + or eax, ebx + xor edx, edx +ALIGN 16 +innerloop16: + movzx ebx, byte ptr [rbp+88H] + or ebx, eax + add rdx, 64 + cmp rdx, qword ptr [rsp+80H] + cmove eax, ebx + mov dword ptr [rsp+88H], eax + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+8H] + mov r10, qword ptr [rdi+10H] + mov r11, qword ptr [rdi+18H] + mov r12, qword ptr [rdi+40H] + mov r13, qword ptr [rdi+48H] + mov r14, qword ptr [rdi+50H] + mov r15, qword ptr [rdi+58H] + vmovdqu32 ymm16, ymmword ptr [rdx+r8-2H*20H] + vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-2H*20H], 01H + vmovdqu32 ymm17, ymmword ptr [rdx+r9-2H*20H] + vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-2H*20H], 01H + vpunpcklqdq zmm8, zmm16, zmm17 + vpunpckhqdq zmm9, zmm16, zmm17 + vmovdqu32 ymm18, ymmword ptr [rdx+r10-2H*20H] + vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-2H*20H], 01H + vmovdqu32 ymm19, ymmword ptr [rdx+r11-2H*20H] + vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-2H*20H], 01H + vpunpcklqdq zmm10, zmm18, zmm19 + vpunpckhqdq zmm11, zmm18, zmm19 + mov r8, qword ptr [rdi+20H] + mov r9, qword ptr [rdi+28H] + mov r10, qword ptr [rdi+30H] + mov r11, qword ptr [rdi+38H] + mov r12, qword ptr [rdi+60H] + mov r13, qword ptr [rdi+68H] + mov r14, qword ptr [rdi+70H] + mov r15, qword ptr [rdi+78H] + vmovdqu32 ymm16, ymmword ptr [rdx+r8-2H*20H] + vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-2H*20H], 01H + vmovdqu32 ymm17, ymmword ptr [rdx+r9-2H*20H] + vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-2H*20H], 01H + vpunpcklqdq zmm12, zmm16, zmm17 + vpunpckhqdq zmm13, zmm16, zmm17 + vmovdqu32 ymm18, ymmword ptr [rdx+r10-2H*20H] + vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-2H*20H], 01H + vmovdqu32 ymm19, ymmword ptr [rdx+r11-2H*20H] + vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-2H*20H], 01H + vpunpcklqdq zmm14, zmm18, zmm19 + vpunpckhqdq zmm15, zmm18, zmm19 + vmovdqa32 zmm27, zmmword ptr [INDEX0] + vmovdqa32 zmm31, zmmword ptr [INDEX1] + vshufps zmm16, zmm8, zmm10, 136 + vshufps zmm17, zmm12, zmm14, 136 + vmovdqa32 zmm20, zmm16 + vpermt2d zmm16, zmm27, zmm17 + vpermt2d zmm20, zmm31, zmm17 + vshufps zmm17, zmm8, zmm10, 221 + vshufps zmm30, zmm12, zmm14, 221 + vmovdqa32 zmm21, zmm17 + vpermt2d zmm17, zmm27, zmm30 + vpermt2d zmm21, zmm31, zmm30 + vshufps zmm18, zmm9, zmm11, 136 + vshufps zmm8, zmm13, zmm15, 136 + vmovdqa32 zmm22, zmm18 + vpermt2d zmm18, zmm27, zmm8 + vpermt2d zmm22, zmm31, zmm8 + vshufps zmm19, zmm9, zmm11, 221 + vshufps zmm8, zmm13, zmm15, 221 + vmovdqa32 zmm23, zmm19 + vpermt2d zmm19, zmm27, zmm8 + vpermt2d zmm23, zmm31, zmm8 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+8H] + mov r10, qword ptr [rdi+10H] + mov r11, qword ptr [rdi+18H] + mov r12, qword ptr [rdi+40H] + mov r13, qword ptr [rdi+48H] + mov r14, qword ptr [rdi+50H] + mov r15, qword ptr [rdi+58H] + vmovdqu32 ymm24, ymmword ptr [r8+rdx-1H*20H] + vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-1H*20H], 01H + vmovdqu32 ymm25, ymmword ptr [r9+rdx-1H*20H] + vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-1H*20H], 01H + vpunpcklqdq zmm8, zmm24, zmm25 + vpunpckhqdq zmm9, zmm24, zmm25 + vmovdqu32 ymm24, ymmword ptr [r10+rdx-1H*20H] + vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-1H*20H], 01H + vmovdqu32 ymm25, ymmword ptr [r11+rdx-1H*20H] + vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-1H*20H], 01H + vpunpcklqdq zmm10, zmm24, zmm25 + vpunpckhqdq zmm11, zmm24, zmm25 + prefetcht0 byte ptr [r8+rdx+80H] + prefetcht0 byte ptr [r12+rdx+80H] + prefetcht0 byte ptr [r9+rdx+80H] + prefetcht0 byte ptr [r13+rdx+80H] + prefetcht0 byte ptr [r10+rdx+80H] + prefetcht0 byte ptr [r14+rdx+80H] + prefetcht0 byte ptr [r11+rdx+80H] + prefetcht0 byte ptr [r15+rdx+80H] + mov r8, qword ptr [rdi+20H] + mov r9, qword ptr [rdi+28H] + mov r10, qword ptr [rdi+30H] + mov r11, qword ptr [rdi+38H] + mov r12, qword ptr [rdi+60H] + mov r13, qword ptr [rdi+68H] + mov r14, qword ptr [rdi+70H] + mov r15, qword ptr [rdi+78H] + vmovdqu32 ymm24, ymmword ptr [r8+rdx-1H*20H] + vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-1H*20H], 01H + vmovdqu32 ymm25, ymmword ptr [r9+rdx-1H*20H] + vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-1H*20H], 01H + vpunpcklqdq zmm12, zmm24, zmm25 + vpunpckhqdq zmm13, zmm24, zmm25 + vmovdqu32 ymm24, ymmword ptr [r10+rdx-1H*20H] + vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-1H*20H], 01H + vmovdqu32 ymm25, ymmword ptr [r11+rdx-1H*20H] + vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-1H*20H], 01H + vpunpcklqdq zmm14, zmm24, zmm25 + vpunpckhqdq zmm15, zmm24, zmm25 + prefetcht0 byte ptr [r8+rdx+80H] + prefetcht0 byte ptr [r12+rdx+80H] + prefetcht0 byte ptr [r9+rdx+80H] + prefetcht0 byte ptr [r13+rdx+80H] + prefetcht0 byte ptr [r10+rdx+80H] + prefetcht0 byte ptr [r14+rdx+80H] + prefetcht0 byte ptr [r11+rdx+80H] + prefetcht0 byte ptr [r15+rdx+80H] + vshufps zmm24, zmm8, zmm10, 136 + vshufps zmm30, zmm12, zmm14, 136 + vmovdqa32 zmm28, zmm24 + vpermt2d zmm24, zmm27, zmm30 + vpermt2d zmm28, zmm31, zmm30 + vshufps zmm25, zmm8, zmm10, 221 + vshufps zmm30, zmm12, zmm14, 221 + vmovdqa32 zmm29, zmm25 + vpermt2d zmm25, zmm27, zmm30 + vpermt2d zmm29, zmm31, zmm30 + vshufps zmm26, zmm9, zmm11, 136 + vshufps zmm8, zmm13, zmm15, 136 + vmovdqa32 zmm30, zmm26 + vpermt2d zmm26, zmm27, zmm8 + vpermt2d zmm30, zmm31, zmm8 + vshufps zmm8, zmm9, zmm11, 221 + vshufps zmm10, zmm13, zmm15, 221 + vpermi2d zmm27, zmm8, zmm10 + vpermi2d zmm31, zmm8, zmm10 + vpbroadcastd zmm8, dword ptr [BLAKE3_IV_0] + vpbroadcastd zmm9, dword ptr [BLAKE3_IV_1] + vpbroadcastd zmm10, dword ptr [BLAKE3_IV_2] + vpbroadcastd zmm11, dword ptr [BLAKE3_IV_3] + vmovdqa32 zmm12, zmmword ptr [rsp] + vmovdqa32 zmm13, zmmword ptr [rsp+1H*40H] + vpbroadcastd zmm14, dword ptr [BLAKE3_BLOCK_LEN] + vpbroadcastd zmm15, dword ptr [rsp+22H*4H] + vpaddd zmm0, zmm0, zmm16 + vpaddd zmm1, zmm1, zmm18 + vpaddd zmm2, zmm2, zmm20 + vpaddd zmm3, zmm3, zmm22 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm17 + vpaddd zmm1, zmm1, zmm19 + vpaddd zmm2, zmm2, zmm21 + vpaddd zmm3, zmm3, zmm23 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm24 + vpaddd zmm1, zmm1, zmm26 + vpaddd zmm2, zmm2, zmm28 + vpaddd zmm3, zmm3, zmm30 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm25 + vpaddd zmm1, zmm1, zmm27 + vpaddd zmm2, zmm2, zmm29 + vpaddd zmm3, zmm3, zmm31 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm18 + vpaddd zmm1, zmm1, zmm19 + vpaddd zmm2, zmm2, zmm23 + vpaddd zmm3, zmm3, zmm20 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm22 + vpaddd zmm1, zmm1, zmm26 + vpaddd zmm2, zmm2, zmm16 + vpaddd zmm3, zmm3, zmm29 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm17 + vpaddd zmm1, zmm1, zmm28 + vpaddd zmm2, zmm2, zmm25 + vpaddd zmm3, zmm3, zmm31 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm27 + vpaddd zmm1, zmm1, zmm21 + vpaddd zmm2, zmm2, zmm30 + vpaddd zmm3, zmm3, zmm24 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm19 + vpaddd zmm1, zmm1, zmm26 + vpaddd zmm2, zmm2, zmm29 + vpaddd zmm3, zmm3, zmm23 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm20 + vpaddd zmm1, zmm1, zmm28 + vpaddd zmm2, zmm2, zmm18 + vpaddd zmm3, zmm3, zmm30 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm22 + vpaddd zmm1, zmm1, zmm25 + vpaddd zmm2, zmm2, zmm27 + vpaddd zmm3, zmm3, zmm24 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm21 + vpaddd zmm1, zmm1, zmm16 + vpaddd zmm2, zmm2, zmm31 + vpaddd zmm3, zmm3, zmm17 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm26 + vpaddd zmm1, zmm1, zmm28 + vpaddd zmm2, zmm2, zmm30 + vpaddd zmm3, zmm3, zmm29 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm23 + vpaddd zmm1, zmm1, zmm25 + vpaddd zmm2, zmm2, zmm19 + vpaddd zmm3, zmm3, zmm31 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm20 + vpaddd zmm1, zmm1, zmm27 + vpaddd zmm2, zmm2, zmm21 + vpaddd zmm3, zmm3, zmm17 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm16 + vpaddd zmm1, zmm1, zmm18 + vpaddd zmm2, zmm2, zmm24 + vpaddd zmm3, zmm3, zmm22 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm28 + vpaddd zmm1, zmm1, zmm25 + vpaddd zmm2, zmm2, zmm31 + vpaddd zmm3, zmm3, zmm30 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm29 + vpaddd zmm1, zmm1, zmm27 + vpaddd zmm2, zmm2, zmm26 + vpaddd zmm3, zmm3, zmm24 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm23 + vpaddd zmm1, zmm1, zmm21 + vpaddd zmm2, zmm2, zmm16 + vpaddd zmm3, zmm3, zmm22 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm18 + vpaddd zmm1, zmm1, zmm19 + vpaddd zmm2, zmm2, zmm17 + vpaddd zmm3, zmm3, zmm20 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm25 + vpaddd zmm1, zmm1, zmm27 + vpaddd zmm2, zmm2, zmm24 + vpaddd zmm3, zmm3, zmm31 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm30 + vpaddd zmm1, zmm1, zmm21 + vpaddd zmm2, zmm2, zmm28 + vpaddd zmm3, zmm3, zmm17 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm29 + vpaddd zmm1, zmm1, zmm16 + vpaddd zmm2, zmm2, zmm18 + vpaddd zmm3, zmm3, zmm20 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm19 + vpaddd zmm1, zmm1, zmm26 + vpaddd zmm2, zmm2, zmm22 + vpaddd zmm3, zmm3, zmm23 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm27 + vpaddd zmm1, zmm1, zmm21 + vpaddd zmm2, zmm2, zmm17 + vpaddd zmm3, zmm3, zmm24 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm31 + vpaddd zmm1, zmm1, zmm16 + vpaddd zmm2, zmm2, zmm25 + vpaddd zmm3, zmm3, zmm22 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm30 + vpaddd zmm1, zmm1, zmm18 + vpaddd zmm2, zmm2, zmm19 + vpaddd zmm3, zmm3, zmm23 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm26 + vpaddd zmm1, zmm1, zmm28 + vpaddd zmm2, zmm2, zmm20 + vpaddd zmm3, zmm3, zmm29 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpxord zmm0, zmm0, zmm8 + vpxord zmm1, zmm1, zmm9 + vpxord zmm2, zmm2, zmm10 + vpxord zmm3, zmm3, zmm11 + vpxord zmm4, zmm4, zmm12 + vpxord zmm5, zmm5, zmm13 + vpxord zmm6, zmm6, zmm14 + vpxord zmm7, zmm7, zmm15 + movzx eax, byte ptr [rbp+78H] + jne innerloop16 + mov rbx, qword ptr [rbp+90H] + vpunpckldq zmm16, zmm0, zmm1 + vpunpckhdq zmm17, zmm0, zmm1 + vpunpckldq zmm18, zmm2, zmm3 + vpunpckhdq zmm19, zmm2, zmm3 + vpunpckldq zmm20, zmm4, zmm5 + vpunpckhdq zmm21, zmm4, zmm5 + vpunpckldq zmm22, zmm6, zmm7 + vpunpckhdq zmm23, zmm6, zmm7 + vpunpcklqdq zmm0, zmm16, zmm18 + vpunpckhqdq zmm1, zmm16, zmm18 + vpunpcklqdq zmm2, zmm17, zmm19 + vpunpckhqdq zmm3, zmm17, zmm19 + vpunpcklqdq zmm4, zmm20, zmm22 + vpunpckhqdq zmm5, zmm20, zmm22 + vpunpcklqdq zmm6, zmm21, zmm23 + vpunpckhqdq zmm7, zmm21, zmm23 + vshufi32x4 zmm16, zmm0, zmm4, 88H + vshufi32x4 zmm17, zmm1, zmm5, 88H + vshufi32x4 zmm18, zmm2, zmm6, 88H + vshufi32x4 zmm19, zmm3, zmm7, 88H + vshufi32x4 zmm20, zmm0, zmm4, 0DDH + vshufi32x4 zmm21, zmm1, zmm5, 0DDH + vshufi32x4 zmm22, zmm2, zmm6, 0DDH + vshufi32x4 zmm23, zmm3, zmm7, 0DDH + vshufi32x4 zmm0, zmm16, zmm17, 88H + vshufi32x4 zmm1, zmm18, zmm19, 88H + vshufi32x4 zmm2, zmm20, zmm21, 88H + vshufi32x4 zmm3, zmm22, zmm23, 88H + vshufi32x4 zmm4, zmm16, zmm17, 0DDH + vshufi32x4 zmm5, zmm18, zmm19, 0DDH + vshufi32x4 zmm6, zmm20, zmm21, 0DDH + vshufi32x4 zmm7, zmm22, zmm23, 0DDH + vmovdqu32 zmmword ptr [rbx], zmm0 + vmovdqu32 zmmword ptr [rbx+1H*40H], zmm1 + vmovdqu32 zmmword ptr [rbx+2H*40H], zmm2 + vmovdqu32 zmmword ptr [rbx+3H*40H], zmm3 + vmovdqu32 zmmword ptr [rbx+4H*40H], zmm4 + vmovdqu32 zmmword ptr [rbx+5H*40H], zmm5 + vmovdqu32 zmmword ptr [rbx+6H*40H], zmm6 + vmovdqu32 zmmword ptr [rbx+7H*40H], zmm7 + vmovdqa32 zmm0, zmmword ptr [rsp] + vmovdqa32 zmm1, zmmword ptr [rsp+1H*40H] + vmovdqa32 zmm2, zmm0 + ; XXX: ml64.exe does not currently understand the syntax. We use a workaround. + vpbroadcastd zmm4, dword ptr [ADD16] + vpbroadcastd zmm5, dword ptr [ADD1] + vpaddd zmm2{k1}, zmm0, zmm4 + ; vpaddd zmm2{k1}, zmm0, dword ptr [ADD16] ; {1to16} + vpcmpud k2, zmm2, zmm0, 1 + vpaddd zmm1 {k2}, zmm1, zmm5 + ; vpaddd zmm1 {k2}, zmm1, dword ptr [ADD1] ; {1to16} + vmovdqa32 zmmword ptr [rsp], zmm2 + vmovdqa32 zmmword ptr [rsp+1H*40H], zmm1 + add rdi, 128 + add rbx, 512 + mov qword ptr [rbp+90H], rbx + sub rsi, 16 + cmp rsi, 16 + jnc outerloop16 + test rsi, rsi + jne final15blocks +unwind: + vzeroupper + vmovdqa xmm6, xmmword ptr [rsp+90H] + vmovdqa xmm7, xmmword ptr [rsp+0A0H] + vmovdqa xmm8, xmmword ptr [rsp+0B0H] + vmovdqa xmm9, xmmword ptr [rsp+0C0H] + vmovdqa xmm10, xmmword ptr [rsp+0D0H] + vmovdqa xmm11, xmmword ptr [rsp+0E0H] + vmovdqa xmm12, xmmword ptr [rsp+0F0H] + vmovdqa xmm13, xmmword ptr [rsp+100H] + vmovdqa xmm14, xmmword ptr [rsp+110H] + vmovdqa xmm15, xmmword ptr [rsp+120H] + mov rsp, rbp + pop rbp + pop rbx + pop rsi + pop rdi + pop r12 + pop r13 + pop r14 + pop r15 + ret +ALIGN 16 +final15blocks: + test esi, 8H + je final7blocks + vpbroadcastd ymm0, dword ptr [rcx] + vpbroadcastd ymm1, dword ptr [rcx+4H] + vpbroadcastd ymm2, dword ptr [rcx+8H] + vpbroadcastd ymm3, dword ptr [rcx+0CH] + vpbroadcastd ymm4, dword ptr [rcx+10H] + vpbroadcastd ymm5, dword ptr [rcx+14H] + vpbroadcastd ymm6, dword ptr [rcx+18H] + vpbroadcastd ymm7, dword ptr [rcx+1CH] + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+8H] + mov r10, qword ptr [rdi+10H] + mov r11, qword ptr [rdi+18H] + mov r12, qword ptr [rdi+20H] + mov r13, qword ptr [rdi+28H] + mov r14, qword ptr [rdi+30H] + mov r15, qword ptr [rdi+38H] + movzx eax, byte ptr [rbp+78H] + movzx ebx, byte ptr [rbp+80H] + or eax, ebx + xor edx, edx +innerloop8: + movzx ebx, byte ptr [rbp+88H] + or ebx, eax + add rdx, 64 + cmp rdx, qword ptr [rsp+80H] + cmove eax, ebx + mov dword ptr [rsp+88H], eax + vmovups xmm8, xmmword ptr [r8+rdx-40H] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-40H], 01H + vmovups xmm9, xmmword ptr [r9+rdx-40H] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-40H], 01H + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-40H] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-40H], 01H + vmovups xmm11, xmmword ptr [r11+rdx-40H] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-40H], 01H + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm16, ymm12, ymm14, 136 + vshufps ymm17, ymm12, ymm14, 221 + vshufps ymm18, ymm13, ymm15, 136 + vshufps ymm19, ymm13, ymm15, 221 + vmovups xmm8, xmmword ptr [r8+rdx-30H] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-30H], 01H + vmovups xmm9, xmmword ptr [r9+rdx-30H] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-30H], 01H + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-30H] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-30H], 01H + vmovups xmm11, xmmword ptr [r11+rdx-30H] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-30H], 01H + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm20, ymm12, ymm14, 136 + vshufps ymm21, ymm12, ymm14, 221 + vshufps ymm22, ymm13, ymm15, 136 + vshufps ymm23, ymm13, ymm15, 221 + vmovups xmm8, xmmword ptr [r8+rdx-20H] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-20H], 01H + vmovups xmm9, xmmword ptr [r9+rdx-20H] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-20H], 01H + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-20H] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-20H], 01H + vmovups xmm11, xmmword ptr [r11+rdx-20H] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-20H], 01H + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm24, ymm12, ymm14, 136 + vshufps ymm25, ymm12, ymm14, 221 + vshufps ymm26, ymm13, ymm15, 136 + vshufps ymm27, ymm13, ymm15, 221 + vmovups xmm8, xmmword ptr [r8+rdx-10H] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-10H], 01H + vmovups xmm9, xmmword ptr [r9+rdx-10H] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-10H], 01H + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-10H] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-10H], 01H + vmovups xmm11, xmmword ptr [r11+rdx-10H] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-10H], 01H + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm28, ymm12, ymm14, 136 + vshufps ymm29, ymm12, ymm14, 221 + vshufps ymm30, ymm13, ymm15, 136 + vshufps ymm31, ymm13, ymm15, 221 + vpbroadcastd ymm8, dword ptr [BLAKE3_IV_0] + vpbroadcastd ymm9, dword ptr [BLAKE3_IV_1] + vpbroadcastd ymm10, dword ptr [BLAKE3_IV_2] + vpbroadcastd ymm11, dword ptr [BLAKE3_IV_3] + vmovdqa ymm12, ymmword ptr [rsp] + vmovdqa ymm13, ymmword ptr [rsp+40H] + vpbroadcastd ymm14, dword ptr [BLAKE3_BLOCK_LEN] + vpbroadcastd ymm15, dword ptr [rsp+88H] + vpaddd ymm0, ymm0, ymm16 + vpaddd ymm1, ymm1, ymm18 + vpaddd ymm2, ymm2, ymm20 + vpaddd ymm3, ymm3, ymm22 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm17 + vpaddd ymm1, ymm1, ymm19 + vpaddd ymm2, ymm2, ymm21 + vpaddd ymm3, ymm3, ymm23 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm24 + vpaddd ymm1, ymm1, ymm26 + vpaddd ymm2, ymm2, ymm28 + vpaddd ymm3, ymm3, ymm30 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm25 + vpaddd ymm1, ymm1, ymm27 + vpaddd ymm2, ymm2, ymm29 + vpaddd ymm3, ymm3, ymm31 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm18 + vpaddd ymm1, ymm1, ymm19 + vpaddd ymm2, ymm2, ymm23 + vpaddd ymm3, ymm3, ymm20 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm22 + vpaddd ymm1, ymm1, ymm26 + vpaddd ymm2, ymm2, ymm16 + vpaddd ymm3, ymm3, ymm29 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm17 + vpaddd ymm1, ymm1, ymm28 + vpaddd ymm2, ymm2, ymm25 + vpaddd ymm3, ymm3, ymm31 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm27 + vpaddd ymm1, ymm1, ymm21 + vpaddd ymm2, ymm2, ymm30 + vpaddd ymm3, ymm3, ymm24 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm19 + vpaddd ymm1, ymm1, ymm26 + vpaddd ymm2, ymm2, ymm29 + vpaddd ymm3, ymm3, ymm23 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm20 + vpaddd ymm1, ymm1, ymm28 + vpaddd ymm2, ymm2, ymm18 + vpaddd ymm3, ymm3, ymm30 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm22 + vpaddd ymm1, ymm1, ymm25 + vpaddd ymm2, ymm2, ymm27 + vpaddd ymm3, ymm3, ymm24 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm21 + vpaddd ymm1, ymm1, ymm16 + vpaddd ymm2, ymm2, ymm31 + vpaddd ymm3, ymm3, ymm17 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm26 + vpaddd ymm1, ymm1, ymm28 + vpaddd ymm2, ymm2, ymm30 + vpaddd ymm3, ymm3, ymm29 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm23 + vpaddd ymm1, ymm1, ymm25 + vpaddd ymm2, ymm2, ymm19 + vpaddd ymm3, ymm3, ymm31 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm20 + vpaddd ymm1, ymm1, ymm27 + vpaddd ymm2, ymm2, ymm21 + vpaddd ymm3, ymm3, ymm17 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm16 + vpaddd ymm1, ymm1, ymm18 + vpaddd ymm2, ymm2, ymm24 + vpaddd ymm3, ymm3, ymm22 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm28 + vpaddd ymm1, ymm1, ymm25 + vpaddd ymm2, ymm2, ymm31 + vpaddd ymm3, ymm3, ymm30 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm29 + vpaddd ymm1, ymm1, ymm27 + vpaddd ymm2, ymm2, ymm26 + vpaddd ymm3, ymm3, ymm24 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm23 + vpaddd ymm1, ymm1, ymm21 + vpaddd ymm2, ymm2, ymm16 + vpaddd ymm3, ymm3, ymm22 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm18 + vpaddd ymm1, ymm1, ymm19 + vpaddd ymm2, ymm2, ymm17 + vpaddd ymm3, ymm3, ymm20 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm25 + vpaddd ymm1, ymm1, ymm27 + vpaddd ymm2, ymm2, ymm24 + vpaddd ymm3, ymm3, ymm31 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm30 + vpaddd ymm1, ymm1, ymm21 + vpaddd ymm2, ymm2, ymm28 + vpaddd ymm3, ymm3, ymm17 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm29 + vpaddd ymm1, ymm1, ymm16 + vpaddd ymm2, ymm2, ymm18 + vpaddd ymm3, ymm3, ymm20 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm19 + vpaddd ymm1, ymm1, ymm26 + vpaddd ymm2, ymm2, ymm22 + vpaddd ymm3, ymm3, ymm23 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm27 + vpaddd ymm1, ymm1, ymm21 + vpaddd ymm2, ymm2, ymm17 + vpaddd ymm3, ymm3, ymm24 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm31 + vpaddd ymm1, ymm1, ymm16 + vpaddd ymm2, ymm2, ymm25 + vpaddd ymm3, ymm3, ymm22 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm30 + vpaddd ymm1, ymm1, ymm18 + vpaddd ymm2, ymm2, ymm19 + vpaddd ymm3, ymm3, ymm23 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm26 + vpaddd ymm1, ymm1, ymm28 + vpaddd ymm2, ymm2, ymm20 + vpaddd ymm3, ymm3, ymm29 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpxor ymm0, ymm0, ymm8 + vpxor ymm1, ymm1, ymm9 + vpxor ymm2, ymm2, ymm10 + vpxor ymm3, ymm3, ymm11 + vpxor ymm4, ymm4, ymm12 + vpxor ymm5, ymm5, ymm13 + vpxor ymm6, ymm6, ymm14 + vpxor ymm7, ymm7, ymm15 + movzx eax, byte ptr [rbp+78H] + jne innerloop8 + mov rbx, qword ptr [rbp+90H] + vunpcklps ymm8, ymm0, ymm1 + vunpcklps ymm9, ymm2, ymm3 + vunpckhps ymm10, ymm0, ymm1 + vunpcklps ymm11, ymm4, ymm5 + vunpcklps ymm0, ymm6, ymm7 + vshufps ymm12, ymm8, ymm9, 78 + vblendps ymm1, ymm8, ymm12, 0CCH + vshufps ymm8, ymm11, ymm0, 78 + vunpckhps ymm13, ymm2, ymm3 + vblendps ymm2, ymm11, ymm8, 0CCH + vblendps ymm3, ymm12, ymm9, 0CCH + vperm2f128 ymm12, ymm1, ymm2, 20H + vmovups ymmword ptr [rbx], ymm12 + vunpckhps ymm14, ymm4, ymm5 + vblendps ymm4, ymm8, ymm0, 0CCH + vunpckhps ymm15, ymm6, ymm7 + vperm2f128 ymm7, ymm3, ymm4, 20H + vmovups ymmword ptr [rbx+20H], ymm7 + vshufps ymm5, ymm10, ymm13, 78 + vblendps ymm6, ymm5, ymm13, 0CCH + vshufps ymm13, ymm14, ymm15, 78 + vblendps ymm10, ymm10, ymm5, 0CCH + vblendps ymm14, ymm14, ymm13, 0CCH + vperm2f128 ymm8, ymm10, ymm14, 20H + vmovups ymmword ptr [rbx+40H], ymm8 + vblendps ymm15, ymm13, ymm15, 0CCH + vperm2f128 ymm13, ymm6, ymm15, 20H + vmovups ymmword ptr [rbx+60H], ymm13 + vperm2f128 ymm9, ymm1, ymm2, 31H + vperm2f128 ymm11, ymm3, ymm4, 31H + vmovups ymmword ptr [rbx+80H], ymm9 + vperm2f128 ymm14, ymm10, ymm14, 31H + vperm2f128 ymm15, ymm6, ymm15, 31H + vmovups ymmword ptr [rbx+0A0H], ymm11 + vmovups ymmword ptr [rbx+0C0H], ymm14 + vmovups ymmword ptr [rbx+0E0H], ymm15 + vmovdqa ymm0, ymmword ptr [rsp] + vmovdqa ymm2, ymmword ptr [rsp+40H] + vmovdqa32 ymm0 {k1}, ymmword ptr [rsp+1H*20H] + vmovdqa32 ymm2 {k1}, ymmword ptr [rsp+3H*20H] + vmovdqa ymmword ptr [rsp], ymm0 + vmovdqa ymmword ptr [rsp+40H], ymm2 + add rbx, 256 + mov qword ptr [rbp+90H], rbx + add rdi, 64 + sub rsi, 8 +final7blocks: + mov rbx, qword ptr [rbp+90H] + mov r15, qword ptr [rsp+80H] + movzx r13, byte ptr [rbp+78H] + movzx r12, byte ptr [rbp+88H] + test esi, 4H + je final3blocks + vbroadcasti32x4 zmm0, xmmword ptr [rcx] + vbroadcasti32x4 zmm1, xmmword ptr [rcx+1H*10H] + vmovdqa xmm12, xmmword ptr [rsp] + vmovdqa xmm13, xmmword ptr [rsp+40H] + vpunpckldq xmm14, xmm12, xmm13 + vpunpckhdq xmm15, xmm12, xmm13 + vpermq ymm14, ymm14, 0DCH + vpermq ymm15, ymm15, 0DCH + vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN] + vinserti64x4 zmm13, zmm14, ymm15, 01H + mov eax, 17476 + kmovw k2, eax + vpblendmd zmm13 {k2}, zmm13, zmm12 + vbroadcasti32x4 zmm15, xmmword ptr [BLAKE3_IV] + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+8H] + mov r10, qword ptr [rdi+10H] + mov r11, qword ptr [rdi+18H] + mov eax, 43690 + kmovw k3, eax + mov eax, 34952 + kmovw k4, eax + movzx eax, byte ptr [rbp+80H] + or eax, r13d + xor edx, edx +ALIGN 16 +innerloop4: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + mov dword ptr [rsp+88H], eax + vmovdqa32 zmm2, zmm15 + vpbroadcastd zmm8, dword ptr [rsp+22H*4H] + vpblendmd zmm3 {k4}, zmm13, zmm8 + vmovups zmm8, zmmword ptr [r8+rdx-1H*40H] + vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-4H*10H], 01H + vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-4H*10H], 02H + vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-4H*10H], 03H + vmovups zmm9, zmmword ptr [r8+rdx-30H] + vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-3H*10H], 01H + vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-3H*10H], 02H + vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-3H*10H], 03H + vshufps zmm4, zmm8, zmm9, 136 + vshufps zmm5, zmm8, zmm9, 221 + vmovups zmm8, zmmword ptr [r8+rdx-20H] + vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-2H*10H], 01H + vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-2H*10H], 02H + vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-2H*10H], 03H + vmovups zmm9, zmmword ptr [r8+rdx-10H] + vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-1H*10H], 01H + vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-1H*10H], 02H + vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-1H*10H], 03H + vshufps zmm6, zmm8, zmm9, 136 + vshufps zmm7, zmm8, zmm9, 221 + vpshufd zmm6, zmm6, 93H + vpshufd zmm7, zmm7, 93H + mov al, 7 +roundloop4: + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm0, zmm0, zmm1 + vpxord zmm3, zmm3, zmm0 + vprord zmm3, zmm3, 16 + vpaddd zmm2, zmm2, zmm3 + vpxord zmm1, zmm1, zmm2 + vprord zmm1, zmm1, 12 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm0, zmm0, zmm1 + vpxord zmm3, zmm3, zmm0 + vprord zmm3, zmm3, 8 + vpaddd zmm2, zmm2, zmm3 + vpxord zmm1, zmm1, zmm2 + vprord zmm1, zmm1, 7 + vpshufd zmm0, zmm0, 93H + vpshufd zmm3, zmm3, 4EH + vpshufd zmm2, zmm2, 39H + vpaddd zmm0, zmm0, zmm6 + vpaddd zmm0, zmm0, zmm1 + vpxord zmm3, zmm3, zmm0 + vprord zmm3, zmm3, 16 + vpaddd zmm2, zmm2, zmm3 + vpxord zmm1, zmm1, zmm2 + vprord zmm1, zmm1, 12 + vpaddd zmm0, zmm0, zmm7 + vpaddd zmm0, zmm0, zmm1 + vpxord zmm3, zmm3, zmm0 + vprord zmm3, zmm3, 8 + vpaddd zmm2, zmm2, zmm3 + vpxord zmm1, zmm1, zmm2 + vprord zmm1, zmm1, 7 + vpshufd zmm0, zmm0, 39H + vpshufd zmm3, zmm3, 4EH + vpshufd zmm2, zmm2, 93H + dec al + jz endroundloop4 + vshufps zmm8, zmm4, zmm5, 214 + vpshufd zmm9, zmm4, 0FH + vpshufd zmm4, zmm8, 39H + vshufps zmm8, zmm6, zmm7, 250 + vpblendmd zmm9 {k3}, zmm9, zmm8 + vpunpcklqdq zmm8, zmm7, zmm5 + vpblendmd zmm8 {k4}, zmm8, zmm6 + vpshufd zmm8, zmm8, 78H + vpunpckhdq zmm5, zmm5, zmm7 + vpunpckldq zmm6, zmm6, zmm5 + vpshufd zmm7, zmm6, 1EH + vmovdqa32 zmm5, zmm9 + vmovdqa32 zmm6, zmm8 + jmp roundloop4 +endroundloop4: + vpxord zmm0, zmm0, zmm2 + vpxord zmm1, zmm1, zmm3 + mov eax, r13d + cmp rdx, r15 + jne innerloop4 + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+10H], xmm1 + vextracti128 xmmword ptr [rbx+20H], ymm0, 01H + vextracti128 xmmword ptr [rbx+30H], ymm1, 01H + vextracti32x4 xmmword ptr [rbx+4H*10H], zmm0, 02H + vextracti32x4 xmmword ptr [rbx+5H*10H], zmm1, 02H + vextracti32x4 xmmword ptr [rbx+6H*10H], zmm0, 03H + vextracti32x4 xmmword ptr [rbx+7H*10H], zmm1, 03H + vmovdqa xmm0, xmmword ptr [rsp] + vmovdqa xmm2, xmmword ptr [rsp+40H] + vmovdqa32 xmm0 {k1}, xmmword ptr [rsp+1H*10H] + vmovdqa32 xmm2 {k1}, xmmword ptr [rsp+5H*10H] + vmovdqa xmmword ptr [rsp], xmm0 + vmovdqa xmmword ptr [rsp+40H], xmm2 + add rbx, 128 + add rdi, 32 + sub rsi, 4 +final3blocks: + test esi, 2H + je final1block + vbroadcasti128 ymm0, xmmword ptr [rcx] + vbroadcasti128 ymm1, xmmword ptr [rcx+10H] + vmovd xmm13, dword ptr [rsp] + vpinsrd xmm13, xmm13, dword ptr [rsp+40H], 1 + vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN], 2 + vmovd xmm14, dword ptr [rsp+4H] + vpinsrd xmm14, xmm14, dword ptr [rsp+44H], 1 + vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN], 2 + vinserti128 ymm13, ymm13, xmm14, 01H + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+8H] + movzx eax, byte ptr [rbp+80H] + or eax, r13d + xor edx, edx +ALIGN 16 +innerloop2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + mov dword ptr [rsp+88H], eax + vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV] + vpbroadcastd ymm8, dword ptr [rsp+88H] + vpblendd ymm3, ymm13, ymm8, 88H + vmovups ymm8, ymmword ptr [r8+rdx-40H] + vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-40H], 01H + vmovups ymm9, ymmword ptr [r8+rdx-30H] + vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-30H], 01H + vshufps ymm4, ymm8, ymm9, 136 + vshufps ymm5, ymm8, ymm9, 221 + vmovups ymm8, ymmword ptr [r8+rdx-20H] + vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-20H], 01H + vmovups ymm9, ymmword ptr [r8+rdx-10H] + vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-10H], 01H + vshufps ymm6, ymm8, ymm9, 136 + vshufps ymm7, ymm8, ymm9, 221 + vpshufd ymm6, ymm6, 93H + vpshufd ymm7, ymm7, 93H + mov al, 7 +roundloop2: + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm0, ymm0, ymm1 + vpxord ymm3, ymm3, ymm0 + vprord ymm3, ymm3, 16 + vpaddd ymm2, ymm2, ymm3 + vpxord ymm1, ymm1, ymm2 + vprord ymm1, ymm1, 12 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm0, ymm0, ymm1 + vpxord ymm3, ymm3, ymm0 + vprord ymm3, ymm3, 8 + vpaddd ymm2, ymm2, ymm3 + vpxord ymm1, ymm1, ymm2 + vprord ymm1, ymm1, 7 + vpshufd ymm0, ymm0, 93H + vpshufd ymm3, ymm3, 4EH + vpshufd ymm2, ymm2, 39H + vpaddd ymm0, ymm0, ymm6 + vpaddd ymm0, ymm0, ymm1 + vpxord ymm3, ymm3, ymm0 + vprord ymm3, ymm3, 16 + vpaddd ymm2, ymm2, ymm3 + vpxord ymm1, ymm1, ymm2 + vprord ymm1, ymm1, 12 + vpaddd ymm0, ymm0, ymm7 + vpaddd ymm0, ymm0, ymm1 + vpxord ymm3, ymm3, ymm0 + vprord ymm3, ymm3, 8 + vpaddd ymm2, ymm2, ymm3 + vpxord ymm1, ymm1, ymm2 + vprord ymm1, ymm1, 7 + vpshufd ymm0, ymm0, 39H + vpshufd ymm3, ymm3, 4EH + vpshufd ymm2, ymm2, 93H + dec al + jz endroundloop2 + vshufps ymm8, ymm4, ymm5, 214 + vpshufd ymm9, ymm4, 0FH + vpshufd ymm4, ymm8, 39H + vshufps ymm8, ymm6, ymm7, 250 + vpblendd ymm9, ymm9, ymm8, 0AAH + vpunpcklqdq ymm8, ymm7, ymm5 + vpblendd ymm8, ymm8, ymm6, 88H + vpshufd ymm8, ymm8, 78H + vpunpckhdq ymm5, ymm5, ymm7 + vpunpckldq ymm6, ymm6, ymm5 + vpshufd ymm7, ymm6, 1EH + vmovdqa ymm5, ymm9 + vmovdqa ymm6, ymm8 + jmp roundloop2 +endroundloop2: + vpxor ymm0, ymm0, ymm2 + vpxor ymm1, ymm1, ymm3 + mov eax, r13d + cmp rdx, r15 + jne innerloop2 + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+10H], xmm1 + vextracti128 xmmword ptr [rbx+20H], ymm0, 01H + vextracti128 xmmword ptr [rbx+30H], ymm1, 01H + vmovdqa xmm0, xmmword ptr [rsp] + vmovdqa xmm2, xmmword ptr [rsp+40H] + vmovdqu32 xmm0 {k1}, xmmword ptr [rsp+8H] + vmovdqu32 xmm2 {k1}, xmmword ptr [rsp+48H] + vmovdqa xmmword ptr [rsp], xmm0 + vmovdqa xmmword ptr [rsp+40H], xmm2 + add rbx, 64 + add rdi, 16 + sub rsi, 2 +final1block: + test esi, 1H + je unwind + vmovdqu xmm0, xmmword ptr [rcx] + vmovdqu xmm1, xmmword ptr [rcx+10H] + vmovd xmm14, dword ptr [rsp] + vpinsrd xmm14, xmm14, dword ptr [rsp+40H], 1 + vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN], 2 + vmovdqa xmm15, xmmword ptr [BLAKE3_IV] + mov r8, qword ptr [rdi] + movzx eax, byte ptr [rbp+80H] + or eax, r13d + xor edx, edx +ALIGN 16 +innerloop1: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + vpinsrd xmm3, xmm14, eax, 3 + vmovdqa xmm2, xmm15 + vmovups xmm8, xmmword ptr [r8+rdx-40H] + vmovups xmm9, xmmword ptr [r8+rdx-30H] + vshufps xmm4, xmm8, xmm9, 136 + vshufps xmm5, xmm8, xmm9, 221 + vmovups xmm8, xmmword ptr [r8+rdx-20H] + vmovups xmm9, xmmword ptr [r8+rdx-10H] + vshufps xmm6, xmm8, xmm9, 136 + vshufps xmm7, xmm8, xmm9, 221 + vpshufd xmm6, xmm6, 93H + vpshufd xmm7, xmm7, 93H + mov al, 7 +roundloop1: + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 93H + vpshufd xmm3, xmm3, 4EH + vpshufd xmm2, xmm2, 39H + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 39H + vpshufd xmm3, xmm3, 4EH + vpshufd xmm2, xmm2, 93H + dec al + jz endroundloop1 + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0FH + vpshufd xmm4, xmm8, 39H + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0AAH + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 88H + vpshufd xmm8, xmm8, 78H + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 1EH + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + jmp roundloop1 +endroundloop1: + vpxor xmm0, xmm0, xmm2 + vpxor xmm1, xmm1, xmm3 + mov eax, r13d + cmp rdx, r15 + jne innerloop1 + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+10H], xmm1 + jmp unwind + +_blake3_hash_many_avx512 ENDP +blake3_hash_many_avx512 ENDP + +ALIGN 16 +blake3_compress_in_place_avx512 PROC +_blake3_compress_in_place_avx512 PROC + sub rsp, 72 + vmovdqa xmmword ptr [rsp], xmm6 + vmovdqa xmmword ptr [rsp+10H], xmm7 + vmovdqa xmmword ptr [rsp+20H], xmm8 + vmovdqa xmmword ptr [rsp+30H], xmm9 + vmovdqu xmm0, xmmword ptr [rcx] + vmovdqu xmm1, xmmword ptr [rcx+10H] + movzx eax, byte ptr [rsp+70H] + movzx r8d, r8b + shl rax, 32 + add r8, rax + vmovq xmm3, r9 + vmovq xmm4, r8 + vpunpcklqdq xmm3, xmm3, xmm4 + vmovaps xmm2, xmmword ptr [BLAKE3_IV] + vmovups xmm8, xmmword ptr [rdx] + vmovups xmm9, xmmword ptr [rdx+10H] + vshufps xmm4, xmm8, xmm9, 136 + vshufps xmm5, xmm8, xmm9, 221 + vmovups xmm8, xmmword ptr [rdx+20H] + vmovups xmm9, xmmword ptr [rdx+30H] + vshufps xmm6, xmm8, xmm9, 136 + vshufps xmm7, xmm8, xmm9, 221 + vpshufd xmm6, xmm6, 93H + vpshufd xmm7, xmm7, 93H + mov al, 7 +@@: + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 93H + vpshufd xmm3, xmm3, 4EH + vpshufd xmm2, xmm2, 39H + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 39H + vpshufd xmm3, xmm3, 4EH + vpshufd xmm2, xmm2, 93H + dec al + jz @F + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0FH + vpshufd xmm4, xmm8, 39H + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0AAH + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 88H + vpshufd xmm8, xmm8, 78H + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 1EH + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + jmp @B +@@: + vpxor xmm0, xmm0, xmm2 + vpxor xmm1, xmm1, xmm3 + vmovdqu xmmword ptr [rcx], xmm0 + vmovdqu xmmword ptr [rcx+10H], xmm1 + vmovdqa xmm6, xmmword ptr [rsp] + vmovdqa xmm7, xmmword ptr [rsp+10H] + vmovdqa xmm8, xmmword ptr [rsp+20H] + vmovdqa xmm9, xmmword ptr [rsp+30H] + add rsp, 72 + ret +_blake3_compress_in_place_avx512 ENDP +blake3_compress_in_place_avx512 ENDP + +ALIGN 16 +blake3_compress_xof_avx512 PROC +_blake3_compress_xof_avx512 PROC + sub rsp, 72 + vmovdqa xmmword ptr [rsp], xmm6 + vmovdqa xmmword ptr [rsp+10H], xmm7 + vmovdqa xmmword ptr [rsp+20H], xmm8 + vmovdqa xmmword ptr [rsp+30H], xmm9 + vmovdqu xmm0, xmmword ptr [rcx] + vmovdqu xmm1, xmmword ptr [rcx+10H] + movzx eax, byte ptr [rsp+70H] + movzx r8d, r8b + mov r10, qword ptr [rsp+78H] + shl rax, 32 + add r8, rax + vmovq xmm3, r9 + vmovq xmm4, r8 + vpunpcklqdq xmm3, xmm3, xmm4 + vmovaps xmm2, xmmword ptr [BLAKE3_IV] + vmovups xmm8, xmmword ptr [rdx] + vmovups xmm9, xmmword ptr [rdx+10H] + vshufps xmm4, xmm8, xmm9, 136 + vshufps xmm5, xmm8, xmm9, 221 + vmovups xmm8, xmmword ptr [rdx+20H] + vmovups xmm9, xmmword ptr [rdx+30H] + vshufps xmm6, xmm8, xmm9, 136 + vshufps xmm7, xmm8, xmm9, 221 + vpshufd xmm6, xmm6, 93H + vpshufd xmm7, xmm7, 93H + mov al, 7 +@@: + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 93H + vpshufd xmm3, xmm3, 4EH + vpshufd xmm2, xmm2, 39H + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 39H + vpshufd xmm3, xmm3, 4EH + vpshufd xmm2, xmm2, 93H + dec al + jz @F + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0FH + vpshufd xmm4, xmm8, 39H + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0AAH + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 88H + vpshufd xmm8, xmm8, 78H + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 1EH + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + jmp @B +@@: + vpxor xmm0, xmm0, xmm2 + vpxor xmm1, xmm1, xmm3 + vpxor xmm2, xmm2, xmmword ptr [rcx] + vpxor xmm3, xmm3, xmmword ptr [rcx+10H] + vmovdqu xmmword ptr [r10], xmm0 + vmovdqu xmmword ptr [r10+10H], xmm1 + vmovdqu xmmword ptr [r10+20H], xmm2 + vmovdqu xmmword ptr [r10+30H], xmm3 + vmovdqa xmm6, xmmword ptr [rsp] + vmovdqa xmm7, xmmword ptr [rsp+10H] + vmovdqa xmm8, xmmword ptr [rsp+20H] + vmovdqa xmm9, xmmword ptr [rsp+30H] + add rsp, 72 + ret +_blake3_compress_xof_avx512 ENDP +blake3_compress_xof_avx512 ENDP + +_TEXT ENDS + +_RDATA SEGMENT READONLY PAGE ALIAS(".rdata") 'CONST' +ALIGN 64 +INDEX0: + dd 0, 1, 2, 3, 16, 17, 18, 19 + dd 8, 9, 10, 11, 24, 25, 26, 27 +INDEX1: + dd 4, 5, 6, 7, 20, 21, 22, 23 + dd 12, 13, 14, 15, 28, 29, 30, 31 +ADD0: + dd 0, 1, 2, 3, 4, 5, 6, 7 + dd 8, 9, 10, 11, 12, 13, 14, 15 +ADD1: + dd 1 +ADD16: + dd 16 +BLAKE3_BLOCK_LEN: + dd 64 +ALIGN 64 +BLAKE3_IV: +BLAKE3_IV_0: + dd 06A09E667H +BLAKE3_IV_1: + dd 0BB67AE85H +BLAKE3_IV_2: + dd 03C6EF372H +BLAKE3_IV_3: + dd 0A54FF53AH + +_RDATA ENDS +END diff --git a/thirdparty/blake3/c/blake3_c_rust_bindings/Cargo.toml b/thirdparty/blake3/c/blake3_c_rust_bindings/Cargo.toml new file mode 100644 index 000000000..4c3a25204 --- /dev/null +++ b/thirdparty/blake3/c/blake3_c_rust_bindings/Cargo.toml @@ -0,0 +1,32 @@ +# These are Rust bindings for the C implementation of BLAKE3. As there is a +# native (and faster) Rust implementation of BLAKE3 provided in this same repo, +# these bindings are not expected to be used in production. They're intended +# for testing and benchmarking. + +[package] +name = "blake3_c_rust_bindings" +version = "0.0.0" +description = "TESTING ONLY Rust bindings for the BLAKE3 C implementation" +edition = "2021" + +[features] +# By default the x86-64 build uses assembly implementations. This feature makes +# the build use the C intrinsics implementations instead. +prefer_intrinsics = [] +# Activate NEON bindings. We don't currently do any CPU feature detection for +# this. If this Cargo feature is on, the NEON gets used. +neon = [] +# Enable TBB-based multithreading. +tbb = [] + +[dev-dependencies] +arrayref = "0.3.5" +arrayvec = { version = "0.7.0", default-features = false } +page_size = "0.6.0" +rand = "0.9.0" +rand_chacha = "0.9.0" +reference_impl = { path = "../../reference_impl" } + +[build-dependencies] +cc = "1.0.48" +ignore = "0.4.23" diff --git a/thirdparty/blake3/c/blake3_c_rust_bindings/README.md b/thirdparty/blake3/c/blake3_c_rust_bindings/README.md new file mode 100644 index 000000000..c44726b90 --- /dev/null +++ b/thirdparty/blake3/c/blake3_c_rust_bindings/README.md @@ -0,0 +1,4 @@ +These are Rust bindings for the C implementation of BLAKE3. As there is +a native Rust implementation of BLAKE3 provided in this same repo, these +bindings are not expected to be used in production. They're intended for +testing and benchmarking. diff --git a/thirdparty/blake3/c/blake3_c_rust_bindings/benches/bench.rs b/thirdparty/blake3/c/blake3_c_rust_bindings/benches/bench.rs new file mode 100644 index 000000000..b6c570444 --- /dev/null +++ b/thirdparty/blake3/c/blake3_c_rust_bindings/benches/bench.rs @@ -0,0 +1,477 @@ +#![feature(test)] + +extern crate test; + +use arrayref::array_ref; +use arrayvec::ArrayVec; +use rand::prelude::*; +use test::Bencher; + +const KIB: usize = 1024; +const MAX_SIMD_DEGREE: usize = 16; + +const BLOCK_LEN: usize = 64; +const CHUNK_LEN: usize = 1024; +const OUT_LEN: usize = 32; + +// This struct randomizes two things: +// 1. The actual bytes of input. +// 2. The page offset the input starts at. +pub struct RandomInput { + buf: Vec<u8>, + len: usize, + offsets: Vec<usize>, + offset_index: usize, +} + +impl RandomInput { + pub fn new(b: &mut Bencher, len: usize) -> Self { + b.bytes += len as u64; + let page_size: usize = page_size::get(); + let mut buf = vec![0u8; len + page_size]; + let mut rng = rand::rng(); + rng.fill_bytes(&mut buf); + let mut offsets: Vec<usize> = (0..page_size).collect(); + offsets.shuffle(&mut rng); + Self { + buf, + len, + offsets, + offset_index: 0, + } + } + + pub fn get(&mut self) -> &[u8] { + let offset = self.offsets[self.offset_index]; + self.offset_index += 1; + if self.offset_index >= self.offsets.len() { + self.offset_index = 0; + } + &self.buf[offset..][..self.len] + } +} + +type CompressInPlaceFn = + unsafe extern "C" fn(cv: *mut u32, block: *const u8, block_len: u8, counter: u64, flags: u8); + +fn bench_single_compression_fn(b: &mut Bencher, f: CompressInPlaceFn) { + let mut state = [1u32; 8]; + let mut r = RandomInput::new(b, 64); + let input = array_ref!(r.get(), 0, 64); + b.iter(|| unsafe { f(state.as_mut_ptr(), input.as_ptr(), 64, 0, 0) }); +} + +#[bench] +fn bench_single_compression_portable(b: &mut Bencher) { + bench_single_compression_fn( + b, + blake3_c_rust_bindings::ffi::blake3_compress_in_place_portable, + ); +} + +#[bench] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn bench_single_compression_sse2(b: &mut Bencher) { + if !blake3_c_rust_bindings::sse2_detected() { + return; + } + bench_single_compression_fn( + b, + blake3_c_rust_bindings::ffi::x86::blake3_compress_in_place_sse2, + ); +} + +#[bench] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn bench_single_compression_sse41(b: &mut Bencher) { + if !blake3_c_rust_bindings::sse41_detected() { + return; + } + bench_single_compression_fn( + b, + blake3_c_rust_bindings::ffi::x86::blake3_compress_in_place_sse41, + ); +} + +#[bench] +fn bench_single_compression_avx512(b: &mut Bencher) { + if !blake3_c_rust_bindings::avx512_detected() { + return; + } + bench_single_compression_fn( + b, + blake3_c_rust_bindings::ffi::x86::blake3_compress_in_place_avx512, + ); +} + +type HashManyFn = unsafe extern "C" fn( + inputs: *const *const u8, + num_inputs: usize, + blocks: usize, + key: *const u32, + counter: u64, + increment_counter: bool, + flags: u8, + flags_start: u8, + flags_end: u8, + out: *mut u8, +); + +fn bench_many_chunks_fn(b: &mut Bencher, f: HashManyFn, degree: usize) { + let mut inputs = Vec::new(); + for _ in 0..degree { + inputs.push(RandomInput::new(b, CHUNK_LEN)); + } + b.iter(|| { + let input_arrays: ArrayVec<&[u8; CHUNK_LEN], MAX_SIMD_DEGREE> = inputs + .iter_mut() + .take(degree) + .map(|i| array_ref!(i.get(), 0, CHUNK_LEN)) + .collect(); + let mut out = [0; MAX_SIMD_DEGREE * OUT_LEN]; + unsafe { + f( + input_arrays.as_ptr() as _, + input_arrays.len(), + CHUNK_LEN / BLOCK_LEN, + [0u32; 8].as_ptr(), + 0, + true, + 0, + 0, + 0, + out.as_mut_ptr(), + ) + } + }); +} + +#[bench] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn bench_many_chunks_sse2(b: &mut Bencher) { + if !blake3_c_rust_bindings::sse2_detected() { + return; + } + bench_many_chunks_fn( + b, + blake3_c_rust_bindings::ffi::x86::blake3_hash_many_sse2, + 4, + ); +} + +#[bench] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn bench_many_chunks_sse41(b: &mut Bencher) { + if !blake3_c_rust_bindings::sse41_detected() { + return; + } + bench_many_chunks_fn( + b, + blake3_c_rust_bindings::ffi::x86::blake3_hash_many_sse41, + 4, + ); +} + +#[bench] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn bench_many_chunks_avx2(b: &mut Bencher) { + if !blake3_c_rust_bindings::avx2_detected() { + return; + } + bench_many_chunks_fn( + b, + blake3_c_rust_bindings::ffi::x86::blake3_hash_many_avx2, + 8, + ); +} + +#[bench] +fn bench_many_chunks_avx512(b: &mut Bencher) { + if !blake3_c_rust_bindings::avx512_detected() { + return; + } + bench_many_chunks_fn( + b, + blake3_c_rust_bindings::ffi::x86::blake3_hash_many_avx512, + 16, + ); +} + +#[bench] +#[cfg(feature = "neon")] +fn bench_many_chunks_neon(b: &mut Bencher) { + // When "neon" is on, NEON support is assumed. + bench_many_chunks_fn( + b, + blake3_c_rust_bindings::ffi::neon::blake3_hash_many_neon, + 4, + ); +} + +// TODO: When we get const generics we can unify this with the chunks code. +fn bench_many_parents_fn(b: &mut Bencher, f: HashManyFn, degree: usize) { + let mut inputs = Vec::new(); + for _ in 0..degree { + inputs.push(RandomInput::new(b, BLOCK_LEN)); + } + b.iter(|| { + let input_arrays: ArrayVec<&[u8; BLOCK_LEN], MAX_SIMD_DEGREE> = inputs + .iter_mut() + .take(degree) + .map(|i| array_ref!(i.get(), 0, BLOCK_LEN)) + .collect(); + let mut out = [0; MAX_SIMD_DEGREE * OUT_LEN]; + unsafe { + f( + input_arrays.as_ptr() as _, + input_arrays.len(), + 1, + [0u32; 8].as_ptr(), + 0, + false, + 0, + 0, + 0, + out.as_mut_ptr(), + ) + } + }); +} + +#[bench] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn bench_many_parents_sse2(b: &mut Bencher) { + if !blake3_c_rust_bindings::sse2_detected() { + return; + } + bench_many_parents_fn( + b, + blake3_c_rust_bindings::ffi::x86::blake3_hash_many_sse2, + 4, + ); +} + +#[bench] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn bench_many_parents_sse41(b: &mut Bencher) { + if !blake3_c_rust_bindings::sse41_detected() { + return; + } + bench_many_parents_fn( + b, + blake3_c_rust_bindings::ffi::x86::blake3_hash_many_sse41, + 4, + ); +} + +#[bench] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn bench_many_parents_avx2(b: &mut Bencher) { + if !blake3_c_rust_bindings::avx2_detected() { + return; + } + bench_many_parents_fn( + b, + blake3_c_rust_bindings::ffi::x86::blake3_hash_many_avx2, + 8, + ); +} + +#[bench] +fn bench_many_parents_avx512(b: &mut Bencher) { + if !blake3_c_rust_bindings::avx512_detected() { + return; + } + bench_many_parents_fn( + b, + blake3_c_rust_bindings::ffi::x86::blake3_hash_many_avx512, + 16, + ); +} + +#[bench] +#[cfg(feature = "neon")] +fn bench_many_parents_neon(b: &mut Bencher) { + // When "neon" is on, NEON support is assumed. + bench_many_parents_fn( + b, + blake3_c_rust_bindings::ffi::neon::blake3_hash_many_neon, + 4, + ); +} + +fn bench_incremental(b: &mut Bencher, len: usize) { + let mut input = RandomInput::new(b, len); + b.iter(|| { + let mut hasher = blake3_c_rust_bindings::Hasher::new(); + hasher.update(input.get()); + let mut out = [0; 32]; + hasher.finalize(&mut out); + out + }); +} + +#[bench] +fn bench_incremental_0001_block(b: &mut Bencher) { + bench_incremental(b, BLOCK_LEN); +} + +#[bench] +fn bench_incremental_0001_kib(b: &mut Bencher) { + bench_incremental(b, 1 * KIB); +} + +#[bench] +fn bench_incremental_0002_kib(b: &mut Bencher) { + bench_incremental(b, 2 * KIB); +} + +#[bench] +fn bench_incremental_0004_kib(b: &mut Bencher) { + bench_incremental(b, 4 * KIB); +} + +#[bench] +fn bench_incremental_0008_kib(b: &mut Bencher) { + bench_incremental(b, 8 * KIB); +} + +#[bench] +fn bench_incremental_0016_kib(b: &mut Bencher) { + bench_incremental(b, 16 * KIB); +} + +#[bench] +fn bench_incremental_0032_kib(b: &mut Bencher) { + bench_incremental(b, 32 * KIB); +} + +#[bench] +fn bench_incremental_0064_kib(b: &mut Bencher) { + bench_incremental(b, 64 * KIB); +} + +#[bench] +fn bench_incremental_0128_kib(b: &mut Bencher) { + bench_incremental(b, 128 * KIB); +} + +#[bench] +fn bench_incremental_0256_kib(b: &mut Bencher) { + bench_incremental(b, 256 * KIB); +} + +#[bench] +fn bench_incremental_0512_kib(b: &mut Bencher) { + bench_incremental(b, 512 * KIB); +} + +#[bench] +fn bench_incremental_1024_kib(b: &mut Bencher) { + bench_incremental(b, 1024 * KIB); +} + +#[cfg(feature = "tbb")] +fn bench_tbb(b: &mut Bencher, len: usize) { + let mut input = RandomInput::new(b, len); + b.iter(|| { + let mut hasher = blake3_c_rust_bindings::Hasher::new(); + hasher.update_tbb(input.get()); + let mut out = [0; 32]; + hasher.finalize(&mut out); + out + }); +} + +#[bench] +#[cfg(feature = "tbb")] +fn bench_tbb_0001_block(b: &mut Bencher) { + bench_tbb(b, BLOCK_LEN); +} + +#[bench] +#[cfg(feature = "tbb")] +fn bench_tbb_0001_kib(b: &mut Bencher) { + bench_tbb(b, 1 * KIB); +} + +#[bench] +#[cfg(feature = "tbb")] +fn bench_tbb_0002_kib(b: &mut Bencher) { + bench_tbb(b, 2 * KIB); +} + +#[bench] +#[cfg(feature = "tbb")] +fn bench_tbb_0004_kib(b: &mut Bencher) { + bench_tbb(b, 4 * KIB); +} + +#[bench] +#[cfg(feature = "tbb")] +fn bench_tbb_0008_kib(b: &mut Bencher) { + bench_tbb(b, 8 * KIB); +} + +#[bench] +#[cfg(feature = "tbb")] +fn bench_tbb_0016_kib(b: &mut Bencher) { + bench_tbb(b, 16 * KIB); +} + +#[bench] +#[cfg(feature = "tbb")] +fn bench_tbb_0032_kib(b: &mut Bencher) { + bench_tbb(b, 32 * KIB); +} + +#[bench] +#[cfg(feature = "tbb")] +fn bench_tbb_0064_kib(b: &mut Bencher) { + bench_tbb(b, 64 * KIB); +} + +#[bench] +#[cfg(feature = "tbb")] +fn bench_tbb_0128_kib(b: &mut Bencher) { + bench_tbb(b, 128 * KIB); +} + +#[bench] +#[cfg(feature = "tbb")] +fn bench_tbb_0256_kib(b: &mut Bencher) { + bench_tbb(b, 256 * KIB); +} + +#[bench] +#[cfg(feature = "tbb")] +fn bench_tbb_0512_kib(b: &mut Bencher) { + bench_tbb(b, 512 * KIB); +} + +#[bench] +#[cfg(feature = "tbb")] +fn bench_tbb_1024_kib(b: &mut Bencher) { + bench_tbb(b, 1024 * KIB); +} + +// This checks that update() splits up its input in increasing powers of 2, so +// that it can recover a high degree of parallelism when the number of bytes +// hashed so far is uneven. The performance of this benchmark should be +// reasonably close to bench_incremental_0064_kib, within 80% or so. When we +// had a bug in this logic (https://github.com/BLAKE3-team/BLAKE3/issues/69), +// performance was less than half. +#[bench] +fn bench_two_updates(b: &mut Bencher) { + let len = 65536; + let mut input = RandomInput::new(b, len); + b.iter(|| { + let mut hasher = blake3_c_rust_bindings::Hasher::new(); + let input = input.get(); + hasher.update(&input[..1]); + hasher.update(&input[1..]); + let mut out = [0; 32]; + hasher.finalize(&mut out); + out + }); +} diff --git a/thirdparty/blake3/c/blake3_c_rust_bindings/build.rs b/thirdparty/blake3/c/blake3_c_rust_bindings/build.rs new file mode 100644 index 000000000..0236df4b4 --- /dev/null +++ b/thirdparty/blake3/c/blake3_c_rust_bindings/build.rs @@ -0,0 +1,253 @@ +use std::env; + +fn defined(var: &str) -> bool { + env::var_os(var).is_some() +} + +fn target_components() -> Vec<String> { + let target = env::var("TARGET").unwrap(); + target.split("-").map(|s| s.to_string()).collect() +} + +fn is_x86_64() -> bool { + target_components()[0] == "x86_64" +} + +fn is_windows_target() -> bool { + env::var("CARGO_CFG_TARGET_OS").unwrap() == "windows" +} + +fn use_msvc_asm() -> bool { + const MSVC_NAMES: &[&str] = &["", "cl", "cl.exe"]; + let target_os = env::var("CARGO_CFG_TARGET_OS").unwrap_or_default(); + let target_env = env::var("CARGO_CFG_TARGET_ENV").unwrap_or_default(); + let target_windows_msvc = target_os == "windows" && target_env == "msvc"; + let host_triple = env::var("HOST").unwrap_or_default(); + let target_triple = env::var("TARGET").unwrap_or_default(); + let cross_compiling = host_triple != target_triple; + let cc = env::var("CC").unwrap_or_default().to_ascii_lowercase(); + if !target_windows_msvc { + // We are not building for Windows with the MSVC toolchain. + false + } else if !cross_compiling && MSVC_NAMES.contains(&&*cc) { + // We are building on Windows with the MSVC toolchain (and not cross-compiling for another architecture or target). + true + } else { + // We are cross-compiling to Windows with the MSVC toolchain. + let target_arch = env::var("CARGO_CFG_TARGET_ARCH").unwrap_or_default(); + let target_vendor = env::var("CARGO_CFG_TARGET_VENDOR").unwrap_or_default(); + let cc = env::var(format!("CC_{target_arch}_{target_vendor}_windows_msvc")) + .unwrap_or_default() + .to_ascii_lowercase(); + // Check if we are using the MSVC compiler. + MSVC_NAMES.contains(&&*cc) + } +} + +fn is_x86_32() -> bool { + let arch = &target_components()[0]; + arch == "i386" || arch == "i586" || arch == "i686" +} + +fn is_armv7() -> bool { + target_components()[0] == "armv7" +} + +fn is_aarch64() -> bool { + target_components()[0] == "aarch64" +} + +// Windows targets may be using the MSVC toolchain or the GNU toolchain. The +// right compiler flags to use depend on the toolchain. (And we don't want to +// use flag_if_supported, because we don't want features to be silently +// disabled by old compilers.) +fn is_windows_msvc() -> bool { + // Some targets are only two components long, so check in steps. + target_components()[1] == "pc" + && target_components()[2] == "windows" + && target_components()[3] == "msvc" +} + +fn new_build() -> cc::Build { + let mut build = cc::Build::new(); + if !is_windows_msvc() { + build.flag("-std=c11"); + } + build +} + +fn new_cpp_build() -> cc::Build { + let mut build = cc::Build::new(); + build.cpp(true); + if is_windows_msvc() { + build.flag("/std:c++20"); + build.flag("/EHs-c-"); + build.flag("/GR-"); + } else { + build.flag("-std=c++20"); + build.flag("-fno-exceptions"); + build.flag("-fno-rtti"); + } + build +} + +fn c_dir_path(filename: &str) -> String { + // The `cross` tool doesn't support reading files in parent directories. As a hacky workaround + // in `cross_test.sh`, we move the c/ directory around and set BLAKE3_C_DIR_OVERRIDE. Regular + // building and testing doesn't require this. + if let Ok(c_dir_override) = env::var("BLAKE3_C_DIR_OVERRIDE") { + c_dir_override + "/" + filename + } else { + "../".to_string() + filename + } +} + +fn main() -> Result<(), Box<dyn std::error::Error>> { + let mut base_build = new_build(); + base_build.file(c_dir_path("blake3.c")); + base_build.file(c_dir_path("blake3_dispatch.c")); + base_build.file(c_dir_path("blake3_portable.c")); + if cfg!(feature = "tbb") { + base_build.define("BLAKE3_USE_TBB", "1"); + } + base_build.compile("blake3_base"); + + if cfg!(feature = "tbb") { + let mut tbb_build = new_cpp_build(); + tbb_build.define("BLAKE3_USE_TBB", "1"); + tbb_build.file(c_dir_path("blake3_tbb.cpp")); + tbb_build.compile("blake3_tbb"); + println!("cargo::rustc-link-lib=tbb"); + } + + if is_x86_64() && !defined("CARGO_FEATURE_PREFER_INTRINSICS") { + // On 64-bit, use the assembly implementations, unless the + // "prefer_intrinsics" feature is enabled. + if is_windows_target() { + if use_msvc_asm() { + let mut build = new_build(); + build.file(c_dir_path("blake3_sse2_x86-64_windows_msvc.asm")); + build.file(c_dir_path("blake3_sse41_x86-64_windows_msvc.asm")); + build.file(c_dir_path("blake3_avx2_x86-64_windows_msvc.asm")); + build.file(c_dir_path("blake3_avx512_x86-64_windows_msvc.asm")); + build.compile("blake3_asm"); + } else { + let mut build = new_build(); + build.file(c_dir_path("blake3_sse2_x86-64_windows_gnu.S")); + build.file(c_dir_path("blake3_sse41_x86-64_windows_gnu.S")); + build.file(c_dir_path("blake3_avx2_x86-64_windows_gnu.S")); + build.file(c_dir_path("blake3_avx512_x86-64_windows_gnu.S")); + build.compile("blake3_asm"); + } + } else { + // All non-Windows implementations are assumed to support + // Linux-style assembly. These files do contain a small + // explicit workaround for macOS also. + let mut build = new_build(); + build.file(c_dir_path("blake3_sse2_x86-64_unix.S")); + build.file(c_dir_path("blake3_sse41_x86-64_unix.S")); + build.file(c_dir_path("blake3_avx2_x86-64_unix.S")); + build.file(c_dir_path("blake3_avx512_x86-64_unix.S")); + build.compile("blake3_asm"); + } + } else if is_x86_64() || is_x86_32() { + // Assembly implementations are only for 64-bit. On 32-bit, or if + // the "prefer_intrinsics" feature is enabled, use the + // intrinsics-based C implementations. These each need to be + // compiled separately, with the corresponding instruction set + // extension explicitly enabled in the compiler. + + let mut sse2_build = new_build(); + sse2_build.file(c_dir_path("blake3_sse2.c")); + if is_windows_msvc() { + // /arch:SSE2 is the default on x86 and undefined on x86_64: + // https://docs.microsoft.com/en-us/cpp/build/reference/arch-x86 + // It also includes SSE4.1 intrinsics: + // https://stackoverflow.com/a/32183222/823869 + } else { + sse2_build.flag("-msse2"); + } + sse2_build.compile("blake3_sse2"); + + let mut sse41_build = new_build(); + sse41_build.file(c_dir_path("blake3_sse41.c")); + if is_windows_msvc() { + // /arch:SSE2 is the default on x86 and undefined on x86_64: + // https://docs.microsoft.com/en-us/cpp/build/reference/arch-x86 + // It also includes SSE4.1 intrinsics: + // https://stackoverflow.com/a/32183222/823869 + } else { + sse41_build.flag("-msse4.1"); + } + sse41_build.compile("blake3_sse41"); + + let mut avx2_build = new_build(); + avx2_build.file(c_dir_path("blake3_avx2.c")); + if is_windows_msvc() { + avx2_build.flag("/arch:AVX2"); + } else { + avx2_build.flag("-mavx2"); + } + avx2_build.compile("blake3_avx2"); + + let mut avx512_build = new_build(); + avx512_build.file(c_dir_path("blake3_avx512.c")); + if is_windows_msvc() { + // Note that a lot of versions of MSVC don't support /arch:AVX512, + // and they'll discard it with a warning, hopefully leading to a + // build error. + avx512_build.flag("/arch:AVX512"); + } else { + avx512_build.flag("-mavx512f"); + avx512_build.flag("-mavx512vl"); + } + avx512_build.compile("blake3_avx512"); + } + + // We only build NEON code here if + // 1) it's requested + // and 2) the root crate is not already building it. + // The only time this will really happen is if you build this + // crate by hand with the "neon" feature for some reason. + // + // In addition, 3) if the target is aarch64, NEON is on by default. + if defined("CARGO_FEATURE_NEON") || is_aarch64() { + let mut neon_build = new_build(); + neon_build.file(c_dir_path("blake3_neon.c")); + // ARMv7 platforms that support NEON generally need the following + // flags. AArch64 supports NEON by default and does not support -mpfu. + if is_armv7() { + neon_build.flag("-mfpu=neon-vfpv4"); + neon_build.flag("-mfloat-abi=hard"); + } + neon_build.compile("blake3_neon"); + } + + // The `cc` crate does not automatically emit rerun-if directives for the + // environment variables it supports, in particular for $CC. We expect to + // do a lot of benchmarking across different compilers, so we explicitly + // add the variables that we're likely to need. + println!("cargo:rerun-if-env-changed=CC"); + println!("cargo:rerun-if-env-changed=CFLAGS"); + + // Ditto for source files, though these shouldn't change as often. `ignore::Walk` respects + // .gitignore, so this doesn't traverse target/. + for result in ignore::Walk::new("..") { + let result = result?; + let path = result.path(); + if path.is_file() { + println!("cargo:rerun-if-changed={}", path.to_str().unwrap()); + } + } + + // When compiling with clang-cl for windows, it adds .asm files to the root + // which we need to delete so cargo doesn't get angry + if is_windows_target() && !use_msvc_asm() { + let _ = std::fs::remove_file("blake3_avx2_x86-64_windows_gnu.asm"); + let _ = std::fs::remove_file("blake3_avx512_x86-64_windows_gnu.asm"); + let _ = std::fs::remove_file("blake3_sse2_x86-64_windows_gnu.asm"); + let _ = std::fs::remove_file("blake3_sse41_x86-64_windows_gnu.asm"); + } + + Ok(()) +} diff --git a/thirdparty/blake3/c/blake3_c_rust_bindings/cross_test.sh b/thirdparty/blake3/c/blake3_c_rust_bindings/cross_test.sh new file mode 100755 index 000000000..94d50affb --- /dev/null +++ b/thirdparty/blake3/c/blake3_c_rust_bindings/cross_test.sh @@ -0,0 +1,31 @@ +#! /usr/bin/env bash + +# This hacky script works around the fact that `cross test` does not support +# path dependencies. (It uses a docker shared folder to let the guest access +# project files, so parent directories aren't available.) Solve this problem by +# copying the entire project to a temp dir and rearranging paths to put "c" and +# "reference_impl" underneath "blake3_c_rust_bindings", so that everything is +# accessible. Hopefully this will just run on CI forever and no one will ever +# read this and discover my deep shame. + +set -e -u -o pipefail + +project_root="$(realpath "$(dirname "$BASH_SOURCE")/../..")" +tmpdir="$(mktemp -d)" +echo "Running cross tests in $tmpdir" +cd "$tmpdir" +git clone "$project_root" blake3 +mv blake3/c/blake3_c_rust_bindings . +mv blake3/reference_impl blake3_c_rust_bindings +mv blake3/c blake3_c_rust_bindings +cd blake3_c_rust_bindings +sed -i 's|reference_impl = { path = "../../reference_impl" }|reference_impl = { path = "reference_impl" }|' Cargo.toml + +export BLAKE3_C_DIR_OVERRIDE="./c" +cat > Cross.toml << EOF +[build.env] +passthrough = [ + "BLAKE3_C_DIR_OVERRIDE", +] +EOF +cross test "$@" diff --git a/thirdparty/blake3/c/blake3_c_rust_bindings/src/lib.rs b/thirdparty/blake3/c/blake3_c_rust_bindings/src/lib.rs new file mode 100644 index 000000000..c2b398920 --- /dev/null +++ b/thirdparty/blake3/c/blake3_c_rust_bindings/src/lib.rs @@ -0,0 +1,333 @@ +//! These are Rust bindings for the C implementation of BLAKE3. As there is a +//! native (and faster) Rust implementation of BLAKE3 provided in this same +//! repo, these bindings are not expected to be used in production. They're +//! intended for testing and benchmarking. + +use std::ffi::{c_void, CString}; +use std::mem::MaybeUninit; + +#[cfg(test)] +mod test; + +pub const BLOCK_LEN: usize = 64; +pub const CHUNK_LEN: usize = 1024; +pub const OUT_LEN: usize = 32; + +// Feature detection functions for tests and benchmarks. Note that the C code +// does its own feature detection in blake3_dispatch.c. +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +pub fn sse2_detected() -> bool { + is_x86_feature_detected!("sse2") +} + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +pub fn sse41_detected() -> bool { + is_x86_feature_detected!("sse4.1") +} + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +pub fn avx2_detected() -> bool { + is_x86_feature_detected!("avx2") +} + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +pub fn avx512_detected() -> bool { + is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512vl") +} + +#[derive(Clone)] +pub struct Hasher(ffi::blake3_hasher); + +impl Hasher { + pub fn new() -> Self { + let mut c_state = MaybeUninit::uninit(); + unsafe { + ffi::blake3_hasher_init(c_state.as_mut_ptr()); + Self(c_state.assume_init()) + } + } + + pub fn new_keyed(key: &[u8; 32]) -> Self { + let mut c_state = MaybeUninit::uninit(); + unsafe { + ffi::blake3_hasher_init_keyed(c_state.as_mut_ptr(), key.as_ptr()); + Self(c_state.assume_init()) + } + } + + pub fn new_derive_key(context: &str) -> Self { + let mut c_state = MaybeUninit::uninit(); + let context_c_string = CString::new(context).expect("valid C string, no null bytes"); + unsafe { + ffi::blake3_hasher_init_derive_key(c_state.as_mut_ptr(), context_c_string.as_ptr()); + Self(c_state.assume_init()) + } + } + + pub fn new_derive_key_raw(context: &[u8]) -> Self { + let mut c_state = MaybeUninit::uninit(); + unsafe { + ffi::blake3_hasher_init_derive_key_raw( + c_state.as_mut_ptr(), + context.as_ptr() as *const _, + context.len(), + ); + Self(c_state.assume_init()) + } + } + + pub fn update(&mut self, input: &[u8]) { + unsafe { + ffi::blake3_hasher_update(&mut self.0, input.as_ptr() as *const c_void, input.len()); + } + } + + #[cfg(feature = "tbb")] + pub fn update_tbb(&mut self, input: &[u8]) { + unsafe { + ffi::blake3_hasher_update_tbb( + &mut self.0, + input.as_ptr() as *const c_void, + input.len(), + ); + } + } + + pub fn finalize(&self, output: &mut [u8]) { + unsafe { + ffi::blake3_hasher_finalize(&self.0, output.as_mut_ptr(), output.len()); + } + } + + pub fn finalize_seek(&self, seek: u64, output: &mut [u8]) { + unsafe { + ffi::blake3_hasher_finalize_seek(&self.0, seek, output.as_mut_ptr(), output.len()); + } + } + + pub fn reset(&mut self) { + unsafe { + ffi::blake3_hasher_reset(&mut self.0); + } + } +} + +pub mod ffi { + #[repr(C)] + #[derive(Copy, Clone)] + pub struct blake3_chunk_state { + pub cv: [u32; 8usize], + pub chunk_counter: u64, + pub buf: [u8; 64usize], + pub buf_len: u8, + pub blocks_compressed: u8, + pub flags: u8, + } + + #[repr(C)] + #[derive(Copy, Clone)] + pub struct blake3_hasher { + pub key: [u32; 8usize], + pub chunk: blake3_chunk_state, + pub cv_stack_len: u8, + pub cv_stack: [u8; 1728usize], + } + + extern "C" { + // public interface + pub fn blake3_hasher_init(self_: *mut blake3_hasher); + pub fn blake3_hasher_init_keyed(self_: *mut blake3_hasher, key: *const u8); + pub fn blake3_hasher_init_derive_key( + self_: *mut blake3_hasher, + context: *const ::std::os::raw::c_char, + ); + pub fn blake3_hasher_init_derive_key_raw( + self_: *mut blake3_hasher, + context: *const ::std::os::raw::c_void, + context_len: usize, + ); + pub fn blake3_hasher_update( + self_: *mut blake3_hasher, + input: *const ::std::os::raw::c_void, + input_len: usize, + ); + #[cfg(feature = "tbb")] + pub fn blake3_hasher_update_tbb( + self_: *mut blake3_hasher, + input: *const ::std::os::raw::c_void, + input_len: usize, + ); + pub fn blake3_hasher_finalize(self_: *const blake3_hasher, out: *mut u8, out_len: usize); + pub fn blake3_hasher_finalize_seek( + self_: *const blake3_hasher, + seek: u64, + out: *mut u8, + out_len: usize, + ); + pub fn blake3_hasher_reset(self_: *mut blake3_hasher); + + // portable low-level functions + pub fn blake3_compress_in_place_portable( + cv: *mut u32, + block: *const u8, + block_len: u8, + counter: u64, + flags: u8, + ); + pub fn blake3_compress_xof_portable( + cv: *const u32, + block: *const u8, + block_len: u8, + counter: u64, + flags: u8, + out: *mut u8, + ); + pub fn blake3_hash_many_portable( + inputs: *const *const u8, + num_inputs: usize, + blocks: usize, + key: *const u32, + counter: u64, + increment_counter: bool, + flags: u8, + flags_start: u8, + flags_end: u8, + out: *mut u8, + ); + } + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + pub mod x86 { + extern "C" { + // SSE2 low level functions + pub fn blake3_compress_in_place_sse2( + cv: *mut u32, + block: *const u8, + block_len: u8, + counter: u64, + flags: u8, + ); + pub fn blake3_compress_xof_sse2( + cv: *const u32, + block: *const u8, + block_len: u8, + counter: u64, + flags: u8, + out: *mut u8, + ); + pub fn blake3_hash_many_sse2( + inputs: *const *const u8, + num_inputs: usize, + blocks: usize, + key: *const u32, + counter: u64, + increment_counter: bool, + flags: u8, + flags_start: u8, + flags_end: u8, + out: *mut u8, + ); + + // SSE4.1 low level functions + pub fn blake3_compress_in_place_sse41( + cv: *mut u32, + block: *const u8, + block_len: u8, + counter: u64, + flags: u8, + ); + pub fn blake3_compress_xof_sse41( + cv: *const u32, + block: *const u8, + block_len: u8, + counter: u64, + flags: u8, + out: *mut u8, + ); + pub fn blake3_hash_many_sse41( + inputs: *const *const u8, + num_inputs: usize, + blocks: usize, + key: *const u32, + counter: u64, + increment_counter: bool, + flags: u8, + flags_start: u8, + flags_end: u8, + out: *mut u8, + ); + + // AVX2 low level functions + pub fn blake3_hash_many_avx2( + inputs: *const *const u8, + num_inputs: usize, + blocks: usize, + key: *const u32, + counter: u64, + increment_counter: bool, + flags: u8, + flags_start: u8, + flags_end: u8, + out: *mut u8, + ); + + // AVX-512 low level functions + pub fn blake3_compress_xof_avx512( + cv: *const u32, + block: *const u8, + block_len: u8, + counter: u64, + flags: u8, + out: *mut u8, + ); + pub fn blake3_compress_in_place_avx512( + cv: *mut u32, + block: *const u8, + block_len: u8, + counter: u64, + flags: u8, + ); + pub fn blake3_hash_many_avx512( + inputs: *const *const u8, + num_inputs: usize, + blocks: usize, + key: *const u32, + counter: u64, + increment_counter: bool, + flags: u8, + flags_start: u8, + flags_end: u8, + out: *mut u8, + ); + #[cfg(unix)] + pub fn blake3_xof_many_avx512( + cv: *const u32, + block: *const u8, + block_len: u8, + counter: u64, + flags: u8, + out: *mut u8, + outblocks: usize, + ); + } + } + + #[cfg(feature = "neon")] + pub mod neon { + extern "C" { + // NEON low level functions + pub fn blake3_hash_many_neon( + inputs: *const *const u8, + num_inputs: usize, + blocks: usize, + key: *const u32, + counter: u64, + increment_counter: bool, + flags: u8, + flags_start: u8, + flags_end: u8, + out: *mut u8, + ); + } + } +} diff --git a/thirdparty/blake3/c/blake3_c_rust_bindings/src/test.rs b/thirdparty/blake3/c/blake3_c_rust_bindings/src/test.rs new file mode 100644 index 000000000..6ff61ec6f --- /dev/null +++ b/thirdparty/blake3/c/blake3_c_rust_bindings/src/test.rs @@ -0,0 +1,696 @@ +// Most of this code is duplicated from the root `blake3` crate. Perhaps we +// could share more of it in the future. + +use crate::{BLOCK_LEN, CHUNK_LEN, OUT_LEN}; +use arrayref::{array_mut_ref, array_ref}; +use arrayvec::ArrayVec; +use core::usize; +use rand::prelude::*; + +const CHUNK_START: u8 = 1 << 0; +const CHUNK_END: u8 = 1 << 1; +const PARENT: u8 = 1 << 2; +const ROOT: u8 = 1 << 3; +const KEYED_HASH: u8 = 1 << 4; +// const DERIVE_KEY_CONTEXT: u8 = 1 << 5; +// const DERIVE_KEY_MATERIAL: u8 = 1 << 6; + +// Interesting input lengths to run tests on. +pub const TEST_CASES: &[usize] = &[ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + BLOCK_LEN - 1, + BLOCK_LEN, + BLOCK_LEN + 1, + 2 * BLOCK_LEN - 1, + 2 * BLOCK_LEN, + 2 * BLOCK_LEN + 1, + CHUNK_LEN - 1, + CHUNK_LEN, + CHUNK_LEN + 1, + 2 * CHUNK_LEN, + 2 * CHUNK_LEN + 1, + 3 * CHUNK_LEN, + 3 * CHUNK_LEN + 1, + 4 * CHUNK_LEN, + 4 * CHUNK_LEN + 1, + 5 * CHUNK_LEN, + 5 * CHUNK_LEN + 1, + 6 * CHUNK_LEN, + 6 * CHUNK_LEN + 1, + 7 * CHUNK_LEN, + 7 * CHUNK_LEN + 1, + 8 * CHUNK_LEN, + 8 * CHUNK_LEN + 1, + 16 * CHUNK_LEN, // AVX512's bandwidth + 31 * CHUNK_LEN, // 16 + 8 + 4 + 2 + 1 + 100 * CHUNK_LEN, // subtrees larger than MAX_SIMD_DEGREE chunks +]; + +pub const TEST_CASES_MAX: usize = 100 * CHUNK_LEN; + +// There's a test to make sure these two are equal below. +pub const TEST_KEY: [u8; 32] = *b"whats the Elvish word for friend"; +pub const TEST_KEY_WORDS: [u32; 8] = [ + 1952540791, 1752440947, 1816469605, 1752394102, 1919907616, 1868963940, 1919295602, 1684956521, +]; + +// Paint the input with a repeating byte pattern. We use a cycle length of 251, +// because that's the largest prime number less than 256. This makes it +// unlikely to swapping any two adjacent input blocks or chunks will give the +// same answer. +fn paint_test_input(buf: &mut [u8]) { + for (i, b) in buf.iter_mut().enumerate() { + *b = (i % 251) as u8; + } +} + +#[inline(always)] +fn le_bytes_from_words_32(words: &[u32; 8]) -> [u8; 32] { + let mut out = [0; 32]; + *array_mut_ref!(out, 0 * 4, 4) = words[0].to_le_bytes(); + *array_mut_ref!(out, 1 * 4, 4) = words[1].to_le_bytes(); + *array_mut_ref!(out, 2 * 4, 4) = words[2].to_le_bytes(); + *array_mut_ref!(out, 3 * 4, 4) = words[3].to_le_bytes(); + *array_mut_ref!(out, 4 * 4, 4) = words[4].to_le_bytes(); + *array_mut_ref!(out, 5 * 4, 4) = words[5].to_le_bytes(); + *array_mut_ref!(out, 6 * 4, 4) = words[6].to_le_bytes(); + *array_mut_ref!(out, 7 * 4, 4) = words[7].to_le_bytes(); + out +} + +type CompressInPlaceFn = + unsafe extern "C" fn(cv: *mut u32, block: *const u8, block_len: u8, counter: u64, flags: u8); + +type CompressXofFn = unsafe extern "C" fn( + cv: *const u32, + block: *const u8, + block_len: u8, + counter: u64, + flags: u8, + out: *mut u8, +); + +// A shared helper function for platform-specific tests. +pub fn test_compress_fn(compress_in_place_fn: CompressInPlaceFn, compress_xof_fn: CompressXofFn) { + let initial_state = TEST_KEY_WORDS; + let block_len: u8 = 61; + let mut block = [0; BLOCK_LEN]; + paint_test_input(&mut block[..block_len as usize]); + // Use a counter with set bits in both 32-bit words. + let counter = (5u64 << 32) + 6; + let flags = CHUNK_END | ROOT | KEYED_HASH; + + let mut portable_out = [0; 64]; + unsafe { + crate::ffi::blake3_compress_xof_portable( + initial_state.as_ptr(), + block.as_ptr(), + block_len, + counter, + flags, + portable_out.as_mut_ptr(), + ); + } + + let mut test_state = initial_state; + unsafe { + compress_in_place_fn( + test_state.as_mut_ptr(), + block.as_ptr(), + block_len, + counter, + flags, + ) + }; + let test_state_bytes = le_bytes_from_words_32(&test_state); + let mut test_xof = [0; 64]; + unsafe { + compress_xof_fn( + initial_state.as_ptr(), + block.as_ptr(), + block_len, + counter, + flags, + test_xof.as_mut_ptr(), + ) + }; + + assert_eq!(&portable_out[..32], &test_state_bytes[..]); + assert_eq!(&portable_out[..], &test_xof[..]); +} + +// Testing the portable implementation against itself is circular, but why not. +#[test] +fn test_compress_portable() { + test_compress_fn( + crate::ffi::blake3_compress_in_place_portable, + crate::ffi::blake3_compress_xof_portable, + ); +} + +#[test] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn test_compress_sse2() { + if !crate::sse2_detected() { + return; + } + test_compress_fn( + crate::ffi::x86::blake3_compress_in_place_sse2, + crate::ffi::x86::blake3_compress_xof_sse2, + ); +} + +#[test] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn test_compress_sse41() { + if !crate::sse41_detected() { + return; + } + test_compress_fn( + crate::ffi::x86::blake3_compress_in_place_sse41, + crate::ffi::x86::blake3_compress_xof_sse41, + ); +} + +#[test] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn test_compress_avx512() { + if !crate::avx512_detected() { + return; + } + test_compress_fn( + crate::ffi::x86::blake3_compress_in_place_avx512, + crate::ffi::x86::blake3_compress_xof_avx512, + ); +} + +type HashManyFn = unsafe extern "C" fn( + inputs: *const *const u8, + num_inputs: usize, + blocks: usize, + key: *const u32, + counter: u64, + increment_counter: bool, + flags: u8, + flags_start: u8, + flags_end: u8, + out: *mut u8, +); + +// A shared helper function for platform-specific tests. +pub fn test_hash_many_fn(hash_many_fn: HashManyFn) { + // Test a few different initial counter values. + // - 0: The base case. + // - u32::MAX: The low word of the counter overflows for all inputs except the first. + // - i32::MAX: *No* overflow. But carry bugs in tricky SIMD code can screw this up, if you XOR + // when you're supposed to ANDNOT... + let initial_counters = [0, u32::MAX as u64, i32::MAX as u64]; + for counter in initial_counters { + dbg!(counter); + + // 31 (16 + 8 + 4 + 2 + 1) inputs + const NUM_INPUTS: usize = 31; + let mut input_buf = [0; CHUNK_LEN * NUM_INPUTS]; + crate::test::paint_test_input(&mut input_buf); + + // First hash chunks. + let mut chunks = ArrayVec::<&[u8; CHUNK_LEN], NUM_INPUTS>::new(); + for i in 0..NUM_INPUTS { + chunks.push(array_ref!(input_buf, i * CHUNK_LEN, CHUNK_LEN)); + } + let mut portable_chunks_out = [0; NUM_INPUTS * OUT_LEN]; + unsafe { + crate::ffi::blake3_hash_many_portable( + chunks.as_ptr() as _, + chunks.len(), + CHUNK_LEN / BLOCK_LEN, + TEST_KEY_WORDS.as_ptr(), + counter, + true, + KEYED_HASH, + CHUNK_START, + CHUNK_END, + portable_chunks_out.as_mut_ptr(), + ); + } + + let mut test_chunks_out = [0; NUM_INPUTS * OUT_LEN]; + unsafe { + hash_many_fn( + chunks.as_ptr() as _, + chunks.len(), + CHUNK_LEN / BLOCK_LEN, + TEST_KEY_WORDS.as_ptr(), + counter, + true, + KEYED_HASH, + CHUNK_START, + CHUNK_END, + test_chunks_out.as_mut_ptr(), + ); + } + for n in 0..NUM_INPUTS { + dbg!(n); + assert_eq!( + &portable_chunks_out[n * OUT_LEN..][..OUT_LEN], + &test_chunks_out[n * OUT_LEN..][..OUT_LEN] + ); + } + + // Then hash parents. + let mut parents = ArrayVec::<&[u8; 2 * OUT_LEN], NUM_INPUTS>::new(); + for i in 0..NUM_INPUTS { + parents.push(array_ref!(input_buf, i * 2 * OUT_LEN, 2 * OUT_LEN)); + } + let mut portable_parents_out = [0; NUM_INPUTS * OUT_LEN]; + unsafe { + crate::ffi::blake3_hash_many_portable( + parents.as_ptr() as _, + parents.len(), + 1, + TEST_KEY_WORDS.as_ptr(), + counter, + false, + KEYED_HASH | PARENT, + 0, + 0, + portable_parents_out.as_mut_ptr(), + ); + } + + let mut test_parents_out = [0; NUM_INPUTS * OUT_LEN]; + unsafe { + hash_many_fn( + parents.as_ptr() as _, + parents.len(), + 1, + TEST_KEY_WORDS.as_ptr(), + counter, + false, + KEYED_HASH | PARENT, + 0, + 0, + test_parents_out.as_mut_ptr(), + ); + } + for n in 0..NUM_INPUTS { + dbg!(n); + assert_eq!( + &portable_parents_out[n * OUT_LEN..][..OUT_LEN], + &test_parents_out[n * OUT_LEN..][..OUT_LEN] + ); + } + } +} + +// Testing the portable implementation against itself is circular, but why not. +#[test] +fn test_hash_many_portable() { + test_hash_many_fn(crate::ffi::blake3_hash_many_portable); +} + +#[test] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn test_hash_many_sse2() { + if !crate::sse2_detected() { + return; + } + test_hash_many_fn(crate::ffi::x86::blake3_hash_many_sse2); +} + +#[test] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn test_hash_many_sse41() { + if !crate::sse41_detected() { + return; + } + test_hash_many_fn(crate::ffi::x86::blake3_hash_many_sse41); +} + +#[test] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn test_hash_many_avx2() { + if !crate::avx2_detected() { + return; + } + test_hash_many_fn(crate::ffi::x86::blake3_hash_many_avx2); +} + +#[test] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn test_hash_many_avx512() { + if !crate::avx512_detected() { + return; + } + test_hash_many_fn(crate::ffi::x86::blake3_hash_many_avx512); +} + +#[test] +#[cfg(feature = "neon")] +fn test_hash_many_neon() { + test_hash_many_fn(crate::ffi::neon::blake3_hash_many_neon); +} + +#[allow(unused)] +type XofManyFunction = unsafe extern "C" fn( + cv: *const u32, + block: *const u8, + block_len: u8, + counter: u64, + flags: u8, + out: *mut u8, + outblocks: usize, +); + +// A shared helper function for platform-specific tests. +#[allow(unused)] +pub fn test_xof_many_fn(xof_many_function: XofManyFunction) { + let mut block = [0; BLOCK_LEN]; + let block_len = 42; + crate::test::paint_test_input(&mut block[..block_len]); + let cv = [40, 41, 42, 43, 44, 45, 46, 47]; + let flags = KEYED_HASH; + + // Test a few different initial counter values. + // - 0: The base case. + // - u32::MAX: The low word of the counter overflows for all inputs except the first. + // - i32::MAX: *No* overflow. But carry bugs in tricky SIMD code can screw this up, if you XOR + // when you're supposed to ANDNOT... + let initial_counters = [0, u32::MAX as u64, i32::MAX as u64]; + for counter in initial_counters { + dbg!(counter); + + // 31 (16 + 8 + 4 + 2 + 1) outputs + const OUTPUT_SIZE: usize = 31 * BLOCK_LEN; + + let mut portable_out = [0u8; OUTPUT_SIZE]; + for (i, out_block) in portable_out.chunks_exact_mut(BLOCK_LEN).enumerate() { + unsafe { + crate::ffi::blake3_compress_xof_portable( + cv.as_ptr(), + block.as_ptr(), + block_len as u8, + counter + i as u64, + flags, + out_block.as_mut_ptr(), + ); + } + } + + let mut test_out = [0u8; OUTPUT_SIZE]; + unsafe { + xof_many_function( + cv.as_ptr(), + block.as_ptr(), + block_len as u8, + counter, + flags, + test_out.as_mut_ptr(), + OUTPUT_SIZE / BLOCK_LEN, + ); + } + + assert_eq!(portable_out, test_out); + } + + // Test that xof_many doesn't write more blocks than requested. Note that the current assembly + // implementation always outputs at least one block, so we don't test the zero case. + for block_count in 1..=32 { + let mut array = [0; BLOCK_LEN * 33]; + let output_start = 17; + let output_len = block_count * BLOCK_LEN; + let output_end = output_start + output_len; + let output = &mut array[output_start..output_end]; + unsafe { + xof_many_function( + cv.as_ptr(), + block.as_ptr(), + block_len as u8, + 0, + flags, + output.as_mut_ptr(), + block_count, + ); + } + for i in 0..array.len() { + if i < output_start || output_end <= i { + assert_eq!(0, array[i], "index {i}"); + } + } + } +} + +#[test] +#[cfg(unix)] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn test_xof_many_avx512() { + if !crate::avx512_detected() { + return; + } + test_xof_many_fn(crate::ffi::x86::blake3_xof_many_avx512); +} + +#[test] +fn test_compare_reference_impl() { + const OUT: usize = 303; // more than 64, not a multiple of 4 + let mut input_buf = [0; TEST_CASES_MAX]; + paint_test_input(&mut input_buf); + for &case in TEST_CASES { + let input = &input_buf[..case]; + dbg!(case); + + // regular + { + let mut reference_hasher = reference_impl::Hasher::new(); + reference_hasher.update(input); + let mut expected_out = [0; OUT]; + reference_hasher.finalize(&mut expected_out); + + let mut test_hasher = crate::Hasher::new(); + test_hasher.update(input); + let mut test_out = [0; OUT]; + test_hasher.finalize(&mut test_out); + assert_eq!(test_out[..], expected_out[..]); + + #[cfg(feature = "tbb")] + { + let mut tbb_hasher = crate::Hasher::new(); + tbb_hasher.update_tbb(input); + let mut tbb_out = [0; OUT]; + tbb_hasher.finalize(&mut tbb_out); + assert_eq!(tbb_out[..], expected_out[..]); + } + } + + // keyed + { + let mut reference_hasher = reference_impl::Hasher::new_keyed(&TEST_KEY); + reference_hasher.update(input); + let mut expected_out = [0; OUT]; + reference_hasher.finalize(&mut expected_out); + + let mut test_hasher = crate::Hasher::new_keyed(&TEST_KEY); + test_hasher.update(input); + let mut test_out = [0; OUT]; + test_hasher.finalize(&mut test_out); + assert_eq!(test_out[..], expected_out[..]); + + #[cfg(feature = "tbb")] + { + let mut tbb_hasher = crate::Hasher::new_keyed(&TEST_KEY); + tbb_hasher.update_tbb(input); + let mut tbb_out = [0; OUT]; + tbb_hasher.finalize(&mut tbb_out); + assert_eq!(tbb_out[..], expected_out[..]); + } + } + + // derive_key + { + let context = "BLAKE3 2019-12-27 16:13:59 example context (not the test vector one)"; + let mut reference_hasher = reference_impl::Hasher::new_derive_key(context); + reference_hasher.update(input); + let mut expected_out = [0; OUT]; + reference_hasher.finalize(&mut expected_out); + + // the regular C string API + let mut test_hasher = crate::Hasher::new_derive_key(context); + test_hasher.update(input); + let mut test_out = [0; OUT]; + test_hasher.finalize(&mut test_out); + assert_eq!(test_out[..], expected_out[..]); + + // the raw bytes API + let mut test_hasher_raw = crate::Hasher::new_derive_key_raw(context.as_bytes()); + test_hasher_raw.update(input); + let mut test_out_raw = [0; OUT]; + test_hasher_raw.finalize(&mut test_out_raw); + assert_eq!(test_out_raw[..], expected_out[..]); + + #[cfg(feature = "tbb")] + { + let mut tbb_hasher = crate::Hasher::new_derive_key(context); + tbb_hasher.update_tbb(input); + let mut tbb_out = [0; OUT]; + tbb_hasher.finalize(&mut tbb_out); + assert_eq!(tbb_out[..], expected_out[..]); + } + } + } +} + +fn reference_hash(input: &[u8]) -> [u8; OUT_LEN] { + let mut hasher = reference_impl::Hasher::new(); + hasher.update(input); + let mut bytes = [0; OUT_LEN]; + hasher.finalize(&mut bytes); + bytes.into() +} + +#[test] +fn test_compare_update_multiple() { + // Don't use all the long test cases here, since that's unnecessarily slow + // in debug mode. + let mut short_test_cases = TEST_CASES; + while *short_test_cases.last().unwrap() > 4 * CHUNK_LEN { + short_test_cases = &short_test_cases[..short_test_cases.len() - 1]; + } + assert_eq!(*short_test_cases.last().unwrap(), 4 * CHUNK_LEN); + + let mut input_buf = [0; 2 * TEST_CASES_MAX]; + paint_test_input(&mut input_buf); + + for &first_update in short_test_cases { + dbg!(first_update); + let first_input = &input_buf[..first_update]; + let mut test_hasher = crate::Hasher::new(); + test_hasher.update(first_input); + + for &second_update in short_test_cases { + dbg!(second_update); + let second_input = &input_buf[first_update..][..second_update]; + let total_input = &input_buf[..first_update + second_update]; + + // Clone the hasher with first_update bytes already written, so + // that the next iteration can reuse it. + let mut test_hasher = test_hasher.clone(); + test_hasher.update(second_input); + let mut test_out = [0; OUT_LEN]; + test_hasher.finalize(&mut test_out); + + let expected = reference_hash(total_input); + assert_eq!(expected, test_out); + } + } +} + +#[test] +fn test_fuzz_hasher() { + const INPUT_MAX: usize = 4 * CHUNK_LEN; + let mut input_buf = [0; 3 * INPUT_MAX]; + paint_test_input(&mut input_buf); + + // Don't do too many iterations in debug mode, to keep the tests under a + // second or so. CI should run tests in release mode also. Provide an + // environment variable for specifying a larger number of fuzz iterations. + let num_tests = if cfg!(debug_assertions) { 100 } else { 10_000 }; + + // Use a fixed RNG seed for reproducibility. + let mut rng = rand_chacha::ChaCha8Rng::from_seed([1; 32]); + for _num_test in 0..num_tests { + dbg!(_num_test); + let mut hasher = crate::Hasher::new(); + let mut total_input = 0; + // For each test, write 3 inputs of random length. + for _ in 0..3 { + let input_len = rng.random_range(0..INPUT_MAX + 1); + dbg!(input_len); + let input = &input_buf[total_input..][..input_len]; + hasher.update(input); + total_input += input_len; + } + let expected = reference_hash(&input_buf[..total_input]); + let mut test_out = [0; 32]; + hasher.finalize(&mut test_out); + assert_eq!(expected, test_out); + } +} + +#[test] +fn test_finalize_seek() { + let mut expected = [0; 1000]; + { + let mut reference_hasher = reference_impl::Hasher::new(); + reference_hasher.update(b"foobarbaz"); + reference_hasher.finalize(&mut expected); + } + + let mut test_hasher = crate::Hasher::new(); + test_hasher.update(b"foobarbaz"); + + let mut out = [0; 103]; + for &seek in &[0, 1, 7, 59, 63, 64, 65, 501, expected.len() - out.len()] { + dbg!(seek); + test_hasher.finalize_seek(seek as u64, &mut out); + assert_eq!(&expected[seek..][..out.len()], &out[..]); + } +} + +#[test] +fn test_reset() { + { + let mut hasher = crate::Hasher::new(); + hasher.update(&[42; 3 * CHUNK_LEN + 7]); + hasher.reset(); + hasher.update(&[42; CHUNK_LEN + 3]); + let mut output = [0; 32]; + hasher.finalize(&mut output); + + let mut reference_hasher = reference_impl::Hasher::new(); + reference_hasher.update(&[42; CHUNK_LEN + 3]); + let mut reference_hash = [0; 32]; + reference_hasher.finalize(&mut reference_hash); + + assert_eq!(reference_hash, output); + } + { + let key = &[99; 32]; + let mut hasher = crate::Hasher::new_keyed(key); + hasher.update(&[42; 3 * CHUNK_LEN + 7]); + hasher.reset(); + hasher.update(&[42; CHUNK_LEN + 3]); + let mut output = [0; 32]; + hasher.finalize(&mut output); + + let mut reference_hasher = reference_impl::Hasher::new_keyed(key); + reference_hasher.update(&[42; CHUNK_LEN + 3]); + let mut reference_hash = [0; 32]; + reference_hasher.finalize(&mut reference_hash); + + assert_eq!(reference_hash, output); + } + { + let context = "BLAKE3 2020-02-12 10:20:58 reset test"; + let mut hasher = crate::Hasher::new_derive_key(context); + hasher.update(&[42; 3 * CHUNK_LEN + 7]); + hasher.reset(); + hasher.update(&[42; CHUNK_LEN + 3]); + let mut output = [0; 32]; + hasher.finalize(&mut output); + + let mut reference_hasher = reference_impl::Hasher::new_derive_key(context); + reference_hasher.update(&[42; CHUNK_LEN + 3]); + let mut reference_hash = [0; 32]; + reference_hasher.finalize(&mut reference_hash); + + assert_eq!(reference_hash, output); + } +} diff --git a/thirdparty/blake3/c/blake3_dispatch.c b/thirdparty/blake3/c/blake3_dispatch.c new file mode 100644 index 000000000..52db90586 --- /dev/null +++ b/thirdparty/blake3/c/blake3_dispatch.c @@ -0,0 +1,332 @@ +#include <stdbool.h> +#include <stddef.h> +#include <stdint.h> + +#include "blake3_impl.h" + +#if defined(_MSC_VER) +#include <Windows.h> +#endif + +#if defined(IS_X86) +#if defined(_MSC_VER) +#include <intrin.h> +#elif defined(__GNUC__) +#include <immintrin.h> +#else +#undef IS_X86 /* Unimplemented! */ +#endif +#endif + +#if !defined(BLAKE3_ATOMICS) +#if defined(__has_include) +#if __has_include(<stdatomic.h>) && !defined(_MSC_VER) +#define BLAKE3_ATOMICS 1 +#else +#define BLAKE3_ATOMICS 0 +#endif /* __has_include(<stdatomic.h>) && !defined(_MSC_VER) */ +#else +#define BLAKE3_ATOMICS 0 +#endif /* defined(__has_include) */ +#endif /* BLAKE3_ATOMICS */ + +#if BLAKE3_ATOMICS +#define ATOMIC_INT _Atomic int +#define ATOMIC_LOAD(x) x +#define ATOMIC_STORE(x, y) x = y +#elif defined(_MSC_VER) +#define ATOMIC_INT LONG +#define ATOMIC_LOAD(x) InterlockedOr(&x, 0) +#define ATOMIC_STORE(x, y) InterlockedExchange(&x, y) +#else +#define ATOMIC_INT int +#define ATOMIC_LOAD(x) x +#define ATOMIC_STORE(x, y) x = y +#endif + +#define MAYBE_UNUSED(x) (void)((x)) + +#if defined(IS_X86) +static uint64_t xgetbv(void) { +#if defined(_MSC_VER) + return _xgetbv(0); +#else + uint32_t eax = 0, edx = 0; + __asm__ __volatile__("xgetbv\n" : "=a"(eax), "=d"(edx) : "c"(0)); + return ((uint64_t)edx << 32) | eax; +#endif +} + +static void cpuid(uint32_t out[4], uint32_t id) { +#if defined(_MSC_VER) + __cpuid((int *)out, id); +#elif defined(__i386__) || defined(_M_IX86) + __asm__ __volatile__("movl %%ebx, %1\n" + "cpuid\n" + "xchgl %1, %%ebx\n" + : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3]) + : "a"(id)); +#else + __asm__ __volatile__("cpuid\n" + : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) + : "a"(id)); +#endif +} + +static void cpuidex(uint32_t out[4], uint32_t id, uint32_t sid) { +#if defined(_MSC_VER) + __cpuidex((int *)out, id, sid); +#elif defined(__i386__) || defined(_M_IX86) + __asm__ __volatile__("movl %%ebx, %1\n" + "cpuid\n" + "xchgl %1, %%ebx\n" + : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3]) + : "a"(id), "c"(sid)); +#else + __asm__ __volatile__("cpuid\n" + : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) + : "a"(id), "c"(sid)); +#endif +} + +#endif + +enum cpu_feature { + SSE2 = 1 << 0, + SSSE3 = 1 << 1, + SSE41 = 1 << 2, + AVX = 1 << 3, + AVX2 = 1 << 4, + AVX512F = 1 << 5, + AVX512VL = 1 << 6, + /* ... */ + UNDEFINED = 1 << 30 +}; + +#if !defined(BLAKE3_TESTING) +static /* Allow the variable to be controlled manually for testing */ +#endif + ATOMIC_INT g_cpu_features = UNDEFINED; + +#if !defined(BLAKE3_TESTING) +static +#endif + enum cpu_feature + get_cpu_features(void) { + + /* If TSAN detects a data race here, try compiling with -DBLAKE3_ATOMICS=1 */ + enum cpu_feature features = ATOMIC_LOAD(g_cpu_features); + if (features != UNDEFINED) { + return features; + } else { +#if defined(IS_X86) + uint32_t regs[4] = {0}; + uint32_t *eax = ®s[0], *ebx = ®s[1], *ecx = ®s[2], *edx = ®s[3]; + (void)edx; + features = 0; + cpuid(regs, 0); + const int max_id = *eax; + cpuid(regs, 1); +#if defined(__amd64__) || defined(_M_X64) + features |= SSE2; +#else + if (*edx & (1UL << 26)) + features |= SSE2; +#endif + if (*ecx & (1UL << 9)) + features |= SSSE3; + if (*ecx & (1UL << 19)) + features |= SSE41; + + if (*ecx & (1UL << 27)) { // OSXSAVE + const uint64_t mask = xgetbv(); + if ((mask & 6) == 6) { // SSE and AVX states + if (*ecx & (1UL << 28)) + features |= AVX; + if (max_id >= 7) { + cpuidex(regs, 7, 0); + if (*ebx & (1UL << 5)) + features |= AVX2; + if ((mask & 224) == 224) { // Opmask, ZMM_Hi256, Hi16_Zmm + if (*ebx & (1UL << 31)) + features |= AVX512VL; + if (*ebx & (1UL << 16)) + features |= AVX512F; + } + } + } + } + ATOMIC_STORE(g_cpu_features, features); + return features; +#else + /* How to detect NEON? */ + return 0; +#endif + } +} + +void blake3_compress_in_place(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags) { +#if defined(IS_X86) + const enum cpu_feature features = get_cpu_features(); + MAYBE_UNUSED(features); +#if !defined(BLAKE3_NO_AVX512) + if (features & AVX512VL) { + blake3_compress_in_place_avx512(cv, block, block_len, counter, flags); + return; + } +#endif +#if !defined(BLAKE3_NO_SSE41) + if (features & SSE41) { + blake3_compress_in_place_sse41(cv, block, block_len, counter, flags); + return; + } +#endif +#if !defined(BLAKE3_NO_SSE2) + if (features & SSE2) { + blake3_compress_in_place_sse2(cv, block, block_len, counter, flags); + return; + } +#endif +#endif + blake3_compress_in_place_portable(cv, block, block_len, counter, flags); +} + +void blake3_compress_xof(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, uint8_t flags, + uint8_t out[64]) { +#if defined(IS_X86) + const enum cpu_feature features = get_cpu_features(); + MAYBE_UNUSED(features); +#if !defined(BLAKE3_NO_AVX512) + if (features & AVX512VL) { + blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out); + return; + } +#endif +#if !defined(BLAKE3_NO_SSE41) + if (features & SSE41) { + blake3_compress_xof_sse41(cv, block, block_len, counter, flags, out); + return; + } +#endif +#if !defined(BLAKE3_NO_SSE2) + if (features & SSE2) { + blake3_compress_xof_sse2(cv, block, block_len, counter, flags, out); + return; + } +#endif +#endif + blake3_compress_xof_portable(cv, block, block_len, counter, flags, out); +} + + +void blake3_xof_many(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, uint8_t flags, + uint8_t out[64], size_t outblocks) { + if (outblocks == 0) { + // The current assembly implementation always outputs at least 1 block. + return; + } +#if defined(IS_X86) + const enum cpu_feature features = get_cpu_features(); + MAYBE_UNUSED(features); +#if !defined(_WIN32) && !defined(BLAKE3_NO_AVX512) + if (features & AVX512VL) { + blake3_xof_many_avx512(cv, block, block_len, counter, flags, out, outblocks); + return; + } +#endif +#endif + for(size_t i = 0; i < outblocks; ++i) { + blake3_compress_xof(cv, block, block_len, counter + i, flags, out + 64*i); + } +} + +void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], uint64_t counter, + bool increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { +#if defined(IS_X86) + const enum cpu_feature features = get_cpu_features(); + MAYBE_UNUSED(features); +#if !defined(BLAKE3_NO_AVX512) + if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) { + blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, + out); + return; + } +#endif +#if !defined(BLAKE3_NO_AVX2) + if (features & AVX2) { + blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, + out); + return; + } +#endif +#if !defined(BLAKE3_NO_SSE41) + if (features & SSE41) { + blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, + out); + return; + } +#endif +#if !defined(BLAKE3_NO_SSE2) + if (features & SSE2) { + blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, + out); + return; + } +#endif +#endif + +#if BLAKE3_USE_NEON == 1 + blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, out); + return; +#endif + + blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, + out); +} + +// The dynamically detected SIMD degree of the current platform. +size_t blake3_simd_degree(void) { +#if defined(IS_X86) + const enum cpu_feature features = get_cpu_features(); + MAYBE_UNUSED(features); +#if !defined(BLAKE3_NO_AVX512) + if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) { + return 16; + } +#endif +#if !defined(BLAKE3_NO_AVX2) + if (features & AVX2) { + return 8; + } +#endif +#if !defined(BLAKE3_NO_SSE41) + if (features & SSE41) { + return 4; + } +#endif +#if !defined(BLAKE3_NO_SSE2) + if (features & SSE2) { + return 4; + } +#endif +#endif +#if BLAKE3_USE_NEON == 1 + return 4; +#endif + return 1; +} diff --git a/thirdparty/blake3/c/blake3_impl.h b/thirdparty/blake3/c/blake3_impl.h new file mode 100644 index 000000000..facd59974 --- /dev/null +++ b/thirdparty/blake3/c/blake3_impl.h @@ -0,0 +1,333 @@ +#ifndef BLAKE3_IMPL_H +#define BLAKE3_IMPL_H + +#include <assert.h> +#include <stdbool.h> +#include <stddef.h> +#include <stdint.h> +#include <string.h> + +#include "blake3.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// internal flags +enum blake3_flags { + CHUNK_START = 1 << 0, + CHUNK_END = 1 << 1, + PARENT = 1 << 2, + ROOT = 1 << 3, + KEYED_HASH = 1 << 4, + DERIVE_KEY_CONTEXT = 1 << 5, + DERIVE_KEY_MATERIAL = 1 << 6, +}; + +// This C implementation tries to support recent versions of GCC, Clang, and +// MSVC. +#if defined(_MSC_VER) +#define INLINE static __forceinline +#else +#define INLINE static inline __attribute__((always_inline)) +#endif + +#ifdef __cplusplus +#define NOEXCEPT noexcept +#else +#define NOEXCEPT +#endif + +#if (defined(__x86_64__) || defined(_M_X64)) && !defined(_M_ARM64EC) +#define IS_X86 +#define IS_X86_64 +#endif + +#if defined(__i386__) || defined(_M_IX86) +#define IS_X86 +#define IS_X86_32 +#endif + +#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#define IS_AARCH64 +#endif + +#if defined(IS_X86) +#if defined(_MSC_VER) +#include <intrin.h> +#endif +#endif + +#if !defined(BLAKE3_USE_NEON) + // If BLAKE3_USE_NEON not manually set, autodetect based on AArch64ness + #if defined(IS_AARCH64) + #if defined(__ARM_BIG_ENDIAN) + #define BLAKE3_USE_NEON 0 + #else + #define BLAKE3_USE_NEON 1 + #endif + #else + #define BLAKE3_USE_NEON 0 + #endif +#endif + +#if defined(IS_X86) +#define MAX_SIMD_DEGREE 16 +#elif BLAKE3_USE_NEON == 1 +#define MAX_SIMD_DEGREE 4 +#else +#define MAX_SIMD_DEGREE 1 +#endif + +// There are some places where we want a static size that's equal to the +// MAX_SIMD_DEGREE, but also at least 2. +#define MAX_SIMD_DEGREE_OR_2 (MAX_SIMD_DEGREE > 2 ? MAX_SIMD_DEGREE : 2) + +static const uint32_t IV[8] = {0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, + 0xA54FF53AUL, 0x510E527FUL, 0x9B05688CUL, + 0x1F83D9ABUL, 0x5BE0CD19UL}; + +static const uint8_t MSG_SCHEDULE[7][16] = { + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8}, + {3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1}, + {10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6}, + {12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4}, + {9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7}, + {11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13}, +}; + +/* Find index of the highest set bit */ +/* x is assumed to be nonzero. */ +static unsigned int highest_one(uint64_t x) { +#if defined(__GNUC__) || defined(__clang__) + return 63 ^ (unsigned int)__builtin_clzll(x); +#elif defined(_MSC_VER) && defined(IS_X86_64) + unsigned long index; + _BitScanReverse64(&index, x); + return index; +#elif defined(_MSC_VER) && defined(IS_X86_32) + if(x >> 32) { + unsigned long index; + _BitScanReverse(&index, (unsigned long)(x >> 32)); + return 32 + index; + } else { + unsigned long index; + _BitScanReverse(&index, (unsigned long)x); + return index; + } +#else + unsigned int c = 0; + if(x & 0xffffffff00000000ULL) { x >>= 32; c += 32; } + if(x & 0x00000000ffff0000ULL) { x >>= 16; c += 16; } + if(x & 0x000000000000ff00ULL) { x >>= 8; c += 8; } + if(x & 0x00000000000000f0ULL) { x >>= 4; c += 4; } + if(x & 0x000000000000000cULL) { x >>= 2; c += 2; } + if(x & 0x0000000000000002ULL) { c += 1; } + return c; +#endif +} + +// Count the number of 1 bits. +INLINE unsigned int popcnt(uint64_t x) { +#if defined(__GNUC__) || defined(__clang__) + return (unsigned int)__builtin_popcountll(x); +#else + unsigned int count = 0; + while (x != 0) { + count += 1; + x &= x - 1; + } + return count; +#endif +} + +// Largest power of two less than or equal to x. As a special case, returns 1 +// when x is 0. +INLINE uint64_t round_down_to_power_of_2(uint64_t x) { + return 1ULL << highest_one(x | 1); +} + +INLINE uint32_t counter_low(uint64_t counter) { return (uint32_t)counter; } + +INLINE uint32_t counter_high(uint64_t counter) { + return (uint32_t)(counter >> 32); +} + +INLINE uint32_t load32(const void *src) { + const uint8_t *p = (const uint8_t *)src; + return ((uint32_t)(p[0]) << 0) | ((uint32_t)(p[1]) << 8) | + ((uint32_t)(p[2]) << 16) | ((uint32_t)(p[3]) << 24); +} + +INLINE void load_key_words(const uint8_t key[BLAKE3_KEY_LEN], + uint32_t key_words[8]) { + key_words[0] = load32(&key[0 * 4]); + key_words[1] = load32(&key[1 * 4]); + key_words[2] = load32(&key[2 * 4]); + key_words[3] = load32(&key[3 * 4]); + key_words[4] = load32(&key[4 * 4]); + key_words[5] = load32(&key[5 * 4]); + key_words[6] = load32(&key[6 * 4]); + key_words[7] = load32(&key[7 * 4]); +} + +INLINE void load_block_words(const uint8_t block[BLAKE3_BLOCK_LEN], + uint32_t block_words[16]) { + for (size_t i = 0; i < 16; i++) { + block_words[i] = load32(&block[i * 4]); + } +} + +INLINE void store32(void *dst, uint32_t w) { + uint8_t *p = (uint8_t *)dst; + p[0] = (uint8_t)(w >> 0); + p[1] = (uint8_t)(w >> 8); + p[2] = (uint8_t)(w >> 16); + p[3] = (uint8_t)(w >> 24); +} + +INLINE void store_cv_words(uint8_t bytes_out[32], uint32_t cv_words[8]) { + store32(&bytes_out[0 * 4], cv_words[0]); + store32(&bytes_out[1 * 4], cv_words[1]); + store32(&bytes_out[2 * 4], cv_words[2]); + store32(&bytes_out[3 * 4], cv_words[3]); + store32(&bytes_out[4 * 4], cv_words[4]); + store32(&bytes_out[5 * 4], cv_words[5]); + store32(&bytes_out[6 * 4], cv_words[6]); + store32(&bytes_out[7 * 4], cv_words[7]); +} + +void blake3_compress_in_place(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags); + +void blake3_compress_xof(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, uint8_t flags, + uint8_t out[64]); + +void blake3_xof_many(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, uint8_t flags, + uint8_t out[64], size_t outblocks); + +void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], uint64_t counter, + bool increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out); + +size_t blake3_simd_degree(void); + +BLAKE3_PRIVATE size_t blake3_compress_subtree_wide(const uint8_t *input, size_t input_len, + const uint32_t key[8], + uint64_t chunk_counter, uint8_t flags, + uint8_t *out, bool use_tbb); + +#if defined(BLAKE3_USE_TBB) +BLAKE3_PRIVATE void blake3_compress_subtree_wide_join_tbb( + // shared params + const uint32_t key[8], uint8_t flags, bool use_tbb, + // left-hand side params + const uint8_t *l_input, size_t l_input_len, uint64_t l_chunk_counter, + uint8_t *l_cvs, size_t *l_n, + // right-hand side params + const uint8_t *r_input, size_t r_input_len, uint64_t r_chunk_counter, + uint8_t *r_cvs, size_t *r_n) NOEXCEPT; +#endif + +// Declarations for implementation-specific functions. +void blake3_compress_in_place_portable(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags); + +void blake3_compress_xof_portable(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags, uint8_t out[64]); + +void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out); + +#if defined(IS_X86) +#if !defined(BLAKE3_NO_SSE2) +void blake3_compress_in_place_sse2(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags); +void blake3_compress_xof_sse2(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags, uint8_t out[64]); +void blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out); +#endif +#if !defined(BLAKE3_NO_SSE41) +void blake3_compress_in_place_sse41(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags); +void blake3_compress_xof_sse41(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags, uint8_t out[64]); +void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out); +#endif +#if !defined(BLAKE3_NO_AVX2) +void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out); +#endif +#if !defined(BLAKE3_NO_AVX512) +void blake3_compress_in_place_avx512(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags); + +void blake3_compress_xof_avx512(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags, uint8_t out[64]); + +void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out); + +#if !defined(_WIN32) +void blake3_xof_many_avx512(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, uint8_t flags, + uint8_t* out, size_t outblocks); +#endif +#endif +#endif + +#if BLAKE3_USE_NEON == 1 +void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out); +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* BLAKE3_IMPL_H */ diff --git a/thirdparty/blake3/c/blake3_neon.c b/thirdparty/blake3/c/blake3_neon.c new file mode 100644 index 000000000..53ce83c3b --- /dev/null +++ b/thirdparty/blake3/c/blake3_neon.c @@ -0,0 +1,366 @@ +#include "blake3_impl.h" + +#include <arm_neon.h> + +#ifdef __ARM_BIG_ENDIAN +#error "This implementation only supports little-endian ARM." +// It might be that all we need for big-endian support here is to get the loads +// and stores right, but step zero would be finding a way to test it in CI. +#endif + +INLINE uint32x4_t loadu_128(const uint8_t src[16]) { + // vld1q_u32 has alignment requirements. Don't use it. + return vreinterpretq_u32_u8(vld1q_u8(src)); +} + +INLINE void storeu_128(uint32x4_t src, uint8_t dest[16]) { + // vst1q_u32 has alignment requirements. Don't use it. + vst1q_u8(dest, vreinterpretq_u8_u32(src)); +} + +INLINE uint32x4_t add_128(uint32x4_t a, uint32x4_t b) { + return vaddq_u32(a, b); +} + +INLINE uint32x4_t xor_128(uint32x4_t a, uint32x4_t b) { + return veorq_u32(a, b); +} + +INLINE uint32x4_t set1_128(uint32_t x) { return vld1q_dup_u32(&x); } + +INLINE uint32x4_t set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { + uint32_t array[4] = {a, b, c, d}; + return vld1q_u32(array); +} + +INLINE uint32x4_t rot16_128(uint32x4_t x) { + // The straightforward implementation would be two shifts and an or, but that's + // slower on microarchitectures we've tested. See + // https://github.com/BLAKE3-team/BLAKE3/pull/319. + // return vorrq_u32(vshrq_n_u32(x, 16), vshlq_n_u32(x, 32 - 16)); + return vreinterpretq_u32_u16(vrev32q_u16(vreinterpretq_u16_u32(x))); +} + +INLINE uint32x4_t rot12_128(uint32x4_t x) { + // See comment in rot16_128. + // return vorrq_u32(vshrq_n_u32(x, 12), vshlq_n_u32(x, 32 - 12)); + return vsriq_n_u32(vshlq_n_u32(x, 32-12), x, 12); +} + +INLINE uint32x4_t rot8_128(uint32x4_t x) { + // See comment in rot16_128. + // return vorrq_u32(vshrq_n_u32(x, 8), vshlq_n_u32(x, 32 - 8)); +#if defined(__clang__) + return vreinterpretq_u32_u8(__builtin_shufflevector(vreinterpretq_u8_u32(x), vreinterpretq_u8_u32(x), 1,2,3,0,5,6,7,4,9,10,11,8,13,14,15,12)); +#elif __GNUC__ * 10000 + __GNUC_MINOR__ * 100 >=40700 + static const uint8x16_t r8 = {1,2,3,0,5,6,7,4,9,10,11,8,13,14,15,12}; + return vreinterpretq_u32_u8(__builtin_shuffle(vreinterpretq_u8_u32(x), vreinterpretq_u8_u32(x), r8)); +#else + return vsriq_n_u32(vshlq_n_u32(x, 32-8), x, 8); +#endif +} + +INLINE uint32x4_t rot7_128(uint32x4_t x) { + // See comment in rot16_128. + // return vorrq_u32(vshrq_n_u32(x, 7), vshlq_n_u32(x, 32 - 7)); + return vsriq_n_u32(vshlq_n_u32(x, 32-7), x, 7); +} + +// TODO: compress_neon + +// TODO: hash2_neon + +/* + * ---------------------------------------------------------------------------- + * hash4_neon + * ---------------------------------------------------------------------------- + */ + +INLINE void round_fn4(uint32x4_t v[16], uint32x4_t m[16], size_t r) { + v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); + v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); + v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); + v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); + v[0] = add_128(v[0], v[4]); + v[1] = add_128(v[1], v[5]); + v[2] = add_128(v[2], v[6]); + v[3] = add_128(v[3], v[7]); + v[12] = xor_128(v[12], v[0]); + v[13] = xor_128(v[13], v[1]); + v[14] = xor_128(v[14], v[2]); + v[15] = xor_128(v[15], v[3]); + v[12] = rot16_128(v[12]); + v[13] = rot16_128(v[13]); + v[14] = rot16_128(v[14]); + v[15] = rot16_128(v[15]); + v[8] = add_128(v[8], v[12]); + v[9] = add_128(v[9], v[13]); + v[10] = add_128(v[10], v[14]); + v[11] = add_128(v[11], v[15]); + v[4] = xor_128(v[4], v[8]); + v[5] = xor_128(v[5], v[9]); + v[6] = xor_128(v[6], v[10]); + v[7] = xor_128(v[7], v[11]); + v[4] = rot12_128(v[4]); + v[5] = rot12_128(v[5]); + v[6] = rot12_128(v[6]); + v[7] = rot12_128(v[7]); + v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); + v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); + v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); + v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); + v[0] = add_128(v[0], v[4]); + v[1] = add_128(v[1], v[5]); + v[2] = add_128(v[2], v[6]); + v[3] = add_128(v[3], v[7]); + v[12] = xor_128(v[12], v[0]); + v[13] = xor_128(v[13], v[1]); + v[14] = xor_128(v[14], v[2]); + v[15] = xor_128(v[15], v[3]); + v[12] = rot8_128(v[12]); + v[13] = rot8_128(v[13]); + v[14] = rot8_128(v[14]); + v[15] = rot8_128(v[15]); + v[8] = add_128(v[8], v[12]); + v[9] = add_128(v[9], v[13]); + v[10] = add_128(v[10], v[14]); + v[11] = add_128(v[11], v[15]); + v[4] = xor_128(v[4], v[8]); + v[5] = xor_128(v[5], v[9]); + v[6] = xor_128(v[6], v[10]); + v[7] = xor_128(v[7], v[11]); + v[4] = rot7_128(v[4]); + v[5] = rot7_128(v[5]); + v[6] = rot7_128(v[6]); + v[7] = rot7_128(v[7]); + + v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); + v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); + v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); + v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); + v[0] = add_128(v[0], v[5]); + v[1] = add_128(v[1], v[6]); + v[2] = add_128(v[2], v[7]); + v[3] = add_128(v[3], v[4]); + v[15] = xor_128(v[15], v[0]); + v[12] = xor_128(v[12], v[1]); + v[13] = xor_128(v[13], v[2]); + v[14] = xor_128(v[14], v[3]); + v[15] = rot16_128(v[15]); + v[12] = rot16_128(v[12]); + v[13] = rot16_128(v[13]); + v[14] = rot16_128(v[14]); + v[10] = add_128(v[10], v[15]); + v[11] = add_128(v[11], v[12]); + v[8] = add_128(v[8], v[13]); + v[9] = add_128(v[9], v[14]); + v[5] = xor_128(v[5], v[10]); + v[6] = xor_128(v[6], v[11]); + v[7] = xor_128(v[7], v[8]); + v[4] = xor_128(v[4], v[9]); + v[5] = rot12_128(v[5]); + v[6] = rot12_128(v[6]); + v[7] = rot12_128(v[7]); + v[4] = rot12_128(v[4]); + v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); + v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); + v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); + v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); + v[0] = add_128(v[0], v[5]); + v[1] = add_128(v[1], v[6]); + v[2] = add_128(v[2], v[7]); + v[3] = add_128(v[3], v[4]); + v[15] = xor_128(v[15], v[0]); + v[12] = xor_128(v[12], v[1]); + v[13] = xor_128(v[13], v[2]); + v[14] = xor_128(v[14], v[3]); + v[15] = rot8_128(v[15]); + v[12] = rot8_128(v[12]); + v[13] = rot8_128(v[13]); + v[14] = rot8_128(v[14]); + v[10] = add_128(v[10], v[15]); + v[11] = add_128(v[11], v[12]); + v[8] = add_128(v[8], v[13]); + v[9] = add_128(v[9], v[14]); + v[5] = xor_128(v[5], v[10]); + v[6] = xor_128(v[6], v[11]); + v[7] = xor_128(v[7], v[8]); + v[4] = xor_128(v[4], v[9]); + v[5] = rot7_128(v[5]); + v[6] = rot7_128(v[6]); + v[7] = rot7_128(v[7]); + v[4] = rot7_128(v[4]); +} + +INLINE void transpose_vecs_128(uint32x4_t vecs[4]) { + // Individually transpose the four 2x2 sub-matrices in each corner. + uint32x4x2_t rows01 = vtrnq_u32(vecs[0], vecs[1]); + uint32x4x2_t rows23 = vtrnq_u32(vecs[2], vecs[3]); + + // Swap the top-right and bottom-left 2x2s (which just got transposed). + vecs[0] = + vcombine_u32(vget_low_u32(rows01.val[0]), vget_low_u32(rows23.val[0])); + vecs[1] = + vcombine_u32(vget_low_u32(rows01.val[1]), vget_low_u32(rows23.val[1])); + vecs[2] = + vcombine_u32(vget_high_u32(rows01.val[0]), vget_high_u32(rows23.val[0])); + vecs[3] = + vcombine_u32(vget_high_u32(rows01.val[1]), vget_high_u32(rows23.val[1])); +} + +INLINE void transpose_msg_vecs4(const uint8_t *const *inputs, + size_t block_offset, uint32x4_t out[16]) { + out[0] = loadu_128(&inputs[0][block_offset + 0 * sizeof(uint32x4_t)]); + out[1] = loadu_128(&inputs[1][block_offset + 0 * sizeof(uint32x4_t)]); + out[2] = loadu_128(&inputs[2][block_offset + 0 * sizeof(uint32x4_t)]); + out[3] = loadu_128(&inputs[3][block_offset + 0 * sizeof(uint32x4_t)]); + out[4] = loadu_128(&inputs[0][block_offset + 1 * sizeof(uint32x4_t)]); + out[5] = loadu_128(&inputs[1][block_offset + 1 * sizeof(uint32x4_t)]); + out[6] = loadu_128(&inputs[2][block_offset + 1 * sizeof(uint32x4_t)]); + out[7] = loadu_128(&inputs[3][block_offset + 1 * sizeof(uint32x4_t)]); + out[8] = loadu_128(&inputs[0][block_offset + 2 * sizeof(uint32x4_t)]); + out[9] = loadu_128(&inputs[1][block_offset + 2 * sizeof(uint32x4_t)]); + out[10] = loadu_128(&inputs[2][block_offset + 2 * sizeof(uint32x4_t)]); + out[11] = loadu_128(&inputs[3][block_offset + 2 * sizeof(uint32x4_t)]); + out[12] = loadu_128(&inputs[0][block_offset + 3 * sizeof(uint32x4_t)]); + out[13] = loadu_128(&inputs[1][block_offset + 3 * sizeof(uint32x4_t)]); + out[14] = loadu_128(&inputs[2][block_offset + 3 * sizeof(uint32x4_t)]); + out[15] = loadu_128(&inputs[3][block_offset + 3 * sizeof(uint32x4_t)]); + transpose_vecs_128(&out[0]); + transpose_vecs_128(&out[4]); + transpose_vecs_128(&out[8]); + transpose_vecs_128(&out[12]); +} + +INLINE void load_counters4(uint64_t counter, bool increment_counter, + uint32x4_t *out_low, uint32x4_t *out_high) { + uint64_t mask = (increment_counter ? ~0 : 0); + *out_low = set4( + counter_low(counter + (mask & 0)), counter_low(counter + (mask & 1)), + counter_low(counter + (mask & 2)), counter_low(counter + (mask & 3))); + *out_high = set4( + counter_high(counter + (mask & 0)), counter_high(counter + (mask & 1)), + counter_high(counter + (mask & 2)), counter_high(counter + (mask & 3))); +} + +void blake3_hash4_neon(const uint8_t *const *inputs, size_t blocks, + const uint32_t key[8], uint64_t counter, + bool increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { + uint32x4_t h_vecs[8] = { + set1_128(key[0]), set1_128(key[1]), set1_128(key[2]), set1_128(key[3]), + set1_128(key[4]), set1_128(key[5]), set1_128(key[6]), set1_128(key[7]), + }; + uint32x4_t counter_low_vec, counter_high_vec; + load_counters4(counter, increment_counter, &counter_low_vec, + &counter_high_vec); + uint8_t block_flags = flags | flags_start; + + for (size_t block = 0; block < blocks; block++) { + if (block + 1 == blocks) { + block_flags |= flags_end; + } + uint32x4_t block_len_vec = set1_128(BLAKE3_BLOCK_LEN); + uint32x4_t block_flags_vec = set1_128(block_flags); + uint32x4_t msg_vecs[16]; + transpose_msg_vecs4(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); + + uint32x4_t v[16] = { + h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], + h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], + set1_128(IV[0]), set1_128(IV[1]), set1_128(IV[2]), set1_128(IV[3]), + counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, + }; + round_fn4(v, msg_vecs, 0); + round_fn4(v, msg_vecs, 1); + round_fn4(v, msg_vecs, 2); + round_fn4(v, msg_vecs, 3); + round_fn4(v, msg_vecs, 4); + round_fn4(v, msg_vecs, 5); + round_fn4(v, msg_vecs, 6); + h_vecs[0] = xor_128(v[0], v[8]); + h_vecs[1] = xor_128(v[1], v[9]); + h_vecs[2] = xor_128(v[2], v[10]); + h_vecs[3] = xor_128(v[3], v[11]); + h_vecs[4] = xor_128(v[4], v[12]); + h_vecs[5] = xor_128(v[5], v[13]); + h_vecs[6] = xor_128(v[6], v[14]); + h_vecs[7] = xor_128(v[7], v[15]); + + block_flags = flags; + } + + transpose_vecs_128(&h_vecs[0]); + transpose_vecs_128(&h_vecs[4]); + // The first four vecs now contain the first half of each output, and the + // second four vecs contain the second half of each output. + storeu_128(h_vecs[0], &out[0 * sizeof(uint32x4_t)]); + storeu_128(h_vecs[4], &out[1 * sizeof(uint32x4_t)]); + storeu_128(h_vecs[1], &out[2 * sizeof(uint32x4_t)]); + storeu_128(h_vecs[5], &out[3 * sizeof(uint32x4_t)]); + storeu_128(h_vecs[2], &out[4 * sizeof(uint32x4_t)]); + storeu_128(h_vecs[6], &out[5 * sizeof(uint32x4_t)]); + storeu_128(h_vecs[3], &out[6 * sizeof(uint32x4_t)]); + storeu_128(h_vecs[7], &out[7 * sizeof(uint32x4_t)]); +} + +/* + * ---------------------------------------------------------------------------- + * hash_many_neon + * ---------------------------------------------------------------------------- + */ + +void blake3_compress_in_place_portable(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags); + +INLINE void hash_one_neon(const uint8_t *input, size_t blocks, + const uint32_t key[8], uint64_t counter, + uint8_t flags, uint8_t flags_start, uint8_t flags_end, + uint8_t out[BLAKE3_OUT_LEN]) { + uint32_t cv[8]; + memcpy(cv, key, BLAKE3_KEY_LEN); + uint8_t block_flags = flags | flags_start; + while (blocks > 0) { + if (blocks == 1) { + block_flags |= flags_end; + } + // TODO: Implement compress_neon. However note that according to + // https://github.com/BLAKE2/BLAKE2/commit/7965d3e6e1b4193438b8d3a656787587d2579227, + // compress_neon might not be any faster than compress_portable. + blake3_compress_in_place_portable(cv, input, BLAKE3_BLOCK_LEN, counter, + block_flags); + input = &input[BLAKE3_BLOCK_LEN]; + blocks -= 1; + block_flags = flags; + } + memcpy(out, cv, BLAKE3_OUT_LEN); +} + +void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out) { + while (num_inputs >= 4) { + blake3_hash4_neon(inputs, blocks, key, counter, increment_counter, flags, + flags_start, flags_end, out); + if (increment_counter) { + counter += 4; + } + inputs += 4; + num_inputs -= 4; + out = &out[4 * BLAKE3_OUT_LEN]; + } + while (num_inputs > 0) { + hash_one_neon(inputs[0], blocks, key, counter, flags, flags_start, + flags_end, out); + if (increment_counter) { + counter += 1; + } + inputs += 1; + num_inputs -= 1; + out = &out[BLAKE3_OUT_LEN]; + } +} diff --git a/thirdparty/blake3/c/blake3_portable.c b/thirdparty/blake3/c/blake3_portable.c new file mode 100644 index 000000000..062dd1b47 --- /dev/null +++ b/thirdparty/blake3/c/blake3_portable.c @@ -0,0 +1,160 @@ +#include "blake3_impl.h" +#include <string.h> + +INLINE uint32_t rotr32(uint32_t w, uint32_t c) { + return (w >> c) | (w << (32 - c)); +} + +INLINE void g(uint32_t *state, size_t a, size_t b, size_t c, size_t d, + uint32_t x, uint32_t y) { + state[a] = state[a] + state[b] + x; + state[d] = rotr32(state[d] ^ state[a], 16); + state[c] = state[c] + state[d]; + state[b] = rotr32(state[b] ^ state[c], 12); + state[a] = state[a] + state[b] + y; + state[d] = rotr32(state[d] ^ state[a], 8); + state[c] = state[c] + state[d]; + state[b] = rotr32(state[b] ^ state[c], 7); +} + +INLINE void round_fn(uint32_t state[16], const uint32_t *msg, size_t round) { + // Select the message schedule based on the round. + const uint8_t *schedule = MSG_SCHEDULE[round]; + + // Mix the columns. + g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]); + g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]); + g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]); + g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]); + + // Mix the rows. + g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]); + g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]); + g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]); + g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]); +} + +INLINE void compress_pre(uint32_t state[16], const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, uint8_t flags) { + uint32_t block_words[16]; + block_words[0] = load32(block + 4 * 0); + block_words[1] = load32(block + 4 * 1); + block_words[2] = load32(block + 4 * 2); + block_words[3] = load32(block + 4 * 3); + block_words[4] = load32(block + 4 * 4); + block_words[5] = load32(block + 4 * 5); + block_words[6] = load32(block + 4 * 6); + block_words[7] = load32(block + 4 * 7); + block_words[8] = load32(block + 4 * 8); + block_words[9] = load32(block + 4 * 9); + block_words[10] = load32(block + 4 * 10); + block_words[11] = load32(block + 4 * 11); + block_words[12] = load32(block + 4 * 12); + block_words[13] = load32(block + 4 * 13); + block_words[14] = load32(block + 4 * 14); + block_words[15] = load32(block + 4 * 15); + + state[0] = cv[0]; + state[1] = cv[1]; + state[2] = cv[2]; + state[3] = cv[3]; + state[4] = cv[4]; + state[5] = cv[5]; + state[6] = cv[6]; + state[7] = cv[7]; + state[8] = IV[0]; + state[9] = IV[1]; + state[10] = IV[2]; + state[11] = IV[3]; + state[12] = counter_low(counter); + state[13] = counter_high(counter); + state[14] = (uint32_t)block_len; + state[15] = (uint32_t)flags; + + round_fn(state, &block_words[0], 0); + round_fn(state, &block_words[0], 1); + round_fn(state, &block_words[0], 2); + round_fn(state, &block_words[0], 3); + round_fn(state, &block_words[0], 4); + round_fn(state, &block_words[0], 5); + round_fn(state, &block_words[0], 6); +} + +void blake3_compress_in_place_portable(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags) { + uint32_t state[16]; + compress_pre(state, cv, block, block_len, counter, flags); + cv[0] = state[0] ^ state[8]; + cv[1] = state[1] ^ state[9]; + cv[2] = state[2] ^ state[10]; + cv[3] = state[3] ^ state[11]; + cv[4] = state[4] ^ state[12]; + cv[5] = state[5] ^ state[13]; + cv[6] = state[6] ^ state[14]; + cv[7] = state[7] ^ state[15]; +} + +void blake3_compress_xof_portable(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags, uint8_t out[64]) { + uint32_t state[16]; + compress_pre(state, cv, block, block_len, counter, flags); + + store32(&out[0 * 4], state[0] ^ state[8]); + store32(&out[1 * 4], state[1] ^ state[9]); + store32(&out[2 * 4], state[2] ^ state[10]); + store32(&out[3 * 4], state[3] ^ state[11]); + store32(&out[4 * 4], state[4] ^ state[12]); + store32(&out[5 * 4], state[5] ^ state[13]); + store32(&out[6 * 4], state[6] ^ state[14]); + store32(&out[7 * 4], state[7] ^ state[15]); + store32(&out[8 * 4], state[8] ^ cv[0]); + store32(&out[9 * 4], state[9] ^ cv[1]); + store32(&out[10 * 4], state[10] ^ cv[2]); + store32(&out[11 * 4], state[11] ^ cv[3]); + store32(&out[12 * 4], state[12] ^ cv[4]); + store32(&out[13 * 4], state[13] ^ cv[5]); + store32(&out[14 * 4], state[14] ^ cv[6]); + store32(&out[15 * 4], state[15] ^ cv[7]); +} + +INLINE void hash_one_portable(const uint8_t *input, size_t blocks, + const uint32_t key[8], uint64_t counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) { + uint32_t cv[8]; + memcpy(cv, key, BLAKE3_KEY_LEN); + uint8_t block_flags = flags | flags_start; + while (blocks > 0) { + if (blocks == 1) { + block_flags |= flags_end; + } + blake3_compress_in_place_portable(cv, input, BLAKE3_BLOCK_LEN, counter, + block_flags); + input = &input[BLAKE3_BLOCK_LEN]; + blocks -= 1; + block_flags = flags; + } + store_cv_words(out, cv); +} + +void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out) { + while (num_inputs > 0) { + hash_one_portable(inputs[0], blocks, key, counter, flags, flags_start, + flags_end, out); + if (increment_counter) { + counter += 1; + } + inputs += 1; + num_inputs -= 1; + out = &out[BLAKE3_OUT_LEN]; + } +} diff --git a/thirdparty/blake3/c/blake3_sse2.c b/thirdparty/blake3/c/blake3_sse2.c new file mode 100644 index 000000000..691e1c680 --- /dev/null +++ b/thirdparty/blake3/c/blake3_sse2.c @@ -0,0 +1,566 @@ +#include "blake3_impl.h" + +#include <immintrin.h> + +#define DEGREE 4 + +#define _mm_shuffle_ps2(a, b, c) \ + (_mm_castps_si128( \ + _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c)))) + +INLINE __m128i loadu(const uint8_t src[16]) { + return _mm_loadu_si128((const __m128i *)src); +} + +INLINE void storeu(__m128i src, uint8_t dest[16]) { + _mm_storeu_si128((__m128i *)dest, src); +} + +INLINE __m128i addv(__m128i a, __m128i b) { return _mm_add_epi32(a, b); } + +// Note that clang-format doesn't like the name "xor" for some reason. +INLINE __m128i xorv(__m128i a, __m128i b) { return _mm_xor_si128(a, b); } + +INLINE __m128i set1(uint32_t x) { return _mm_set1_epi32((int32_t)x); } + +INLINE __m128i set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { + return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d); +} + +INLINE __m128i rot16(__m128i x) { + return _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, 0xB1), 0xB1); +} + +INLINE __m128i rot12(__m128i x) { + return xorv(_mm_srli_epi32(x, 12), _mm_slli_epi32(x, 32 - 12)); +} + +INLINE __m128i rot8(__m128i x) { + return xorv(_mm_srli_epi32(x, 8), _mm_slli_epi32(x, 32 - 8)); +} + +INLINE __m128i rot7(__m128i x) { + return xorv(_mm_srli_epi32(x, 7), _mm_slli_epi32(x, 32 - 7)); +} + +INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, + __m128i m) { + *row0 = addv(addv(*row0, m), *row1); + *row3 = xorv(*row3, *row0); + *row3 = rot16(*row3); + *row2 = addv(*row2, *row3); + *row1 = xorv(*row1, *row2); + *row1 = rot12(*row1); +} + +INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, + __m128i m) { + *row0 = addv(addv(*row0, m), *row1); + *row3 = xorv(*row3, *row0); + *row3 = rot8(*row3); + *row2 = addv(*row2, *row3); + *row1 = xorv(*row1, *row2); + *row1 = rot7(*row1); +} + +// Note the optimization here of leaving row1 as the unrotated row, rather than +// row0. All the message loads below are adjusted to compensate for this. See +// discussion at https://github.com/sneves/blake2-avx2/pull/4 +INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { + *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3)); + *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); + *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1)); +} + +INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { + *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1)); + *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); + *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3)); +} + +INLINE __m128i blend_epi16(__m128i a, __m128i b, const int16_t imm8) { + const __m128i bits = _mm_set_epi16(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01); + __m128i mask = _mm_set1_epi16(imm8); + mask = _mm_and_si128(mask, bits); + mask = _mm_cmpeq_epi16(mask, bits); + return _mm_or_si128(_mm_and_si128(mask, b), _mm_andnot_si128(mask, a)); +} + +INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, uint8_t flags) { + rows[0] = loadu((uint8_t *)&cv[0]); + rows[1] = loadu((uint8_t *)&cv[4]); + rows[2] = set4(IV[0], IV[1], IV[2], IV[3]); + rows[3] = set4(counter_low(counter), counter_high(counter), + (uint32_t)block_len, (uint32_t)flags); + + __m128i m0 = loadu(&block[sizeof(__m128i) * 0]); + __m128i m1 = loadu(&block[sizeof(__m128i) * 1]); + __m128i m2 = loadu(&block[sizeof(__m128i) * 2]); + __m128i m3 = loadu(&block[sizeof(__m128i) * 3]); + + __m128i t0, t1, t2, t3, tt; + + // Round 1. The first round permutes the message words from the original + // input order, into the groups that get mixed in parallel. + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); // 6 4 2 0 + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); // 7 5 3 1 + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10 8 + t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3)); // 12 10 8 14 + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11 9 + t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3)); // 13 11 9 15 + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 2. This round and all following rounds apply a fixed permutation + // to the message words from the round before. + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 3 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 4 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 5 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 6 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 7 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); +} + +void blake3_compress_in_place_sse2(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags) { + __m128i rows[4]; + compress_pre(rows, cv, block, block_len, counter, flags); + storeu(xorv(rows[0], rows[2]), (uint8_t *)&cv[0]); + storeu(xorv(rows[1], rows[3]), (uint8_t *)&cv[4]); +} + +void blake3_compress_xof_sse2(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags, uint8_t out[64]) { + __m128i rows[4]; + compress_pre(rows, cv, block, block_len, counter, flags); + storeu(xorv(rows[0], rows[2]), &out[0]); + storeu(xorv(rows[1], rows[3]), &out[16]); + storeu(xorv(rows[2], loadu((uint8_t *)&cv[0])), &out[32]); + storeu(xorv(rows[3], loadu((uint8_t *)&cv[4])), &out[48]); +} + +INLINE void round_fn(__m128i v[16], __m128i m[16], size_t r) { + v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); + v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); + v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); + v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); + v[0] = addv(v[0], v[4]); + v[1] = addv(v[1], v[5]); + v[2] = addv(v[2], v[6]); + v[3] = addv(v[3], v[7]); + v[12] = xorv(v[12], v[0]); + v[13] = xorv(v[13], v[1]); + v[14] = xorv(v[14], v[2]); + v[15] = xorv(v[15], v[3]); + v[12] = rot16(v[12]); + v[13] = rot16(v[13]); + v[14] = rot16(v[14]); + v[15] = rot16(v[15]); + v[8] = addv(v[8], v[12]); + v[9] = addv(v[9], v[13]); + v[10] = addv(v[10], v[14]); + v[11] = addv(v[11], v[15]); + v[4] = xorv(v[4], v[8]); + v[5] = xorv(v[5], v[9]); + v[6] = xorv(v[6], v[10]); + v[7] = xorv(v[7], v[11]); + v[4] = rot12(v[4]); + v[5] = rot12(v[5]); + v[6] = rot12(v[6]); + v[7] = rot12(v[7]); + v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); + v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); + v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); + v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); + v[0] = addv(v[0], v[4]); + v[1] = addv(v[1], v[5]); + v[2] = addv(v[2], v[6]); + v[3] = addv(v[3], v[7]); + v[12] = xorv(v[12], v[0]); + v[13] = xorv(v[13], v[1]); + v[14] = xorv(v[14], v[2]); + v[15] = xorv(v[15], v[3]); + v[12] = rot8(v[12]); + v[13] = rot8(v[13]); + v[14] = rot8(v[14]); + v[15] = rot8(v[15]); + v[8] = addv(v[8], v[12]); + v[9] = addv(v[9], v[13]); + v[10] = addv(v[10], v[14]); + v[11] = addv(v[11], v[15]); + v[4] = xorv(v[4], v[8]); + v[5] = xorv(v[5], v[9]); + v[6] = xorv(v[6], v[10]); + v[7] = xorv(v[7], v[11]); + v[4] = rot7(v[4]); + v[5] = rot7(v[5]); + v[6] = rot7(v[6]); + v[7] = rot7(v[7]); + + v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); + v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); + v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); + v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); + v[0] = addv(v[0], v[5]); + v[1] = addv(v[1], v[6]); + v[2] = addv(v[2], v[7]); + v[3] = addv(v[3], v[4]); + v[15] = xorv(v[15], v[0]); + v[12] = xorv(v[12], v[1]); + v[13] = xorv(v[13], v[2]); + v[14] = xorv(v[14], v[3]); + v[15] = rot16(v[15]); + v[12] = rot16(v[12]); + v[13] = rot16(v[13]); + v[14] = rot16(v[14]); + v[10] = addv(v[10], v[15]); + v[11] = addv(v[11], v[12]); + v[8] = addv(v[8], v[13]); + v[9] = addv(v[9], v[14]); + v[5] = xorv(v[5], v[10]); + v[6] = xorv(v[6], v[11]); + v[7] = xorv(v[7], v[8]); + v[4] = xorv(v[4], v[9]); + v[5] = rot12(v[5]); + v[6] = rot12(v[6]); + v[7] = rot12(v[7]); + v[4] = rot12(v[4]); + v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); + v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); + v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); + v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); + v[0] = addv(v[0], v[5]); + v[1] = addv(v[1], v[6]); + v[2] = addv(v[2], v[7]); + v[3] = addv(v[3], v[4]); + v[15] = xorv(v[15], v[0]); + v[12] = xorv(v[12], v[1]); + v[13] = xorv(v[13], v[2]); + v[14] = xorv(v[14], v[3]); + v[15] = rot8(v[15]); + v[12] = rot8(v[12]); + v[13] = rot8(v[13]); + v[14] = rot8(v[14]); + v[10] = addv(v[10], v[15]); + v[11] = addv(v[11], v[12]); + v[8] = addv(v[8], v[13]); + v[9] = addv(v[9], v[14]); + v[5] = xorv(v[5], v[10]); + v[6] = xorv(v[6], v[11]); + v[7] = xorv(v[7], v[8]); + v[4] = xorv(v[4], v[9]); + v[5] = rot7(v[5]); + v[6] = rot7(v[6]); + v[7] = rot7(v[7]); + v[4] = rot7(v[4]); +} + +INLINE void transpose_vecs(__m128i vecs[DEGREE]) { + // Interleave 32-bit lanes. The low unpack is lanes 00/11 and the high is + // 22/33. Note that this doesn't split the vector into two lanes, as the + // AVX2 counterparts do. + __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]); + __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]); + __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]); + __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]); + + // Interleave 64-bit lanes. + __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01); + __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01); + __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23); + __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23); + + vecs[0] = abcd_0; + vecs[1] = abcd_1; + vecs[2] = abcd_2; + vecs[3] = abcd_3; +} + +INLINE void transpose_msg_vecs(const uint8_t *const *inputs, + size_t block_offset, __m128i out[16]) { + out[0] = loadu(&inputs[0][block_offset + 0 * sizeof(__m128i)]); + out[1] = loadu(&inputs[1][block_offset + 0 * sizeof(__m128i)]); + out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m128i)]); + out[3] = loadu(&inputs[3][block_offset + 0 * sizeof(__m128i)]); + out[4] = loadu(&inputs[0][block_offset + 1 * sizeof(__m128i)]); + out[5] = loadu(&inputs[1][block_offset + 1 * sizeof(__m128i)]); + out[6] = loadu(&inputs[2][block_offset + 1 * sizeof(__m128i)]); + out[7] = loadu(&inputs[3][block_offset + 1 * sizeof(__m128i)]); + out[8] = loadu(&inputs[0][block_offset + 2 * sizeof(__m128i)]); + out[9] = loadu(&inputs[1][block_offset + 2 * sizeof(__m128i)]); + out[10] = loadu(&inputs[2][block_offset + 2 * sizeof(__m128i)]); + out[11] = loadu(&inputs[3][block_offset + 2 * sizeof(__m128i)]); + out[12] = loadu(&inputs[0][block_offset + 3 * sizeof(__m128i)]); + out[13] = loadu(&inputs[1][block_offset + 3 * sizeof(__m128i)]); + out[14] = loadu(&inputs[2][block_offset + 3 * sizeof(__m128i)]); + out[15] = loadu(&inputs[3][block_offset + 3 * sizeof(__m128i)]); + for (size_t i = 0; i < 4; ++i) { + _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0); + } + transpose_vecs(&out[0]); + transpose_vecs(&out[4]); + transpose_vecs(&out[8]); + transpose_vecs(&out[12]); +} + +INLINE void load_counters(uint64_t counter, bool increment_counter, + __m128i *out_lo, __m128i *out_hi) { + const __m128i mask = _mm_set1_epi32(-(int32_t)increment_counter); + const __m128i add0 = _mm_set_epi32(3, 2, 1, 0); + const __m128i add1 = _mm_and_si128(mask, add0); + __m128i l = _mm_add_epi32(_mm_set1_epi32((int32_t)counter), add1); + __m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32(0x80000000)), + _mm_xor_si128( l, _mm_set1_epi32(0x80000000))); + __m128i h = _mm_sub_epi32(_mm_set1_epi32((int32_t)(counter >> 32)), carry); + *out_lo = l; + *out_hi = h; +} + +static +void blake3_hash4_sse2(const uint8_t *const *inputs, size_t blocks, + const uint32_t key[8], uint64_t counter, + bool increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { + __m128i h_vecs[8] = { + set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]), + set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]), + }; + __m128i counter_low_vec, counter_high_vec; + load_counters(counter, increment_counter, &counter_low_vec, + &counter_high_vec); + uint8_t block_flags = flags | flags_start; + + for (size_t block = 0; block < blocks; block++) { + if (block + 1 == blocks) { + block_flags |= flags_end; + } + __m128i block_len_vec = set1(BLAKE3_BLOCK_LEN); + __m128i block_flags_vec = set1(block_flags); + __m128i msg_vecs[16]; + transpose_msg_vecs(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); + + __m128i v[16] = { + h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], + h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], + set1(IV[0]), set1(IV[1]), set1(IV[2]), set1(IV[3]), + counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, + }; + round_fn(v, msg_vecs, 0); + round_fn(v, msg_vecs, 1); + round_fn(v, msg_vecs, 2); + round_fn(v, msg_vecs, 3); + round_fn(v, msg_vecs, 4); + round_fn(v, msg_vecs, 5); + round_fn(v, msg_vecs, 6); + h_vecs[0] = xorv(v[0], v[8]); + h_vecs[1] = xorv(v[1], v[9]); + h_vecs[2] = xorv(v[2], v[10]); + h_vecs[3] = xorv(v[3], v[11]); + h_vecs[4] = xorv(v[4], v[12]); + h_vecs[5] = xorv(v[5], v[13]); + h_vecs[6] = xorv(v[6], v[14]); + h_vecs[7] = xorv(v[7], v[15]); + + block_flags = flags; + } + + transpose_vecs(&h_vecs[0]); + transpose_vecs(&h_vecs[4]); + // The first four vecs now contain the first half of each output, and the + // second four vecs contain the second half of each output. + storeu(h_vecs[0], &out[0 * sizeof(__m128i)]); + storeu(h_vecs[4], &out[1 * sizeof(__m128i)]); + storeu(h_vecs[1], &out[2 * sizeof(__m128i)]); + storeu(h_vecs[5], &out[3 * sizeof(__m128i)]); + storeu(h_vecs[2], &out[4 * sizeof(__m128i)]); + storeu(h_vecs[6], &out[5 * sizeof(__m128i)]); + storeu(h_vecs[3], &out[6 * sizeof(__m128i)]); + storeu(h_vecs[7], &out[7 * sizeof(__m128i)]); +} + +INLINE void hash_one_sse2(const uint8_t *input, size_t blocks, + const uint32_t key[8], uint64_t counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) { + uint32_t cv[8]; + memcpy(cv, key, BLAKE3_KEY_LEN); + uint8_t block_flags = flags | flags_start; + while (blocks > 0) { + if (blocks == 1) { + block_flags |= flags_end; + } + blake3_compress_in_place_sse2(cv, input, BLAKE3_BLOCK_LEN, counter, + block_flags); + input = &input[BLAKE3_BLOCK_LEN]; + blocks -= 1; + block_flags = flags; + } + memcpy(out, cv, BLAKE3_OUT_LEN); +} + +void blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out) { + while (num_inputs >= DEGREE) { + blake3_hash4_sse2(inputs, blocks, key, counter, increment_counter, flags, + flags_start, flags_end, out); + if (increment_counter) { + counter += DEGREE; + } + inputs += DEGREE; + num_inputs -= DEGREE; + out = &out[DEGREE * BLAKE3_OUT_LEN]; + } + while (num_inputs > 0) { + hash_one_sse2(inputs[0], blocks, key, counter, flags, flags_start, + flags_end, out); + if (increment_counter) { + counter += 1; + } + inputs += 1; + num_inputs -= 1; + out = &out[BLAKE3_OUT_LEN]; + } +} diff --git a/thirdparty/blake3/c/blake3_sse2_x86-64_unix.S b/thirdparty/blake3/c/blake3_sse2_x86-64_unix.S new file mode 100644 index 000000000..99f033fef --- /dev/null +++ b/thirdparty/blake3/c/blake3_sse2_x86-64_unix.S @@ -0,0 +1,2291 @@ +#if defined(__ELF__) && defined(__linux__) +.section .note.GNU-stack,"",%progbits +#endif + +#if defined(__ELF__) && defined(__CET__) && defined(__has_include) +#if __has_include(<cet.h>) +#include <cet.h> +#endif +#endif + +#if !defined(_CET_ENDBR) +#define _CET_ENDBR +#endif + +.intel_syntax noprefix +.global blake3_hash_many_sse2 +.global _blake3_hash_many_sse2 +.global blake3_compress_in_place_sse2 +.global _blake3_compress_in_place_sse2 +.global blake3_compress_xof_sse2 +.global _blake3_compress_xof_sse2 +#ifdef __APPLE__ +.text +#else +.section .text +#endif + .p2align 6 +_blake3_hash_many_sse2: +blake3_hash_many_sse2: + _CET_ENDBR + push r15 + push r14 + push r13 + push r12 + push rbx + push rbp + mov rbp, rsp + sub rsp, 360 + and rsp, 0xFFFFFFFFFFFFFFC0 + neg r9d + movd xmm0, r9d + pshufd xmm0, xmm0, 0x00 + movdqa xmmword ptr [rsp+0x130], xmm0 + movdqa xmm1, xmm0 + pand xmm1, xmmword ptr [ADD0+rip] + pand xmm0, xmmword ptr [ADD1+rip] + movdqa xmmword ptr [rsp+0x150], xmm0 + movd xmm0, r8d + pshufd xmm0, xmm0, 0x00 + paddd xmm0, xmm1 + movdqa xmmword ptr [rsp+0x110], xmm0 + pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] + pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] + pcmpgtd xmm1, xmm0 + shr r8, 32 + movd xmm2, r8d + pshufd xmm2, xmm2, 0x00 + psubd xmm2, xmm1 + movdqa xmmword ptr [rsp+0x120], xmm2 + mov rbx, qword ptr [rbp+0x50] + mov r15, rdx + shl r15, 6 + movzx r13d, byte ptr [rbp+0x38] + movzx r12d, byte ptr [rbp+0x48] + cmp rsi, 4 + jc 3f +2: + movdqu xmm3, xmmword ptr [rcx] + pshufd xmm0, xmm3, 0x00 + pshufd xmm1, xmm3, 0x55 + pshufd xmm2, xmm3, 0xAA + pshufd xmm3, xmm3, 0xFF + movdqu xmm7, xmmword ptr [rcx+0x10] + pshufd xmm4, xmm7, 0x00 + pshufd xmm5, xmm7, 0x55 + pshufd xmm6, xmm7, 0xAA + pshufd xmm7, xmm7, 0xFF + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +9: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movdqu xmm8, xmmword ptr [r8+rdx-0x40] + movdqu xmm9, xmmword ptr [r9+rdx-0x40] + movdqu xmm10, xmmword ptr [r10+rdx-0x40] + movdqu xmm11, xmmword ptr [r11+rdx-0x40] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp], xmm8 + movdqa xmmword ptr [rsp+0x10], xmm9 + movdqa xmmword ptr [rsp+0x20], xmm12 + movdqa xmmword ptr [rsp+0x30], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-0x30] + movdqu xmm9, xmmword ptr [r9+rdx-0x30] + movdqu xmm10, xmmword ptr [r10+rdx-0x30] + movdqu xmm11, xmmword ptr [r11+rdx-0x30] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0x40], xmm8 + movdqa xmmword ptr [rsp+0x50], xmm9 + movdqa xmmword ptr [rsp+0x60], xmm12 + movdqa xmmword ptr [rsp+0x70], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-0x20] + movdqu xmm9, xmmword ptr [r9+rdx-0x20] + movdqu xmm10, xmmword ptr [r10+rdx-0x20] + movdqu xmm11, xmmword ptr [r11+rdx-0x20] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0x80], xmm8 + movdqa xmmword ptr [rsp+0x90], xmm9 + movdqa xmmword ptr [rsp+0xA0], xmm12 + movdqa xmmword ptr [rsp+0xB0], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-0x10] + movdqu xmm9, xmmword ptr [r9+rdx-0x10] + movdqu xmm10, xmmword ptr [r10+rdx-0x10] + movdqu xmm11, xmmword ptr [r11+rdx-0x10] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0xC0], xmm8 + movdqa xmmword ptr [rsp+0xD0], xmm9 + movdqa xmmword ptr [rsp+0xE0], xmm12 + movdqa xmmword ptr [rsp+0xF0], xmm13 + movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip] + movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip] + movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip] + movdqa xmm12, xmmword ptr [rsp+0x110] + movdqa xmm13, xmmword ptr [rsp+0x120] + movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip] + movd xmm15, eax + pshufd xmm15, xmm15, 0x00 + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + paddd xmm0, xmmword ptr [rsp] + paddd xmm1, xmmword ptr [rsp+0x20] + paddd xmm2, xmmword ptr [rsp+0x40] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x10] + paddd xmm1, xmmword ptr [rsp+0x30] + paddd xmm2, xmmword ptr [rsp+0x50] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x80] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp+0xC0] + paddd xmm3, xmmword ptr [rsp+0xE0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x90] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0xD0] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x20] + paddd xmm1, xmmword ptr [rsp+0x30] + paddd xmm2, xmmword ptr [rsp+0x70] + paddd xmm3, xmmword ptr [rsp+0x40] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x60] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp] + paddd xmm3, xmmword ptr [rsp+0xD0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x10] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0x90] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xB0] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp+0xE0] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x30] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp+0xD0] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x40] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0x20] + paddd xmm3, xmmword ptr [rsp+0xE0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x60] + paddd xmm1, xmmword ptr [rsp+0x90] + paddd xmm2, xmmword ptr [rsp+0xB0] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x50] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0xF0] + paddd xmm3, xmmword ptr [rsp+0x10] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xA0] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0xE0] + paddd xmm3, xmmword ptr [rsp+0xD0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x70] + paddd xmm1, xmmword ptr [rsp+0x90] + paddd xmm2, xmmword ptr [rsp+0x30] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x40] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0x50] + paddd xmm3, xmmword ptr [rsp+0x10] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp] + paddd xmm1, xmmword ptr [rsp+0x20] + paddd xmm2, xmmword ptr [rsp+0x80] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xC0] + paddd xmm1, xmmword ptr [rsp+0x90] + paddd xmm2, xmmword ptr [rsp+0xF0] + paddd xmm3, xmmword ptr [rsp+0xE0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xD0] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0xA0] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x70] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x20] + paddd xmm1, xmmword ptr [rsp+0x30] + paddd xmm2, xmmword ptr [rsp+0x10] + paddd xmm3, xmmword ptr [rsp+0x40] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x90] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0x80] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xE0] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp+0xC0] + paddd xmm3, xmmword ptr [rsp+0x10] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xD0] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0x20] + paddd xmm3, xmmword ptr [rsp+0x40] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x30] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp+0x60] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xB0] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp+0x10] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xF0] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0x90] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xE0] + paddd xmm1, xmmword ptr [rsp+0x20] + paddd xmm2, xmmword ptr [rsp+0x30] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xA0] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0x40] + paddd xmm3, xmmword ptr [rsp+0xD0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + pxor xmm0, xmm8 + pxor xmm1, xmm9 + pxor xmm2, xmm10 + pxor xmm3, xmm11 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + pxor xmm4, xmm12 + pxor xmm5, xmm13 + pxor xmm6, xmm14 + pxor xmm7, xmm15 + mov eax, r13d + jne 9b + movdqa xmm9, xmm0 + punpckldq xmm0, xmm1 + punpckhdq xmm9, xmm1 + movdqa xmm11, xmm2 + punpckldq xmm2, xmm3 + punpckhdq xmm11, xmm3 + movdqa xmm1, xmm0 + punpcklqdq xmm0, xmm2 + punpckhqdq xmm1, xmm2 + movdqa xmm3, xmm9 + punpcklqdq xmm9, xmm11 + punpckhqdq xmm3, xmm11 + movdqu xmmword ptr [rbx], xmm0 + movdqu xmmword ptr [rbx+0x20], xmm1 + movdqu xmmword ptr [rbx+0x40], xmm9 + movdqu xmmword ptr [rbx+0x60], xmm3 + movdqa xmm9, xmm4 + punpckldq xmm4, xmm5 + punpckhdq xmm9, xmm5 + movdqa xmm11, xmm6 + punpckldq xmm6, xmm7 + punpckhdq xmm11, xmm7 + movdqa xmm5, xmm4 + punpcklqdq xmm4, xmm6 + punpckhqdq xmm5, xmm6 + movdqa xmm7, xmm9 + punpcklqdq xmm9, xmm11 + punpckhqdq xmm7, xmm11 + movdqu xmmword ptr [rbx+0x10], xmm4 + movdqu xmmword ptr [rbx+0x30], xmm5 + movdqu xmmword ptr [rbx+0x50], xmm9 + movdqu xmmword ptr [rbx+0x70], xmm7 + movdqa xmm1, xmmword ptr [rsp+0x110] + movdqa xmm0, xmm1 + paddd xmm1, xmmword ptr [rsp+0x150] + movdqa xmmword ptr [rsp+0x110], xmm1 + pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] + pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] + pcmpgtd xmm0, xmm1 + movdqa xmm1, xmmword ptr [rsp+0x120] + psubd xmm1, xmm0 + movdqa xmmword ptr [rsp+0x120], xmm1 + add rbx, 128 + add rdi, 32 + sub rsi, 4 + cmp rsi, 4 + jnc 2b + test rsi, rsi + jnz 3f +4: + mov rsp, rbp + pop rbp + pop rbx + pop r12 + pop r13 + pop r14 + pop r15 + ret +.p2align 5 +3: + test esi, 0x2 + je 3f + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+0x10] + movaps xmm8, xmm0 + movaps xmm9, xmm1 + movd xmm13, dword ptr [rsp+0x110] + movd xmm14, dword ptr [rsp+0x120] + punpckldq xmm13, xmm14 + movaps xmmword ptr [rsp], xmm13 + movd xmm14, dword ptr [rsp+0x114] + movd xmm13, dword ptr [rsp+0x124] + punpckldq xmm14, xmm13 + movaps xmmword ptr [rsp+0x10], xmm14 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + movaps xmm10, xmm2 + movups xmm4, xmmword ptr [r8+rdx-0x40] + movups xmm5, xmmword ptr [r8+rdx-0x30] + movaps xmm3, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm3, xmm5, 221 + movaps xmm5, xmm3 + movups xmm6, xmmword ptr [r8+rdx-0x20] + movups xmm7, xmmword ptr [r8+rdx-0x10] + movaps xmm3, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm3, xmm7, 221 + pshufd xmm7, xmm3, 0x93 + movups xmm12, xmmword ptr [r9+rdx-0x40] + movups xmm13, xmmword ptr [r9+rdx-0x30] + movaps xmm11, xmm12 + shufps xmm12, xmm13, 136 + shufps xmm11, xmm13, 221 + movaps xmm13, xmm11 + movups xmm14, xmmword ptr [r9+rdx-0x20] + movups xmm15, xmmword ptr [r9+rdx-0x10] + movaps xmm11, xmm14 + shufps xmm14, xmm15, 136 + pshufd xmm14, xmm14, 0x93 + shufps xmm11, xmm15, 221 + pshufd xmm15, xmm11, 0x93 + shl rax, 0x20 + or rax, 0x40 + movq xmm3, rax + movdqa xmmword ptr [rsp+0x20], xmm3 + movaps xmm3, xmmword ptr [rsp] + movaps xmm11, xmmword ptr [rsp+0x10] + punpcklqdq xmm3, xmmword ptr [rsp+0x20] + punpcklqdq xmm11, xmmword ptr [rsp+0x20] + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm8, xmm12 + movaps xmmword ptr [rsp+0x20], xmm4 + movaps xmmword ptr [rsp+0x30], xmm12 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + pshuflw xmm11, xmm11, 0xB1 + pshufhw xmm11, xmm11, 0xB1 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 20 + psrld xmm4, 12 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 20 + psrld xmm4, 12 + por xmm9, xmm4 + paddd xmm0, xmm5 + paddd xmm8, xmm13 + movaps xmmword ptr [rsp+0x40], xmm5 + movaps xmmword ptr [rsp+0x50], xmm13 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + movdqa xmm13, xmm3 + psrld xmm3, 8 + pslld xmm13, 24 + pxor xmm3, xmm13 + movdqa xmm13, xmm11 + psrld xmm11, 8 + pslld xmm13, 24 + pxor xmm11, xmm13 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 25 + psrld xmm4, 7 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 25 + psrld xmm4, 7 + por xmm9, xmm4 + pshufd xmm0, xmm0, 0x93 + pshufd xmm8, xmm8, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm11, xmm11, 0x4E + pshufd xmm2, xmm2, 0x39 + pshufd xmm10, xmm10, 0x39 + paddd xmm0, xmm6 + paddd xmm8, xmm14 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + pshuflw xmm11, xmm11, 0xB1 + pshufhw xmm11, xmm11, 0xB1 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 20 + psrld xmm4, 12 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 20 + psrld xmm4, 12 + por xmm9, xmm4 + paddd xmm0, xmm7 + paddd xmm8, xmm15 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + movdqa xmm13, xmm3 + psrld xmm3, 8 + pslld xmm13, 24 + pxor xmm3, xmm13 + movdqa xmm13, xmm11 + psrld xmm11, 8 + pslld xmm13, 24 + pxor xmm11, xmm13 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 25 + psrld xmm4, 7 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 25 + psrld xmm4, 7 + por xmm9, xmm4 + pshufd xmm0, xmm0, 0x39 + pshufd xmm8, xmm8, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm11, xmm11, 0x4E + pshufd xmm2, xmm2, 0x93 + pshufd xmm10, xmm10, 0x93 + dec al + je 9f + movdqa xmm12, xmmword ptr [rsp+0x20] + movdqa xmm5, xmmword ptr [rsp+0x40] + pshufd xmm13, xmm12, 0x0F + shufps xmm12, xmm5, 214 + pshufd xmm4, xmm12, 0x39 + movdqa xmm12, xmm6 + shufps xmm12, xmm7, 250 + pand xmm13, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm12, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm13, xmm12 + movdqa xmmword ptr [rsp+0x20], xmm13 + movdqa xmm12, xmm7 + punpcklqdq xmm12, xmm5 + movdqa xmm13, xmm6 + pand xmm12, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm13, xmmword ptr [PBLENDW_0xC0_MASK+rip] + por xmm12, xmm13 + pshufd xmm12, xmm12, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmmword ptr [rsp+0x40], xmm12 + movdqa xmm5, xmmword ptr [rsp+0x30] + movdqa xmm13, xmmword ptr [rsp+0x50] + pshufd xmm6, xmm5, 0x0F + shufps xmm5, xmm13, 214 + pshufd xmm12, xmm5, 0x39 + movdqa xmm5, xmm14 + shufps xmm5, xmm15, 250 + pand xmm6, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm5, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm6, xmm5 + movdqa xmm5, xmm15 + punpcklqdq xmm5, xmm13 + movdqa xmmword ptr [rsp+0x30], xmm2 + movdqa xmm2, xmm14 + pand xmm5, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip] + por xmm5, xmm2 + movdqa xmm2, xmmword ptr [rsp+0x30] + pshufd xmm5, xmm5, 0x78 + punpckhdq xmm13, xmm15 + punpckldq xmm14, xmm13 + pshufd xmm15, xmm14, 0x1E + movdqa xmm13, xmm6 + movdqa xmm14, xmm5 + movdqa xmm5, xmmword ptr [rsp+0x20] + movdqa xmm6, xmmword ptr [rsp+0x40] + jmp 9b +9: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + pxor xmm8, xmm10 + pxor xmm9, xmm11 + mov eax, r13d + cmp rdx, r15 + jne 2b + movups xmmword ptr [rbx], xmm0 + movups xmmword ptr [rbx+0x10], xmm1 + movups xmmword ptr [rbx+0x20], xmm8 + movups xmmword ptr [rbx+0x30], xmm9 + mov eax, dword ptr [rsp+0x130] + neg eax + mov r10d, dword ptr [rsp+0x110+8*rax] + mov r11d, dword ptr [rsp+0x120+8*rax] + mov dword ptr [rsp+0x110], r10d + mov dword ptr [rsp+0x120], r11d + add rdi, 16 + add rbx, 64 + sub rsi, 2 +3: + test esi, 0x1 + je 4b + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+0x10] + movd xmm13, dword ptr [rsp+0x110] + movd xmm14, dword ptr [rsp+0x120] + punpckldq xmm13, xmm14 + mov r8, qword ptr [rdi] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + shl rax, 32 + or rax, 64 + movq xmm12, rax + movdqa xmm3, xmm13 + punpcklqdq xmm3, xmm12 + movups xmm4, xmmword ptr [r8+rdx-0x40] + movups xmm5, xmmword ptr [r8+rdx-0x30] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [r8+rdx-0x20] + movups xmm7, xmmword ptr [r8+rdx-0x10] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 0x93 + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + dec al + jz 9f + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm9, xmm8 + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + movdqa xmm10, xmm6 + pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] + por xmm8, xmm10 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp 9b +9: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + movups xmmword ptr [rbx], xmm0 + movups xmmword ptr [rbx+0x10], xmm1 + jmp 4b + +.p2align 6 +blake3_compress_in_place_sse2: +_blake3_compress_in_place_sse2: + _CET_ENDBR + movups xmm0, xmmword ptr [rdi] + movups xmm1, xmmword ptr [rdi+0x10] + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + shl r8, 32 + add rdx, r8 + movq xmm3, rcx + movq xmm4, rdx + punpcklqdq xmm3, xmm4 + movups xmm4, xmmword ptr [rsi] + movups xmm5, xmmword ptr [rsi+0x10] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [rsi+0x20] + movups xmm7, xmmword ptr [rsi+0x30] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 0x93 + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + dec al + jz 9f + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm9, xmm8 + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + movdqa xmm10, xmm6 + pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] + por xmm8, xmm10 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp 9b +9: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + movups xmmword ptr [rdi], xmm0 + movups xmmword ptr [rdi+0x10], xmm1 + ret + +.p2align 6 +blake3_compress_xof_sse2: +_blake3_compress_xof_sse2: + _CET_ENDBR + movups xmm0, xmmword ptr [rdi] + movups xmm1, xmmword ptr [rdi+0x10] + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + movzx eax, r8b + movzx edx, dl + shl rax, 32 + add rdx, rax + movq xmm3, rcx + movq xmm4, rdx + punpcklqdq xmm3, xmm4 + movups xmm4, xmmword ptr [rsi] + movups xmm5, xmmword ptr [rsi+0x10] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [rsi+0x20] + movups xmm7, xmmword ptr [rsi+0x30] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 0x93 + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + dec al + jz 9f + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm9, xmm8 + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + movdqa xmm10, xmm6 + pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] + por xmm8, xmm10 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp 9b +9: + movdqu xmm4, xmmword ptr [rdi] + movdqu xmm5, xmmword ptr [rdi+0x10] + pxor xmm0, xmm2 + pxor xmm1, xmm3 + pxor xmm2, xmm4 + pxor xmm3, xmm5 + movups xmmword ptr [r9], xmm0 + movups xmmword ptr [r9+0x10], xmm1 + movups xmmword ptr [r9+0x20], xmm2 + movups xmmword ptr [r9+0x30], xmm3 + ret + + +#ifdef __APPLE__ +.static_data +#else +.section .rodata +#endif +.p2align 6 +BLAKE3_IV: + .long 0x6A09E667, 0xBB67AE85 + .long 0x3C6EF372, 0xA54FF53A +ADD0: + .long 0, 1, 2, 3 +ADD1: + .long 4, 4, 4, 4 +BLAKE3_IV_0: + .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 +BLAKE3_IV_1: + .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 +BLAKE3_IV_2: + .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 +BLAKE3_IV_3: + .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A +BLAKE3_BLOCK_LEN: + .long 64, 64, 64, 64 +CMP_MSB_MASK: + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 +PBLENDW_0x33_MASK: + .long 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000 +PBLENDW_0xCC_MASK: + .long 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF +PBLENDW_0x3F_MASK: + .long 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 +PBLENDW_0xC0_MASK: + .long 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF diff --git a/thirdparty/blake3/c/blake3_sse2_x86-64_windows_gnu.S b/thirdparty/blake3/c/blake3_sse2_x86-64_windows_gnu.S new file mode 100644 index 000000000..4facb50e7 --- /dev/null +++ b/thirdparty/blake3/c/blake3_sse2_x86-64_windows_gnu.S @@ -0,0 +1,2332 @@ +.intel_syntax noprefix +.global blake3_hash_many_sse2 +.global _blake3_hash_many_sse2 +.global blake3_compress_in_place_sse2 +.global _blake3_compress_in_place_sse2 +.global blake3_compress_xof_sse2 +.global _blake3_compress_xof_sse2 +.section .text + .p2align 6 +_blake3_hash_many_sse2: +blake3_hash_many_sse2: + push r15 + push r14 + push r13 + push r12 + push rsi + push rdi + push rbx + push rbp + mov rbp, rsp + sub rsp, 528 + and rsp, 0xFFFFFFFFFFFFFFC0 + movdqa xmmword ptr [rsp+0x170], xmm6 + movdqa xmmword ptr [rsp+0x180], xmm7 + movdqa xmmword ptr [rsp+0x190], xmm8 + movdqa xmmword ptr [rsp+0x1A0], xmm9 + movdqa xmmword ptr [rsp+0x1B0], xmm10 + movdqa xmmword ptr [rsp+0x1C0], xmm11 + movdqa xmmword ptr [rsp+0x1D0], xmm12 + movdqa xmmword ptr [rsp+0x1E0], xmm13 + movdqa xmmword ptr [rsp+0x1F0], xmm14 + movdqa xmmword ptr [rsp+0x200], xmm15 + mov rdi, rcx + mov rsi, rdx + mov rdx, r8 + mov rcx, r9 + mov r8, qword ptr [rbp+0x68] + movzx r9, byte ptr [rbp+0x70] + neg r9d + movd xmm0, r9d + pshufd xmm0, xmm0, 0x00 + movdqa xmmword ptr [rsp+0x130], xmm0 + movdqa xmm1, xmm0 + pand xmm1, xmmword ptr [ADD0+rip] + pand xmm0, xmmword ptr [ADD1+rip] + movdqa xmmword ptr [rsp+0x150], xmm0 + movd xmm0, r8d + pshufd xmm0, xmm0, 0x00 + paddd xmm0, xmm1 + movdqa xmmword ptr [rsp+0x110], xmm0 + pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] + pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] + pcmpgtd xmm1, xmm0 + shr r8, 32 + movd xmm2, r8d + pshufd xmm2, xmm2, 0x00 + psubd xmm2, xmm1 + movdqa xmmword ptr [rsp+0x120], xmm2 + mov rbx, qword ptr [rbp+0x90] + mov r15, rdx + shl r15, 6 + movzx r13d, byte ptr [rbp+0x78] + movzx r12d, byte ptr [rbp+0x88] + cmp rsi, 4 + jc 3f +2: + movdqu xmm3, xmmword ptr [rcx] + pshufd xmm0, xmm3, 0x00 + pshufd xmm1, xmm3, 0x55 + pshufd xmm2, xmm3, 0xAA + pshufd xmm3, xmm3, 0xFF + movdqu xmm7, xmmword ptr [rcx+0x10] + pshufd xmm4, xmm7, 0x00 + pshufd xmm5, xmm7, 0x55 + pshufd xmm6, xmm7, 0xAA + pshufd xmm7, xmm7, 0xFF + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + movzx eax, byte ptr [rbp+0x80] + or eax, r13d + xor edx, edx +9: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movdqu xmm8, xmmword ptr [r8+rdx-0x40] + movdqu xmm9, xmmword ptr [r9+rdx-0x40] + movdqu xmm10, xmmword ptr [r10+rdx-0x40] + movdqu xmm11, xmmword ptr [r11+rdx-0x40] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp], xmm8 + movdqa xmmword ptr [rsp+0x10], xmm9 + movdqa xmmword ptr [rsp+0x20], xmm12 + movdqa xmmword ptr [rsp+0x30], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-0x30] + movdqu xmm9, xmmword ptr [r9+rdx-0x30] + movdqu xmm10, xmmword ptr [r10+rdx-0x30] + movdqu xmm11, xmmword ptr [r11+rdx-0x30] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0x40], xmm8 + movdqa xmmword ptr [rsp+0x50], xmm9 + movdqa xmmword ptr [rsp+0x60], xmm12 + movdqa xmmword ptr [rsp+0x70], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-0x20] + movdqu xmm9, xmmword ptr [r9+rdx-0x20] + movdqu xmm10, xmmword ptr [r10+rdx-0x20] + movdqu xmm11, xmmword ptr [r11+rdx-0x20] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0x80], xmm8 + movdqa xmmword ptr [rsp+0x90], xmm9 + movdqa xmmword ptr [rsp+0xA0], xmm12 + movdqa xmmword ptr [rsp+0xB0], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-0x10] + movdqu xmm9, xmmword ptr [r9+rdx-0x10] + movdqu xmm10, xmmword ptr [r10+rdx-0x10] + movdqu xmm11, xmmword ptr [r11+rdx-0x10] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0xC0], xmm8 + movdqa xmmword ptr [rsp+0xD0], xmm9 + movdqa xmmword ptr [rsp+0xE0], xmm12 + movdqa xmmword ptr [rsp+0xF0], xmm13 + movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip] + movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip] + movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip] + movdqa xmm12, xmmword ptr [rsp+0x110] + movdqa xmm13, xmmword ptr [rsp+0x120] + movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip] + movd xmm15, eax + pshufd xmm15, xmm15, 0x00 + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + paddd xmm0, xmmword ptr [rsp] + paddd xmm1, xmmword ptr [rsp+0x20] + paddd xmm2, xmmword ptr [rsp+0x40] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x10] + paddd xmm1, xmmword ptr [rsp+0x30] + paddd xmm2, xmmword ptr [rsp+0x50] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x80] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp+0xC0] + paddd xmm3, xmmword ptr [rsp+0xE0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x90] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0xD0] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x20] + paddd xmm1, xmmword ptr [rsp+0x30] + paddd xmm2, xmmword ptr [rsp+0x70] + paddd xmm3, xmmword ptr [rsp+0x40] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x60] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp] + paddd xmm3, xmmword ptr [rsp+0xD0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x10] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0x90] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xB0] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp+0xE0] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x30] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp+0xD0] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x40] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0x20] + paddd xmm3, xmmword ptr [rsp+0xE0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x60] + paddd xmm1, xmmword ptr [rsp+0x90] + paddd xmm2, xmmword ptr [rsp+0xB0] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x50] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0xF0] + paddd xmm3, xmmword ptr [rsp+0x10] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xA0] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0xE0] + paddd xmm3, xmmword ptr [rsp+0xD0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x70] + paddd xmm1, xmmword ptr [rsp+0x90] + paddd xmm2, xmmword ptr [rsp+0x30] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x40] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0x50] + paddd xmm3, xmmword ptr [rsp+0x10] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp] + paddd xmm1, xmmword ptr [rsp+0x20] + paddd xmm2, xmmword ptr [rsp+0x80] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xC0] + paddd xmm1, xmmword ptr [rsp+0x90] + paddd xmm2, xmmword ptr [rsp+0xF0] + paddd xmm3, xmmword ptr [rsp+0xE0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xD0] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0xA0] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x70] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x20] + paddd xmm1, xmmword ptr [rsp+0x30] + paddd xmm2, xmmword ptr [rsp+0x10] + paddd xmm3, xmmword ptr [rsp+0x40] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x90] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0x80] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xE0] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp+0xC0] + paddd xmm3, xmmword ptr [rsp+0x10] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xD0] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0x20] + paddd xmm3, xmmword ptr [rsp+0x40] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x30] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp+0x60] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xB0] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp+0x10] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xF0] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0x90] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xE0] + paddd xmm1, xmmword ptr [rsp+0x20] + paddd xmm2, xmmword ptr [rsp+0x30] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xA0] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0x40] + paddd xmm3, xmmword ptr [rsp+0xD0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + pxor xmm0, xmm8 + pxor xmm1, xmm9 + pxor xmm2, xmm10 + pxor xmm3, xmm11 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + pxor xmm4, xmm12 + pxor xmm5, xmm13 + pxor xmm6, xmm14 + pxor xmm7, xmm15 + mov eax, r13d + jne 9b + movdqa xmm9, xmm0 + punpckldq xmm0, xmm1 + punpckhdq xmm9, xmm1 + movdqa xmm11, xmm2 + punpckldq xmm2, xmm3 + punpckhdq xmm11, xmm3 + movdqa xmm1, xmm0 + punpcklqdq xmm0, xmm2 + punpckhqdq xmm1, xmm2 + movdqa xmm3, xmm9 + punpcklqdq xmm9, xmm11 + punpckhqdq xmm3, xmm11 + movdqu xmmword ptr [rbx], xmm0 + movdqu xmmword ptr [rbx+0x20], xmm1 + movdqu xmmword ptr [rbx+0x40], xmm9 + movdqu xmmword ptr [rbx+0x60], xmm3 + movdqa xmm9, xmm4 + punpckldq xmm4, xmm5 + punpckhdq xmm9, xmm5 + movdqa xmm11, xmm6 + punpckldq xmm6, xmm7 + punpckhdq xmm11, xmm7 + movdqa xmm5, xmm4 + punpcklqdq xmm4, xmm6 + punpckhqdq xmm5, xmm6 + movdqa xmm7, xmm9 + punpcklqdq xmm9, xmm11 + punpckhqdq xmm7, xmm11 + movdqu xmmword ptr [rbx+0x10], xmm4 + movdqu xmmword ptr [rbx+0x30], xmm5 + movdqu xmmword ptr [rbx+0x50], xmm9 + movdqu xmmword ptr [rbx+0x70], xmm7 + movdqa xmm1, xmmword ptr [rsp+0x110] + movdqa xmm0, xmm1 + paddd xmm1, xmmword ptr [rsp+0x150] + movdqa xmmword ptr [rsp+0x110], xmm1 + pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] + pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] + pcmpgtd xmm0, xmm1 + movdqa xmm1, xmmword ptr [rsp+0x120] + psubd xmm1, xmm0 + movdqa xmmword ptr [rsp+0x120], xmm1 + add rbx, 128 + add rdi, 32 + sub rsi, 4 + cmp rsi, 4 + jnc 2b + test rsi, rsi + jne 3f +4: + movdqa xmm6, xmmword ptr [rsp+0x170] + movdqa xmm7, xmmword ptr [rsp+0x180] + movdqa xmm8, xmmword ptr [rsp+0x190] + movdqa xmm9, xmmword ptr [rsp+0x1A0] + movdqa xmm10, xmmword ptr [rsp+0x1B0] + movdqa xmm11, xmmword ptr [rsp+0x1C0] + movdqa xmm12, xmmword ptr [rsp+0x1D0] + movdqa xmm13, xmmword ptr [rsp+0x1E0] + movdqa xmm14, xmmword ptr [rsp+0x1F0] + movdqa xmm15, xmmword ptr [rsp+0x200] + mov rsp, rbp + pop rbp + pop rbx + pop rdi + pop rsi + pop r12 + pop r13 + pop r14 + pop r15 + ret +.p2align 5 +3: + test esi, 0x2 + je 3f + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+0x10] + movaps xmm8, xmm0 + movaps xmm9, xmm1 + movd xmm13, dword ptr [rsp+0x110] + movd xmm14, dword ptr [rsp+0x120] + punpckldq xmm13, xmm14 + movaps xmmword ptr [rsp], xmm13 + movd xmm14, dword ptr [rsp+0x114] + movd xmm13, dword ptr [rsp+0x124] + punpckldq xmm14, xmm13 + movaps xmmword ptr [rsp+0x10], xmm14 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + movzx eax, byte ptr [rbp+0x80] + or eax, r13d + xor edx, edx +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + movaps xmm10, xmm2 + movups xmm4, xmmword ptr [r8+rdx-0x40] + movups xmm5, xmmword ptr [r8+rdx-0x30] + movaps xmm3, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm3, xmm5, 221 + movaps xmm5, xmm3 + movups xmm6, xmmword ptr [r8+rdx-0x20] + movups xmm7, xmmword ptr [r8+rdx-0x10] + movaps xmm3, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm3, xmm7, 221 + pshufd xmm7, xmm3, 0x93 + movups xmm12, xmmword ptr [r9+rdx-0x40] + movups xmm13, xmmword ptr [r9+rdx-0x30] + movaps xmm11, xmm12 + shufps xmm12, xmm13, 136 + shufps xmm11, xmm13, 221 + movaps xmm13, xmm11 + movups xmm14, xmmword ptr [r9+rdx-0x20] + movups xmm15, xmmword ptr [r9+rdx-0x10] + movaps xmm11, xmm14 + shufps xmm14, xmm15, 136 + pshufd xmm14, xmm14, 0x93 + shufps xmm11, xmm15, 221 + pshufd xmm15, xmm11, 0x93 + shl rax, 0x20 + or rax, 0x40 + movq xmm3, rax + movdqa xmmword ptr [rsp+0x20], xmm3 + movaps xmm3, xmmword ptr [rsp] + movaps xmm11, xmmword ptr [rsp+0x10] + punpcklqdq xmm3, xmmword ptr [rsp+0x20] + punpcklqdq xmm11, xmmword ptr [rsp+0x20] + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm8, xmm12 + movaps xmmword ptr [rsp+0x20], xmm4 + movaps xmmword ptr [rsp+0x30], xmm12 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + pshuflw xmm11, xmm11, 0xB1 + pshufhw xmm11, xmm11, 0xB1 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 20 + psrld xmm4, 12 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 20 + psrld xmm4, 12 + por xmm9, xmm4 + paddd xmm0, xmm5 + paddd xmm8, xmm13 + movaps xmmword ptr [rsp+0x40], xmm5 + movaps xmmword ptr [rsp+0x50], xmm13 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + movdqa xmm13, xmm3 + psrld xmm3, 8 + pslld xmm13, 24 + pxor xmm3, xmm13 + movdqa xmm13, xmm11 + psrld xmm11, 8 + pslld xmm13, 24 + pxor xmm11, xmm13 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 25 + psrld xmm4, 7 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 25 + psrld xmm4, 7 + por xmm9, xmm4 + pshufd xmm0, xmm0, 0x93 + pshufd xmm8, xmm8, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm11, xmm11, 0x4E + pshufd xmm2, xmm2, 0x39 + pshufd xmm10, xmm10, 0x39 + paddd xmm0, xmm6 + paddd xmm8, xmm14 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + pshuflw xmm11, xmm11, 0xB1 + pshufhw xmm11, xmm11, 0xB1 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 20 + psrld xmm4, 12 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 20 + psrld xmm4, 12 + por xmm9, xmm4 + paddd xmm0, xmm7 + paddd xmm8, xmm15 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + movdqa xmm13, xmm3 + psrld xmm3, 8 + pslld xmm13, 24 + pxor xmm3, xmm13 + movdqa xmm13, xmm11 + psrld xmm11, 8 + pslld xmm13, 24 + pxor xmm11, xmm13 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 25 + psrld xmm4, 7 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 25 + psrld xmm4, 7 + por xmm9, xmm4 + pshufd xmm0, xmm0, 0x39 + pshufd xmm8, xmm8, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm11, xmm11, 0x4E + pshufd xmm2, xmm2, 0x93 + pshufd xmm10, xmm10, 0x93 + dec al + je 9f + movdqa xmm12, xmmword ptr [rsp+0x20] + movdqa xmm5, xmmword ptr [rsp+0x40] + pshufd xmm13, xmm12, 0x0F + shufps xmm12, xmm5, 214 + pshufd xmm4, xmm12, 0x39 + movdqa xmm12, xmm6 + shufps xmm12, xmm7, 250 + pand xmm13, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm12, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm13, xmm12 + movdqa xmmword ptr [rsp+0x20], xmm13 + movdqa xmm12, xmm7 + punpcklqdq xmm12, xmm5 + movdqa xmm13, xmm6 + pand xmm12, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm13, xmmword ptr [PBLENDW_0xC0_MASK+rip] + por xmm12, xmm13 + pshufd xmm12, xmm12, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmmword ptr [rsp+0x40], xmm12 + movdqa xmm5, xmmword ptr [rsp+0x30] + movdqa xmm13, xmmword ptr [rsp+0x50] + pshufd xmm6, xmm5, 0x0F + shufps xmm5, xmm13, 214 + pshufd xmm12, xmm5, 0x39 + movdqa xmm5, xmm14 + shufps xmm5, xmm15, 250 + pand xmm6, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm5, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm6, xmm5 + movdqa xmm5, xmm15 + punpcklqdq xmm5, xmm13 + movdqa xmmword ptr [rsp+0x30], xmm2 + movdqa xmm2, xmm14 + pand xmm5, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip] + por xmm5, xmm2 + movdqa xmm2, xmmword ptr [rsp+0x30] + pshufd xmm5, xmm5, 0x78 + punpckhdq xmm13, xmm15 + punpckldq xmm14, xmm13 + pshufd xmm15, xmm14, 0x1E + movdqa xmm13, xmm6 + movdqa xmm14, xmm5 + movdqa xmm5, xmmword ptr [rsp+0x20] + movdqa xmm6, xmmword ptr [rsp+0x40] + jmp 9b +9: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + pxor xmm8, xmm10 + pxor xmm9, xmm11 + mov eax, r13d + cmp rdx, r15 + jne 2b + movups xmmword ptr [rbx], xmm0 + movups xmmword ptr [rbx+0x10], xmm1 + movups xmmword ptr [rbx+0x20], xmm8 + movups xmmword ptr [rbx+0x30], xmm9 + mov eax, dword ptr [rsp+0x130] + neg eax + mov r10d, dword ptr [rsp+0x110+8*rax] + mov r11d, dword ptr [rsp+0x120+8*rax] + mov dword ptr [rsp+0x110], r10d + mov dword ptr [rsp+0x120], r11d + add rdi, 16 + add rbx, 64 + sub rsi, 2 +3: + test esi, 0x1 + je 4b + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+0x10] + movd xmm13, dword ptr [rsp+0x110] + movd xmm14, dword ptr [rsp+0x120] + punpckldq xmm13, xmm14 + mov r8, qword ptr [rdi] + movzx eax, byte ptr [rbp+0x80] + or eax, r13d + xor edx, edx +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + shl rax, 32 + or rax, 64 + movq xmm12, rax + movdqa xmm3, xmm13 + punpcklqdq xmm3, xmm12 + movups xmm4, xmmword ptr [r8+rdx-0x40] + movups xmm5, xmmword ptr [r8+rdx-0x30] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [r8+rdx-0x20] + movups xmm7, xmmword ptr [r8+rdx-0x10] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 0x93 + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + dec al + jz 9f + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm9, xmm8 + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + movdqa xmm10, xmm6 + pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] + por xmm8, xmm10 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp 9b +9: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + movups xmmword ptr [rbx], xmm0 + movups xmmword ptr [rbx+0x10], xmm1 + jmp 4b + +.p2align 6 +blake3_compress_in_place_sse2: +_blake3_compress_in_place_sse2: + sub rsp, 120 + movdqa xmmword ptr [rsp], xmm6 + movdqa xmmword ptr [rsp+0x10], xmm7 + movdqa xmmword ptr [rsp+0x20], xmm8 + movdqa xmmword ptr [rsp+0x30], xmm9 + movdqa xmmword ptr [rsp+0x40], xmm11 + movdqa xmmword ptr [rsp+0x50], xmm14 + movdqa xmmword ptr [rsp+0x60], xmm15 + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+0x10] + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + movzx eax, byte ptr [rsp+0xA0] + movzx r8d, r8b + shl rax, 32 + add r8, rax + movq xmm3, r9 + movq xmm4, r8 + punpcklqdq xmm3, xmm4 + movups xmm4, xmmword ptr [rdx] + movups xmm5, xmmword ptr [rdx+0x10] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [rdx+0x20] + movups xmm7, xmmword ptr [rdx+0x30] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 0x93 + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + dec al + jz 9f + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm9, xmm8 + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + movdqa xmm14, xmm6 + pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm14, xmmword ptr [PBLENDW_0xC0_MASK+rip] + por xmm8, xmm14 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp 9b +9: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + movups xmmword ptr [rcx], xmm0 + movups xmmword ptr [rcx+0x10], xmm1 + movdqa xmm6, xmmword ptr [rsp] + movdqa xmm7, xmmword ptr [rsp+0x10] + movdqa xmm8, xmmword ptr [rsp+0x20] + movdqa xmm9, xmmword ptr [rsp+0x30] + movdqa xmm11, xmmword ptr [rsp+0x40] + movdqa xmm14, xmmword ptr [rsp+0x50] + movdqa xmm15, xmmword ptr [rsp+0x60] + add rsp, 120 + ret + + +.p2align 6 +_blake3_compress_xof_sse2: +blake3_compress_xof_sse2: + sub rsp, 120 + movdqa xmmword ptr [rsp], xmm6 + movdqa xmmword ptr [rsp+0x10], xmm7 + movdqa xmmword ptr [rsp+0x20], xmm8 + movdqa xmmword ptr [rsp+0x30], xmm9 + movdqa xmmword ptr [rsp+0x40], xmm11 + movdqa xmmword ptr [rsp+0x50], xmm14 + movdqa xmmword ptr [rsp+0x60], xmm15 + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+0x10] + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + movzx eax, byte ptr [rsp+0xA0] + movzx r8d, r8b + mov r10, qword ptr [rsp+0xA8] + shl rax, 32 + add r8, rax + movq xmm3, r9 + movq xmm4, r8 + punpcklqdq xmm3, xmm4 + movups xmm4, xmmword ptr [rdx] + movups xmm5, xmmword ptr [rdx+0x10] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [rdx+0x20] + movups xmm7, xmmword ptr [rdx+0x30] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 0x93 + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + dec al + jz 9f + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm9, xmm8 + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + movdqa xmm14, xmm6 + pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm14, xmmword ptr [PBLENDW_0xC0_MASK+rip] + por xmm8, xmm14 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp 9b +9: + movdqu xmm4, xmmword ptr [rcx] + movdqu xmm5, xmmword ptr [rcx+0x10] + pxor xmm0, xmm2 + pxor xmm1, xmm3 + pxor xmm2, xmm4 + pxor xmm3, xmm5 + movups xmmword ptr [r10], xmm0 + movups xmmword ptr [r10+0x10], xmm1 + movups xmmword ptr [r10+0x20], xmm2 + movups xmmword ptr [r10+0x30], xmm3 + movdqa xmm6, xmmword ptr [rsp] + movdqa xmm7, xmmword ptr [rsp+0x10] + movdqa xmm8, xmmword ptr [rsp+0x20] + movdqa xmm9, xmmword ptr [rsp+0x30] + movdqa xmm11, xmmword ptr [rsp+0x40] + movdqa xmm14, xmmword ptr [rsp+0x50] + movdqa xmm15, xmmword ptr [rsp+0x60] + add rsp, 120 + ret + + +.section .rdata +.p2align 6 +BLAKE3_IV: + .long 0x6A09E667, 0xBB67AE85 + .long 0x3C6EF372, 0xA54FF53A +ADD0: + .long 0, 1, 2, 3 +ADD1: + .long 4, 4, 4, 4 +BLAKE3_IV_0: + .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 +BLAKE3_IV_1: + .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 +BLAKE3_IV_2: + .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 +BLAKE3_IV_3: + .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A +BLAKE3_BLOCK_LEN: + .long 64, 64, 64, 64 +CMP_MSB_MASK: + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 +PBLENDW_0x33_MASK: + .long 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000 +PBLENDW_0xCC_MASK: + .long 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF +PBLENDW_0x3F_MASK: + .long 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 +PBLENDW_0xC0_MASK: + .long 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF diff --git a/thirdparty/blake3/c/blake3_sse2_x86-64_windows_msvc.asm b/thirdparty/blake3/c/blake3_sse2_x86-64_windows_msvc.asm new file mode 100644 index 000000000..507502f11 --- /dev/null +++ b/thirdparty/blake3/c/blake3_sse2_x86-64_windows_msvc.asm @@ -0,0 +1,2350 @@ +public _blake3_hash_many_sse2 +public blake3_hash_many_sse2 +public blake3_compress_in_place_sse2 +public _blake3_compress_in_place_sse2 +public blake3_compress_xof_sse2 +public _blake3_compress_xof_sse2 + +_TEXT SEGMENT ALIGN(16) 'CODE' + +ALIGN 16 +blake3_hash_many_sse2 PROC +_blake3_hash_many_sse2 PROC + push r15 + push r14 + push r13 + push r12 + push rsi + push rdi + push rbx + push rbp + mov rbp, rsp + sub rsp, 528 + and rsp, 0FFFFFFFFFFFFFFC0H + movdqa xmmword ptr [rsp+170H], xmm6 + movdqa xmmword ptr [rsp+180H], xmm7 + movdqa xmmword ptr [rsp+190H], xmm8 + movdqa xmmword ptr [rsp+1A0H], xmm9 + movdqa xmmword ptr [rsp+1B0H], xmm10 + movdqa xmmword ptr [rsp+1C0H], xmm11 + movdqa xmmword ptr [rsp+1D0H], xmm12 + movdqa xmmword ptr [rsp+1E0H], xmm13 + movdqa xmmword ptr [rsp+1F0H], xmm14 + movdqa xmmword ptr [rsp+200H], xmm15 + mov rdi, rcx + mov rsi, rdx + mov rdx, r8 + mov rcx, r9 + mov r8, qword ptr [rbp+68H] + movzx r9, byte ptr [rbp+70H] + neg r9d + movd xmm0, r9d + pshufd xmm0, xmm0, 00H + movdqa xmmword ptr [rsp+130H], xmm0 + movdqa xmm1, xmm0 + pand xmm1, xmmword ptr [ADD0] + pand xmm0, xmmword ptr [ADD1] + movdqa xmmword ptr [rsp+150H], xmm0 + movd xmm0, r8d + pshufd xmm0, xmm0, 00H + paddd xmm0, xmm1 + movdqa xmmword ptr [rsp+110H], xmm0 + pxor xmm0, xmmword ptr [CMP_MSB_MASK] + pxor xmm1, xmmword ptr [CMP_MSB_MASK] + pcmpgtd xmm1, xmm0 + shr r8, 32 + movd xmm2, r8d + pshufd xmm2, xmm2, 00H + psubd xmm2, xmm1 + movdqa xmmword ptr [rsp+120H], xmm2 + mov rbx, qword ptr [rbp+90H] + mov r15, rdx + shl r15, 6 + movzx r13d, byte ptr [rbp+78H] + movzx r12d, byte ptr [rbp+88H] + cmp rsi, 4 + jc final3blocks +outerloop4: + movdqu xmm3, xmmword ptr [rcx] + pshufd xmm0, xmm3, 00H + pshufd xmm1, xmm3, 55H + pshufd xmm2, xmm3, 0AAH + pshufd xmm3, xmm3, 0FFH + movdqu xmm7, xmmword ptr [rcx+10H] + pshufd xmm4, xmm7, 00H + pshufd xmm5, xmm7, 55H + pshufd xmm6, xmm7, 0AAH + pshufd xmm7, xmm7, 0FFH + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+8H] + mov r10, qword ptr [rdi+10H] + mov r11, qword ptr [rdi+18H] + movzx eax, byte ptr [rbp+80H] + or eax, r13d + xor edx, edx +innerloop4: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movdqu xmm8, xmmword ptr [r8+rdx-40H] + movdqu xmm9, xmmword ptr [r9+rdx-40H] + movdqu xmm10, xmmword ptr [r10+rdx-40H] + movdqu xmm11, xmmword ptr [r11+rdx-40H] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp], xmm8 + movdqa xmmword ptr [rsp+10H], xmm9 + movdqa xmmword ptr [rsp+20H], xmm12 + movdqa xmmword ptr [rsp+30H], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-30H] + movdqu xmm9, xmmword ptr [r9+rdx-30H] + movdqu xmm10, xmmword ptr [r10+rdx-30H] + movdqu xmm11, xmmword ptr [r11+rdx-30H] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+40H], xmm8 + movdqa xmmword ptr [rsp+50H], xmm9 + movdqa xmmword ptr [rsp+60H], xmm12 + movdqa xmmword ptr [rsp+70H], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-20H] + movdqu xmm9, xmmword ptr [r9+rdx-20H] + movdqu xmm10, xmmword ptr [r10+rdx-20H] + movdqu xmm11, xmmword ptr [r11+rdx-20H] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+80H], xmm8 + movdqa xmmword ptr [rsp+90H], xmm9 + movdqa xmmword ptr [rsp+0A0H], xmm12 + movdqa xmmword ptr [rsp+0B0H], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-10H] + movdqu xmm9, xmmword ptr [r9+rdx-10H] + movdqu xmm10, xmmword ptr [r10+rdx-10H] + movdqu xmm11, xmmword ptr [r11+rdx-10H] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0C0H], xmm8 + movdqa xmmword ptr [rsp+0D0H], xmm9 + movdqa xmmword ptr [rsp+0E0H], xmm12 + movdqa xmmword ptr [rsp+0F0H], xmm13 + movdqa xmm9, xmmword ptr [BLAKE3_IV_1] + movdqa xmm10, xmmword ptr [BLAKE3_IV_2] + movdqa xmm11, xmmword ptr [BLAKE3_IV_3] + movdqa xmm12, xmmword ptr [rsp+110H] + movdqa xmm13, xmmword ptr [rsp+120H] + movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN] + movd xmm15, eax + pshufd xmm15, xmm15, 00H + prefetcht0 byte ptr [r8+rdx+80H] + prefetcht0 byte ptr [r9+rdx+80H] + prefetcht0 byte ptr [r10+rdx+80H] + prefetcht0 byte ptr [r11+rdx+80H] + paddd xmm0, xmmword ptr [rsp] + paddd xmm1, xmmword ptr [rsp+20H] + paddd xmm2, xmmword ptr [rsp+40H] + paddd xmm3, xmmword ptr [rsp+60H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0B1H + pshufhw xmm12, xmm12, 0B1H + pshuflw xmm13, xmm13, 0B1H + pshufhw xmm13, xmm13, 0B1H + pshuflw xmm14, xmm14, 0B1H + pshufhw xmm14, xmm14, 0B1H + pshuflw xmm15, xmm15, 0B1H + pshufhw xmm15, xmm15, 0B1H + movdqa xmm8, xmmword ptr [BLAKE3_IV_0] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+10H] + paddd xmm1, xmmword ptr [rsp+30H] + paddd xmm2, xmmword ptr [rsp+50H] + paddd xmm3, xmmword ptr [rsp+70H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+80H] + paddd xmm1, xmmword ptr [rsp+0A0H] + paddd xmm2, xmmword ptr [rsp+0C0H] + paddd xmm3, xmmword ptr [rsp+0E0H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0B1H + pshufhw xmm15, xmm15, 0B1H + pshuflw xmm12, xmm12, 0B1H + pshufhw xmm12, xmm12, 0B1H + pshuflw xmm13, xmm13, 0B1H + pshufhw xmm13, xmm13, 0B1H + pshuflw xmm14, xmm14, 0B1H + pshufhw xmm14, xmm14, 0B1H + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+90H] + paddd xmm1, xmmword ptr [rsp+0B0H] + paddd xmm2, xmmword ptr [rsp+0D0H] + paddd xmm3, xmmword ptr [rsp+0F0H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+20H] + paddd xmm1, xmmword ptr [rsp+30H] + paddd xmm2, xmmword ptr [rsp+70H] + paddd xmm3, xmmword ptr [rsp+40H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0B1H + pshufhw xmm12, xmm12, 0B1H + pshuflw xmm13, xmm13, 0B1H + pshufhw xmm13, xmm13, 0B1H + pshuflw xmm14, xmm14, 0B1H + pshufhw xmm14, xmm14, 0B1H + pshuflw xmm15, xmm15, 0B1H + pshufhw xmm15, xmm15, 0B1H + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+60H] + paddd xmm1, xmmword ptr [rsp+0A0H] + paddd xmm2, xmmword ptr [rsp] + paddd xmm3, xmmword ptr [rsp+0D0H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+10H] + paddd xmm1, xmmword ptr [rsp+0C0H] + paddd xmm2, xmmword ptr [rsp+90H] + paddd xmm3, xmmword ptr [rsp+0F0H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0B1H + pshufhw xmm15, xmm15, 0B1H + pshuflw xmm12, xmm12, 0B1H + pshufhw xmm12, xmm12, 0B1H + pshuflw xmm13, xmm13, 0B1H + pshufhw xmm13, xmm13, 0B1H + pshuflw xmm14, xmm14, 0B1H + pshufhw xmm14, xmm14, 0B1H + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0B0H] + paddd xmm1, xmmword ptr [rsp+50H] + paddd xmm2, xmmword ptr [rsp+0E0H] + paddd xmm3, xmmword ptr [rsp+80H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+30H] + paddd xmm1, xmmword ptr [rsp+0A0H] + paddd xmm2, xmmword ptr [rsp+0D0H] + paddd xmm3, xmmword ptr [rsp+70H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0B1H + pshufhw xmm12, xmm12, 0B1H + pshuflw xmm13, xmm13, 0B1H + pshufhw xmm13, xmm13, 0B1H + pshuflw xmm14, xmm14, 0B1H + pshufhw xmm14, xmm14, 0B1H + pshuflw xmm15, xmm15, 0B1H + pshufhw xmm15, xmm15, 0B1H + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+40H] + paddd xmm1, xmmword ptr [rsp+0C0H] + paddd xmm2, xmmword ptr [rsp+20H] + paddd xmm3, xmmword ptr [rsp+0E0H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+60H] + paddd xmm1, xmmword ptr [rsp+90H] + paddd xmm2, xmmword ptr [rsp+0B0H] + paddd xmm3, xmmword ptr [rsp+80H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0B1H + pshufhw xmm15, xmm15, 0B1H + pshuflw xmm12, xmm12, 0B1H + pshufhw xmm12, xmm12, 0B1H + pshuflw xmm13, xmm13, 0B1H + pshufhw xmm13, xmm13, 0B1H + pshuflw xmm14, xmm14, 0B1H + pshufhw xmm14, xmm14, 0B1H + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+50H] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0F0H] + paddd xmm3, xmmword ptr [rsp+10H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0A0H] + paddd xmm1, xmmword ptr [rsp+0C0H] + paddd xmm2, xmmword ptr [rsp+0E0H] + paddd xmm3, xmmword ptr [rsp+0D0H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0B1H + pshufhw xmm12, xmm12, 0B1H + pshuflw xmm13, xmm13, 0B1H + pshufhw xmm13, xmm13, 0B1H + pshuflw xmm14, xmm14, 0B1H + pshufhw xmm14, xmm14, 0B1H + pshuflw xmm15, xmm15, 0B1H + pshufhw xmm15, xmm15, 0B1H + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+70H] + paddd xmm1, xmmword ptr [rsp+90H] + paddd xmm2, xmmword ptr [rsp+30H] + paddd xmm3, xmmword ptr [rsp+0F0H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+40H] + paddd xmm1, xmmword ptr [rsp+0B0H] + paddd xmm2, xmmword ptr [rsp+50H] + paddd xmm3, xmmword ptr [rsp+10H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0B1H + pshufhw xmm15, xmm15, 0B1H + pshuflw xmm12, xmm12, 0B1H + pshufhw xmm12, xmm12, 0B1H + pshuflw xmm13, xmm13, 0B1H + pshufhw xmm13, xmm13, 0B1H + pshuflw xmm14, xmm14, 0B1H + pshufhw xmm14, xmm14, 0B1H + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp] + paddd xmm1, xmmword ptr [rsp+20H] + paddd xmm2, xmmword ptr [rsp+80H] + paddd xmm3, xmmword ptr [rsp+60H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0C0H] + paddd xmm1, xmmword ptr [rsp+90H] + paddd xmm2, xmmword ptr [rsp+0F0H] + paddd xmm3, xmmword ptr [rsp+0E0H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0B1H + pshufhw xmm12, xmm12, 0B1H + pshuflw xmm13, xmm13, 0B1H + pshufhw xmm13, xmm13, 0B1H + pshuflw xmm14, xmm14, 0B1H + pshufhw xmm14, xmm14, 0B1H + pshuflw xmm15, xmm15, 0B1H + pshufhw xmm15, xmm15, 0B1H + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0D0H] + paddd xmm1, xmmword ptr [rsp+0B0H] + paddd xmm2, xmmword ptr [rsp+0A0H] + paddd xmm3, xmmword ptr [rsp+80H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+70H] + paddd xmm1, xmmword ptr [rsp+50H] + paddd xmm2, xmmword ptr [rsp] + paddd xmm3, xmmword ptr [rsp+60H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0B1H + pshufhw xmm15, xmm15, 0B1H + pshuflw xmm12, xmm12, 0B1H + pshufhw xmm12, xmm12, 0B1H + pshuflw xmm13, xmm13, 0B1H + pshufhw xmm13, xmm13, 0B1H + pshuflw xmm14, xmm14, 0B1H + pshufhw xmm14, xmm14, 0B1H + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+20H] + paddd xmm1, xmmword ptr [rsp+30H] + paddd xmm2, xmmword ptr [rsp+10H] + paddd xmm3, xmmword ptr [rsp+40H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+90H] + paddd xmm1, xmmword ptr [rsp+0B0H] + paddd xmm2, xmmword ptr [rsp+80H] + paddd xmm3, xmmword ptr [rsp+0F0H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0B1H + pshufhw xmm12, xmm12, 0B1H + pshuflw xmm13, xmm13, 0B1H + pshufhw xmm13, xmm13, 0B1H + pshuflw xmm14, xmm14, 0B1H + pshufhw xmm14, xmm14, 0B1H + pshuflw xmm15, xmm15, 0B1H + pshufhw xmm15, xmm15, 0B1H + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0E0H] + paddd xmm1, xmmword ptr [rsp+50H] + paddd xmm2, xmmword ptr [rsp+0C0H] + paddd xmm3, xmmword ptr [rsp+10H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0D0H] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+20H] + paddd xmm3, xmmword ptr [rsp+40H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0B1H + pshufhw xmm15, xmm15, 0B1H + pshuflw xmm12, xmm12, 0B1H + pshufhw xmm12, xmm12, 0B1H + pshuflw xmm13, xmm13, 0B1H + pshufhw xmm13, xmm13, 0B1H + pshuflw xmm14, xmm14, 0B1H + pshufhw xmm14, xmm14, 0B1H + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+30H] + paddd xmm1, xmmword ptr [rsp+0A0H] + paddd xmm2, xmmword ptr [rsp+60H] + paddd xmm3, xmmword ptr [rsp+70H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0B0H] + paddd xmm1, xmmword ptr [rsp+50H] + paddd xmm2, xmmword ptr [rsp+10H] + paddd xmm3, xmmword ptr [rsp+80H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0B1H + pshufhw xmm12, xmm12, 0B1H + pshuflw xmm13, xmm13, 0B1H + pshufhw xmm13, xmm13, 0B1H + pshuflw xmm14, xmm14, 0B1H + pshufhw xmm14, xmm14, 0B1H + pshuflw xmm15, xmm15, 0B1H + pshufhw xmm15, xmm15, 0B1H + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0F0H] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+90H] + paddd xmm3, xmmword ptr [rsp+60H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0E0H] + paddd xmm1, xmmword ptr [rsp+20H] + paddd xmm2, xmmword ptr [rsp+30H] + paddd xmm3, xmmword ptr [rsp+70H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0B1H + pshufhw xmm15, xmm15, 0B1H + pshuflw xmm12, xmm12, 0B1H + pshufhw xmm12, xmm12, 0B1H + pshuflw xmm13, xmm13, 0B1H + pshufhw xmm13, xmm13, 0B1H + pshuflw xmm14, xmm14, 0B1H + pshufhw xmm14, xmm14, 0B1H + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0A0H] + paddd xmm1, xmmword ptr [rsp+0C0H] + paddd xmm2, xmmword ptr [rsp+40H] + paddd xmm3, xmmword ptr [rsp+0D0H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + pxor xmm0, xmm8 + pxor xmm1, xmm9 + pxor xmm2, xmm10 + pxor xmm3, xmm11 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + pxor xmm4, xmm12 + pxor xmm5, xmm13 + pxor xmm6, xmm14 + pxor xmm7, xmm15 + mov eax, r13d + jne innerloop4 + movdqa xmm9, xmm0 + punpckldq xmm0, xmm1 + punpckhdq xmm9, xmm1 + movdqa xmm11, xmm2 + punpckldq xmm2, xmm3 + punpckhdq xmm11, xmm3 + movdqa xmm1, xmm0 + punpcklqdq xmm0, xmm2 + punpckhqdq xmm1, xmm2 + movdqa xmm3, xmm9 + punpcklqdq xmm9, xmm11 + punpckhqdq xmm3, xmm11 + movdqu xmmword ptr [rbx], xmm0 + movdqu xmmword ptr [rbx+20H], xmm1 + movdqu xmmword ptr [rbx+40H], xmm9 + movdqu xmmword ptr [rbx+60H], xmm3 + movdqa xmm9, xmm4 + punpckldq xmm4, xmm5 + punpckhdq xmm9, xmm5 + movdqa xmm11, xmm6 + punpckldq xmm6, xmm7 + punpckhdq xmm11, xmm7 + movdqa xmm5, xmm4 + punpcklqdq xmm4, xmm6 + punpckhqdq xmm5, xmm6 + movdqa xmm7, xmm9 + punpcklqdq xmm9, xmm11 + punpckhqdq xmm7, xmm11 + movdqu xmmword ptr [rbx+10H], xmm4 + movdqu xmmword ptr [rbx+30H], xmm5 + movdqu xmmword ptr [rbx+50H], xmm9 + movdqu xmmword ptr [rbx+70H], xmm7 + movdqa xmm1, xmmword ptr [rsp+110H] + movdqa xmm0, xmm1 + paddd xmm1, xmmword ptr [rsp+150H] + movdqa xmmword ptr [rsp+110H], xmm1 + pxor xmm0, xmmword ptr [CMP_MSB_MASK] + pxor xmm1, xmmword ptr [CMP_MSB_MASK] + pcmpgtd xmm0, xmm1 + movdqa xmm1, xmmword ptr [rsp+120H] + psubd xmm1, xmm0 + movdqa xmmword ptr [rsp+120H], xmm1 + add rbx, 128 + add rdi, 32 + sub rsi, 4 + cmp rsi, 4 + jnc outerloop4 + test rsi, rsi + jne final3blocks +unwind: + movdqa xmm6, xmmword ptr [rsp+170H] + movdqa xmm7, xmmword ptr [rsp+180H] + movdqa xmm8, xmmword ptr [rsp+190H] + movdqa xmm9, xmmword ptr [rsp+1A0H] + movdqa xmm10, xmmword ptr [rsp+1B0H] + movdqa xmm11, xmmword ptr [rsp+1C0H] + movdqa xmm12, xmmword ptr [rsp+1D0H] + movdqa xmm13, xmmword ptr [rsp+1E0H] + movdqa xmm14, xmmword ptr [rsp+1F0H] + movdqa xmm15, xmmword ptr [rsp+200H] + mov rsp, rbp + pop rbp + pop rbx + pop rdi + pop rsi + pop r12 + pop r13 + pop r14 + pop r15 + ret +ALIGN 16 +final3blocks: + test esi, 2H + je final1block + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+10H] + movaps xmm8, xmm0 + movaps xmm9, xmm1 + movd xmm13, dword ptr [rsp+110H] + movd xmm14, dword ptr [rsp+120H] + punpckldq xmm13, xmm14 + movaps xmmword ptr [rsp], xmm13 + movd xmm14, dword ptr [rsp+114H] + movd xmm13, dword ptr [rsp+124H] + punpckldq xmm14, xmm13 + movaps xmmword ptr [rsp+10H], xmm14 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+8H] + movzx eax, byte ptr [rbp+80H] + or eax, r13d + xor edx, edx +innerloop2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movaps xmm2, xmmword ptr [BLAKE3_IV] + movaps xmm10, xmm2 + movups xmm4, xmmword ptr [r8+rdx-40H] + movups xmm5, xmmword ptr [r8+rdx-30H] + movaps xmm3, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm3, xmm5, 221 + movaps xmm5, xmm3 + movups xmm6, xmmword ptr [r8+rdx-20H] + movups xmm7, xmmword ptr [r8+rdx-10H] + movaps xmm3, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 93H + shufps xmm3, xmm7, 221 + pshufd xmm7, xmm3, 93H + movups xmm12, xmmword ptr [r9+rdx-40H] + movups xmm13, xmmword ptr [r9+rdx-30H] + movaps xmm11, xmm12 + shufps xmm12, xmm13, 136 + shufps xmm11, xmm13, 221 + movaps xmm13, xmm11 + movups xmm14, xmmword ptr [r9+rdx-20H] + movups xmm15, xmmword ptr [r9+rdx-10H] + movaps xmm11, xmm14 + shufps xmm14, xmm15, 136 + pshufd xmm14, xmm14, 93H + shufps xmm11, xmm15, 221 + pshufd xmm15, xmm11, 93H + shl rax, 20H + or rax, 40H + movd xmm3, rax + movdqa xmmword ptr [rsp+20H], xmm3 + movaps xmm3, xmmword ptr [rsp] + movaps xmm11, xmmword ptr [rsp+10H] + punpcklqdq xmm3, xmmword ptr [rsp+20H] + punpcklqdq xmm11, xmmword ptr [rsp+20H] + mov al, 7 +roundloop2: + paddd xmm0, xmm4 + paddd xmm8, xmm12 + movaps xmmword ptr [rsp+20H], xmm4 + movaps xmmword ptr [rsp+30H], xmm12 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + pshuflw xmm3, xmm3, 0B1H + pshufhw xmm3, xmm3, 0B1H + pshuflw xmm11, xmm11, 0B1H + pshufhw xmm11, xmm11, 0B1H + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 20 + psrld xmm4, 12 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 20 + psrld xmm4, 12 + por xmm9, xmm4 + paddd xmm0, xmm5 + paddd xmm8, xmm13 + movaps xmmword ptr [rsp+40H], xmm5 + movaps xmmword ptr [rsp+50H], xmm13 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + movdqa xmm13, xmm3 + psrld xmm3, 8 + pslld xmm13, 24 + pxor xmm3, xmm13 + movdqa xmm13, xmm11 + psrld xmm11, 8 + pslld xmm13, 24 + pxor xmm11, xmm13 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 25 + psrld xmm4, 7 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 25 + psrld xmm4, 7 + por xmm9, xmm4 + pshufd xmm0, xmm0, 93H + pshufd xmm8, xmm8, 93H + pshufd xmm3, xmm3, 4EH + pshufd xmm11, xmm11, 4EH + pshufd xmm2, xmm2, 39H + pshufd xmm10, xmm10, 39H + paddd xmm0, xmm6 + paddd xmm8, xmm14 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + pshuflw xmm3, xmm3, 0B1H + pshufhw xmm3, xmm3, 0B1H + pshuflw xmm11, xmm11, 0B1H + pshufhw xmm11, xmm11, 0B1H + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 20 + psrld xmm4, 12 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 20 + psrld xmm4, 12 + por xmm9, xmm4 + paddd xmm0, xmm7 + paddd xmm8, xmm15 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + movdqa xmm13, xmm3 + psrld xmm3, 8 + pslld xmm13, 24 + pxor xmm3, xmm13 + movdqa xmm13, xmm11 + psrld xmm11, 8 + pslld xmm13, 24 + pxor xmm11, xmm13 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 25 + psrld xmm4, 7 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 25 + psrld xmm4, 7 + por xmm9, xmm4 + pshufd xmm0, xmm0, 39H + pshufd xmm8, xmm8, 39H + pshufd xmm3, xmm3, 4EH + pshufd xmm11, xmm11, 4EH + pshufd xmm2, xmm2, 93H + pshufd xmm10, xmm10, 93H + dec al + je endroundloop2 + movdqa xmm12, xmmword ptr [rsp+20H] + movdqa xmm5, xmmword ptr [rsp+40H] + pshufd xmm13, xmm12, 0FH + shufps xmm12, xmm5, 214 + pshufd xmm4, xmm12, 39H + movdqa xmm12, xmm6 + shufps xmm12, xmm7, 250 + pand xmm13, xmmword ptr [PBLENDW_0x33_MASK] + pand xmm12, xmmword ptr [PBLENDW_0xCC_MASK] + por xmm13, xmm12 + movdqa xmmword ptr [rsp+20H], xmm13 + movdqa xmm12, xmm7 + punpcklqdq xmm12, xmm5 + movdqa xmm13, xmm6 + pand xmm12, xmmword ptr [PBLENDW_0x3F_MASK] + pand xmm13, xmmword ptr [PBLENDW_0xC0_MASK] + por xmm12, xmm13 + pshufd xmm12, xmm12, 78H + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 1EH + movdqa xmmword ptr [rsp+40H], xmm12 + movdqa xmm5, xmmword ptr [rsp+30H] + movdqa xmm13, xmmword ptr [rsp+50H] + pshufd xmm6, xmm5, 0FH + shufps xmm5, xmm13, 214 + pshufd xmm12, xmm5, 39H + movdqa xmm5, xmm14 + shufps xmm5, xmm15, 250 + pand xmm6, xmmword ptr [PBLENDW_0x33_MASK] + pand xmm5, xmmword ptr [PBLENDW_0xCC_MASK] + por xmm6, xmm5 + movdqa xmm5, xmm15 + punpcklqdq xmm5, xmm13 + movdqa xmmword ptr [rsp+30H], xmm2 + movdqa xmm2, xmm14 + pand xmm5, xmmword ptr [PBLENDW_0x3F_MASK] + pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK] + por xmm5, xmm2 + movdqa xmm2, xmmword ptr [rsp+30H] + pshufd xmm5, xmm5, 78H + punpckhdq xmm13, xmm15 + punpckldq xmm14, xmm13 + pshufd xmm15, xmm14, 1EH + movdqa xmm13, xmm6 + movdqa xmm14, xmm5 + movdqa xmm5, xmmword ptr [rsp+20H] + movdqa xmm6, xmmword ptr [rsp+40H] + jmp roundloop2 +endroundloop2: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + pxor xmm8, xmm10 + pxor xmm9, xmm11 + mov eax, r13d + cmp rdx, r15 + jne innerloop2 + movups xmmword ptr [rbx], xmm0 + movups xmmword ptr [rbx+10H], xmm1 + movups xmmword ptr [rbx+20H], xmm8 + movups xmmword ptr [rbx+30H], xmm9 + mov eax, dword ptr [rsp+130H] + neg eax + mov r10d, dword ptr [rsp+110H+8*rax] + mov r11d, dword ptr [rsp+120H+8*rax] + mov dword ptr [rsp+110H], r10d + mov dword ptr [rsp+120H], r11d + add rdi, 16 + add rbx, 64 + sub rsi, 2 +final1block: + test esi, 1H + je unwind + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+10H] + movd xmm13, dword ptr [rsp+110H] + movd xmm14, dword ptr [rsp+120H] + punpckldq xmm13, xmm14 + mov r8, qword ptr [rdi] + movzx eax, byte ptr [rbp+80H] + or eax, r13d + xor edx, edx +innerloop1: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movaps xmm2, xmmword ptr [BLAKE3_IV] + shl rax, 32 + or rax, 64 + movd xmm12, rax + movdqa xmm3, xmm13 + punpcklqdq xmm3, xmm12 + movups xmm4, xmmword ptr [r8+rdx-40H] + movups xmm5, xmmword ptr [r8+rdx-30H] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [r8+rdx-20H] + movups xmm7, xmmword ptr [r8+rdx-10H] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 93H + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 93H + mov al, 7 +roundloop1: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0B1H + pshufhw xmm3, xmm3, 0B1H + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 93H + pshufd xmm3, xmm3, 4EH + pshufd xmm2, xmm2, 39H + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0B1H + pshufhw xmm3, xmm3, 0B1H + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 39H + pshufd xmm3, xmm3, 4EH + pshufd xmm2, xmm2, 93H + dec al + jz endroundloop1 + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0FH + pshufd xmm4, xmm8, 39H + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pand xmm9, xmmword ptr [PBLENDW_0x33_MASK] + pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK] + por xmm9, xmm8 + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + movdqa xmm10, xmm6 + pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK] + pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK] + por xmm8, xmm10 + pshufd xmm8, xmm8, 78H + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 1EH + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp roundloop1 +endroundloop1: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + mov eax, r13d + cmp rdx, r15 + jne innerloop1 + movups xmmword ptr [rbx], xmm0 + movups xmmword ptr [rbx+10H], xmm1 + jmp unwind +_blake3_hash_many_sse2 ENDP +blake3_hash_many_sse2 ENDP + +blake3_compress_in_place_sse2 PROC +_blake3_compress_in_place_sse2 PROC + sub rsp, 120 + movdqa xmmword ptr [rsp], xmm6 + movdqa xmmword ptr [rsp+10H], xmm7 + movdqa xmmword ptr [rsp+20H], xmm8 + movdqa xmmword ptr [rsp+30H], xmm9 + movdqa xmmword ptr [rsp+40H], xmm11 + movdqa xmmword ptr [rsp+50H], xmm14 + movdqa xmmword ptr [rsp+60H], xmm15 + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+10H] + movaps xmm2, xmmword ptr [BLAKE3_IV] + movzx eax, byte ptr [rsp+0A0H] + movzx r8d, r8b + shl rax, 32 + add r8, rax + movd xmm3, r9 + movd xmm4, r8 + punpcklqdq xmm3, xmm4 + movups xmm4, xmmword ptr [rdx] + movups xmm5, xmmword ptr [rdx+10H] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [rdx+20H] + movups xmm7, xmmword ptr [rdx+30H] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 93H + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 93H + mov al, 7 +@@: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0B1H + pshufhw xmm3, xmm3, 0B1H + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 93H + pshufd xmm3, xmm3, 4EH + pshufd xmm2, xmm2, 39H + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0B1H + pshufhw xmm3, xmm3, 0B1H + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 39H + pshufd xmm3, xmm3, 4EH + pshufd xmm2, xmm2, 93H + dec al + jz @F + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0FH + pshufd xmm4, xmm8, 39H + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pand xmm9, xmmword ptr [PBLENDW_0x33_MASK] + pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK] + por xmm9, xmm8 + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + movdqa xmm14, xmm6 + pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK] + pand xmm14, xmmword ptr [PBLENDW_0xC0_MASK] + por xmm8, xmm14 + pshufd xmm8, xmm8, 78H + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 1EH + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp @B +@@: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + movups xmmword ptr [rcx], xmm0 + movups xmmword ptr [rcx+10H], xmm1 + movdqa xmm6, xmmword ptr [rsp] + movdqa xmm7, xmmword ptr [rsp+10H] + movdqa xmm8, xmmword ptr [rsp+20H] + movdqa xmm9, xmmword ptr [rsp+30H] + movdqa xmm11, xmmword ptr [rsp+40H] + movdqa xmm14, xmmword ptr [rsp+50H] + movdqa xmm15, xmmword ptr [rsp+60H] + add rsp, 120 + ret +_blake3_compress_in_place_sse2 ENDP +blake3_compress_in_place_sse2 ENDP + +ALIGN 16 +blake3_compress_xof_sse2 PROC +_blake3_compress_xof_sse2 PROC + sub rsp, 120 + movdqa xmmword ptr [rsp], xmm6 + movdqa xmmword ptr [rsp+10H], xmm7 + movdqa xmmword ptr [rsp+20H], xmm8 + movdqa xmmword ptr [rsp+30H], xmm9 + movdqa xmmword ptr [rsp+40H], xmm11 + movdqa xmmword ptr [rsp+50H], xmm14 + movdqa xmmword ptr [rsp+60H], xmm15 + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+10H] + movaps xmm2, xmmword ptr [BLAKE3_IV] + movzx eax, byte ptr [rsp+0A0H] + movzx r8d, r8b + mov r10, qword ptr [rsp+0A8H] + shl rax, 32 + add r8, rax + movd xmm3, r9 + movd xmm4, r8 + punpcklqdq xmm3, xmm4 + movups xmm4, xmmword ptr [rdx] + movups xmm5, xmmword ptr [rdx+10H] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [rdx+20H] + movups xmm7, xmmword ptr [rdx+30H] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 93H + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 93H + mov al, 7 +@@: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0B1H + pshufhw xmm3, xmm3, 0B1H + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 93H + pshufd xmm3, xmm3, 4EH + pshufd xmm2, xmm2, 39H + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0B1H + pshufhw xmm3, xmm3, 0B1H + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 39H + pshufd xmm3, xmm3, 4EH + pshufd xmm2, xmm2, 93H + dec al + jz @F + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0FH + pshufd xmm4, xmm8, 39H + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pand xmm9, xmmword ptr [PBLENDW_0x33_MASK] + pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK] + por xmm9, xmm8 + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + movdqa xmm14, xmm6 + pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK] + pand xmm14, xmmword ptr [PBLENDW_0xC0_MASK] + por xmm8, xmm14 + pshufd xmm8, xmm8, 78H + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 1EH + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp @B +@@: + movdqu xmm4, xmmword ptr [rcx] + movdqu xmm5, xmmword ptr [rcx+10H] + pxor xmm0, xmm2 + pxor xmm1, xmm3 + pxor xmm2, xmm4 + pxor xmm3, xmm5 + movups xmmword ptr [r10], xmm0 + movups xmmword ptr [r10+10H], xmm1 + movups xmmword ptr [r10+20H], xmm2 + movups xmmword ptr [r10+30H], xmm3 + movdqa xmm6, xmmword ptr [rsp] + movdqa xmm7, xmmword ptr [rsp+10H] + movdqa xmm8, xmmword ptr [rsp+20H] + movdqa xmm9, xmmword ptr [rsp+30H] + movdqa xmm11, xmmword ptr [rsp+40H] + movdqa xmm14, xmmword ptr [rsp+50H] + movdqa xmm15, xmmword ptr [rsp+60H] + add rsp, 120 + ret +_blake3_compress_xof_sse2 ENDP +blake3_compress_xof_sse2 ENDP + +_TEXT ENDS + + +_RDATA SEGMENT READONLY PAGE ALIAS(".rdata") 'CONST' +ALIGN 64 +BLAKE3_IV: + dd 6A09E667H, 0BB67AE85H, 3C6EF372H, 0A54FF53AH + +ADD0: + dd 0, 1, 2, 3 + +ADD1: + dd 4 dup (4) + +BLAKE3_IV_0: + dd 4 dup (6A09E667H) + +BLAKE3_IV_1: + dd 4 dup (0BB67AE85H) + +BLAKE3_IV_2: + dd 4 dup (3C6EF372H) + +BLAKE3_IV_3: + dd 4 dup (0A54FF53AH) + +BLAKE3_BLOCK_LEN: + dd 4 dup (64) + +CMP_MSB_MASK: + dd 8 dup(80000000H) + +PBLENDW_0x33_MASK: + dd 0FFFFFFFFH, 000000000H, 0FFFFFFFFH, 000000000H +PBLENDW_0xCC_MASK: + dd 000000000H, 0FFFFFFFFH, 000000000H, 0FFFFFFFFH +PBLENDW_0x3F_MASK: + dd 0FFFFFFFFH, 0FFFFFFFFH, 0FFFFFFFFH, 000000000H +PBLENDW_0xC0_MASK: + dd 000000000H, 000000000H, 000000000H, 0FFFFFFFFH + +_RDATA ENDS +END diff --git a/thirdparty/blake3/c/blake3_sse41.c b/thirdparty/blake3/c/blake3_sse41.c new file mode 100644 index 000000000..4653a856f --- /dev/null +++ b/thirdparty/blake3/c/blake3_sse41.c @@ -0,0 +1,560 @@ +#include "blake3_impl.h" + +#include <immintrin.h> + +#define DEGREE 4 + +#define _mm_shuffle_ps2(a, b, c) \ + (_mm_castps_si128( \ + _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c)))) + +INLINE __m128i loadu(const uint8_t src[16]) { + return _mm_loadu_si128((const __m128i *)src); +} + +INLINE void storeu(__m128i src, uint8_t dest[16]) { + _mm_storeu_si128((__m128i *)dest, src); +} + +INLINE __m128i addv(__m128i a, __m128i b) { return _mm_add_epi32(a, b); } + +// Note that clang-format doesn't like the name "xor" for some reason. +INLINE __m128i xorv(__m128i a, __m128i b) { return _mm_xor_si128(a, b); } + +INLINE __m128i set1(uint32_t x) { return _mm_set1_epi32((int32_t)x); } + +INLINE __m128i set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { + return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d); +} + +INLINE __m128i rot16(__m128i x) { + return _mm_shuffle_epi8( + x, _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2)); +} + +INLINE __m128i rot12(__m128i x) { + return xorv(_mm_srli_epi32(x, 12), _mm_slli_epi32(x, 32 - 12)); +} + +INLINE __m128i rot8(__m128i x) { + return _mm_shuffle_epi8( + x, _mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1)); +} + +INLINE __m128i rot7(__m128i x) { + return xorv(_mm_srli_epi32(x, 7), _mm_slli_epi32(x, 32 - 7)); +} + +INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, + __m128i m) { + *row0 = addv(addv(*row0, m), *row1); + *row3 = xorv(*row3, *row0); + *row3 = rot16(*row3); + *row2 = addv(*row2, *row3); + *row1 = xorv(*row1, *row2); + *row1 = rot12(*row1); +} + +INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, + __m128i m) { + *row0 = addv(addv(*row0, m), *row1); + *row3 = xorv(*row3, *row0); + *row3 = rot8(*row3); + *row2 = addv(*row2, *row3); + *row1 = xorv(*row1, *row2); + *row1 = rot7(*row1); +} + +// Note the optimization here of leaving row1 as the unrotated row, rather than +// row0. All the message loads below are adjusted to compensate for this. See +// discussion at https://github.com/sneves/blake2-avx2/pull/4 +INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { + *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3)); + *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); + *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1)); +} + +INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { + *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1)); + *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); + *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3)); +} + +INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, uint8_t flags) { + rows[0] = loadu((uint8_t *)&cv[0]); + rows[1] = loadu((uint8_t *)&cv[4]); + rows[2] = set4(IV[0], IV[1], IV[2], IV[3]); + rows[3] = set4(counter_low(counter), counter_high(counter), + (uint32_t)block_len, (uint32_t)flags); + + __m128i m0 = loadu(&block[sizeof(__m128i) * 0]); + __m128i m1 = loadu(&block[sizeof(__m128i) * 1]); + __m128i m2 = loadu(&block[sizeof(__m128i) * 2]); + __m128i m3 = loadu(&block[sizeof(__m128i) * 3]); + + __m128i t0, t1, t2, t3, tt; + + // Round 1. The first round permutes the message words from the original + // input order, into the groups that get mixed in parallel. + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); // 6 4 2 0 + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); // 7 5 3 1 + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10 8 + t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3)); // 12 10 8 14 + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11 9 + t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3)); // 13 11 9 15 + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 2. This round and all following rounds apply a fixed permutation + // to the message words from the round before. + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 3 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 4 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 5 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 6 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 7 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); +} + +void blake3_compress_in_place_sse41(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags) { + __m128i rows[4]; + compress_pre(rows, cv, block, block_len, counter, flags); + storeu(xorv(rows[0], rows[2]), (uint8_t *)&cv[0]); + storeu(xorv(rows[1], rows[3]), (uint8_t *)&cv[4]); +} + +void blake3_compress_xof_sse41(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags, uint8_t out[64]) { + __m128i rows[4]; + compress_pre(rows, cv, block, block_len, counter, flags); + storeu(xorv(rows[0], rows[2]), &out[0]); + storeu(xorv(rows[1], rows[3]), &out[16]); + storeu(xorv(rows[2], loadu((uint8_t *)&cv[0])), &out[32]); + storeu(xorv(rows[3], loadu((uint8_t *)&cv[4])), &out[48]); +} + +INLINE void round_fn(__m128i v[16], __m128i m[16], size_t r) { + v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); + v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); + v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); + v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); + v[0] = addv(v[0], v[4]); + v[1] = addv(v[1], v[5]); + v[2] = addv(v[2], v[6]); + v[3] = addv(v[3], v[7]); + v[12] = xorv(v[12], v[0]); + v[13] = xorv(v[13], v[1]); + v[14] = xorv(v[14], v[2]); + v[15] = xorv(v[15], v[3]); + v[12] = rot16(v[12]); + v[13] = rot16(v[13]); + v[14] = rot16(v[14]); + v[15] = rot16(v[15]); + v[8] = addv(v[8], v[12]); + v[9] = addv(v[9], v[13]); + v[10] = addv(v[10], v[14]); + v[11] = addv(v[11], v[15]); + v[4] = xorv(v[4], v[8]); + v[5] = xorv(v[5], v[9]); + v[6] = xorv(v[6], v[10]); + v[7] = xorv(v[7], v[11]); + v[4] = rot12(v[4]); + v[5] = rot12(v[5]); + v[6] = rot12(v[6]); + v[7] = rot12(v[7]); + v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); + v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); + v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); + v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); + v[0] = addv(v[0], v[4]); + v[1] = addv(v[1], v[5]); + v[2] = addv(v[2], v[6]); + v[3] = addv(v[3], v[7]); + v[12] = xorv(v[12], v[0]); + v[13] = xorv(v[13], v[1]); + v[14] = xorv(v[14], v[2]); + v[15] = xorv(v[15], v[3]); + v[12] = rot8(v[12]); + v[13] = rot8(v[13]); + v[14] = rot8(v[14]); + v[15] = rot8(v[15]); + v[8] = addv(v[8], v[12]); + v[9] = addv(v[9], v[13]); + v[10] = addv(v[10], v[14]); + v[11] = addv(v[11], v[15]); + v[4] = xorv(v[4], v[8]); + v[5] = xorv(v[5], v[9]); + v[6] = xorv(v[6], v[10]); + v[7] = xorv(v[7], v[11]); + v[4] = rot7(v[4]); + v[5] = rot7(v[5]); + v[6] = rot7(v[6]); + v[7] = rot7(v[7]); + + v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); + v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); + v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); + v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); + v[0] = addv(v[0], v[5]); + v[1] = addv(v[1], v[6]); + v[2] = addv(v[2], v[7]); + v[3] = addv(v[3], v[4]); + v[15] = xorv(v[15], v[0]); + v[12] = xorv(v[12], v[1]); + v[13] = xorv(v[13], v[2]); + v[14] = xorv(v[14], v[3]); + v[15] = rot16(v[15]); + v[12] = rot16(v[12]); + v[13] = rot16(v[13]); + v[14] = rot16(v[14]); + v[10] = addv(v[10], v[15]); + v[11] = addv(v[11], v[12]); + v[8] = addv(v[8], v[13]); + v[9] = addv(v[9], v[14]); + v[5] = xorv(v[5], v[10]); + v[6] = xorv(v[6], v[11]); + v[7] = xorv(v[7], v[8]); + v[4] = xorv(v[4], v[9]); + v[5] = rot12(v[5]); + v[6] = rot12(v[6]); + v[7] = rot12(v[7]); + v[4] = rot12(v[4]); + v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); + v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); + v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); + v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); + v[0] = addv(v[0], v[5]); + v[1] = addv(v[1], v[6]); + v[2] = addv(v[2], v[7]); + v[3] = addv(v[3], v[4]); + v[15] = xorv(v[15], v[0]); + v[12] = xorv(v[12], v[1]); + v[13] = xorv(v[13], v[2]); + v[14] = xorv(v[14], v[3]); + v[15] = rot8(v[15]); + v[12] = rot8(v[12]); + v[13] = rot8(v[13]); + v[14] = rot8(v[14]); + v[10] = addv(v[10], v[15]); + v[11] = addv(v[11], v[12]); + v[8] = addv(v[8], v[13]); + v[9] = addv(v[9], v[14]); + v[5] = xorv(v[5], v[10]); + v[6] = xorv(v[6], v[11]); + v[7] = xorv(v[7], v[8]); + v[4] = xorv(v[4], v[9]); + v[5] = rot7(v[5]); + v[6] = rot7(v[6]); + v[7] = rot7(v[7]); + v[4] = rot7(v[4]); +} + +INLINE void transpose_vecs(__m128i vecs[DEGREE]) { + // Interleave 32-bit lanes. The low unpack is lanes 00/11 and the high is + // 22/33. Note that this doesn't split the vector into two lanes, as the + // AVX2 counterparts do. + __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]); + __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]); + __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]); + __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]); + + // Interleave 64-bit lanes. + __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01); + __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01); + __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23); + __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23); + + vecs[0] = abcd_0; + vecs[1] = abcd_1; + vecs[2] = abcd_2; + vecs[3] = abcd_3; +} + +INLINE void transpose_msg_vecs(const uint8_t *const *inputs, + size_t block_offset, __m128i out[16]) { + out[0] = loadu(&inputs[0][block_offset + 0 * sizeof(__m128i)]); + out[1] = loadu(&inputs[1][block_offset + 0 * sizeof(__m128i)]); + out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m128i)]); + out[3] = loadu(&inputs[3][block_offset + 0 * sizeof(__m128i)]); + out[4] = loadu(&inputs[0][block_offset + 1 * sizeof(__m128i)]); + out[5] = loadu(&inputs[1][block_offset + 1 * sizeof(__m128i)]); + out[6] = loadu(&inputs[2][block_offset + 1 * sizeof(__m128i)]); + out[7] = loadu(&inputs[3][block_offset + 1 * sizeof(__m128i)]); + out[8] = loadu(&inputs[0][block_offset + 2 * sizeof(__m128i)]); + out[9] = loadu(&inputs[1][block_offset + 2 * sizeof(__m128i)]); + out[10] = loadu(&inputs[2][block_offset + 2 * sizeof(__m128i)]); + out[11] = loadu(&inputs[3][block_offset + 2 * sizeof(__m128i)]); + out[12] = loadu(&inputs[0][block_offset + 3 * sizeof(__m128i)]); + out[13] = loadu(&inputs[1][block_offset + 3 * sizeof(__m128i)]); + out[14] = loadu(&inputs[2][block_offset + 3 * sizeof(__m128i)]); + out[15] = loadu(&inputs[3][block_offset + 3 * sizeof(__m128i)]); + for (size_t i = 0; i < 4; ++i) { + _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0); + } + transpose_vecs(&out[0]); + transpose_vecs(&out[4]); + transpose_vecs(&out[8]); + transpose_vecs(&out[12]); +} + +INLINE void load_counters(uint64_t counter, bool increment_counter, + __m128i *out_lo, __m128i *out_hi) { + const __m128i mask = _mm_set1_epi32(-(int32_t)increment_counter); + const __m128i add0 = _mm_set_epi32(3, 2, 1, 0); + const __m128i add1 = _mm_and_si128(mask, add0); + __m128i l = _mm_add_epi32(_mm_set1_epi32((int32_t)counter), add1); + __m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32(0x80000000)), + _mm_xor_si128( l, _mm_set1_epi32(0x80000000))); + __m128i h = _mm_sub_epi32(_mm_set1_epi32((int32_t)(counter >> 32)), carry); + *out_lo = l; + *out_hi = h; +} + +static +void blake3_hash4_sse41(const uint8_t *const *inputs, size_t blocks, + const uint32_t key[8], uint64_t counter, + bool increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { + __m128i h_vecs[8] = { + set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]), + set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]), + }; + __m128i counter_low_vec, counter_high_vec; + load_counters(counter, increment_counter, &counter_low_vec, + &counter_high_vec); + uint8_t block_flags = flags | flags_start; + + for (size_t block = 0; block < blocks; block++) { + if (block + 1 == blocks) { + block_flags |= flags_end; + } + __m128i block_len_vec = set1(BLAKE3_BLOCK_LEN); + __m128i block_flags_vec = set1(block_flags); + __m128i msg_vecs[16]; + transpose_msg_vecs(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); + + __m128i v[16] = { + h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], + h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], + set1(IV[0]), set1(IV[1]), set1(IV[2]), set1(IV[3]), + counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, + }; + round_fn(v, msg_vecs, 0); + round_fn(v, msg_vecs, 1); + round_fn(v, msg_vecs, 2); + round_fn(v, msg_vecs, 3); + round_fn(v, msg_vecs, 4); + round_fn(v, msg_vecs, 5); + round_fn(v, msg_vecs, 6); + h_vecs[0] = xorv(v[0], v[8]); + h_vecs[1] = xorv(v[1], v[9]); + h_vecs[2] = xorv(v[2], v[10]); + h_vecs[3] = xorv(v[3], v[11]); + h_vecs[4] = xorv(v[4], v[12]); + h_vecs[5] = xorv(v[5], v[13]); + h_vecs[6] = xorv(v[6], v[14]); + h_vecs[7] = xorv(v[7], v[15]); + + block_flags = flags; + } + + transpose_vecs(&h_vecs[0]); + transpose_vecs(&h_vecs[4]); + // The first four vecs now contain the first half of each output, and the + // second four vecs contain the second half of each output. + storeu(h_vecs[0], &out[0 * sizeof(__m128i)]); + storeu(h_vecs[4], &out[1 * sizeof(__m128i)]); + storeu(h_vecs[1], &out[2 * sizeof(__m128i)]); + storeu(h_vecs[5], &out[3 * sizeof(__m128i)]); + storeu(h_vecs[2], &out[4 * sizeof(__m128i)]); + storeu(h_vecs[6], &out[5 * sizeof(__m128i)]); + storeu(h_vecs[3], &out[6 * sizeof(__m128i)]); + storeu(h_vecs[7], &out[7 * sizeof(__m128i)]); +} + +INLINE void hash_one_sse41(const uint8_t *input, size_t blocks, + const uint32_t key[8], uint64_t counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) { + uint32_t cv[8]; + memcpy(cv, key, BLAKE3_KEY_LEN); + uint8_t block_flags = flags | flags_start; + while (blocks > 0) { + if (blocks == 1) { + block_flags |= flags_end; + } + blake3_compress_in_place_sse41(cv, input, BLAKE3_BLOCK_LEN, counter, + block_flags); + input = &input[BLAKE3_BLOCK_LEN]; + blocks -= 1; + block_flags = flags; + } + memcpy(out, cv, BLAKE3_OUT_LEN); +} + +void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out) { + while (num_inputs >= DEGREE) { + blake3_hash4_sse41(inputs, blocks, key, counter, increment_counter, flags, + flags_start, flags_end, out); + if (increment_counter) { + counter += DEGREE; + } + inputs += DEGREE; + num_inputs -= DEGREE; + out = &out[DEGREE * BLAKE3_OUT_LEN]; + } + while (num_inputs > 0) { + hash_one_sse41(inputs[0], blocks, key, counter, flags, flags_start, + flags_end, out); + if (increment_counter) { + counter += 1; + } + inputs += 1; + num_inputs -= 1; + out = &out[BLAKE3_OUT_LEN]; + } +} diff --git a/thirdparty/blake3/c/blake3_sse41_x86-64_unix.S b/thirdparty/blake3/c/blake3_sse41_x86-64_unix.S new file mode 100644 index 000000000..a3ff64269 --- /dev/null +++ b/thirdparty/blake3/c/blake3_sse41_x86-64_unix.S @@ -0,0 +1,2028 @@ +#if defined(__ELF__) && defined(__linux__) +.section .note.GNU-stack,"",%progbits +#endif + +#if defined(__ELF__) && defined(__CET__) && defined(__has_include) +#if __has_include(<cet.h>) +#include <cet.h> +#endif +#endif + +#if !defined(_CET_ENDBR) +#define _CET_ENDBR +#endif + +.intel_syntax noprefix +.global blake3_hash_many_sse41 +.global _blake3_hash_many_sse41 +.global blake3_compress_in_place_sse41 +.global _blake3_compress_in_place_sse41 +.global blake3_compress_xof_sse41 +.global _blake3_compress_xof_sse41 +#ifdef __APPLE__ +.text +#else +.section .text +#endif + .p2align 6 +_blake3_hash_many_sse41: +blake3_hash_many_sse41: + _CET_ENDBR + push r15 + push r14 + push r13 + push r12 + push rbx + push rbp + mov rbp, rsp + sub rsp, 360 + and rsp, 0xFFFFFFFFFFFFFFC0 + neg r9d + movd xmm0, r9d + pshufd xmm0, xmm0, 0x00 + movdqa xmmword ptr [rsp+0x130], xmm0 + movdqa xmm1, xmm0 + pand xmm1, xmmword ptr [ADD0+rip] + pand xmm0, xmmword ptr [ADD1+rip] + movdqa xmmword ptr [rsp+0x150], xmm0 + movd xmm0, r8d + pshufd xmm0, xmm0, 0x00 + paddd xmm0, xmm1 + movdqa xmmword ptr [rsp+0x110], xmm0 + pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] + pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] + pcmpgtd xmm1, xmm0 + shr r8, 32 + movd xmm2, r8d + pshufd xmm2, xmm2, 0x00 + psubd xmm2, xmm1 + movdqa xmmword ptr [rsp+0x120], xmm2 + mov rbx, qword ptr [rbp+0x50] + mov r15, rdx + shl r15, 6 + movzx r13d, byte ptr [rbp+0x38] + movzx r12d, byte ptr [rbp+0x48] + cmp rsi, 4 + jc 3f +2: + movdqu xmm3, xmmword ptr [rcx] + pshufd xmm0, xmm3, 0x00 + pshufd xmm1, xmm3, 0x55 + pshufd xmm2, xmm3, 0xAA + pshufd xmm3, xmm3, 0xFF + movdqu xmm7, xmmword ptr [rcx+0x10] + pshufd xmm4, xmm7, 0x00 + pshufd xmm5, xmm7, 0x55 + pshufd xmm6, xmm7, 0xAA + pshufd xmm7, xmm7, 0xFF + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +9: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movdqu xmm8, xmmword ptr [r8+rdx-0x40] + movdqu xmm9, xmmword ptr [r9+rdx-0x40] + movdqu xmm10, xmmword ptr [r10+rdx-0x40] + movdqu xmm11, xmmword ptr [r11+rdx-0x40] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp], xmm8 + movdqa xmmword ptr [rsp+0x10], xmm9 + movdqa xmmword ptr [rsp+0x20], xmm12 + movdqa xmmword ptr [rsp+0x30], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-0x30] + movdqu xmm9, xmmword ptr [r9+rdx-0x30] + movdqu xmm10, xmmword ptr [r10+rdx-0x30] + movdqu xmm11, xmmword ptr [r11+rdx-0x30] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0x40], xmm8 + movdqa xmmword ptr [rsp+0x50], xmm9 + movdqa xmmword ptr [rsp+0x60], xmm12 + movdqa xmmword ptr [rsp+0x70], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-0x20] + movdqu xmm9, xmmword ptr [r9+rdx-0x20] + movdqu xmm10, xmmword ptr [r10+rdx-0x20] + movdqu xmm11, xmmword ptr [r11+rdx-0x20] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0x80], xmm8 + movdqa xmmword ptr [rsp+0x90], xmm9 + movdqa xmmword ptr [rsp+0xA0], xmm12 + movdqa xmmword ptr [rsp+0xB0], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-0x10] + movdqu xmm9, xmmword ptr [r9+rdx-0x10] + movdqu xmm10, xmmword ptr [r10+rdx-0x10] + movdqu xmm11, xmmword ptr [r11+rdx-0x10] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0xC0], xmm8 + movdqa xmmword ptr [rsp+0xD0], xmm9 + movdqa xmmword ptr [rsp+0xE0], xmm12 + movdqa xmmword ptr [rsp+0xF0], xmm13 + movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip] + movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip] + movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip] + movdqa xmm12, xmmword ptr [rsp+0x110] + movdqa xmm13, xmmword ptr [rsp+0x120] + movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip] + movd xmm15, eax + pshufd xmm15, xmm15, 0x00 + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + paddd xmm0, xmmword ptr [rsp] + paddd xmm1, xmmword ptr [rsp+0x20] + paddd xmm2, xmmword ptr [rsp+0x40] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x10] + paddd xmm1, xmmword ptr [rsp+0x30] + paddd xmm2, xmmword ptr [rsp+0x50] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x80] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp+0xC0] + paddd xmm3, xmmword ptr [rsp+0xE0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x90] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0xD0] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x20] + paddd xmm1, xmmword ptr [rsp+0x30] + paddd xmm2, xmmword ptr [rsp+0x70] + paddd xmm3, xmmword ptr [rsp+0x40] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x60] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp] + paddd xmm3, xmmword ptr [rsp+0xD0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x10] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0x90] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xB0] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp+0xE0] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x30] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp+0xD0] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x40] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0x20] + paddd xmm3, xmmword ptr [rsp+0xE0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x60] + paddd xmm1, xmmword ptr [rsp+0x90] + paddd xmm2, xmmword ptr [rsp+0xB0] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x50] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0xF0] + paddd xmm3, xmmword ptr [rsp+0x10] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xA0] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0xE0] + paddd xmm3, xmmword ptr [rsp+0xD0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x70] + paddd xmm1, xmmword ptr [rsp+0x90] + paddd xmm2, xmmword ptr [rsp+0x30] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x40] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0x50] + paddd xmm3, xmmword ptr [rsp+0x10] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp] + paddd xmm1, xmmword ptr [rsp+0x20] + paddd xmm2, xmmword ptr [rsp+0x80] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xC0] + paddd xmm1, xmmword ptr [rsp+0x90] + paddd xmm2, xmmword ptr [rsp+0xF0] + paddd xmm3, xmmword ptr [rsp+0xE0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xD0] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0xA0] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x70] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x20] + paddd xmm1, xmmword ptr [rsp+0x30] + paddd xmm2, xmmword ptr [rsp+0x10] + paddd xmm3, xmmword ptr [rsp+0x40] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x90] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0x80] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xE0] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp+0xC0] + paddd xmm3, xmmword ptr [rsp+0x10] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xD0] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0x20] + paddd xmm3, xmmword ptr [rsp+0x40] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x30] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp+0x60] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xB0] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp+0x10] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xF0] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0x90] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xE0] + paddd xmm1, xmmword ptr [rsp+0x20] + paddd xmm2, xmmword ptr [rsp+0x30] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xA0] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0x40] + paddd xmm3, xmmword ptr [rsp+0xD0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + pxor xmm0, xmm8 + pxor xmm1, xmm9 + pxor xmm2, xmm10 + pxor xmm3, xmm11 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + pxor xmm4, xmm12 + pxor xmm5, xmm13 + pxor xmm6, xmm14 + pxor xmm7, xmm15 + mov eax, r13d + jne 9b + movdqa xmm9, xmm0 + punpckldq xmm0, xmm1 + punpckhdq xmm9, xmm1 + movdqa xmm11, xmm2 + punpckldq xmm2, xmm3 + punpckhdq xmm11, xmm3 + movdqa xmm1, xmm0 + punpcklqdq xmm0, xmm2 + punpckhqdq xmm1, xmm2 + movdqa xmm3, xmm9 + punpcklqdq xmm9, xmm11 + punpckhqdq xmm3, xmm11 + movdqu xmmword ptr [rbx], xmm0 + movdqu xmmword ptr [rbx+0x20], xmm1 + movdqu xmmword ptr [rbx+0x40], xmm9 + movdqu xmmword ptr [rbx+0x60], xmm3 + movdqa xmm9, xmm4 + punpckldq xmm4, xmm5 + punpckhdq xmm9, xmm5 + movdqa xmm11, xmm6 + punpckldq xmm6, xmm7 + punpckhdq xmm11, xmm7 + movdqa xmm5, xmm4 + punpcklqdq xmm4, xmm6 + punpckhqdq xmm5, xmm6 + movdqa xmm7, xmm9 + punpcklqdq xmm9, xmm11 + punpckhqdq xmm7, xmm11 + movdqu xmmword ptr [rbx+0x10], xmm4 + movdqu xmmword ptr [rbx+0x30], xmm5 + movdqu xmmword ptr [rbx+0x50], xmm9 + movdqu xmmword ptr [rbx+0x70], xmm7 + movdqa xmm1, xmmword ptr [rsp+0x110] + movdqa xmm0, xmm1 + paddd xmm1, xmmword ptr [rsp+0x150] + movdqa xmmword ptr [rsp+0x110], xmm1 + pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] + pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] + pcmpgtd xmm0, xmm1 + movdqa xmm1, xmmword ptr [rsp+0x120] + psubd xmm1, xmm0 + movdqa xmmword ptr [rsp+0x120], xmm1 + add rbx, 128 + add rdi, 32 + sub rsi, 4 + cmp rsi, 4 + jnc 2b + test rsi, rsi + jnz 3f +4: + mov rsp, rbp + pop rbp + pop rbx + pop r12 + pop r13 + pop r14 + pop r15 + ret +.p2align 5 +3: + test esi, 0x2 + je 3f + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+0x10] + movaps xmm8, xmm0 + movaps xmm9, xmm1 + movd xmm13, dword ptr [rsp+0x110] + pinsrd xmm13, dword ptr [rsp+0x120], 1 + pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + movaps xmmword ptr [rsp], xmm13 + movd xmm14, dword ptr [rsp+0x114] + pinsrd xmm14, dword ptr [rsp+0x124], 1 + pinsrd xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + movaps xmmword ptr [rsp+0x10], xmm14 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + movaps xmm10, xmm2 + movups xmm4, xmmword ptr [r8+rdx-0x40] + movups xmm5, xmmword ptr [r8+rdx-0x30] + movaps xmm3, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm3, xmm5, 221 + movaps xmm5, xmm3 + movups xmm6, xmmword ptr [r8+rdx-0x20] + movups xmm7, xmmword ptr [r8+rdx-0x10] + movaps xmm3, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm3, xmm7, 221 + pshufd xmm7, xmm3, 0x93 + movups xmm12, xmmword ptr [r9+rdx-0x40] + movups xmm13, xmmword ptr [r9+rdx-0x30] + movaps xmm11, xmm12 + shufps xmm12, xmm13, 136 + shufps xmm11, xmm13, 221 + movaps xmm13, xmm11 + movups xmm14, xmmword ptr [r9+rdx-0x20] + movups xmm15, xmmword ptr [r9+rdx-0x10] + movaps xmm11, xmm14 + shufps xmm14, xmm15, 136 + pshufd xmm14, xmm14, 0x93 + shufps xmm11, xmm15, 221 + pshufd xmm15, xmm11, 0x93 + movaps xmm3, xmmword ptr [rsp] + movaps xmm11, xmmword ptr [rsp+0x10] + pinsrd xmm3, eax, 3 + pinsrd xmm11, eax, 3 + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm8, xmm12 + movaps xmmword ptr [rsp+0x20], xmm4 + movaps xmmword ptr [rsp+0x30], xmm12 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + movaps xmm12, xmmword ptr [ROT16+rip] + pshufb xmm3, xmm12 + pshufb xmm11, xmm12 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 20 + psrld xmm4, 12 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 20 + psrld xmm4, 12 + por xmm9, xmm4 + paddd xmm0, xmm5 + paddd xmm8, xmm13 + movaps xmmword ptr [rsp+0x40], xmm5 + movaps xmmword ptr [rsp+0x50], xmm13 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + movaps xmm13, xmmword ptr [ROT8+rip] + pshufb xmm3, xmm13 + pshufb xmm11, xmm13 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 25 + psrld xmm4, 7 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 25 + psrld xmm4, 7 + por xmm9, xmm4 + pshufd xmm0, xmm0, 0x93 + pshufd xmm8, xmm8, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm11, xmm11, 0x4E + pshufd xmm2, xmm2, 0x39 + pshufd xmm10, xmm10, 0x39 + paddd xmm0, xmm6 + paddd xmm8, xmm14 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + pshufb xmm3, xmm12 + pshufb xmm11, xmm12 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 20 + psrld xmm4, 12 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 20 + psrld xmm4, 12 + por xmm9, xmm4 + paddd xmm0, xmm7 + paddd xmm8, xmm15 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + pshufb xmm3, xmm13 + pshufb xmm11, xmm13 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 25 + psrld xmm4, 7 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 25 + psrld xmm4, 7 + por xmm9, xmm4 + pshufd xmm0, xmm0, 0x39 + pshufd xmm8, xmm8, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm11, xmm11, 0x4E + pshufd xmm2, xmm2, 0x93 + pshufd xmm10, xmm10, 0x93 + dec al + je 9f + movdqa xmm12, xmmword ptr [rsp+0x20] + movdqa xmm5, xmmword ptr [rsp+0x40] + pshufd xmm13, xmm12, 0x0F + shufps xmm12, xmm5, 214 + pshufd xmm4, xmm12, 0x39 + movdqa xmm12, xmm6 + shufps xmm12, xmm7, 250 + pblendw xmm13, xmm12, 0xCC + movdqa xmm12, xmm7 + punpcklqdq xmm12, xmm5 + pblendw xmm12, xmm6, 0xC0 + pshufd xmm12, xmm12, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmmword ptr [rsp+0x20], xmm13 + movdqa xmmword ptr [rsp+0x40], xmm12 + movdqa xmm5, xmmword ptr [rsp+0x30] + movdqa xmm13, xmmword ptr [rsp+0x50] + pshufd xmm6, xmm5, 0x0F + shufps xmm5, xmm13, 214 + pshufd xmm12, xmm5, 0x39 + movdqa xmm5, xmm14 + shufps xmm5, xmm15, 250 + pblendw xmm6, xmm5, 0xCC + movdqa xmm5, xmm15 + punpcklqdq xmm5, xmm13 + pblendw xmm5, xmm14, 0xC0 + pshufd xmm5, xmm5, 0x78 + punpckhdq xmm13, xmm15 + punpckldq xmm14, xmm13 + pshufd xmm15, xmm14, 0x1E + movdqa xmm13, xmm6 + movdqa xmm14, xmm5 + movdqa xmm5, xmmword ptr [rsp+0x20] + movdqa xmm6, xmmword ptr [rsp+0x40] + jmp 9b +9: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + pxor xmm8, xmm10 + pxor xmm9, xmm11 + mov eax, r13d + cmp rdx, r15 + jne 2b + movups xmmword ptr [rbx], xmm0 + movups xmmword ptr [rbx+0x10], xmm1 + movups xmmword ptr [rbx+0x20], xmm8 + movups xmmword ptr [rbx+0x30], xmm9 + movdqa xmm0, xmmword ptr [rsp+0x130] + movdqa xmm1, xmmword ptr [rsp+0x110] + movdqa xmm2, xmmword ptr [rsp+0x120] + movdqu xmm3, xmmword ptr [rsp+0x118] + movdqu xmm4, xmmword ptr [rsp+0x128] + blendvps xmm1, xmm3, xmm0 + blendvps xmm2, xmm4, xmm0 + movdqa xmmword ptr [rsp+0x110], xmm1 + movdqa xmmword ptr [rsp+0x120], xmm2 + add rdi, 16 + add rbx, 64 + sub rsi, 2 +3: + test esi, 0x1 + je 4b + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+0x10] + movd xmm13, dword ptr [rsp+0x110] + pinsrd xmm13, dword ptr [rsp+0x120], 1 + pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + movaps xmm14, xmmword ptr [ROT8+rip] + movaps xmm15, xmmword ptr [ROT16+rip] + mov r8, qword ptr [rdi] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + movaps xmm3, xmm13 + pinsrd xmm3, eax, 3 + movups xmm4, xmmword ptr [r8+rdx-0x40] + movups xmm5, xmmword ptr [r8+rdx-0x30] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [r8+rdx-0x20] + movups xmm7, xmmword ptr [r8+rdx-0x10] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 0x93 + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + dec al + jz 9f + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pblendw xmm9, xmm8, 0xCC + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + pblendw xmm8, xmm6, 0xC0 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp 9b +9: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + movups xmmword ptr [rbx], xmm0 + movups xmmword ptr [rbx+0x10], xmm1 + jmp 4b + +.p2align 6 +blake3_compress_in_place_sse41: +_blake3_compress_in_place_sse41: + _CET_ENDBR + movups xmm0, xmmword ptr [rdi] + movups xmm1, xmmword ptr [rdi+0x10] + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + shl r8, 32 + add rdx, r8 + movq xmm3, rcx + movq xmm4, rdx + punpcklqdq xmm3, xmm4 + movups xmm4, xmmword ptr [rsi] + movups xmm5, xmmword ptr [rsi+0x10] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [rsi+0x20] + movups xmm7, xmmword ptr [rsi+0x30] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 0x93 + movaps xmm14, xmmword ptr [ROT8+rip] + movaps xmm15, xmmword ptr [ROT16+rip] + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + dec al + jz 9f + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pblendw xmm9, xmm8, 0xCC + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + pblendw xmm8, xmm6, 0xC0 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp 9b +9: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + movups xmmword ptr [rdi], xmm0 + movups xmmword ptr [rdi+0x10], xmm1 + ret + +.p2align 6 +blake3_compress_xof_sse41: +_blake3_compress_xof_sse41: + _CET_ENDBR + movups xmm0, xmmword ptr [rdi] + movups xmm1, xmmword ptr [rdi+0x10] + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + movzx eax, r8b + movzx edx, dl + shl rax, 32 + add rdx, rax + movq xmm3, rcx + movq xmm4, rdx + punpcklqdq xmm3, xmm4 + movups xmm4, xmmword ptr [rsi] + movups xmm5, xmmword ptr [rsi+0x10] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [rsi+0x20] + movups xmm7, xmmword ptr [rsi+0x30] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 0x93 + movaps xmm14, xmmword ptr [ROT8+rip] + movaps xmm15, xmmword ptr [ROT16+rip] + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + dec al + jz 9f + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pblendw xmm9, xmm8, 0xCC + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + pblendw xmm8, xmm6, 0xC0 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp 9b +9: + movdqu xmm4, xmmword ptr [rdi] + movdqu xmm5, xmmword ptr [rdi+0x10] + pxor xmm0, xmm2 + pxor xmm1, xmm3 + pxor xmm2, xmm4 + pxor xmm3, xmm5 + movups xmmword ptr [r9], xmm0 + movups xmmword ptr [r9+0x10], xmm1 + movups xmmword ptr [r9+0x20], xmm2 + movups xmmword ptr [r9+0x30], xmm3 + ret + + +#ifdef __APPLE__ +.static_data +#else +.section .rodata +#endif +.p2align 6 +BLAKE3_IV: + .long 0x6A09E667, 0xBB67AE85 + .long 0x3C6EF372, 0xA54FF53A +ROT16: + .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 +ROT8: + .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 +ADD0: + .long 0, 1, 2, 3 +ADD1: + .long 4, 4, 4, 4 +BLAKE3_IV_0: + .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 +BLAKE3_IV_1: + .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 +BLAKE3_IV_2: + .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 +BLAKE3_IV_3: + .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A +BLAKE3_BLOCK_LEN: + .long 64, 64, 64, 64 +CMP_MSB_MASK: + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 diff --git a/thirdparty/blake3/c/blake3_sse41_x86-64_windows_gnu.S b/thirdparty/blake3/c/blake3_sse41_x86-64_windows_gnu.S new file mode 100644 index 000000000..02083f9d5 --- /dev/null +++ b/thirdparty/blake3/c/blake3_sse41_x86-64_windows_gnu.S @@ -0,0 +1,2069 @@ +.intel_syntax noprefix +.global blake3_hash_many_sse41 +.global _blake3_hash_many_sse41 +.global blake3_compress_in_place_sse41 +.global _blake3_compress_in_place_sse41 +.global blake3_compress_xof_sse41 +.global _blake3_compress_xof_sse41 +.section .text + .p2align 6 +_blake3_hash_many_sse41: +blake3_hash_many_sse41: + push r15 + push r14 + push r13 + push r12 + push rsi + push rdi + push rbx + push rbp + mov rbp, rsp + sub rsp, 528 + and rsp, 0xFFFFFFFFFFFFFFC0 + movdqa xmmword ptr [rsp+0x170], xmm6 + movdqa xmmword ptr [rsp+0x180], xmm7 + movdqa xmmword ptr [rsp+0x190], xmm8 + movdqa xmmword ptr [rsp+0x1A0], xmm9 + movdqa xmmword ptr [rsp+0x1B0], xmm10 + movdqa xmmword ptr [rsp+0x1C0], xmm11 + movdqa xmmword ptr [rsp+0x1D0], xmm12 + movdqa xmmword ptr [rsp+0x1E0], xmm13 + movdqa xmmword ptr [rsp+0x1F0], xmm14 + movdqa xmmword ptr [rsp+0x200], xmm15 + mov rdi, rcx + mov rsi, rdx + mov rdx, r8 + mov rcx, r9 + mov r8, qword ptr [rbp+0x68] + movzx r9, byte ptr [rbp+0x70] + neg r9d + movd xmm0, r9d + pshufd xmm0, xmm0, 0x00 + movdqa xmmword ptr [rsp+0x130], xmm0 + movdqa xmm1, xmm0 + pand xmm1, xmmword ptr [ADD0+rip] + pand xmm0, xmmword ptr [ADD1+rip] + movdqa xmmword ptr [rsp+0x150], xmm0 + movd xmm0, r8d + pshufd xmm0, xmm0, 0x00 + paddd xmm0, xmm1 + movdqa xmmword ptr [rsp+0x110], xmm0 + pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] + pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] + pcmpgtd xmm1, xmm0 + shr r8, 32 + movd xmm2, r8d + pshufd xmm2, xmm2, 0x00 + psubd xmm2, xmm1 + movdqa xmmword ptr [rsp+0x120], xmm2 + mov rbx, qword ptr [rbp+0x90] + mov r15, rdx + shl r15, 6 + movzx r13d, byte ptr [rbp+0x78] + movzx r12d, byte ptr [rbp+0x88] + cmp rsi, 4 + jc 3f +2: + movdqu xmm3, xmmword ptr [rcx] + pshufd xmm0, xmm3, 0x00 + pshufd xmm1, xmm3, 0x55 + pshufd xmm2, xmm3, 0xAA + pshufd xmm3, xmm3, 0xFF + movdqu xmm7, xmmword ptr [rcx+0x10] + pshufd xmm4, xmm7, 0x00 + pshufd xmm5, xmm7, 0x55 + pshufd xmm6, xmm7, 0xAA + pshufd xmm7, xmm7, 0xFF + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + movzx eax, byte ptr [rbp+0x80] + or eax, r13d + xor edx, edx +9: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movdqu xmm8, xmmword ptr [r8+rdx-0x40] + movdqu xmm9, xmmword ptr [r9+rdx-0x40] + movdqu xmm10, xmmword ptr [r10+rdx-0x40] + movdqu xmm11, xmmword ptr [r11+rdx-0x40] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp], xmm8 + movdqa xmmword ptr [rsp+0x10], xmm9 + movdqa xmmword ptr [rsp+0x20], xmm12 + movdqa xmmword ptr [rsp+0x30], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-0x30] + movdqu xmm9, xmmword ptr [r9+rdx-0x30] + movdqu xmm10, xmmword ptr [r10+rdx-0x30] + movdqu xmm11, xmmword ptr [r11+rdx-0x30] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0x40], xmm8 + movdqa xmmword ptr [rsp+0x50], xmm9 + movdqa xmmword ptr [rsp+0x60], xmm12 + movdqa xmmword ptr [rsp+0x70], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-0x20] + movdqu xmm9, xmmword ptr [r9+rdx-0x20] + movdqu xmm10, xmmword ptr [r10+rdx-0x20] + movdqu xmm11, xmmword ptr [r11+rdx-0x20] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0x80], xmm8 + movdqa xmmword ptr [rsp+0x90], xmm9 + movdqa xmmword ptr [rsp+0xA0], xmm12 + movdqa xmmword ptr [rsp+0xB0], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-0x10] + movdqu xmm9, xmmword ptr [r9+rdx-0x10] + movdqu xmm10, xmmword ptr [r10+rdx-0x10] + movdqu xmm11, xmmword ptr [r11+rdx-0x10] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0xC0], xmm8 + movdqa xmmword ptr [rsp+0xD0], xmm9 + movdqa xmmword ptr [rsp+0xE0], xmm12 + movdqa xmmword ptr [rsp+0xF0], xmm13 + movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip] + movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip] + movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip] + movdqa xmm12, xmmword ptr [rsp+0x110] + movdqa xmm13, xmmword ptr [rsp+0x120] + movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip] + movd xmm15, eax + pshufd xmm15, xmm15, 0x00 + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + paddd xmm0, xmmword ptr [rsp] + paddd xmm1, xmmword ptr [rsp+0x20] + paddd xmm2, xmmword ptr [rsp+0x40] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x10] + paddd xmm1, xmmword ptr [rsp+0x30] + paddd xmm2, xmmword ptr [rsp+0x50] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x80] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp+0xC0] + paddd xmm3, xmmword ptr [rsp+0xE0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x90] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0xD0] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x20] + paddd xmm1, xmmword ptr [rsp+0x30] + paddd xmm2, xmmword ptr [rsp+0x70] + paddd xmm3, xmmword ptr [rsp+0x40] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x60] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp] + paddd xmm3, xmmword ptr [rsp+0xD0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x10] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0x90] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xB0] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp+0xE0] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x30] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp+0xD0] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x40] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0x20] + paddd xmm3, xmmword ptr [rsp+0xE0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x60] + paddd xmm1, xmmword ptr [rsp+0x90] + paddd xmm2, xmmword ptr [rsp+0xB0] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x50] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0xF0] + paddd xmm3, xmmword ptr [rsp+0x10] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xA0] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0xE0] + paddd xmm3, xmmword ptr [rsp+0xD0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x70] + paddd xmm1, xmmword ptr [rsp+0x90] + paddd xmm2, xmmword ptr [rsp+0x30] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x40] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0x50] + paddd xmm3, xmmword ptr [rsp+0x10] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp] + paddd xmm1, xmmword ptr [rsp+0x20] + paddd xmm2, xmmword ptr [rsp+0x80] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xC0] + paddd xmm1, xmmword ptr [rsp+0x90] + paddd xmm2, xmmword ptr [rsp+0xF0] + paddd xmm3, xmmword ptr [rsp+0xE0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xD0] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0xA0] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x70] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x20] + paddd xmm1, xmmword ptr [rsp+0x30] + paddd xmm2, xmmword ptr [rsp+0x10] + paddd xmm3, xmmword ptr [rsp+0x40] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x90] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0x80] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xE0] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp+0xC0] + paddd xmm3, xmmword ptr [rsp+0x10] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xD0] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0x20] + paddd xmm3, xmmword ptr [rsp+0x40] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x30] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp+0x60] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xB0] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp+0x10] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xF0] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0x90] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xE0] + paddd xmm1, xmmword ptr [rsp+0x20] + paddd xmm2, xmmword ptr [rsp+0x30] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xA0] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0x40] + paddd xmm3, xmmword ptr [rsp+0xD0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + pxor xmm0, xmm8 + pxor xmm1, xmm9 + pxor xmm2, xmm10 + pxor xmm3, xmm11 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + pxor xmm4, xmm12 + pxor xmm5, xmm13 + pxor xmm6, xmm14 + pxor xmm7, xmm15 + mov eax, r13d + jne 9b + movdqa xmm9, xmm0 + punpckldq xmm0, xmm1 + punpckhdq xmm9, xmm1 + movdqa xmm11, xmm2 + punpckldq xmm2, xmm3 + punpckhdq xmm11, xmm3 + movdqa xmm1, xmm0 + punpcklqdq xmm0, xmm2 + punpckhqdq xmm1, xmm2 + movdqa xmm3, xmm9 + punpcklqdq xmm9, xmm11 + punpckhqdq xmm3, xmm11 + movdqu xmmword ptr [rbx], xmm0 + movdqu xmmword ptr [rbx+0x20], xmm1 + movdqu xmmword ptr [rbx+0x40], xmm9 + movdqu xmmword ptr [rbx+0x60], xmm3 + movdqa xmm9, xmm4 + punpckldq xmm4, xmm5 + punpckhdq xmm9, xmm5 + movdqa xmm11, xmm6 + punpckldq xmm6, xmm7 + punpckhdq xmm11, xmm7 + movdqa xmm5, xmm4 + punpcklqdq xmm4, xmm6 + punpckhqdq xmm5, xmm6 + movdqa xmm7, xmm9 + punpcklqdq xmm9, xmm11 + punpckhqdq xmm7, xmm11 + movdqu xmmword ptr [rbx+0x10], xmm4 + movdqu xmmword ptr [rbx+0x30], xmm5 + movdqu xmmword ptr [rbx+0x50], xmm9 + movdqu xmmword ptr [rbx+0x70], xmm7 + movdqa xmm1, xmmword ptr [rsp+0x110] + movdqa xmm0, xmm1 + paddd xmm1, xmmword ptr [rsp+0x150] + movdqa xmmword ptr [rsp+0x110], xmm1 + pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] + pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] + pcmpgtd xmm0, xmm1 + movdqa xmm1, xmmword ptr [rsp+0x120] + psubd xmm1, xmm0 + movdqa xmmword ptr [rsp+0x120], xmm1 + add rbx, 128 + add rdi, 32 + sub rsi, 4 + cmp rsi, 4 + jnc 2b + test rsi, rsi + jne 3f +4: + movdqa xmm6, xmmword ptr [rsp+0x170] + movdqa xmm7, xmmword ptr [rsp+0x180] + movdqa xmm8, xmmword ptr [rsp+0x190] + movdqa xmm9, xmmword ptr [rsp+0x1A0] + movdqa xmm10, xmmword ptr [rsp+0x1B0] + movdqa xmm11, xmmword ptr [rsp+0x1C0] + movdqa xmm12, xmmword ptr [rsp+0x1D0] + movdqa xmm13, xmmword ptr [rsp+0x1E0] + movdqa xmm14, xmmword ptr [rsp+0x1F0] + movdqa xmm15, xmmword ptr [rsp+0x200] + mov rsp, rbp + pop rbp + pop rbx + pop rdi + pop rsi + pop r12 + pop r13 + pop r14 + pop r15 + ret +.p2align 5 +3: + test esi, 0x2 + je 3f + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+0x10] + movaps xmm8, xmm0 + movaps xmm9, xmm1 + movd xmm13, dword ptr [rsp+0x110] + pinsrd xmm13, dword ptr [rsp+0x120], 1 + pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + movaps xmmword ptr [rsp], xmm13 + movd xmm14, dword ptr [rsp+0x114] + pinsrd xmm14, dword ptr [rsp+0x124], 1 + pinsrd xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + movaps xmmword ptr [rsp+0x10], xmm14 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + movzx eax, byte ptr [rbp+0x80] + or eax, r13d + xor edx, edx +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + movaps xmm10, xmm2 + movups xmm4, xmmword ptr [r8+rdx-0x40] + movups xmm5, xmmword ptr [r8+rdx-0x30] + movaps xmm3, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm3, xmm5, 221 + movaps xmm5, xmm3 + movups xmm6, xmmword ptr [r8+rdx-0x20] + movups xmm7, xmmword ptr [r8+rdx-0x10] + movaps xmm3, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm3, xmm7, 221 + pshufd xmm7, xmm3, 0x93 + movups xmm12, xmmword ptr [r9+rdx-0x40] + movups xmm13, xmmword ptr [r9+rdx-0x30] + movaps xmm11, xmm12 + shufps xmm12, xmm13, 136 + shufps xmm11, xmm13, 221 + movaps xmm13, xmm11 + movups xmm14, xmmword ptr [r9+rdx-0x20] + movups xmm15, xmmword ptr [r9+rdx-0x10] + movaps xmm11, xmm14 + shufps xmm14, xmm15, 136 + pshufd xmm14, xmm14, 0x93 + shufps xmm11, xmm15, 221 + pshufd xmm15, xmm11, 0x93 + movaps xmm3, xmmword ptr [rsp] + movaps xmm11, xmmword ptr [rsp+0x10] + pinsrd xmm3, eax, 3 + pinsrd xmm11, eax, 3 + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm8, xmm12 + movaps xmmword ptr [rsp+0x20], xmm4 + movaps xmmword ptr [rsp+0x30], xmm12 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + movaps xmm12, xmmword ptr [ROT16+rip] + pshufb xmm3, xmm12 + pshufb xmm11, xmm12 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 20 + psrld xmm4, 12 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 20 + psrld xmm4, 12 + por xmm9, xmm4 + paddd xmm0, xmm5 + paddd xmm8, xmm13 + movaps xmmword ptr [rsp+0x40], xmm5 + movaps xmmword ptr [rsp+0x50], xmm13 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + movaps xmm13, xmmword ptr [ROT8+rip] + pshufb xmm3, xmm13 + pshufb xmm11, xmm13 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 25 + psrld xmm4, 7 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 25 + psrld xmm4, 7 + por xmm9, xmm4 + pshufd xmm0, xmm0, 0x93 + pshufd xmm8, xmm8, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm11, xmm11, 0x4E + pshufd xmm2, xmm2, 0x39 + pshufd xmm10, xmm10, 0x39 + paddd xmm0, xmm6 + paddd xmm8, xmm14 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + pshufb xmm3, xmm12 + pshufb xmm11, xmm12 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 20 + psrld xmm4, 12 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 20 + psrld xmm4, 12 + por xmm9, xmm4 + paddd xmm0, xmm7 + paddd xmm8, xmm15 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + pshufb xmm3, xmm13 + pshufb xmm11, xmm13 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 25 + psrld xmm4, 7 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 25 + psrld xmm4, 7 + por xmm9, xmm4 + pshufd xmm0, xmm0, 0x39 + pshufd xmm8, xmm8, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm11, xmm11, 0x4E + pshufd xmm2, xmm2, 0x93 + pshufd xmm10, xmm10, 0x93 + dec al + je 9f + movdqa xmm12, xmmword ptr [rsp+0x20] + movdqa xmm5, xmmword ptr [rsp+0x40] + pshufd xmm13, xmm12, 0x0F + shufps xmm12, xmm5, 214 + pshufd xmm4, xmm12, 0x39 + movdqa xmm12, xmm6 + shufps xmm12, xmm7, 250 + pblendw xmm13, xmm12, 0xCC + movdqa xmm12, xmm7 + punpcklqdq xmm12, xmm5 + pblendw xmm12, xmm6, 0xC0 + pshufd xmm12, xmm12, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmmword ptr [rsp+0x20], xmm13 + movdqa xmmword ptr [rsp+0x40], xmm12 + movdqa xmm5, xmmword ptr [rsp+0x30] + movdqa xmm13, xmmword ptr [rsp+0x50] + pshufd xmm6, xmm5, 0x0F + shufps xmm5, xmm13, 214 + pshufd xmm12, xmm5, 0x39 + movdqa xmm5, xmm14 + shufps xmm5, xmm15, 250 + pblendw xmm6, xmm5, 0xCC + movdqa xmm5, xmm15 + punpcklqdq xmm5, xmm13 + pblendw xmm5, xmm14, 0xC0 + pshufd xmm5, xmm5, 0x78 + punpckhdq xmm13, xmm15 + punpckldq xmm14, xmm13 + pshufd xmm15, xmm14, 0x1E + movdqa xmm13, xmm6 + movdqa xmm14, xmm5 + movdqa xmm5, xmmword ptr [rsp+0x20] + movdqa xmm6, xmmword ptr [rsp+0x40] + jmp 9b +9: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + pxor xmm8, xmm10 + pxor xmm9, xmm11 + mov eax, r13d + cmp rdx, r15 + jne 2b + movups xmmword ptr [rbx], xmm0 + movups xmmword ptr [rbx+0x10], xmm1 + movups xmmword ptr [rbx+0x20], xmm8 + movups xmmword ptr [rbx+0x30], xmm9 + movdqa xmm0, xmmword ptr [rsp+0x130] + movdqa xmm1, xmmword ptr [rsp+0x110] + movdqa xmm2, xmmword ptr [rsp+0x120] + movdqu xmm3, xmmword ptr [rsp+0x118] + movdqu xmm4, xmmword ptr [rsp+0x128] + blendvps xmm1, xmm3, xmm0 + blendvps xmm2, xmm4, xmm0 + movdqa xmmword ptr [rsp+0x110], xmm1 + movdqa xmmword ptr [rsp+0x120], xmm2 + add rdi, 16 + add rbx, 64 + sub rsi, 2 +3: + test esi, 0x1 + je 4b + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+0x10] + movd xmm13, dword ptr [rsp+0x110] + pinsrd xmm13, dword ptr [rsp+0x120], 1 + pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + movaps xmm14, xmmword ptr [ROT8+rip] + movaps xmm15, xmmword ptr [ROT16+rip] + mov r8, qword ptr [rdi] + movzx eax, byte ptr [rbp+0x80] + or eax, r13d + xor edx, edx +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + movaps xmm3, xmm13 + pinsrd xmm3, eax, 3 + movups xmm4, xmmword ptr [r8+rdx-0x40] + movups xmm5, xmmword ptr [r8+rdx-0x30] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [r8+rdx-0x20] + movups xmm7, xmmword ptr [r8+rdx-0x10] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 0x93 + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + dec al + jz 9f + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pblendw xmm9, xmm8, 0xCC + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + pblendw xmm8, xmm6, 0xC0 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp 9b +9: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + movups xmmword ptr [rbx], xmm0 + movups xmmword ptr [rbx+0x10], xmm1 + jmp 4b + +.p2align 6 +blake3_compress_in_place_sse41: +_blake3_compress_in_place_sse41: + sub rsp, 120 + movdqa xmmword ptr [rsp], xmm6 + movdqa xmmword ptr [rsp+0x10], xmm7 + movdqa xmmword ptr [rsp+0x20], xmm8 + movdqa xmmword ptr [rsp+0x30], xmm9 + movdqa xmmword ptr [rsp+0x40], xmm11 + movdqa xmmword ptr [rsp+0x50], xmm14 + movdqa xmmword ptr [rsp+0x60], xmm15 + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+0x10] + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + movzx eax, byte ptr [rsp+0xA0] + movzx r8d, r8b + shl rax, 32 + add r8, rax + movq xmm3, r9 + movq xmm4, r8 + punpcklqdq xmm3, xmm4 + movups xmm4, xmmword ptr [rdx] + movups xmm5, xmmword ptr [rdx+0x10] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [rdx+0x20] + movups xmm7, xmmword ptr [rdx+0x30] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 0x93 + movaps xmm14, xmmword ptr [ROT8+rip] + movaps xmm15, xmmword ptr [ROT16+rip] + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + dec al + jz 9f + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pblendw xmm9, xmm8, 0xCC + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + pblendw xmm8, xmm6, 0xC0 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp 9b +9: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + movups xmmword ptr [rcx], xmm0 + movups xmmword ptr [rcx+0x10], xmm1 + movdqa xmm6, xmmword ptr [rsp] + movdqa xmm7, xmmword ptr [rsp+0x10] + movdqa xmm8, xmmword ptr [rsp+0x20] + movdqa xmm9, xmmword ptr [rsp+0x30] + movdqa xmm11, xmmword ptr [rsp+0x40] + movdqa xmm14, xmmword ptr [rsp+0x50] + movdqa xmm15, xmmword ptr [rsp+0x60] + add rsp, 120 + ret + + +.p2align 6 +_blake3_compress_xof_sse41: +blake3_compress_xof_sse41: + sub rsp, 120 + movdqa xmmword ptr [rsp], xmm6 + movdqa xmmword ptr [rsp+0x10], xmm7 + movdqa xmmword ptr [rsp+0x20], xmm8 + movdqa xmmword ptr [rsp+0x30], xmm9 + movdqa xmmword ptr [rsp+0x40], xmm11 + movdqa xmmword ptr [rsp+0x50], xmm14 + movdqa xmmword ptr [rsp+0x60], xmm15 + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+0x10] + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + movzx eax, byte ptr [rsp+0xA0] + movzx r8d, r8b + mov r10, qword ptr [rsp+0xA8] + shl rax, 32 + add r8, rax + movq xmm3, r9 + movq xmm4, r8 + punpcklqdq xmm3, xmm4 + movups xmm4, xmmword ptr [rdx] + movups xmm5, xmmword ptr [rdx+0x10] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [rdx+0x20] + movups xmm7, xmmword ptr [rdx+0x30] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 0x93 + movaps xmm14, xmmword ptr [ROT8+rip] + movaps xmm15, xmmword ptr [ROT16+rip] + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + dec al + jz 9f + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pblendw xmm9, xmm8, 0xCC + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + pblendw xmm8, xmm6, 0xC0 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp 9b +9: + movdqu xmm4, xmmword ptr [rcx] + movdqu xmm5, xmmword ptr [rcx+0x10] + pxor xmm0, xmm2 + pxor xmm1, xmm3 + pxor xmm2, xmm4 + pxor xmm3, xmm5 + movups xmmword ptr [r10], xmm0 + movups xmmword ptr [r10+0x10], xmm1 + movups xmmword ptr [r10+0x20], xmm2 + movups xmmword ptr [r10+0x30], xmm3 + movdqa xmm6, xmmword ptr [rsp] + movdqa xmm7, xmmword ptr [rsp+0x10] + movdqa xmm8, xmmword ptr [rsp+0x20] + movdqa xmm9, xmmword ptr [rsp+0x30] + movdqa xmm11, xmmword ptr [rsp+0x40] + movdqa xmm14, xmmword ptr [rsp+0x50] + movdqa xmm15, xmmword ptr [rsp+0x60] + add rsp, 120 + ret + + +.section .rdata +.p2align 6 +BLAKE3_IV: + .long 0x6A09E667, 0xBB67AE85 + .long 0x3C6EF372, 0xA54FF53A +ROT16: + .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 +ROT8: + .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 +ADD0: + .long 0, 1, 2, 3 +ADD1: + .long 4, 4, 4, 4 +BLAKE3_IV_0: + .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 +BLAKE3_IV_1: + .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 +BLAKE3_IV_2: + .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 +BLAKE3_IV_3: + .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A +BLAKE3_BLOCK_LEN: + .long 64, 64, 64, 64 +CMP_MSB_MASK: + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 diff --git a/thirdparty/blake3/c/blake3_sse41_x86-64_windows_msvc.asm b/thirdparty/blake3/c/blake3_sse41_x86-64_windows_msvc.asm new file mode 100644 index 000000000..8966c7b84 --- /dev/null +++ b/thirdparty/blake3/c/blake3_sse41_x86-64_windows_msvc.asm @@ -0,0 +1,2089 @@ +public _blake3_hash_many_sse41 +public blake3_hash_many_sse41 +public blake3_compress_in_place_sse41 +public _blake3_compress_in_place_sse41 +public blake3_compress_xof_sse41 +public _blake3_compress_xof_sse41 + +_TEXT SEGMENT ALIGN(16) 'CODE' + +ALIGN 16 +blake3_hash_many_sse41 PROC +_blake3_hash_many_sse41 PROC + push r15 + push r14 + push r13 + push r12 + push rsi + push rdi + push rbx + push rbp + mov rbp, rsp + sub rsp, 528 + and rsp, 0FFFFFFFFFFFFFFC0H + movdqa xmmword ptr [rsp+170H], xmm6 + movdqa xmmword ptr [rsp+180H], xmm7 + movdqa xmmword ptr [rsp+190H], xmm8 + movdqa xmmword ptr [rsp+1A0H], xmm9 + movdqa xmmword ptr [rsp+1B0H], xmm10 + movdqa xmmword ptr [rsp+1C0H], xmm11 + movdqa xmmword ptr [rsp+1D0H], xmm12 + movdqa xmmword ptr [rsp+1E0H], xmm13 + movdqa xmmword ptr [rsp+1F0H], xmm14 + movdqa xmmword ptr [rsp+200H], xmm15 + mov rdi, rcx + mov rsi, rdx + mov rdx, r8 + mov rcx, r9 + mov r8, qword ptr [rbp+68H] + movzx r9, byte ptr [rbp+70H] + neg r9d + movd xmm0, r9d + pshufd xmm0, xmm0, 00H + movdqa xmmword ptr [rsp+130H], xmm0 + movdqa xmm1, xmm0 + pand xmm1, xmmword ptr [ADD0] + pand xmm0, xmmword ptr [ADD1] + movdqa xmmword ptr [rsp+150H], xmm0 + movd xmm0, r8d + pshufd xmm0, xmm0, 00H + paddd xmm0, xmm1 + movdqa xmmword ptr [rsp+110H], xmm0 + pxor xmm0, xmmword ptr [CMP_MSB_MASK] + pxor xmm1, xmmword ptr [CMP_MSB_MASK] + pcmpgtd xmm1, xmm0 + shr r8, 32 + movd xmm2, r8d + pshufd xmm2, xmm2, 00H + psubd xmm2, xmm1 + movdqa xmmword ptr [rsp+120H], xmm2 + mov rbx, qword ptr [rbp+90H] + mov r15, rdx + shl r15, 6 + movzx r13d, byte ptr [rbp+78H] + movzx r12d, byte ptr [rbp+88H] + cmp rsi, 4 + jc final3blocks +outerloop4: + movdqu xmm3, xmmword ptr [rcx] + pshufd xmm0, xmm3, 00H + pshufd xmm1, xmm3, 55H + pshufd xmm2, xmm3, 0AAH + pshufd xmm3, xmm3, 0FFH + movdqu xmm7, xmmword ptr [rcx+10H] + pshufd xmm4, xmm7, 00H + pshufd xmm5, xmm7, 55H + pshufd xmm6, xmm7, 0AAH + pshufd xmm7, xmm7, 0FFH + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+8H] + mov r10, qword ptr [rdi+10H] + mov r11, qword ptr [rdi+18H] + movzx eax, byte ptr [rbp+80H] + or eax, r13d + xor edx, edx +innerloop4: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movdqu xmm8, xmmword ptr [r8+rdx-40H] + movdqu xmm9, xmmword ptr [r9+rdx-40H] + movdqu xmm10, xmmword ptr [r10+rdx-40H] + movdqu xmm11, xmmword ptr [r11+rdx-40H] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp], xmm8 + movdqa xmmword ptr [rsp+10H], xmm9 + movdqa xmmword ptr [rsp+20H], xmm12 + movdqa xmmword ptr [rsp+30H], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-30H] + movdqu xmm9, xmmword ptr [r9+rdx-30H] + movdqu xmm10, xmmword ptr [r10+rdx-30H] + movdqu xmm11, xmmword ptr [r11+rdx-30H] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+40H], xmm8 + movdqa xmmword ptr [rsp+50H], xmm9 + movdqa xmmword ptr [rsp+60H], xmm12 + movdqa xmmword ptr [rsp+70H], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-20H] + movdqu xmm9, xmmword ptr [r9+rdx-20H] + movdqu xmm10, xmmword ptr [r10+rdx-20H] + movdqu xmm11, xmmword ptr [r11+rdx-20H] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+80H], xmm8 + movdqa xmmword ptr [rsp+90H], xmm9 + movdqa xmmword ptr [rsp+0A0H], xmm12 + movdqa xmmword ptr [rsp+0B0H], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-10H] + movdqu xmm9, xmmword ptr [r9+rdx-10H] + movdqu xmm10, xmmword ptr [r10+rdx-10H] + movdqu xmm11, xmmword ptr [r11+rdx-10H] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0C0H], xmm8 + movdqa xmmword ptr [rsp+0D0H], xmm9 + movdqa xmmword ptr [rsp+0E0H], xmm12 + movdqa xmmword ptr [rsp+0F0H], xmm13 + movdqa xmm9, xmmword ptr [BLAKE3_IV_1] + movdqa xmm10, xmmword ptr [BLAKE3_IV_2] + movdqa xmm11, xmmword ptr [BLAKE3_IV_3] + movdqa xmm12, xmmword ptr [rsp+110H] + movdqa xmm13, xmmword ptr [rsp+120H] + movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN] + movd xmm15, eax + pshufd xmm15, xmm15, 00H + prefetcht0 byte ptr [r8+rdx+80H] + prefetcht0 byte ptr [r9+rdx+80H] + prefetcht0 byte ptr [r10+rdx+80H] + prefetcht0 byte ptr [r11+rdx+80H] + paddd xmm0, xmmword ptr [rsp] + paddd xmm1, xmmword ptr [rsp+20H] + paddd xmm2, xmmword ptr [rsp+40H] + paddd xmm3, xmmword ptr [rsp+60H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [BLAKE3_IV_0] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+10H] + paddd xmm1, xmmword ptr [rsp+30H] + paddd xmm2, xmmword ptr [rsp+50H] + paddd xmm3, xmmword ptr [rsp+70H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+80H] + paddd xmm1, xmmword ptr [rsp+0A0H] + paddd xmm2, xmmword ptr [rsp+0C0H] + paddd xmm3, xmmword ptr [rsp+0E0H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+90H] + paddd xmm1, xmmword ptr [rsp+0B0H] + paddd xmm2, xmmword ptr [rsp+0D0H] + paddd xmm3, xmmword ptr [rsp+0F0H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+20H] + paddd xmm1, xmmword ptr [rsp+30H] + paddd xmm2, xmmword ptr [rsp+70H] + paddd xmm3, xmmword ptr [rsp+40H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+60H] + paddd xmm1, xmmword ptr [rsp+0A0H] + paddd xmm2, xmmword ptr [rsp] + paddd xmm3, xmmword ptr [rsp+0D0H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+10H] + paddd xmm1, xmmword ptr [rsp+0C0H] + paddd xmm2, xmmword ptr [rsp+90H] + paddd xmm3, xmmword ptr [rsp+0F0H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0B0H] + paddd xmm1, xmmword ptr [rsp+50H] + paddd xmm2, xmmword ptr [rsp+0E0H] + paddd xmm3, xmmword ptr [rsp+80H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+30H] + paddd xmm1, xmmword ptr [rsp+0A0H] + paddd xmm2, xmmword ptr [rsp+0D0H] + paddd xmm3, xmmword ptr [rsp+70H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+40H] + paddd xmm1, xmmword ptr [rsp+0C0H] + paddd xmm2, xmmword ptr [rsp+20H] + paddd xmm3, xmmword ptr [rsp+0E0H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+60H] + paddd xmm1, xmmword ptr [rsp+90H] + paddd xmm2, xmmword ptr [rsp+0B0H] + paddd xmm3, xmmword ptr [rsp+80H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+50H] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0F0H] + paddd xmm3, xmmword ptr [rsp+10H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0A0H] + paddd xmm1, xmmword ptr [rsp+0C0H] + paddd xmm2, xmmword ptr [rsp+0E0H] + paddd xmm3, xmmword ptr [rsp+0D0H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+70H] + paddd xmm1, xmmword ptr [rsp+90H] + paddd xmm2, xmmword ptr [rsp+30H] + paddd xmm3, xmmword ptr [rsp+0F0H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+40H] + paddd xmm1, xmmword ptr [rsp+0B0H] + paddd xmm2, xmmword ptr [rsp+50H] + paddd xmm3, xmmword ptr [rsp+10H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp] + paddd xmm1, xmmword ptr [rsp+20H] + paddd xmm2, xmmword ptr [rsp+80H] + paddd xmm3, xmmword ptr [rsp+60H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0C0H] + paddd xmm1, xmmword ptr [rsp+90H] + paddd xmm2, xmmword ptr [rsp+0F0H] + paddd xmm3, xmmword ptr [rsp+0E0H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0D0H] + paddd xmm1, xmmword ptr [rsp+0B0H] + paddd xmm2, xmmword ptr [rsp+0A0H] + paddd xmm3, xmmword ptr [rsp+80H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+70H] + paddd xmm1, xmmword ptr [rsp+50H] + paddd xmm2, xmmword ptr [rsp] + paddd xmm3, xmmword ptr [rsp+60H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+20H] + paddd xmm1, xmmword ptr [rsp+30H] + paddd xmm2, xmmword ptr [rsp+10H] + paddd xmm3, xmmword ptr [rsp+40H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+90H] + paddd xmm1, xmmword ptr [rsp+0B0H] + paddd xmm2, xmmword ptr [rsp+80H] + paddd xmm3, xmmword ptr [rsp+0F0H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0E0H] + paddd xmm1, xmmword ptr [rsp+50H] + paddd xmm2, xmmword ptr [rsp+0C0H] + paddd xmm3, xmmword ptr [rsp+10H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0D0H] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+20H] + paddd xmm3, xmmword ptr [rsp+40H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+30H] + paddd xmm1, xmmword ptr [rsp+0A0H] + paddd xmm2, xmmword ptr [rsp+60H] + paddd xmm3, xmmword ptr [rsp+70H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0B0H] + paddd xmm1, xmmword ptr [rsp+50H] + paddd xmm2, xmmword ptr [rsp+10H] + paddd xmm3, xmmword ptr [rsp+80H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0F0H] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+90H] + paddd xmm3, xmmword ptr [rsp+60H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0E0H] + paddd xmm1, xmmword ptr [rsp+20H] + paddd xmm2, xmmword ptr [rsp+30H] + paddd xmm3, xmmword ptr [rsp+70H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0A0H] + paddd xmm1, xmmword ptr [rsp+0C0H] + paddd xmm2, xmmword ptr [rsp+40H] + paddd xmm3, xmmword ptr [rsp+0D0H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + pxor xmm0, xmm8 + pxor xmm1, xmm9 + pxor xmm2, xmm10 + pxor xmm3, xmm11 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + pxor xmm4, xmm12 + pxor xmm5, xmm13 + pxor xmm6, xmm14 + pxor xmm7, xmm15 + mov eax, r13d + jne innerloop4 + movdqa xmm9, xmm0 + punpckldq xmm0, xmm1 + punpckhdq xmm9, xmm1 + movdqa xmm11, xmm2 + punpckldq xmm2, xmm3 + punpckhdq xmm11, xmm3 + movdqa xmm1, xmm0 + punpcklqdq xmm0, xmm2 + punpckhqdq xmm1, xmm2 + movdqa xmm3, xmm9 + punpcklqdq xmm9, xmm11 + punpckhqdq xmm3, xmm11 + movdqu xmmword ptr [rbx], xmm0 + movdqu xmmword ptr [rbx+20H], xmm1 + movdqu xmmword ptr [rbx+40H], xmm9 + movdqu xmmword ptr [rbx+60H], xmm3 + movdqa xmm9, xmm4 + punpckldq xmm4, xmm5 + punpckhdq xmm9, xmm5 + movdqa xmm11, xmm6 + punpckldq xmm6, xmm7 + punpckhdq xmm11, xmm7 + movdqa xmm5, xmm4 + punpcklqdq xmm4, xmm6 + punpckhqdq xmm5, xmm6 + movdqa xmm7, xmm9 + punpcklqdq xmm9, xmm11 + punpckhqdq xmm7, xmm11 + movdqu xmmword ptr [rbx+10H], xmm4 + movdqu xmmword ptr [rbx+30H], xmm5 + movdqu xmmword ptr [rbx+50H], xmm9 + movdqu xmmword ptr [rbx+70H], xmm7 + movdqa xmm1, xmmword ptr [rsp+110H] + movdqa xmm0, xmm1 + paddd xmm1, xmmword ptr [rsp+150H] + movdqa xmmword ptr [rsp+110H], xmm1 + pxor xmm0, xmmword ptr [CMP_MSB_MASK] + pxor xmm1, xmmword ptr [CMP_MSB_MASK] + pcmpgtd xmm0, xmm1 + movdqa xmm1, xmmword ptr [rsp+120H] + psubd xmm1, xmm0 + movdqa xmmword ptr [rsp+120H], xmm1 + add rbx, 128 + add rdi, 32 + sub rsi, 4 + cmp rsi, 4 + jnc outerloop4 + test rsi, rsi + jne final3blocks +unwind: + movdqa xmm6, xmmword ptr [rsp+170H] + movdqa xmm7, xmmword ptr [rsp+180H] + movdqa xmm8, xmmword ptr [rsp+190H] + movdqa xmm9, xmmword ptr [rsp+1A0H] + movdqa xmm10, xmmword ptr [rsp+1B0H] + movdqa xmm11, xmmword ptr [rsp+1C0H] + movdqa xmm12, xmmword ptr [rsp+1D0H] + movdqa xmm13, xmmword ptr [rsp+1E0H] + movdqa xmm14, xmmword ptr [rsp+1F0H] + movdqa xmm15, xmmword ptr [rsp+200H] + mov rsp, rbp + pop rbp + pop rbx + pop rdi + pop rsi + pop r12 + pop r13 + pop r14 + pop r15 + ret +ALIGN 16 +final3blocks: + test esi, 2H + je final1block + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+10H] + movaps xmm8, xmm0 + movaps xmm9, xmm1 + movd xmm13, dword ptr [rsp+110H] + pinsrd xmm13, dword ptr [rsp+120H], 1 + pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN], 2 + movaps xmmword ptr [rsp], xmm13 + movd xmm14, dword ptr [rsp+114H] + pinsrd xmm14, dword ptr [rsp+124H], 1 + pinsrd xmm14, dword ptr [BLAKE3_BLOCK_LEN], 2 + movaps xmmword ptr [rsp+10H], xmm14 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+8H] + movzx eax, byte ptr [rbp+80H] + or eax, r13d + xor edx, edx +innerloop2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movaps xmm2, xmmword ptr [BLAKE3_IV] + movaps xmm10, xmm2 + movups xmm4, xmmword ptr [r8+rdx-40H] + movups xmm5, xmmword ptr [r8+rdx-30H] + movaps xmm3, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm3, xmm5, 221 + movaps xmm5, xmm3 + movups xmm6, xmmword ptr [r8+rdx-20H] + movups xmm7, xmmword ptr [r8+rdx-10H] + movaps xmm3, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 93H + shufps xmm3, xmm7, 221 + pshufd xmm7, xmm3, 93H + movups xmm12, xmmword ptr [r9+rdx-40H] + movups xmm13, xmmword ptr [r9+rdx-30H] + movaps xmm11, xmm12 + shufps xmm12, xmm13, 136 + shufps xmm11, xmm13, 221 + movaps xmm13, xmm11 + movups xmm14, xmmword ptr [r9+rdx-20H] + movups xmm15, xmmword ptr [r9+rdx-10H] + movaps xmm11, xmm14 + shufps xmm14, xmm15, 136 + pshufd xmm14, xmm14, 93H + shufps xmm11, xmm15, 221 + pshufd xmm15, xmm11, 93H + movaps xmm3, xmmword ptr [rsp] + movaps xmm11, xmmword ptr [rsp+10H] + pinsrd xmm3, eax, 3 + pinsrd xmm11, eax, 3 + mov al, 7 +roundloop2: + paddd xmm0, xmm4 + paddd xmm8, xmm12 + movaps xmmword ptr [rsp+20H], xmm4 + movaps xmmword ptr [rsp+30H], xmm12 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + movaps xmm12, xmmword ptr [ROT16] + pshufb xmm3, xmm12 + pshufb xmm11, xmm12 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 20 + psrld xmm4, 12 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 20 + psrld xmm4, 12 + por xmm9, xmm4 + paddd xmm0, xmm5 + paddd xmm8, xmm13 + movaps xmmword ptr [rsp+40H], xmm5 + movaps xmmword ptr [rsp+50H], xmm13 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + movaps xmm13, xmmword ptr [ROT8] + pshufb xmm3, xmm13 + pshufb xmm11, xmm13 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 25 + psrld xmm4, 7 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 25 + psrld xmm4, 7 + por xmm9, xmm4 + pshufd xmm0, xmm0, 93H + pshufd xmm8, xmm8, 93H + pshufd xmm3, xmm3, 4EH + pshufd xmm11, xmm11, 4EH + pshufd xmm2, xmm2, 39H + pshufd xmm10, xmm10, 39H + paddd xmm0, xmm6 + paddd xmm8, xmm14 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + pshufb xmm3, xmm12 + pshufb xmm11, xmm12 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 20 + psrld xmm4, 12 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 20 + psrld xmm4, 12 + por xmm9, xmm4 + paddd xmm0, xmm7 + paddd xmm8, xmm15 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + pshufb xmm3, xmm13 + pshufb xmm11, xmm13 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 25 + psrld xmm4, 7 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 25 + psrld xmm4, 7 + por xmm9, xmm4 + pshufd xmm0, xmm0, 39H + pshufd xmm8, xmm8, 39H + pshufd xmm3, xmm3, 4EH + pshufd xmm11, xmm11, 4EH + pshufd xmm2, xmm2, 93H + pshufd xmm10, xmm10, 93H + dec al + je endroundloop2 + movdqa xmm12, xmmword ptr [rsp+20H] + movdqa xmm5, xmmword ptr [rsp+40H] + pshufd xmm13, xmm12, 0FH + shufps xmm12, xmm5, 214 + pshufd xmm4, xmm12, 39H + movdqa xmm12, xmm6 + shufps xmm12, xmm7, 250 + pblendw xmm13, xmm12, 0CCH + movdqa xmm12, xmm7 + punpcklqdq xmm12, xmm5 + pblendw xmm12, xmm6, 0C0H + pshufd xmm12, xmm12, 78H + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 1EH + movdqa xmmword ptr [rsp+20H], xmm13 + movdqa xmmword ptr [rsp+40H], xmm12 + movdqa xmm5, xmmword ptr [rsp+30H] + movdqa xmm13, xmmword ptr [rsp+50H] + pshufd xmm6, xmm5, 0FH + shufps xmm5, xmm13, 214 + pshufd xmm12, xmm5, 39H + movdqa xmm5, xmm14 + shufps xmm5, xmm15, 250 + pblendw xmm6, xmm5, 0CCH + movdqa xmm5, xmm15 + punpcklqdq xmm5, xmm13 + pblendw xmm5, xmm14, 0C0H + pshufd xmm5, xmm5, 78H + punpckhdq xmm13, xmm15 + punpckldq xmm14, xmm13 + pshufd xmm15, xmm14, 1EH + movdqa xmm13, xmm6 + movdqa xmm14, xmm5 + movdqa xmm5, xmmword ptr [rsp+20H] + movdqa xmm6, xmmword ptr [rsp+40H] + jmp roundloop2 +endroundloop2: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + pxor xmm8, xmm10 + pxor xmm9, xmm11 + mov eax, r13d + cmp rdx, r15 + jne innerloop2 + movups xmmword ptr [rbx], xmm0 + movups xmmword ptr [rbx+10H], xmm1 + movups xmmword ptr [rbx+20H], xmm8 + movups xmmword ptr [rbx+30H], xmm9 + movdqa xmm0, xmmword ptr [rsp+130H] + movdqa xmm1, xmmword ptr [rsp+110H] + movdqa xmm2, xmmword ptr [rsp+120H] + movdqu xmm3, xmmword ptr [rsp+118H] + movdqu xmm4, xmmword ptr [rsp+128H] + blendvps xmm1, xmm3, xmm0 + blendvps xmm2, xmm4, xmm0 + movdqa xmmword ptr [rsp+110H], xmm1 + movdqa xmmword ptr [rsp+120H], xmm2 + add rdi, 16 + add rbx, 64 + sub rsi, 2 +final1block: + test esi, 1H + je unwind + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+10H] + movd xmm13, dword ptr [rsp+110H] + pinsrd xmm13, dword ptr [rsp+120H], 1 + pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN], 2 + movaps xmm14, xmmword ptr [ROT8] + movaps xmm15, xmmword ptr [ROT16] + mov r8, qword ptr [rdi] + movzx eax, byte ptr [rbp+80H] + or eax, r13d + xor edx, edx +innerloop1: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movaps xmm2, xmmword ptr [BLAKE3_IV] + movaps xmm3, xmm13 + pinsrd xmm3, eax, 3 + movups xmm4, xmmword ptr [r8+rdx-40H] + movups xmm5, xmmword ptr [r8+rdx-30H] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [r8+rdx-20H] + movups xmm7, xmmword ptr [r8+rdx-10H] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 93H + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 93H + mov al, 7 +roundloop1: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 93H + pshufd xmm3, xmm3, 4EH + pshufd xmm2, xmm2, 39H + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 39H + pshufd xmm3, xmm3, 4EH + pshufd xmm2, xmm2, 93H + dec al + jz endroundloop1 + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0FH + pshufd xmm4, xmm8, 39H + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pblendw xmm9, xmm8, 0CCH + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + pblendw xmm8, xmm6, 0C0H + pshufd xmm8, xmm8, 78H + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 1EH + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp roundloop1 +endroundloop1: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + mov eax, r13d + cmp rdx, r15 + jne innerloop1 + movups xmmword ptr [rbx], xmm0 + movups xmmword ptr [rbx+10H], xmm1 + jmp unwind +_blake3_hash_many_sse41 ENDP +blake3_hash_many_sse41 ENDP + +blake3_compress_in_place_sse41 PROC +_blake3_compress_in_place_sse41 PROC + sub rsp, 120 + movdqa xmmword ptr [rsp], xmm6 + movdqa xmmword ptr [rsp+10H], xmm7 + movdqa xmmword ptr [rsp+20H], xmm8 + movdqa xmmword ptr [rsp+30H], xmm9 + movdqa xmmword ptr [rsp+40H], xmm11 + movdqa xmmword ptr [rsp+50H], xmm14 + movdqa xmmword ptr [rsp+60H], xmm15 + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+10H] + movaps xmm2, xmmword ptr [BLAKE3_IV] + movzx eax, byte ptr [rsp+0A0H] + movzx r8d, r8b + shl rax, 32 + add r8, rax + movd xmm3, r9 + movd xmm4, r8 + punpcklqdq xmm3, xmm4 + movups xmm4, xmmword ptr [rdx] + movups xmm5, xmmword ptr [rdx+10H] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [rdx+20H] + movups xmm7, xmmword ptr [rdx+30H] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 93H + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 93H + movaps xmm14, xmmword ptr [ROT8] + movaps xmm15, xmmword ptr [ROT16] + mov al, 7 +@@: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 93H + pshufd xmm3, xmm3, 4EH + pshufd xmm2, xmm2, 39H + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 39H + pshufd xmm3, xmm3, 4EH + pshufd xmm2, xmm2, 93H + dec al + jz @F + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0FH + pshufd xmm4, xmm8, 39H + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pblendw xmm9, xmm8, 0CCH + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + pblendw xmm8, xmm6, 0C0H + pshufd xmm8, xmm8, 78H + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 1EH + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp @B +@@: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + movups xmmword ptr [rcx], xmm0 + movups xmmword ptr [rcx+10H], xmm1 + movdqa xmm6, xmmword ptr [rsp] + movdqa xmm7, xmmword ptr [rsp+10H] + movdqa xmm8, xmmword ptr [rsp+20H] + movdqa xmm9, xmmword ptr [rsp+30H] + movdqa xmm11, xmmword ptr [rsp+40H] + movdqa xmm14, xmmword ptr [rsp+50H] + movdqa xmm15, xmmword ptr [rsp+60H] + add rsp, 120 + ret +_blake3_compress_in_place_sse41 ENDP +blake3_compress_in_place_sse41 ENDP + +ALIGN 16 +blake3_compress_xof_sse41 PROC +_blake3_compress_xof_sse41 PROC + sub rsp, 120 + movdqa xmmword ptr [rsp], xmm6 + movdqa xmmword ptr [rsp+10H], xmm7 + movdqa xmmword ptr [rsp+20H], xmm8 + movdqa xmmword ptr [rsp+30H], xmm9 + movdqa xmmword ptr [rsp+40H], xmm11 + movdqa xmmword ptr [rsp+50H], xmm14 + movdqa xmmword ptr [rsp+60H], xmm15 + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+10H] + movaps xmm2, xmmword ptr [BLAKE3_IV] + movzx eax, byte ptr [rsp+0A0H] + movzx r8d, r8b + mov r10, qword ptr [rsp+0A8H] + shl rax, 32 + add r8, rax + movd xmm3, r9 + movd xmm4, r8 + punpcklqdq xmm3, xmm4 + movups xmm4, xmmword ptr [rdx] + movups xmm5, xmmword ptr [rdx+10H] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [rdx+20H] + movups xmm7, xmmword ptr [rdx+30H] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 93H + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 93H + movaps xmm14, xmmword ptr [ROT8] + movaps xmm15, xmmword ptr [ROT16] + mov al, 7 +@@: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 93H + pshufd xmm3, xmm3, 4EH + pshufd xmm2, xmm2, 39H + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 39H + pshufd xmm3, xmm3, 4EH + pshufd xmm2, xmm2, 93H + dec al + jz @F + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0FH + pshufd xmm4, xmm8, 39H + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pblendw xmm9, xmm8, 0CCH + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + pblendw xmm8, xmm6, 0C0H + pshufd xmm8, xmm8, 78H + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 1EH + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp @B +@@: + movdqu xmm4, xmmword ptr [rcx] + movdqu xmm5, xmmword ptr [rcx+10H] + pxor xmm0, xmm2 + pxor xmm1, xmm3 + pxor xmm2, xmm4 + pxor xmm3, xmm5 + movups xmmword ptr [r10], xmm0 + movups xmmword ptr [r10+10H], xmm1 + movups xmmword ptr [r10+20H], xmm2 + movups xmmword ptr [r10+30H], xmm3 + movdqa xmm6, xmmword ptr [rsp] + movdqa xmm7, xmmword ptr [rsp+10H] + movdqa xmm8, xmmword ptr [rsp+20H] + movdqa xmm9, xmmword ptr [rsp+30H] + movdqa xmm11, xmmword ptr [rsp+40H] + movdqa xmm14, xmmword ptr [rsp+50H] + movdqa xmm15, xmmword ptr [rsp+60H] + add rsp, 120 + ret +_blake3_compress_xof_sse41 ENDP +blake3_compress_xof_sse41 ENDP + +_TEXT ENDS + + +_RDATA SEGMENT READONLY PAGE ALIAS(".rdata") 'CONST' +ALIGN 64 +BLAKE3_IV: + dd 6A09E667H, 0BB67AE85H, 3C6EF372H, 0A54FF53AH + +ADD0: + dd 0, 1, 2, 3 + +ADD1: + dd 4 dup (4) + +BLAKE3_IV_0: + dd 4 dup (6A09E667H) + +BLAKE3_IV_1: + dd 4 dup (0BB67AE85H) + +BLAKE3_IV_2: + dd 4 dup (3C6EF372H) + +BLAKE3_IV_3: + dd 4 dup (0A54FF53AH) + +BLAKE3_BLOCK_LEN: + dd 4 dup (64) + +ROT16: + db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 + +ROT8: + db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 + +CMP_MSB_MASK: + dd 8 dup(80000000H) + +_RDATA ENDS +END + diff --git a/thirdparty/blake3/c/blake3_tbb.cpp b/thirdparty/blake3/c/blake3_tbb.cpp new file mode 100644 index 000000000..c2bc2dbed --- /dev/null +++ b/thirdparty/blake3/c/blake3_tbb.cpp @@ -0,0 +1,37 @@ +#include <cstddef> +#include <cstdint> + +#include <oneapi/tbb/parallel_invoke.h> + +#include "blake3_impl.h" + +static_assert(TBB_USE_EXCEPTIONS == 0, + "This file should be compiled with C++ exceptions disabled."); + +extern "C" void blake3_compress_subtree_wide_join_tbb( + // shared params + const uint32_t key[8], uint8_t flags, bool use_tbb, + // left-hand side params + const uint8_t *l_input, size_t l_input_len, uint64_t l_chunk_counter, + uint8_t *l_cvs, size_t *l_n, + // right-hand side params + const uint8_t *r_input, size_t r_input_len, uint64_t r_chunk_counter, + uint8_t *r_cvs, size_t *r_n) noexcept { + if (!use_tbb) { + *l_n = blake3_compress_subtree_wide(l_input, l_input_len, key, + l_chunk_counter, flags, l_cvs, use_tbb); + *r_n = blake3_compress_subtree_wide(r_input, r_input_len, key, + r_chunk_counter, flags, r_cvs, use_tbb); + return; + } + + oneapi::tbb::parallel_invoke( + [=]() { + *l_n = blake3_compress_subtree_wide( + l_input, l_input_len, key, l_chunk_counter, flags, l_cvs, use_tbb); + }, + [=]() { + *r_n = blake3_compress_subtree_wide( + r_input, r_input_len, key, r_chunk_counter, flags, r_cvs, use_tbb); + }); +} diff --git a/thirdparty/blake3/c/cmake/BLAKE3/ContinuousIntegration.cmake b/thirdparty/blake3/c/cmake/BLAKE3/ContinuousIntegration.cmake new file mode 100644 index 000000000..57a1f1810 --- /dev/null +++ b/thirdparty/blake3/c/cmake/BLAKE3/ContinuousIntegration.cmake @@ -0,0 +1,235 @@ +cmake_minimum_required(VERSION 3.13 FATAL_ERROR) + +if(BUILD_SHARED_LIBS) + message(FATAL_ERROR "BUILD_SHARED_LIBS is incompatible with BLAKE3_TESTING_CI") +endif() + +include(CTest) + +# Declare a testing specific variant of the `blake3` library target. +# +# We use a separate library target in order to be able to perform compilation with various +# combinations of features which are too noisy to specify in the main CMake config as options for +# the normal `blake3` target. +# +# Initially this target has no properties but eventually we will populate them by copying all of the +# relevant properties from the normal `blake3` target. +add_library(blake3-testing + blake3.c + blake3_dispatch.c + blake3_portable.c +) + +if(BLAKE3_USE_TBB AND TBB_FOUND) + target_sources(blake3-testing + PRIVATE + blake3_tbb.cpp) +endif() + +if(BLAKE3_SIMD_TYPE STREQUAL "amd64-asm") + # Conditionally add amd64 asm files to `blake3-testing` sources + if(MSVC) + if(NOT BLAKE3_NO_AVX2) + list(APPEND BLAKE3_TESTING_AMD64_ASM_SOURCES blake3_avx2_x86-64_windows_msvc.asm) + endif() + if(NOT BLAKE3_NO_AVX512) + list(APPEND BLAKE3_TESTING_AMD64_ASM_SOURCES blake3_avx512_x86-64_windows_msvc.asm) + endif() + if(NOT BLAKE3_NO_SSE2) + list(APPEND BLAKE3_TESTING_AMD64_ASM_SOURCES blake3_sse2_x86-64_windows_msvc.asm) + endif() + if(NOT BLAKE3_NO_SSE41) + list(APPEND BLAKE3_TESTING_AMD64_ASM_SOURCES blake3_sse41_x86-64_windows_msvc.asm) + endif() + elseif(CMAKE_C_COMPILER_ID STREQUAL "GNU" + OR CMAKE_C_COMPILER_ID STREQUAL "Clang" + OR CMAKE_C_COMPILER_ID STREQUAL "AppleClang") + if (WIN32) + if(NOT BLAKE3_NO_AVX2) + list(APPEND BLAKE3_TESTING_AMD64_ASM_SOURCES blake3_avx2_x86-64_windows_gnu.S) + endif() + if(NOT BLAKE3_NO_AVX512) + list(APPEND BLAKE3_TESTING_AMD64_ASM_SOURCES blake3_avx512_x86-64_windows_gnu.S) + endif() + if(NOT BLAKE3_NO_SSE2) + list(APPEND BLAKE3_TESTING_AMD64_ASM_SOURCES blake3_sse2_x86-64_windows_gnu.S) + endif() + if(NOT BLAKE3_NO_SSE41) + list(APPEND BLAKE3_TESTING_AMD64_ASM_SOURCES blake3_sse41_x86-64_windows_gnu.S) + endif() + elseif(UNIX) + if(NOT BLAKE3_NO_AVX2) + list(APPEND BLAKE3_TESTING_AMD64_ASM_SOURCES blake3_avx2_x86-64_unix.S) + endif() + if(NOT BLAKE3_NO_AVX512) + list(APPEND BLAKE3_TESTING_AMD64_ASM_SOURCES blake3_avx512_x86-64_unix.S) + endif() + if(NOT BLAKE3_NO_SSE2) + list(APPEND BLAKE3_TESTING_AMD64_ASM_SOURCES blake3_sse2_x86-64_unix.S) + endif() + if(NOT BLAKE3_NO_SSE41) + list(APPEND BLAKE3_TESTING_AMD64_ASM_SOURCES blake3_sse41_x86-64_unix.S) + endif() + endif() + endif() + target_sources(blake3-testing PRIVATE ${BLAKE3_AMD64_ASM_SOURCES}) +elseif(BLAKE3_SIMD_TYPE STREQUAL "x86-intrinsics") + # Conditionally add amd64 C files to `blake3-testing` sources + if (NOT DEFINED BLAKE3_CFLAGS_SSE2 + OR NOT DEFINED BLAKE3_CFLAGS_SSE4.1 + OR NOT DEFINED BLAKE3_CFLAGS_AVX2 + OR NOT DEFINED BLAKE3_CFLAGS_AVX512) + message(WARNING "BLAKE3_SIMD_TYPE is set to 'x86-intrinsics' but no compiler flags are available for the target architecture.") + else() + set(BLAKE3_SIMD_X86_INTRINSICS ON) + endif() + + if(NOT BLAKE3_NO_AVX2) + target_sources(blake3-testing PRIVATE blake3_avx2.c) + set_source_files_properties(blake3_avx2.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_AVX2}") + endif() + if(NOT BLAKE3_NO_AVX512) + target_sources(blake3-testing PRIVATE blake3_avx512.c) + set_source_files_properties(blake3_avx512.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_AVX512}") + endif() + if(NOT BLAKE3_NO_SSE2) + target_sources(blake3-testing PRIVATE blake3_sse2.c) + set_source_files_properties(blake3_sse2.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_SSE2}") + endif() + if(NOT BLAKE3_NO_SSE41) + target_sources(blake3-testing PRIVATE blake3_sse41.c) + set_source_files_properties(blake3_sse41.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_SSE4.1}") + endif() + +elseif(BLAKE3_SIMD_TYPE STREQUAL "neon-intrinsics") + # Conditionally add neon C files to `blake3-testing` sources + + target_sources(blake3-testing PRIVATE + blake3_neon.c + ) + target_compile_definitions(blake3-testing PRIVATE + BLAKE3_USE_NEON=1 + ) + + if (DEFINED BLAKE3_CFLAGS_NEON) + set_source_files_properties(blake3_neon.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_NEON}") + endif() + +elseif(BLAKE3_SIMD_TYPE STREQUAL "none") + # Disable neon if simd type is "none". We check for individual amd64 features further below. + + target_compile_definitions(blake3-testing PRIVATE + BLAKE3_USE_NEON=0 + ) + +endif() + +if(BLAKE3_NO_AVX2) + target_compile_definitions(blake3-testing PRIVATE BLAKE3_NO_AVX2) +endif() +if(BLAKE3_NO_AVX512) + target_compile_definitions(blake3-testing PRIVATE BLAKE3_NO_AVX512) +endif() +if(BLAKE3_NO_SSE2) + target_compile_definitions(blake3-testing PRIVATE BLAKE3_NO_SSE2) +endif() +if(BLAKE3_NO_SSE41) + target_compile_definitions(blake3-testing PRIVATE BLAKE3_NO_SSE41) +endif() + +target_compile_definitions(blake3-testing PUBLIC BLAKE3_TESTING) + +get_target_property(BLAKE3_COMPILE_DEFINITIONS blake3 COMPILE_DEFINITIONS) +if(BLAKE3_COMPILE_DEFINITIONS) + target_compile_definitions(blake3-testing PUBLIC + ${BLAKE3_COMPILE_DEFINITIONS}) +endif() + +get_target_property(BLAKE3_COMPILE_OPTIONS blake3 COMPILE_OPTIONS) +if(BLAKE3_COMPILE_OPTIONS) + target_compile_options(blake3-testing PRIVATE + ${BLAKE3_COMPILE_OPTIONS} + -O3 + -Wall + -Wextra + -pedantic + -fstack-protector-strong + -D_FORTIFY_SOURCE=2 + -fPIE + -fvisibility=hidden + -fsanitize=address,undefined + ) +endif() + +get_target_property(BLAKE3_INCLUDE_DIRECTORIES blake3 INCLUDE_DIRECTORIES) +if(BLAKE3_INCLUDE_DIRECTORIES) + target_include_directories(blake3-testing PUBLIC + $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}> + $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}> + ) +endif() + +get_target_property(BLAKE3_LINK_LIBRARIES blake3 LINK_LIBRARIES) +if(BLAKE3_LINK_LIBRARIES) + target_link_libraries(blake3-testing PRIVATE ${BLAKE3_LINK_LIBRARIES}) +endif() + +get_target_property(BLAKE3_LINK_OPTIONS blake3 LINK_OPTIONS) +if(BLAKE3_LINK_OPTIONS) + target_link_options(blake3-testing PRIVATE + ${BLAKE3_LINK_OPTIONS} + -fsanitize=address,undefined + -pie + -Wl,-z,relro,-z,now + ) +endif() + +# test asm target +add_executable(blake3-asm-test + main.c +) +set_target_properties(blake3-asm-test PROPERTIES + OUTPUT_NAME blake3 + RUNTIME_OUTPUT_DIRECTORY ${CMAKE_SOURCE_DIR}) +target_link_libraries(blake3-asm-test PRIVATE blake3-testing) +target_compile_definitions(blake3-asm-test PRIVATE BLAKE3_TESTING) +target_compile_options(blake3-asm-test PRIVATE + -O3 + -Wall + -Wextra + -pedantic + -fstack-protector-strong + -D_FORTIFY_SOURCE=2 + -fPIE + -fvisibility=hidden + -fsanitize=address,undefined +) +target_link_options(blake3-asm-test PRIVATE + -fsanitize=address,undefined + -pie + -Wl,-z,relro,-z,now +) + +add_test(NAME blake3-testing + COMMAND "${CMAKE_CTEST_COMMAND}" + --verbose + --extra-verbose + --build-and-test "${CMAKE_SOURCE_DIR}" "${CMAKE_BINARY_DIR}" + --build-generator "${CMAKE_GENERATOR}" + --build-makeprogram "${CMAKE_MAKE_PROGRAM}" + --build-project libblake3 + --build-target blake3-asm-test + --build-options + --fresh + "-DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS}" + "-DBLAKE3_TESTING=${BLAKE3_TESTING}" + "-DBLAKE3_TESTING_CI=${BLAKE3_TESTING_CI}" + "-DBLAKE3_USE_TBB=${BLAKE3_USE_TBB}" + "-DBLAKE3_SIMD_TYPE=${BLAKE3_SIMD_TYPE}" + "-DBLAKE3_NO_SSE2=${BLAKE3_NO_SSE2}" + "-DBLAKE3_NO_SSE41=${BLAKE3_NO_SSE41}" + "-DBLAKE3_NO_AVX2=${BLAKE3_NO_AVX2}" + "-DBLAKE3_NO_AVX512=${BLAKE3_NO_AVX512}" + --test-command + "${CMAKE_SOURCE_DIR}/test.py" + ) diff --git a/thirdparty/blake3/c/cmake/BLAKE3/Examples.cmake b/thirdparty/blake3/c/cmake/BLAKE3/Examples.cmake new file mode 100644 index 000000000..8f0edc578 --- /dev/null +++ b/thirdparty/blake3/c/cmake/BLAKE3/Examples.cmake @@ -0,0 +1,13 @@ +if(NOT WIN32) + add_executable(blake3-example + example.c) + target_link_libraries(blake3-example PRIVATE blake3) + install(TARGETS blake3-example) + + if(BLAKE3_USE_TBB) + add_executable(blake3-example-tbb + example_tbb.c) + target_link_libraries(blake3-example-tbb PRIVATE blake3) + install(TARGETS blake3-example-tbb) + endif() +endif() diff --git a/thirdparty/blake3/c/cmake/BLAKE3/Testing.cmake b/thirdparty/blake3/c/cmake/BLAKE3/Testing.cmake new file mode 100644 index 000000000..9beb16a69 --- /dev/null +++ b/thirdparty/blake3/c/cmake/BLAKE3/Testing.cmake @@ -0,0 +1,3 @@ +if(BLAKE3_TESTING_CI) + include(BLAKE3/ContinuousIntegration) +endif() diff --git a/thirdparty/blake3/c/example.c b/thirdparty/blake3/c/example.c new file mode 100644 index 000000000..41a707ea1 --- /dev/null +++ b/thirdparty/blake3/c/example.c @@ -0,0 +1,36 @@ +#include "blake3.h" +#include <errno.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> + +int main(void) { + // Initialize the hasher. + blake3_hasher hasher; + blake3_hasher_init(&hasher); + + // Read input bytes from stdin. + unsigned char buf[65536]; + while (1) { + ssize_t n = read(STDIN_FILENO, buf, sizeof(buf)); + if (n > 0) { + blake3_hasher_update(&hasher, buf, n); + } else if (n == 0) { + break; // end of file + } else { + fprintf(stderr, "read failed: %s\n", strerror(errno)); + return 1; + } + } + + // Finalize the hash. BLAKE3_OUT_LEN is the default output length, 32 bytes. + uint8_t output[BLAKE3_OUT_LEN]; + blake3_hasher_finalize(&hasher, output, BLAKE3_OUT_LEN); + + // Print the hash as hexadecimal. + for (size_t i = 0; i < BLAKE3_OUT_LEN; i++) { + printf("%02x", output[i]); + } + printf("\n"); + return 0; +} diff --git a/thirdparty/blake3/c/example_tbb.c b/thirdparty/blake3/c/example_tbb.c new file mode 100644 index 000000000..88c4348f2 --- /dev/null +++ b/thirdparty/blake3/c/example_tbb.c @@ -0,0 +1,57 @@ +#include "blake3.h" +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <string.h> +#include <sys/mman.h> +#include <sys/stat.h> +#include <unistd.h> + +int main(int argc, char **argv) { + // For each filepath argument, memory map it and hash it. + for (int i = 1; i < argc; i++) { + // Open and memory map the file. + int fd = open(argv[i], O_RDONLY); + if (fd == -1) { + fprintf(stderr, "open failed: %s\n", strerror(errno)); + return 1; + } + struct stat statbuf; + if (fstat(fd, &statbuf) == -1) { + fprintf(stderr, "stat failed: %s\n", strerror(errno)); + return 1; + } + void *mapped = mmap(NULL, statbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0); + if (mapped == MAP_FAILED) { + fprintf(stderr, "mmap failed: %s\n", strerror(errno)); + return 1; + } + + // Initialize the hasher. + blake3_hasher hasher; + blake3_hasher_init(&hasher); + + // Hash the mapped file using multiple threads. + blake3_hasher_update_tbb(&hasher, mapped, statbuf.st_size); + + // Unmap and close the file. + if (munmap(mapped, statbuf.st_size) == -1) { + fprintf(stderr, "munmap failed: %s\n", strerror(errno)); + return 1; + } + if (close(fd) == -1) { + fprintf(stderr, "close failed: %s\n", strerror(errno)); + return 1; + } + + // Finalize the hash. BLAKE3_OUT_LEN is the default output length, 32 bytes. + uint8_t output[BLAKE3_OUT_LEN]; + blake3_hasher_finalize(&hasher, output, BLAKE3_OUT_LEN); + + // Print the hash as hexadecimal. + for (size_t i = 0; i < BLAKE3_OUT_LEN; i++) { + printf("%02x", output[i]); + } + printf("\n"); + } +} diff --git a/thirdparty/blake3/c/libblake3.pc.in b/thirdparty/blake3/c/libblake3.pc.in new file mode 100644 index 000000000..460dc90b1 --- /dev/null +++ b/thirdparty/blake3/c/libblake3.pc.in @@ -0,0 +1,12 @@ +prefix="@CMAKE_INSTALL_PREFIX@" +exec_prefix="${prefix}" +libdir="@PKG_CONFIG_INSTALL_LIBDIR@" +includedir="@PKG_CONFIG_INSTALL_INCLUDEDIR@" + +Name: @PROJECT_NAME@ +Description: @PROJECT_DESCRIPTION@ +Version: @PROJECT_VERSION@ + +Requires: @PKG_CONFIG_REQUIRES@ +Libs: -L"${libdir}" -lblake3 @PKG_CONFIG_LIBS@ +Cflags: -I"${includedir}" @PKG_CONFIG_CFLAGS@ diff --git a/thirdparty/blake3/c/main.c b/thirdparty/blake3/c/main.c new file mode 100644 index 000000000..77cab58f8 --- /dev/null +++ b/thirdparty/blake3/c/main.c @@ -0,0 +1,166 @@ +/* + * This main file is intended for testing via `make test`. It does not build in + * other settings. See README.md in this directory for examples of how to build + * C code. + */ + +#include <assert.h> +#include <errno.h> +#include <stdbool.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> + +#include "blake3.h" +#include "blake3_impl.h" + +#define HASH_MODE 0 +#define KEYED_HASH_MODE 1 +#define DERIVE_KEY_MODE 2 + +static void hex_char_value(uint8_t c, uint8_t *value, bool *valid) { + if ('0' <= c && c <= '9') { + *value = c - '0'; + *valid = true; + } else if ('a' <= c && c <= 'f') { + *value = 10 + c - 'a'; + *valid = true; + } else { + *valid = false; + } +} + +static int parse_key(char *hex_key, uint8_t out[BLAKE3_KEY_LEN]) { + size_t hex_len = strlen(hex_key); + if (hex_len != 64) { + fprintf(stderr, "Expected a 64-char hexadecimal key, got %zu chars.\n", + hex_len); + return 1; + } + for (size_t i = 0; i < 64; i++) { + uint8_t value; + bool valid; + hex_char_value(hex_key[i], &value, &valid); + if (!valid) { + fprintf(stderr, "Invalid hex char.\n"); + return 1; + } + if (i % 2 == 0) { + out[i / 2] = 0; + value <<= 4; + } + out[i / 2] += value; + } + return 0; +} + +/* A little repetition here */ +enum cpu_feature { + SSE2 = 1 << 0, + SSSE3 = 1 << 1, + SSE41 = 1 << 2, + AVX = 1 << 3, + AVX2 = 1 << 4, + AVX512F = 1 << 5, + AVX512VL = 1 << 6, + /* ... */ + UNDEFINED = 1 << 30 +}; + +extern enum cpu_feature g_cpu_features; +enum cpu_feature get_cpu_features(void); + +int main(int argc, char **argv) { + size_t out_len = BLAKE3_OUT_LEN; + uint8_t key[BLAKE3_KEY_LEN]; + char *context = ""; + uint8_t mode = HASH_MODE; + while (argc > 1) { + if (argc <= 2) { + fprintf(stderr, "Odd number of arguments.\n"); + return 1; + } + if (strcmp("--length", argv[1]) == 0) { + char *endptr = NULL; + errno = 0; + unsigned long long out_len_ll = strtoull(argv[2], &endptr, 10); + if (errno != 0 || out_len_ll > SIZE_MAX || endptr == argv[2] || + *endptr != 0) { + fprintf(stderr, "Bad length argument.\n"); + return 1; + } + out_len = (size_t)out_len_ll; + } else if (strcmp("--keyed", argv[1]) == 0) { + mode = KEYED_HASH_MODE; + int ret = parse_key(argv[2], key); + if (ret != 0) { + return ret; + } + } else if (strcmp("--derive-key", argv[1]) == 0) { + mode = DERIVE_KEY_MODE; + context = argv[2]; + } else { + fprintf(stderr, "Unknown flag.\n"); + return 1; + } + argc -= 2; + argv += 2; + } + + /* + * We're going to hash the input multiple times, so we need to buffer it all. + * This is just for test cases, so go ahead and assume that the input is less + * than 1 MiB. + */ + size_t buf_capacity = 1 << 20; + uint8_t *buf = malloc(buf_capacity); + assert(buf != NULL); + size_t buf_len = 0; + while (1) { + size_t n = fread(&buf[buf_len], 1, buf_capacity - buf_len, stdin); + if (n == 0) { + break; + } + buf_len += n; + assert(buf_len < buf_capacity); + } + + const int mask = get_cpu_features(); + int feature = 0; + do { + fprintf(stderr, "Testing 0x%08X\n", feature); + g_cpu_features = feature; + blake3_hasher hasher; + switch (mode) { + case HASH_MODE: + blake3_hasher_init(&hasher); + break; + case KEYED_HASH_MODE: + blake3_hasher_init_keyed(&hasher, key); + break; + case DERIVE_KEY_MODE: + blake3_hasher_init_derive_key(&hasher, context); + break; + default: + abort(); + } + + blake3_hasher_update(&hasher, buf, buf_len); + + /* TODO: An incremental output reader API to avoid this allocation. */ + uint8_t *out = malloc(out_len); + if (out_len > 0 && out == NULL) { + fprintf(stderr, "malloc() failed.\n"); + return 1; + } + blake3_hasher_finalize(&hasher, out, out_len); + for (size_t i = 0; i < out_len; i++) { + printf("%02x", out[i]); + } + printf("\n"); + free(out); + feature = (feature - mask) & mask; + } while (feature != 0); + free(buf); + return 0; +} diff --git a/thirdparty/blake3/c/test.py b/thirdparty/blake3/c/test.py new file mode 100755 index 000000000..98b1c3dff --- /dev/null +++ b/thirdparty/blake3/c/test.py @@ -0,0 +1,97 @@ +#! /usr/bin/env python3 + +from binascii import hexlify +import json +from os import path +import subprocess + +HERE = path.dirname(__file__) +TEST_VECTORS_PATH = path.join(HERE, "..", "test_vectors", "test_vectors.json") +TEST_VECTORS = json.load(open(TEST_VECTORS_PATH)) + + +def run_blake3(args, input): + output = subprocess.run([path.join(HERE, "blake3")] + args, + input=input, + stdout=subprocess.PIPE, + check=True) + return output.stdout.decode().strip() + + +# Fill the input with a repeating byte pattern. We use a cycle length of 251, +# because that's the largest prime number less than 256. This makes it unlikely +# to swapping any two adjacent input blocks or chunks will give the same +# answer. +def make_test_input(length): + i = 0 + buf = bytearray() + while len(buf) < length: + buf.append(i) + i = (i + 1) % 251 + return buf + + +def main(): + for case in TEST_VECTORS["cases"]: + input_len = case["input_len"] + input = make_test_input(input_len) + hex_key = hexlify(TEST_VECTORS["key"].encode()) + context_string = TEST_VECTORS["context_string"] + expected_hash_xof = case["hash"] + expected_hash = expected_hash_xof[:64] + expected_keyed_hash_xof = case["keyed_hash"] + expected_keyed_hash = expected_keyed_hash_xof[:64] + expected_derive_key_xof = case["derive_key"] + expected_derive_key = expected_derive_key_xof[:64] + + # Test the default hash. + test_hash = run_blake3([], input) + for line in test_hash.splitlines(): + assert expected_hash == line, \ + "hash({}): {} != {}".format(input_len, expected_hash, line) + + # Test the extended hash. + xof_len = len(expected_hash_xof) // 2 + test_hash_xof = run_blake3(["--length", str(xof_len)], input) + for line in test_hash_xof.splitlines(): + assert expected_hash_xof == line, \ + "hash_xof({}): {} != {}".format( + input_len, expected_hash_xof, line) + + # Test the default keyed hash. + test_keyed_hash = run_blake3(["--keyed", hex_key], input) + for line in test_keyed_hash.splitlines(): + assert expected_keyed_hash == line, \ + "keyed_hash({}): {} != {}".format( + input_len, expected_keyed_hash, line) + + # Test the extended keyed hash. + xof_len = len(expected_keyed_hash_xof) // 2 + test_keyed_hash_xof = run_blake3( + ["--keyed", hex_key, "--length", + str(xof_len)], input) + for line in test_keyed_hash_xof.splitlines(): + assert expected_keyed_hash_xof == line, \ + "keyed_hash_xof({}): {} != {}".format( + input_len, expected_keyed_hash_xof, line) + + # Test the default derive key. + test_derive_key = run_blake3(["--derive-key", context_string], input) + for line in test_derive_key.splitlines(): + assert expected_derive_key == line, \ + "derive_key({}): {} != {}".format( + input_len, expected_derive_key, line) + + # Test the extended derive key. + xof_len = len(expected_derive_key_xof) // 2 + test_derive_key_xof = run_blake3( + ["--derive-key", context_string, "--length", + str(xof_len)], input) + for line in test_derive_key_xof.splitlines(): + assert expected_derive_key_xof == line, \ + "derive_key_xof({}): {} != {}".format( + input_len, expected_derive_key_xof, line) + + +if __name__ == "__main__": + main() |