diff options
| author | Stefan Boberg <[email protected]> | 2022-09-20 17:28:41 +0200 |
|---|---|---|
| committer | GitHub <[email protected]> | 2022-09-20 17:28:41 +0200 |
| commit | a735967c7c54fcecbfd9760286afc06a3b48233a (patch) | |
| tree | 4789717b7a05c7122cb366d3bcf5810db9678058 | |
| parent | rename URI chunk requests from value -> chunk (#166) (diff) | |
| download | zen-a735967c7c54fcecbfd9760286afc06a3b48233a.tar.xz zen-a735967c7c54fcecbfd9760286afc06a3b48233a.zip | |
Use BLAKE3 port from vcpkg (#141)
use BLAKE3 port from vcpkg instead of in-tree binaries
89 files changed, 13 insertions, 44583 deletions
diff --git a/thirdparty/BLAKE3/.github/workflows/build_b3sum.py b/thirdparty/BLAKE3/.github/workflows/build_b3sum.py deleted file mode 100644 index e487daf97..000000000 --- a/thirdparty/BLAKE3/.github/workflows/build_b3sum.py +++ /dev/null @@ -1,37 +0,0 @@ -#! /usr/bin/env python3 - -from pathlib import Path -import platform -import shutil -import subprocess -import sys - -ROOT = Path(__file__).parent.parent.parent -RUST_TARGET = sys.argv[1] - -subprocess.run(["cargo", "build", "--target", sys.argv[1], "--release"], - cwd=ROOT / "b3sum") - -if platform.system() == "Windows": - original_exe_name = "b3sum.exe" -else: - original_exe_name = "b3sum" - -if platform.system() == "Windows": - new_exe_name = "b3sum_windows_x64_bin.exe" -elif platform.system() == "Darwin": - new_exe_name = "b3sum_macos_x64_bin" -elif platform.system() == "Linux": - new_exe_name = "b3sum_linux_x64_bin" -else: - raise RuntimeError("Unexpected platform: " + platform.system()) - -# Copy the built binary so that it has the upload name we want. -out_dir = ROOT / "b3sum/target" / RUST_TARGET / "release" -original_exe_path = str(out_dir / original_exe_name) -new_exe_path = str(out_dir / new_exe_name) -print("copying", repr(original_exe_path), "to", repr(new_exe_path)) -shutil.copyfile(original_exe_path, new_exe_path) - -# This lets the subsequent upload step get the filepath. -print("::set-output name=bin_path::" + new_exe_path) diff --git a/thirdparty/BLAKE3/.github/workflows/ci.yml b/thirdparty/BLAKE3/.github/workflows/ci.yml deleted file mode 100644 index 464a411d5..000000000 --- a/thirdparty/BLAKE3/.github/workflows/ci.yml +++ /dev/null @@ -1,208 +0,0 @@ -name: tests - -on: - push: - branches: - - "*" - # not on tags - pull_request: - -env: - BLAKE3_CI: "1" - RUSTFLAGS: "-D warnings" - RUST_BACKTRACE: "1" - -jobs: - cargo_tests: - name: ${{ matrix.target.name }} ${{ matrix.channel }} - runs-on: ${{ matrix.target.os }} - strategy: - fail-fast: false - matrix: - target: [ - { "os": "ubuntu-latest", "toolchain": "x86_64-unknown-linux-gnu", "name": "Linux GNU" }, - { "os": "macOS-latest", "toolchain": "x86_64-apple-darwin", "name": "macOS" }, - { "os": "windows-latest", "toolchain": "x86_64-pc-windows-msvc", "name": "Windows MSVC" }, - { "os": "windows-latest", "toolchain": "x86_64-pc-windows-gnu", "name": "Windows GNU" } - ] - channel: [stable, beta, nightly] - - steps: - - uses: actions/checkout@v1 - - uses: actions-rs/toolchain@v1 - with: - toolchain: ${{ format('{0}-{1}', matrix.channel, matrix.target.toolchain) }} - profile: minimal - override: true - # Print the compiler version, for debugging. - - name: print compiler version - run: cargo run --quiet - working-directory: ./tools/compiler_version - # Print out instruction set support, for debugging. - - name: print instruction set support - run: cargo run --quiet - working-directory: ./tools/instruction_set_support - # Default tests plus Rayon. - - run: cargo test --features=rayon - # no_std tests. - - run: cargo test --no-default-features - - # A matrix of different test settings: - # - debug vs release - # - assembly vs Rust+C intrinsics vs pure Rust intrinsics - # - different levels of SIMD support - # - # Full SIMD support. - - run: cargo test --features= - - run: cargo test --features=prefer_intrinsics - - run: cargo test --features=pure - - run: cargo test --features= --release - - run: cargo test --features=prefer_intrinsics --release - - run: cargo test --features=pure --release - # No AVX-512. - - run: cargo test --features=no_avx512 - - run: cargo test --features=no_avx512,prefer_intrinsics - - run: cargo test --features=no_avx512,pure - - run: cargo test --features=no_avx512 --release - - run: cargo test --features=no_avx512,prefer_intrinsics --release - - run: cargo test --features=no_avx512,pure --release - # No AVX2. - - run: cargo test --features=no_avx512,no_avx2 - - run: cargo test --features=no_avx512,no_avx2,prefer_intrinsics - - run: cargo test --features=no_avx512,no_avx2,pure - - run: cargo test --features=no_avx512,no_avx2 --release - - run: cargo test --features=no_avx512,no_avx2,prefer_intrinsics --release - - run: cargo test --features=no_avx512,no_avx2,pure --release - # No SSE4.1 - - run: cargo test --features=no_avx512,no_avx2,no_sse41 - - run: cargo test --features=no_avx512,no_avx2,no_sse41,prefer_intrinsics - - run: cargo test --features=no_avx512,no_avx2,no_sse41,pure - - run: cargo test --features=no_avx512,no_avx2,no_sse41 --release - - run: cargo test --features=no_avx512,no_avx2,no_sse41,prefer_intrinsics --release - - run: cargo test --features=no_avx512,no_avx2,no_sse41,pure --release - # No SSE2 - - run: cargo test --features=no_avx512,no_avx2,no_sse41,no_sse2 - - run: cargo test --features=no_avx512,no_avx2,no_sse41,no_sse2,prefer_intrinsics - - run: cargo test --features=no_avx512,no_avx2,no_sse41,no_sse2,pure - - run: cargo test --features=no_avx512,no_avx2,no_sse41,no_sse2 --release - - run: cargo test --features=no_avx512,no_avx2,no_sse41,no_sse2,prefer_intrinsics --release - - run: cargo test --features=no_avx512,no_avx2,no_sse41,no_sse2,pure --release - - # Test benchmarks. RUSTC_BOOTSTRAP=1 lets this run on non-nightly toolchains. - - run: cargo test --benches - env: - RUSTC_BOOTSTRAP: 1 - # Test vectors. - - name: test vectors - run: cargo test - working-directory: ./test_vectors - - name: test vectors intrinsics - run: cargo test --features=prefer_intrinsics - working-directory: ./test_vectors - - name: test vectors pure - run: cargo test --features=pure - working-directory: ./test_vectors - # Test b3sum. - - name: test b3sum - run: cargo test - working-directory: ./b3sum - - name: test b3sum --no-default-features - run: cargo test --no-default-features - working-directory: ./b3sum - # Test C code. - - name: cargo test C bindings assembly - run: cargo test - working-directory: ./c/blake3_c_rust_bindings - - name: cargo test C bindings intrinsics - run: cargo test --features=prefer_intrinsics - working-directory: ./c/blake3_c_rust_bindings - # Reference impl doc test. - - name: reference impl doc test - run: cargo test - working-directory: ./reference_impl - - cross_tests: - name: cross ${{ matrix.arch }} - runs-on: ubuntu-latest - strategy: - fail-fast: false - matrix: - arch: - - i586-unknown-linux-musl - - i686-unknown-linux-musl - - armv7-unknown-linux-gnueabihf - - aarch64-unknown-linux-gnu - - mips-unknown-linux-gnu - - steps: - - uses: actions/checkout@v1 - - uses: actions-rs/toolchain@v1 - with: - toolchain: stable - override: true - - run: cargo install cross - # Test the portable implementation on everything. - - run: cross test --target ${{ matrix.arch }} - # Test building for ancient i386 processors without guaranteed SSE2 support. - - run: cross rustc --target ${{ matrix.arch }} -- -C target-cpu=i386 - if: startsWith(matrix.arch, 'i586-') || startsWith(matrix.arch, 'i686-') - # Test the NEON implementation on ARM targets. - - run: cross test --target ${{ matrix.arch }} --features=neon - if: startsWith(matrix.arch, 'armv7-') || startsWith(matrix.arch, 'aarch64-') - # Test vectors. Note that this uses a hacky script due to path dependency limitations. - - run: ./test_vectors/cross_test.sh --target ${{ matrix.arch }} - # C code. Same issue with the hacky script. - - run: ./c/blake3_c_rust_bindings/cross_test.sh --target ${{ matrix.arch }} - - run: ./c/blake3_c_rust_bindings/cross_test.sh --target ${{ matrix.arch }} --features=neon - if: startsWith(matrix.arch, 'armv7-') || startsWith(matrix.arch, 'aarch64-') - - # Currently only on x86. - c_tests: - name: C Makefile tests - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v1 - # Test the intrinsics-based implementations. - - run: make -f Makefile.testing test - working-directory: ./c - - run: make -f Makefile.testing clean && rm blake3_sse2.c - working-directory: ./c - - run: BLAKE3_NO_SSE2=1 make -f Makefile.testing test - working-directory: ./c - - run: make -f Makefile.testing clean && rm blake3_sse41.c - working-directory: ./c - - run: BLAKE3_NO_SSE2=1 BLAKE3_NO_SSE41=1 make -f Makefile.testing test - working-directory: ./c - - run: make -f Makefile.testing clean && rm blake3_avx2.c - working-directory: ./c - - run: BLAKE3_NO_SSE2=1 BLAKE3_NO_SSE41=1 BLAKE3_NO_AVX2=1 make -f Makefile.testing test - working-directory: ./c - - run: make -f Makefile.testing clean && rm blake3_avx512.c - working-directory: ./c - - run: BLAKE3_NO_SSE2=1 BLAKE3_NO_SSE41=1 BLAKE3_NO_AVX2=1 BLAKE3_NO_AVX512=1 make -f Makefile.testing test - working-directory: ./c - # Test the assembly implementations. - - run: make -f Makefile.testing test_asm - working-directory: ./c - - run: make -f Makefile.testing clean && rm blake3_sse2_x86-64_unix.S - working-directory: ./c - - run: BLAKE3_NO_SSE2=1 make -f Makefile.testing test_asm - working-directory: ./c - - run: make -f Makefile.testing clean && rm blake3_sse41_x86-64_unix.S - working-directory: ./c - - run: BLAKE3_NO_SSE2=1 BLAKE3_NO_SSE41=1 make -f Makefile.testing test_asm - working-directory: ./c - - run: make -f Makefile.testing clean && rm blake3_avx2_x86-64_unix.S - working-directory: ./c - - run: BLAKE3_NO_SSE2=1 BLAKE3_NO_SSE41=1 BLAKE3_NO_AVX2=1 make -f Makefile.testing test_asm - working-directory: ./c - - run: make -f Makefile.testing clean && rm blake3_avx512_x86-64_unix.S - working-directory: ./c - - run: BLAKE3_NO_SSE2=1 BLAKE3_NO_SSE41=1 BLAKE3_NO_AVX2=1 BLAKE3_NO_AVX512=1 make -f Makefile.testing test_asm - working-directory: ./c - # Restore the files we deleted above. - - run: git checkout . - # Build the example. - - run: make -f Makefile.testing example - working-directory: ./c diff --git a/thirdparty/BLAKE3/.github/workflows/tag.yml b/thirdparty/BLAKE3/.github/workflows/tag.yml deleted file mode 100644 index 577d4f312..000000000 --- a/thirdparty/BLAKE3/.github/workflows/tag.yml +++ /dev/null @@ -1,45 +0,0 @@ -name: publish_b3sum_binaries - -on: - push: - tags: - - "*" - -env: - BLAKE3_CI: "1" - RUSTFLAGS: "-D warnings" - -jobs: - cargo_tests: - name: ${{ matrix.target.name }} - runs-on: ${{ matrix.target.os }} - strategy: - fail-fast: false - matrix: - target: [ - { "os": "ubuntu-latest", "rust-target": "x86_64-unknown-linux-musl", "name": "Linux" }, - { "os": "macOS-latest", "rust-target": "x86_64-apple-darwin", "name": "macOS" }, - { "os": "windows-latest", "rust-target": "x86_64-pc-windows-msvc", "name": "Windows" }, - ] - - steps: - - uses: actions/checkout@v1 - - uses: actions/setup-python@v1 - with: - python-version: "3.x" - - run: pip install PyGithub - - run: sudo apt-get install musl-tools - if: matrix.target.os == 'ubuntu-latest' - - uses: actions-rs/toolchain@v1 - with: - toolchain: stable - profile: minimal - - run: rustup target add ${{ matrix.target.rust-target }} - - name: build b3sum - id: build_b3sum - run: python -u .github/workflows/build_b3sum.py ${{ matrix.target.rust-target }} - - name: upload release asset - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - GITHUB_TAG: ${{ github.ref }} - run: python -u .github/workflows/upload_github_release_asset.py ${{ steps.build_b3sum.outputs.bin_path }} diff --git a/thirdparty/BLAKE3/.github/workflows/upload_github_release_asset.py b/thirdparty/BLAKE3/.github/workflows/upload_github_release_asset.py deleted file mode 100644 index c1cbf518b..000000000 --- a/thirdparty/BLAKE3/.github/workflows/upload_github_release_asset.py +++ /dev/null @@ -1,65 +0,0 @@ -#! /usr/bin/env python3 - -import github -import os -import sys - -RETRIES = 10 - -g = github.Github(os.environ["GITHUB_TOKEN"]) -tag_name = os.environ["GITHUB_TAG"] -tag_prefix = "refs/tags/" -if tag_name.startswith(tag_prefix): - tag_name = tag_name[len(tag_prefix):] -assert len(sys.argv) == 2 -asset_path = sys.argv[1] -asset_name = os.path.basename(asset_path) - -repo = g.get_repo(os.environ["GITHUB_REPOSITORY"]) - -tags = list(repo.get_tags()) - -for tag in tags: - if tag.name == tag_name: - break -else: - raise RuntimeError("no tag named " + repr(tag_name)) - -try: - print("Creating GitHub release for tag " + repr(tag_name) + "...") - repo.create_git_release(tag_name, tag_name, tag.commit.commit.message) -except github.GithubException as github_error: - if github_error.data["errors"][0]["code"] == "already_exists": - print("Release for tag " + repr(tag_name) + " already exists.") - else: - raise - -releases = list(repo.get_releases()) -for release in releases: - if release.tag_name == tag_name: - break -else: - raise RuntimeError("no release for tag " + repr(tag_name)) - -print("Uploading " + repr(asset_path) + "...") -for i in range(RETRIES): - try: - print("Upload attempt #{} of {}...".format(i + 1, RETRIES)) - release.upload_asset(asset_path) - break - except github.GithubException as github_error: - # Unfortunately the asset upload API is flaky. Even worse, it often - # partially succeeds, returning an error to the caller but leaving the - # release in a state where subsequent uploads of the same asset will - # fail with an "already_exists" error. (Though the asset is not visible - # on github.com, so we can't just declare victory and move on.) If we - # detect this case, explicitly delete the asset and continue retrying. - print(github_error) - for asset in release.get_assets(): - if asset.name == asset_name: - print("Found uploaded asset after failure. Deleting...") - asset.delete_asset() -else: - raise RuntimeError("All upload attempts failed.") - -print("Success!") diff --git a/thirdparty/BLAKE3/.gitignore b/thirdparty/BLAKE3/.gitignore deleted file mode 100644 index fa8d85ac5..000000000 --- a/thirdparty/BLAKE3/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -Cargo.lock -target diff --git a/thirdparty/BLAKE3/CONTRIBUTING.md b/thirdparty/BLAKE3/CONTRIBUTING.md deleted file mode 100644 index 3a605f255..000000000 --- a/thirdparty/BLAKE3/CONTRIBUTING.md +++ /dev/null @@ -1,31 +0,0 @@ -# Contributing - -We welcome and encourage third-party contributions to BLAKE3, be it reports of issues encountered while using the software or proposals of patches. - -## Bug reports - -Bugs and other problems should be reported on [GitHub Issues](https://github.com/BLAKE3/BLAKE3/issues). - -If you report a bug, please: - -* Check that it's not already reported in the [GitHub Issues](https://github.com/BLAKE3/BLAKE3/issues). -* Provide information to help us diagnose and ideally reproduce the bug. - -## Patches - -We encourage you to fix a bug via a [GitHub Pull request](https://github.com/BLAKE3/BLAKE3/pulls), preferably after creating a related issue and referring it in the PR. - -If you contribute code and submit a patch, please note the following: - -* We use Rust's stable branch for developing BLAKE3. -* Pull requests should target the `master` branch. -* Try to follow the established Rust [style guidelines](https://doc.rust-lang.org/1.0.0/style/). - -Also please make sure to create new unit tests covering your code additions. You can execute the tests by running: - -```bash -cargo test -``` - -All third-party contributions will be recognized in the list of contributors. - diff --git a/thirdparty/BLAKE3/Cargo.toml b/thirdparty/BLAKE3/Cargo.toml deleted file mode 100644 index 3df0fd279..000000000 --- a/thirdparty/BLAKE3/Cargo.toml +++ /dev/null @@ -1,90 +0,0 @@ -[package] -name = "blake3" -version = "0.3.7" -authors = ["Jack O'Connor <[email protected]>"] -description = "the BLAKE3 hash function" -repository = "https://github.com/BLAKE3-team/BLAKE3" -license = "CC0-1.0 OR Apache-2.0" -documentation = "https://docs.rs/blake3" -readme = "README.md" -edition = "2018" - -[features] -default = ["std"] - -# The NEON implementation does not participate in dynamic feature detection, -# which is currently x86-only. If "neon" is on, NEON support is assumed. Note -# that AArch64 always supports NEON, but support on ARMv7 varies. The NEON -# implementation uses C intrinsics and requires a C compiler. -neon = [] - -# This crate uses libstd for std::io trait implementations, and also for -# runtime CPU feature detection. This feature is enabled by default. If you use -# --no-default-features, the only way to use the SIMD implementations in this -# crate is to enable the corresponding instruction sets statically for the -# entire build, with e.g. RUSTFLAGS="-C target-cpu=native". -std = ["digest/std"] - -# The "rayon" feature (defined below as an optional dependency) enables the -# join::RayonJoin type, which can be used with Hasher::update_with_join to -# perform multi-threaded hashing. However, even if this feature is enabled, all -# other APIs remain single-threaded. - -# ---------- Features below this line are for internal testing only. ---------- - -# By default on x86_64, this crate uses Samuel Neves' hand-written assembly -# implementations for SSE4.1, AVX2, and AVX512. (These provide both the best -# runtime performance, and the fastest build times.) And by default on 32-bit -# x86, this crate uses Rust intrinsics implementations for SSE4.1 and AVX2, and -# a C intrinsics implementation for AVX-512. In both cases, if a C compiler is -# not detected, or if AVX-512 support is missing from the detected compiler, -# build.rs automatically falls back to a pure Rust build. This feature forces -# that fallback, for testing purposes. (Note that in CI testing, we set the -# BLAKE3_CI environment variable, which instructs build.rs to error out rather -# than doing an automatic fallback.) -pure = [] - -# As described above, on x86_64 this crate use assembly implementations by -# default. Enabling the "prefer_intrinsics" feature makes this crate use -# intrinsics implementations on both 32-bit and 64-bit x86, again for testing -# purposes. -prefer_intrinsics = [] - -# Disable individual instruction sets. CI testing uses these flags to simulate -# different levels of hardware SIMD support. Note that code for the -# corresponding instruction set is still compiled; only detection is disabled. -# -# As noted above, these flags are *for testing only* and are not stable. It's -# possible that some users might find that their particular use case performs -# better if e.g. AVX-512 is disabled, because of issues like CPU downlocking. -# If that comes up, and if disabling the instruction set here at the feature -# level turns out to be the right approach, then we can design a stable -# feature. Until then, we reserve the right to break these features in a patch -# release. -no_sse2 = [] -no_sse41 = [] -no_avx2 = [] -no_avx512 = [] - -[package.metadata.docs.rs] -# Document blake3::join::RayonJoin on docs.rs. -features = ["rayon"] - -[dependencies] -arrayref = "0.3.5" -arrayvec = { version = "0.5.1", default-features = false, features = ["array-sizes-33-128"] } -constant_time_eq = "0.1.5" -rayon = { version = "1.2.1", optional = true } -cfg-if = "0.1.10" -digest = "0.9.0" -crypto-mac = "0.8.0" - -[dev-dependencies] -hex = "0.4.2" -page_size = "0.4.1" -rand = "0.7.2" -rand_chacha = "0.2.1" -reference_impl = { path = "./reference_impl" } - -[build-dependencies] -cc = "1.0.4" diff --git a/thirdparty/BLAKE3/LICENSE b/thirdparty/BLAKE3/LICENSE deleted file mode 100644 index f5892efc3..000000000 --- a/thirdparty/BLAKE3/LICENSE +++ /dev/null @@ -1,330 +0,0 @@ -This work is released into the public domain with CC0 1.0. Alternatively, it is -licensed under the Apache License 2.0. - -------------------------------------------------------------------------------- - -Creative Commons Legal Code - -CC0 1.0 Universal - - CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE - LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN - ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS - INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES - REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS - PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM - THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED - HEREUNDER. - -Statement of Purpose - -The laws of most jurisdictions throughout the world automatically confer -exclusive Copyright and Related Rights (defined below) upon the creator -and subsequent owner(s) (each and all, an "owner") of an original work of -authorship and/or a database (each, a "Work"). - -Certain owners wish to permanently relinquish those rights to a Work for -the purpose of contributing to a commons of creative, cultural and -scientific works ("Commons") that the public can reliably and without fear -of later claims of infringement build upon, modify, incorporate in other -works, reuse and redistribute as freely as possible in any form whatsoever -and for any purposes, including without limitation commercial purposes. -These owners may contribute to the Commons to promote the ideal of a free -culture and the further production of creative, cultural and scientific -works, or to gain reputation or greater distribution for their Work in -part through the use and efforts of others. - -For these and/or other purposes and motivations, and without any -expectation of additional consideration or compensation, the person -associating CC0 with a Work (the "Affirmer"), to the extent that he or she -is an owner of Copyright and Related Rights in the Work, voluntarily -elects to apply CC0 to the Work and publicly distribute the Work under its -terms, with knowledge of his or her Copyright and Related Rights in the -Work and the meaning and intended legal effect of CC0 on those rights. - -1. Copyright and Related Rights. A Work made available under CC0 may be -protected by copyright and related or neighboring rights ("Copyright and -Related Rights"). Copyright and Related Rights include, but are not -limited to, the following: - - i. the right to reproduce, adapt, distribute, perform, display, - communicate, and translate a Work; - ii. moral rights retained by the original author(s) and/or performer(s); -iii. publicity and privacy rights pertaining to a person's image or - likeness depicted in a Work; - iv. rights protecting against unfair competition in regards to a Work, - subject to the limitations in paragraph 4(a), below; - v. rights protecting the extraction, dissemination, use and reuse of data - in a Work; - vi. database rights (such as those arising under Directive 96/9/EC of the - European Parliament and of the Council of 11 March 1996 on the legal - protection of databases, and under any national implementation - thereof, including any amended or successor version of such - directive); and -vii. other similar, equivalent or corresponding rights throughout the - world based on applicable law or treaty, and any national - implementations thereof. - -2. Waiver. To the greatest extent permitted by, but not in contravention -of, applicable law, Affirmer hereby overtly, fully, permanently, -irrevocably and unconditionally waives, abandons, and surrenders all of -Affirmer's Copyright and Related Rights and associated claims and causes -of action, whether now known or unknown (including existing as well as -future claims and causes of action), in the Work (i) in all territories -worldwide, (ii) for the maximum duration provided by applicable law or -treaty (including future time extensions), (iii) in any current or future -medium and for any number of copies, and (iv) for any purpose whatsoever, -including without limitation commercial, advertising or promotional -purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each -member of the public at large and to the detriment of Affirmer's heirs and -successors, fully intending that such Waiver shall not be subject to -revocation, rescission, cancellation, termination, or any other legal or -equitable action to disrupt the quiet enjoyment of the Work by the public -as contemplated by Affirmer's express Statement of Purpose. - -3. Public License Fallback. Should any part of the Waiver for any reason -be judged legally invalid or ineffective under applicable law, then the -Waiver shall be preserved to the maximum extent permitted taking into -account Affirmer's express Statement of Purpose. In addition, to the -extent the Waiver is so judged Affirmer hereby grants to each affected -person a royalty-free, non transferable, non sublicensable, non exclusive, -irrevocable and unconditional license to exercise Affirmer's Copyright and -Related Rights in the Work (i) in all territories worldwide, (ii) for the -maximum duration provided by applicable law or treaty (including future -time extensions), (iii) in any current or future medium and for any number -of copies, and (iv) for any purpose whatsoever, including without -limitation commercial, advertising or promotional purposes (the -"License"). The License shall be deemed effective as of the date CC0 was -applied by Affirmer to the Work. Should any part of the License for any -reason be judged legally invalid or ineffective under applicable law, such -partial invalidity or ineffectiveness shall not invalidate the remainder -of the License, and in such case Affirmer hereby affirms that he or she -will not (i) exercise any of his or her remaining Copyright and Related -Rights in the Work or (ii) assert any associated claims and causes of -action with respect to the Work, in either case contrary to Affirmer's -express Statement of Purpose. - -4. Limitations and Disclaimers. - - a. No trademark or patent rights held by Affirmer are waived, abandoned, - surrendered, licensed or otherwise affected by this document. - b. Affirmer offers the Work as-is and makes no representations or - warranties of any kind concerning the Work, express, implied, - statutory or otherwise, including without limitation warranties of - title, merchantability, fitness for a particular purpose, non - infringement, or the absence of latent or other defects, accuracy, or - the present or absence of errors, whether or not discoverable, all to - the greatest extent permissible under applicable law. - c. Affirmer disclaims responsibility for clearing rights of other persons - that may apply to the Work or any use thereof, including without - limitation any person's Copyright and Related Rights in the Work. - Further, Affirmer disclaims responsibility for obtaining any necessary - consents, permissions or other rights required for any use of the - Work. - d. Affirmer understands and acknowledges that Creative Commons is not a - party to this document and has no duty or obligation with respect to - this CC0 or use of the Work. - -------------------------------------------------------------------------------- - - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright 2019 Jack O'Connor and Samuel Neves - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. diff --git a/thirdparty/BLAKE3/README.md b/thirdparty/BLAKE3/README.md deleted file mode 100644 index 360183668..000000000 --- a/thirdparty/BLAKE3/README.md +++ /dev/null @@ -1,202 +0,0 @@ -# <a href="#"><img src="media/BLAKE3.svg" alt="BLAKE3" height=50></a> - -BLAKE3 is a cryptographic hash function that is: - -- **Much faster** than MD5, SHA-1, SHA-2, SHA-3, and BLAKE2. -- **Secure**, unlike MD5 and SHA-1. And secure against length extension, - unlike SHA-2. -- **Highly parallelizable** across any number of threads and SIMD lanes, - because it's a Merkle tree on the inside. -- Capable of **verified streaming** and **incremental updates**, again - because it's a Merkle tree. -- A **PRF**, **MAC**, **KDF**, and **XOF**, as well as a regular hash. -- **One algorithm with no variants**, which is fast on x86-64 and also - on smaller architectures. - -The [chart below](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/benchmarks/bar_chart.py) -is an example benchmark of 16 KiB inputs on modern server hardware (a Cascade -Lake-SP 8275CL processor). For more detailed benchmarks, see the -[BLAKE3 paper](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf). - -<p align="center"> -<img src="media/speed.svg" alt="performance graph"> -</p> - -BLAKE3 is based on an optimized instance of the established hash -function [BLAKE2](https://blake2.net) and on the [original Bao tree -mode](https://github.com/oconnor663/bao/blob/master/docs/spec_0.9.1.md). -The specifications and design rationale are available in the [BLAKE3 -paper](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf). -The default output size is 256 bits. The current version of -[Bao](https://github.com/oconnor663/bao) implements verified streaming -with BLAKE3. - -This repository is the official implementation of BLAKE3. It includes: - -* The [`blake3`](https://crates.io/crates/blake3) Rust crate, which - includes optimized implementations for SSE2, SSE4.1, AVX2, AVX-512, - and NEON, with automatic runtime CPU feature detection on x86. The - `rayon` feature provides multithreading. - -* The [`b3sum`](https://crates.io/crates/b3sum) Rust crate, which - provides a command line interface. It uses multithreading by default, - making it an order of magnitude faster than e.g. `sha256sum` on - typical desktop hardware. - -* The [C implementation](c), which like the Rust implementation includes - SIMD code and runtime CPU feature detection on x86. Unlike the Rust - implementation, it's not currently multithreaded. See - [`c/README.md`](c/README.md). - -* The [reference implementation](reference_impl/reference_impl.rs), - which is discussed in Section 5.1 of the [BLAKE3 - paper](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf). - This implementation is much smaller and simpler than the optimized - ones above. If you want to see how BLAKE3 works, or you're writing a - port that doesn't need multithreading or SIMD optimizations, start - here. - -* A [set of test - vectors](https://github.com/BLAKE3-team/BLAKE3/blob/master/test_vectors/test_vectors.json) - that covers extended outputs, all three modes, and a variety of input - lengths. - -* [](https://github.com/BLAKE3-team/BLAKE3/actions) - -BLAKE3 was designed by: - -* [@oconnor663 ](https://github.com/oconnor663) (Jack O'Connor) -* [@sneves](https://github.com/sneves) (Samuel Neves) -* [@veorq](https://github.com/veorq) (Jean-Philippe Aumasson) -* [@zookozcash](https://github.com/zookozcash) (Zooko) - -The development of BLAKE3 was sponsored by -[Teserakt](https://teserakt.io) and [Electric Coin Company](https://electriccoin.co). - -*NOTE: BLAKE3 is not a password hashing algorithm, because it's -designed to be fast, whereas password hashing should not be fast. If you -hash passwords to store the hashes or if you derive keys from passwords, -we recommend [Argon2](https://github.com/P-H-C/phc-winner-argon2).* - -## Usage - -### The `b3sum` utility - -The `b3sum` command line utility prints the BLAKE3 hashes of files or of -standard input. Prebuilt binaries are available for Linux, Windows, and -macOS (requiring the [unidentified developer -workaround](https://support.apple.com/guide/mac-help/open-a-mac-app-from-an-unidentified-developer-mh40616/mac)) -on the [releases page](https://github.com/BLAKE3-team/BLAKE3/releases). -If you've [installed Rust and -Cargo](https://doc.rust-lang.org/cargo/getting-started/installation.html), -you can also build `b3sum` yourself with: - -```bash -cargo install b3sum -``` - -If `rustup` didn't configure your `PATH` for you, you might need to go -looking for the installed binary in e.g. `~/.cargo/bin`. You can test -out how fast BLAKE3 is on your machine by creating a big file and -hashing it, for example: - -```bash -# Create a 1 GB file. -head -c 1000000000 /dev/zero > /tmp/bigfile -# Hash it with SHA-256. -time openssl sha256 /tmp/bigfile -# Hash it with BLAKE3. -time b3sum /tmp/bigfile -``` - -### The `blake3` crate [](https://docs.rs/blake3) - -To use BLAKE3 from Rust code, add a dependency on the `blake3` crate to -your `Cargo.toml`. Here's an example of hashing some input bytes: - -```rust -// Hash an input all at once. -let hash1 = blake3::hash(b"foobarbaz"); - -// Hash an input incrementally. -let mut hasher = blake3::Hasher::new(); -hasher.update(b"foo"); -hasher.update(b"bar"); -hasher.update(b"baz"); -let hash2 = hasher.finalize(); -assert_eq!(hash1, hash2); - -// Extended output. OutputReader also implements Read and Seek. -let mut output = [0; 1000]; -let mut output_reader = hasher.finalize_xof(); -output_reader.fill(&mut output); -assert_eq!(&output[..32], hash1.as_bytes()); - -// Print a hash as hex. -println!("{}", hash1.to_hex()); -``` - -Besides `hash`, BLAKE3 provides two other modes, `keyed_hash` and -`derive_key`. The `keyed_hash` mode takes a 256-bit key: - -```rust -// MAC an input all at once. -let example_key = [42u8; 32]; -let mac1 = blake3::keyed_hash(&example_key, b"example input"); - -// MAC incrementally. -let mut hasher = blake3::Hasher::new_keyed(&example_key); -hasher.update(b"example input"); -let mac2 = hasher.finalize(); -assert_eq!(mac1, mac2); -``` - -The `derive_key` mode takes a context string of any length and key -material of any length, and it outputs a derived key of any length. The -context string should be hardcoded, globally unique, and -application-specific. A good default format for the context string is -`"[application] [commit timestamp] [purpose]"`: - -```rust -// Derive a couple of subkeys for different purposes. -const EMAIL_CONTEXT: &str = "BLAKE3 example 2020-01-07 17:10:44 email key"; -const API_CONTEXT: &str = "BLAKE3 example 2020-01-07 17:11:21 API key"; -let input_key = b"some very secret key material (>'-')> <('-'<) ^('-')^"; -let mut email_key = [0; 32]; -blake3::derive_key(EMAIL_CONTEXT, input_key, &mut email_key); -let mut api_key = [0; 32]; -blake3::derive_key(API_CONTEXT, input_key, &mut api_key); -assert!(email_key != api_key); -``` - -### The C implementation - -See [`c/README.md`](c/README.md). - -### Other implementations - -We post links to third-party bindings and implementations on the -[@BLAKE3team Twitter account](https://twitter.com/BLAKE3team) whenever -we hear about them. Some highlights include [an optimized Go -implementation](https://github.com/zeebo/blake3), [Wasm bindings for -Node.js and browsers](https://github.com/connor4312/blake3), and [binary -wheels for Python](https://github.com/oconnor663/blake3-py). - -## Contributing - -Please see [CONTRIBUTING.md](CONTRIBUTING.md). - -## Intellectual property - -The Rust code is copyright Jack O'Connor, 2019-2020. The C code is -copyright Samuel Neves and Jack O'Connor, 2019-2020. The assembly code -is copyright Samuel Neves, 2019-2020. - -This work is released into the public domain with CC0 1.0. -Alternatively, it is licensed under the Apache License 2.0. - -## Miscellany - -- [@veorq](https://github.com/veorq) and - [@oconnor663](https://github.com/oconnor663) did [a podcast - interview](https://www.cryptography.fm/3) about designing BLAKE3. diff --git a/thirdparty/BLAKE3/b3sum/Cargo.toml b/thirdparty/BLAKE3/b3sum/Cargo.toml deleted file mode 100644 index 4678bee2d..000000000 --- a/thirdparty/BLAKE3/b3sum/Cargo.toml +++ /dev/null @@ -1,27 +0,0 @@ -[package] -name = "b3sum" -version = "0.3.7" -authors = ["Jack O'Connor <[email protected]>"] -description = "a command line implementation of the BLAKE3 hash function" -repository = "https://github.com/BLAKE3-team/BLAKE3" -license = "CC0-1.0 OR Apache-2.0" -readme = "README.md" -edition = "2018" - -[features] -neon = ["blake3/neon"] -prefer_intrinsics = ["blake3/prefer_intrinsics"] -pure = ["blake3/pure"] - -[dependencies] -anyhow = "1.0.25" -blake3 = { version = "0.3", path = "..", features = ["rayon"] } -clap = "2.33.1" -hex = "0.4.0" -memmap = "0.7.0" -rayon = "1.2.1" -wild = "2.0.3" - -[dev-dependencies] -duct = "0.13.3" -tempfile = "3.1.0" diff --git a/thirdparty/BLAKE3/b3sum/README.md b/thirdparty/BLAKE3/b3sum/README.md deleted file mode 100644 index e97830b7c..000000000 --- a/thirdparty/BLAKE3/b3sum/README.md +++ /dev/null @@ -1,86 +0,0 @@ -# b3sum - -A command line utility for calculating -[BLAKE3](https://github.com/BLAKE3-team/BLAKE3) hashes, similar to -Coreutils tools like `b2sum` or `md5sum`. - -``` -b3sum 0.3.6 - -USAGE: - b3sum [FLAGS] [OPTIONS] [FILE]... - -FLAGS: - -c, --check Reads BLAKE3 sums from the [file]s and checks them - -h, --help Prints help information - --keyed Uses the keyed mode. The secret key is read from standard - input, and it must be exactly 32 raw bytes. - --no-mmap Disables memory mapping. Currently this also disables - multithreading. - --no-names Omits filenames in the output - --quiet Skips printing OK for each successfully verified file. - Must be used with --check. - --raw Writes raw output bytes to stdout, rather than hex. - --no-names is implied. In this case, only a single - input is allowed. - -V, --version Prints version information - -OPTIONS: - --derive-key <CONTEXT> Uses the key derivation mode, with the given - context string. Cannot be used with --keyed. - -l, --length <LEN> The number of output bytes, prior to hex - encoding (default 32) - --num-threads <NUM> The maximum number of threads to use. By - default, this is the number of logical cores. - If this flag is omitted, or if its value is 0, - RAYON_NUM_THREADS is also respected. - -ARGS: - <FILE>... Files to hash, or checkfiles to check. When no file is given, - or when - is given, read standard input. -``` - -See also [this document about how the `--check` flag -works](https://github.com/BLAKE3-team/BLAKE3/blob/master/b3sum/what_does_check_do.md). - -# Example - -Hash the file `foo.txt`: - -```bash -b3sum foo.txt -``` - -Time hashing a gigabyte of data, to see how fast it is: - -```bash -# Create a 1 GB file. -head -c 1000000000 /dev/zero > /tmp/bigfile -# Hash it with SHA-256. -time openssl sha256 /tmp/bigfile -# Hash it with BLAKE3. -time b3sum /tmp/bigfile -``` - - -# Installation - -Prebuilt binaries are available for Linux, Windows, and macOS (requiring -the [unidentified developer -workaround](https://support.apple.com/guide/mac-help/open-a-mac-app-from-an-unidentified-developer-mh40616/mac)) -on the [releases page](https://github.com/BLAKE3-team/BLAKE3/releases). -If you've [installed Rust and -Cargo](https://doc.rust-lang.org/cargo/getting-started/installation.html), -you can also build `b3sum` yourself with: - -``` -cargo install b3sum -``` - -On Linux for example, Cargo will put the compiled binary in -`~/.cargo/bin`. You might want to add that directory to your `$PATH`, or -`rustup` might have done it for you when you installed Cargo. - -If you want to install directly from this directory, you can run `cargo -install --path .`. Or you can just build with `cargo build --release`, -which puts the binary at `./target/release/b3sum`. diff --git a/thirdparty/BLAKE3/b3sum/src/main.rs b/thirdparty/BLAKE3/b3sum/src/main.rs deleted file mode 100644 index b01e5de58..000000000 --- a/thirdparty/BLAKE3/b3sum/src/main.rs +++ /dev/null @@ -1,621 +0,0 @@ -use anyhow::{bail, ensure, Context, Result}; -use clap::{App, Arg}; -use std::cmp; -use std::convert::TryInto; -use std::fs::File; -use std::io; -use std::io::prelude::*; -use std::path::{Path, PathBuf}; - -#[cfg(test)] -mod unit_tests; - -const NAME: &str = "b3sum"; - -const FILE_ARG: &str = "FILE"; -const DERIVE_KEY_ARG: &str = "derive-key"; -const KEYED_ARG: &str = "keyed"; -const LENGTH_ARG: &str = "length"; -const NO_MMAP_ARG: &str = "no-mmap"; -const NO_NAMES_ARG: &str = "no-names"; -const NUM_THREADS_ARG: &str = "num-threads"; -const RAW_ARG: &str = "raw"; -const CHECK_ARG: &str = "check"; -const QUIET_ARG: &str = "quiet"; - -struct Args { - inner: clap::ArgMatches<'static>, - file_args: Vec<PathBuf>, - base_hasher: blake3::Hasher, -} - -impl Args { - fn parse() -> Result<Self> { - let inner = App::new(NAME) - .version(env!("CARGO_PKG_VERSION")) - .arg(Arg::with_name(FILE_ARG).multiple(true).help( - "Files to hash, or checkfiles to check. When no file is given,\n\ - or when - is given, read standard input.", - )) - .arg( - Arg::with_name(LENGTH_ARG) - .long(LENGTH_ARG) - .short("l") - .takes_value(true) - .value_name("LEN") - .help( - "The number of output bytes, prior to hex\n\ - encoding (default 32)", - ), - ) - .arg( - Arg::with_name(NUM_THREADS_ARG) - .long(NUM_THREADS_ARG) - .takes_value(true) - .value_name("NUM") - .help( - "The maximum number of threads to use. By\n\ - default, this is the number of logical cores.\n\ - If this flag is omitted, or if its value is 0,\n\ - RAYON_NUM_THREADS is also respected.", - ), - ) - .arg( - Arg::with_name(KEYED_ARG) - .long(KEYED_ARG) - .requires(FILE_ARG) - .help( - "Uses the keyed mode. The secret key is read from standard\n\ - input, and it must be exactly 32 raw bytes.", - ), - ) - .arg( - Arg::with_name(DERIVE_KEY_ARG) - .long(DERIVE_KEY_ARG) - .conflicts_with(KEYED_ARG) - .takes_value(true) - .value_name("CONTEXT") - .help( - "Uses the key derivation mode, with the given\n\ - context string. Cannot be used with --keyed.", - ), - ) - .arg(Arg::with_name(NO_MMAP_ARG).long(NO_MMAP_ARG).help( - "Disables memory mapping. Currently this also disables\n\ - multithreading.", - )) - .arg( - Arg::with_name(NO_NAMES_ARG) - .long(NO_NAMES_ARG) - .help("Omits filenames in the output"), - ) - .arg(Arg::with_name(RAW_ARG).long(RAW_ARG).help( - "Writes raw output bytes to stdout, rather than hex.\n\ - --no-names is implied. In this case, only a single\n\ - input is allowed.", - )) - .arg( - Arg::with_name(CHECK_ARG) - .long(CHECK_ARG) - .short("c") - .conflicts_with(DERIVE_KEY_ARG) - .conflicts_with(KEYED_ARG) - .conflicts_with(LENGTH_ARG) - .conflicts_with(RAW_ARG) - .conflicts_with(NO_NAMES_ARG) - .help("Reads BLAKE3 sums from the [file]s and checks them"), - ) - .arg( - Arg::with_name(QUIET_ARG) - .long(QUIET_ARG) - .requires(CHECK_ARG) - .help( - "Skips printing OK for each successfully verified file.\n\ - Must be used with --check.", - ), - ) - // wild::args_os() is equivalent to std::env::args_os() on Unix, - // but on Windows it adds support for globbing. - .get_matches_from(wild::args_os()); - let file_args = if let Some(iter) = inner.values_of_os(FILE_ARG) { - iter.map(|s| s.into()).collect() - } else { - vec!["-".into()] - }; - if inner.is_present(RAW_ARG) && file_args.len() > 1 { - bail!("Only one filename can be provided when using --raw"); - } - let base_hasher = if inner.is_present(KEYED_ARG) { - // In keyed mode, since stdin is used for the key, we can't handle - // `-` arguments. Input::open handles that case below. - blake3::Hasher::new_keyed(&read_key_from_stdin()?) - } else if let Some(context) = inner.value_of(DERIVE_KEY_ARG) { - blake3::Hasher::new_derive_key(context) - } else { - blake3::Hasher::new() - }; - Ok(Self { - inner, - file_args, - base_hasher, - }) - } - - fn num_threads(&self) -> Result<Option<usize>> { - if let Some(num_threads_str) = self.inner.value_of(NUM_THREADS_ARG) { - Ok(Some( - num_threads_str - .parse() - .context("Failed to parse num threads.")?, - )) - } else { - Ok(None) - } - } - - fn check(&self) -> bool { - self.inner.is_present(CHECK_ARG) - } - - fn raw(&self) -> bool { - self.inner.is_present(RAW_ARG) - } - - fn no_mmap(&self) -> bool { - self.inner.is_present(NO_MMAP_ARG) - } - - fn no_names(&self) -> bool { - self.inner.is_present(NO_NAMES_ARG) - } - - fn len(&self) -> Result<u64> { - if let Some(length) = self.inner.value_of(LENGTH_ARG) { - length.parse::<u64>().context("Failed to parse length.") - } else { - Ok(blake3::OUT_LEN as u64) - } - } - - fn keyed(&self) -> bool { - self.inner.is_present(KEYED_ARG) - } - - fn quiet(&self) -> bool { - self.inner.is_present(QUIET_ARG) - } -} - -enum Input { - Mmap(io::Cursor<memmap::Mmap>), - File(File), - Stdin, -} - -impl Input { - // Open an input file, using mmap if appropriate. "-" means stdin. Note - // that this convention applies both to command line arguments, and to - // filepaths that appear in a checkfile. - fn open(path: &Path, args: &Args) -> Result<Self> { - if path == Path::new("-") { - if args.keyed() { - bail!("Cannot open `-` in keyed mode"); - } - return Ok(Self::Stdin); - } - let file = File::open(path)?; - if !args.no_mmap() { - if let Some(mmap) = maybe_memmap_file(&file)? { - return Ok(Self::Mmap(io::Cursor::new(mmap))); - } - } - Ok(Self::File(file)) - } - - fn hash(&mut self, args: &Args) -> Result<blake3::OutputReader> { - let mut hasher = args.base_hasher.clone(); - match self { - // The fast path: If we mmapped the file successfully, hash using - // multiple threads. This doesn't work on stdin, or on some files, - // and it can also be disabled with --no-mmap. - Self::Mmap(cursor) => { - hasher.update_with_join::<blake3::join::RayonJoin>(cursor.get_ref()); - } - // The slower paths, for stdin or files we didn't/couldn't mmap. - // This is currently all single-threaded. Doing multi-threaded - // hashing without memory mapping is tricky, since all your worker - // threads have to stop every time you refill the buffer, and that - // ends up being a lot of overhead. To solve that, we need a more - // complicated double-buffering strategy where a background thread - // fills one buffer while the worker threads are hashing the other - // one. We might implement that in the future, but since this is - // the slow path anyway, it's not high priority. - Self::File(file) => { - copy_wide(file, &mut hasher)?; - } - Self::Stdin => { - let stdin = io::stdin(); - let lock = stdin.lock(); - copy_wide(lock, &mut hasher)?; - } - } - Ok(hasher.finalize_xof()) - } -} - -impl Read for Input { - fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> { - match self { - Self::Mmap(cursor) => cursor.read(buf), - Self::File(file) => file.read(buf), - Self::Stdin => io::stdin().read(buf), - } - } -} - -// A 16 KiB buffer is enough to take advantage of all the SIMD instruction sets -// that we support, but `std::io::copy` currently uses 8 KiB. Most platforms -// can support at least 64 KiB, and there's some performance benefit to using -// bigger reads, so that's what we use here. -fn copy_wide(mut reader: impl Read, hasher: &mut blake3::Hasher) -> io::Result<u64> { - let mut buffer = [0; 65536]; - let mut total = 0; - loop { - match reader.read(&mut buffer) { - Ok(0) => return Ok(total), - Ok(n) => { - hasher.update(&buffer[..n]); - total += n as u64; - } - Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, - Err(e) => return Err(e), - } - } -} - -// Mmap a file, if it looks like a good idea. Return None in cases where we -// know mmap will fail, or if the file is short enough that mmapping isn't -// worth it. However, if we do try to mmap and it fails, return the error. -fn maybe_memmap_file(file: &File) -> Result<Option<memmap::Mmap>> { - let metadata = file.metadata()?; - let file_size = metadata.len(); - Ok(if !metadata.is_file() { - // Not a real file. - None - } else if file_size > isize::max_value() as u64 { - // Too long to safely map. - // https://github.com/danburkert/memmap-rs/issues/69 - None - } else if file_size == 0 { - // Mapping an empty file currently fails. - // https://github.com/danburkert/memmap-rs/issues/72 - None - } else if file_size < 16 * 1024 { - // Mapping small files is not worth it. - None - } else { - // Explicitly set the length of the memory map, so that filesystem - // changes can't race to violate the invariants we just checked. - let map = unsafe { - memmap::MmapOptions::new() - .len(file_size as usize) - .map(&file)? - }; - Some(map) - }) -} - -fn write_hex_output(mut output: blake3::OutputReader, args: &Args) -> Result<()> { - // Encoding multiples of the block size is most efficient. - let mut len = args.len()?; - let mut block = [0; blake3::BLOCK_LEN]; - while len > 0 { - output.fill(&mut block); - let hex_str = hex::encode(&block[..]); - let take_bytes = cmp::min(len, block.len() as u64); - print!("{}", &hex_str[..2 * take_bytes as usize]); - len -= take_bytes; - } - Ok(()) -} - -fn write_raw_output(output: blake3::OutputReader, args: &Args) -> Result<()> { - let mut output = output.take(args.len()?); - let stdout = std::io::stdout(); - let mut handler = stdout.lock(); - std::io::copy(&mut output, &mut handler)?; - - Ok(()) -} - -fn read_key_from_stdin() -> Result<[u8; blake3::KEY_LEN]> { - let mut bytes = Vec::with_capacity(blake3::KEY_LEN + 1); - let n = std::io::stdin() - .lock() - .take(blake3::KEY_LEN as u64 + 1) - .read_to_end(&mut bytes)?; - if n < 32 { - bail!( - "expected {} key bytes from stdin, found {}", - blake3::KEY_LEN, - n, - ) - } else if n > 32 { - bail!("read more than {} key bytes from stdin", blake3::KEY_LEN) - } else { - Ok(bytes[..blake3::KEY_LEN].try_into().unwrap()) - } -} - -struct FilepathString { - filepath_string: String, - is_escaped: bool, -} - -// returns (string, did_escape) -fn filepath_to_string(filepath: &Path) -> FilepathString { - let unicode_cow = filepath.to_string_lossy(); - let mut filepath_string = unicode_cow.to_string(); - // If we're on Windows, normalize backslashes to forward slashes. This - // avoids a lot of ugly escaping in the common case, and it makes - // checkfiles created on Windows more likely to be portable to Unix. It - // also allows us to set a blanket "no backslashes allowed in checkfiles on - // Windows" rule, rather than allowing a Unix backslash to potentially get - // interpreted as a directory separator on Windows. - if cfg!(windows) { - filepath_string = filepath_string.replace('\\', "/"); - } - let mut is_escaped = false; - if filepath_string.contains('\\') || filepath_string.contains('\n') { - filepath_string = filepath_string.replace('\\', "\\\\").replace('\n', "\\n"); - is_escaped = true; - } - FilepathString { - filepath_string, - is_escaped, - } -} - -fn hex_half_byte(c: char) -> Result<u8> { - // The hex characters in the hash must be lowercase for now, though we - // could support uppercase too if we wanted to. - if '0' <= c && c <= '9' { - return Ok(c as u8 - '0' as u8); - } - if 'a' <= c && c <= 'f' { - return Ok(c as u8 - 'a' as u8 + 10); - } - bail!("Invalid hex"); -} - -// The `check` command is a security tool. That means it's much better for a -// check to fail more often than it should (a false negative), than for a check -// to ever succeed when it shouldn't (a false positive). By forbidding certain -// characters in checked filepaths, we avoid a class of false positives where -// two different filepaths can get confused with each other. -fn check_for_invalid_characters(utf8_path: &str) -> Result<()> { - // Null characters in paths should never happen, but they can result in a - // path getting silently truncated on Unix. - if utf8_path.contains('\0') { - bail!("Null character in path"); - } - // Because we convert invalid UTF-8 sequences in paths to the Unicode - // replacement character, multiple different invalid paths can map to the - // same UTF-8 string. - if utf8_path.contains('�') { - bail!("Unicode replacement character in path"); - } - // We normalize all Windows backslashes to forward slashes in our output, - // so the only natural way to get a backslash in a checkfile on Windows is - // to construct it on Unix and copy it over. (Or of course you could just - // doctor it by hand.) To avoid confusing this with a directory separator, - // we forbid backslashes entirely on Windows. Note that this check comes - // after unescaping has been done. - if cfg!(windows) && utf8_path.contains('\\') { - bail!("Backslash in path"); - } - Ok(()) -} - -fn unescape(mut path: &str) -> Result<String> { - let mut unescaped = String::with_capacity(2 * path.len()); - while let Some(i) = path.find('\\') { - ensure!(i < path.len() - 1, "Invalid backslash escape"); - unescaped.push_str(&path[..i]); - match path[i + 1..].chars().next().unwrap() { - // Anything other than a recognized escape sequence is an error. - 'n' => unescaped.push_str("\n"), - '\\' => unescaped.push_str("\\"), - _ => bail!("Invalid backslash escape"), - } - path = &path[i + 2..]; - } - unescaped.push_str(path); - Ok(unescaped) -} - -#[derive(Debug)] -struct ParsedCheckLine { - file_string: String, - is_escaped: bool, - file_path: PathBuf, - expected_hash: blake3::Hash, -} - -fn parse_check_line(mut line: &str) -> Result<ParsedCheckLine> { - // Trim off the trailing newline, if any. - line = line.trim_end_matches('\n'); - // If there's a backslash at the front of the line, that means we need to - // unescape the path below. This matches the behavior of e.g. md5sum. - let first = if let Some(c) = line.chars().next() { - c - } else { - bail!("Empty line"); - }; - let mut is_escaped = false; - if first == '\\' { - is_escaped = true; - line = &line[1..]; - } - // The front of the line must be a hash of the usual length, followed by - // two spaces. The hex characters in the hash must be lowercase for now, - // though we could support uppercase too if we wanted to. - let hash_hex_len = 2 * blake3::OUT_LEN; - let num_spaces = 2; - let prefix_len = hash_hex_len + num_spaces; - ensure!(line.len() > prefix_len, "Short line"); - ensure!( - line.chars().take(prefix_len).all(|c| c.is_ascii()), - "Non-ASCII prefix" - ); - ensure!(&line[hash_hex_len..][..2] == " ", "Invalid space"); - // Decode the hash hex. - let mut hash_bytes = [0; blake3::OUT_LEN]; - let mut hex_chars = line[..hash_hex_len].chars(); - for byte in &mut hash_bytes { - let high_char = hex_chars.next().unwrap(); - let low_char = hex_chars.next().unwrap(); - *byte = 16 * hex_half_byte(high_char)? + hex_half_byte(low_char)?; - } - let expected_hash: blake3::Hash = hash_bytes.into(); - let file_string = line[prefix_len..].to_string(); - let file_path_string = if is_escaped { - // If we detected a backslash at the start of the line earlier, now we - // need to unescape backslashes and newlines. - unescape(&file_string)? - } else { - file_string.clone().into() - }; - check_for_invalid_characters(&file_path_string)?; - Ok(ParsedCheckLine { - file_string, - is_escaped, - file_path: file_path_string.into(), - expected_hash, - }) -} - -fn hash_one_input(path: &Path, args: &Args) -> Result<()> { - let mut input = Input::open(path, args)?; - let output = input.hash(args)?; - if args.raw() { - write_raw_output(output, args)?; - return Ok(()); - } - if args.no_names() { - write_hex_output(output, args)?; - println!(); - return Ok(()); - } - let FilepathString { - filepath_string, - is_escaped, - } = filepath_to_string(path); - if is_escaped { - print!("\\"); - } - write_hex_output(output, args)?; - println!(" {}", filepath_string); - Ok(()) -} - -// Returns true for success. Having a boolean return value here, instead of -// passing down the some_file_failed reference, makes it less likely that we -// might forget to set it in some error condition. -fn check_one_line(line: &str, args: &Args) -> bool { - let parse_result = parse_check_line(&line); - let ParsedCheckLine { - file_string, - is_escaped, - file_path, - expected_hash, - } = match parse_result { - Ok(parsed) => parsed, - Err(e) => { - eprintln!("{}: {}", NAME, e); - return false; - } - }; - let file_string = if is_escaped { - "\\".to_string() + &file_string - } else { - file_string - }; - let hash_result: Result<blake3::Hash> = Input::open(&file_path, args) - .and_then(|mut input| input.hash(args)) - .map(|mut hash_output| { - let mut found_hash_bytes = [0; blake3::OUT_LEN]; - hash_output.fill(&mut found_hash_bytes); - found_hash_bytes.into() - }); - let found_hash: blake3::Hash = match hash_result { - Ok(hash) => hash, - Err(e) => { - println!("{}: FAILED ({})", file_string, e); - return false; - } - }; - // This is a constant-time comparison. - if expected_hash == found_hash { - if !args.quiet() { - println!("{}: OK", file_string); - } - true - } else { - println!("{}: FAILED", file_string); - false - } -} - -fn check_one_checkfile(path: &Path, args: &Args, some_file_failed: &mut bool) -> Result<()> { - let checkfile_input = Input::open(path, args)?; - let mut bufreader = io::BufReader::new(checkfile_input); - let mut line = String::new(); - loop { - line.clear(); - let n = bufreader.read_line(&mut line)?; - if n == 0 { - return Ok(()); - } - // check_one_line() prints errors and turns them into a success=false - // return, so it doesn't return a Result. - let success = check_one_line(&line, args); - if !success { - *some_file_failed = true; - } - } -} - -fn main() -> Result<()> { - let args = Args::parse()?; - let mut thread_pool_builder = rayon::ThreadPoolBuilder::new(); - if let Some(num_threads) = args.num_threads()? { - thread_pool_builder = thread_pool_builder.num_threads(num_threads); - } - let thread_pool = thread_pool_builder.build()?; - thread_pool.install(|| { - let mut some_file_failed = false; - // Note that file_args automatically includes `-` if nothing is given. - for path in &args.file_args { - if args.check() { - // A hash mismatch or a failure to read a hashed file will be - // printed in the checkfile loop, and will not propagate here. - // This is similar to the explicit error handling we do in the - // hashing case immediately below. In these cases, - // some_file_failed will be set to false. - check_one_checkfile(path, &args, &mut some_file_failed)?; - } else { - // Errors encountered in hashing are tolerated and printed to - // stderr. This allows e.g. `b3sum *` to print errors for - // non-files and keep going. However, if we encounter any - // errors we'll still return non-zero at the end. - let result = hash_one_input(path, &args); - if let Err(e) = result { - some_file_failed = true; - eprintln!("{}: {}: {}", NAME, path.to_string_lossy(), e); - } - } - } - std::process::exit(if some_file_failed { 1 } else { 0 }); - }) -} diff --git a/thirdparty/BLAKE3/b3sum/src/unit_tests.rs b/thirdparty/BLAKE3/b3sum/src/unit_tests.rs deleted file mode 100644 index 1fa1a17dc..000000000 --- a/thirdparty/BLAKE3/b3sum/src/unit_tests.rs +++ /dev/null @@ -1,189 +0,0 @@ -use std::path::Path; - -#[test] -fn test_parse_check_line() { - // ========================= - // ===== Success Cases ===== - // ========================= - - // the basic case - let crate::ParsedCheckLine { - file_string, - is_escaped, - file_path, - expected_hash, - } = crate::parse_check_line( - "0909090909090909090909090909090909090909090909090909090909090909 foo", - ) - .unwrap(); - assert_eq!(expected_hash, blake3::Hash::from([0x09; 32])); - assert!(!is_escaped); - assert_eq!(file_string, "foo"); - assert_eq!(file_path, Path::new("foo")); - - // regular whitespace - let crate::ParsedCheckLine { - file_string, - is_escaped, - file_path, - expected_hash, - } = crate::parse_check_line( - "fafafafafafafafafafafafafafafafafafafafafafafafafafafafafafafafa fo \to\n\n\n", - ) - .unwrap(); - assert_eq!(expected_hash, blake3::Hash::from([0xfa; 32])); - assert!(!is_escaped); - assert_eq!(file_string, "fo \to"); - assert_eq!(file_path, Path::new("fo \to")); - - // path is one space - let crate::ParsedCheckLine { - file_string, - is_escaped, - file_path, - expected_hash, - } = crate::parse_check_line( - "4242424242424242424242424242424242424242424242424242424242424242 ", - ) - .unwrap(); - assert_eq!(expected_hash, blake3::Hash::from([0x42; 32])); - assert!(!is_escaped); - assert_eq!(file_string, " "); - assert_eq!(file_path, Path::new(" ")); - - // *Unescaped* backslashes. Note that this line does *not* start with a - // backslash, so something like "\" + "n" is interpreted as *two* - // characters. We forbid all backslashes on Windows, so this test is - // Unix-only. - if cfg!(not(windows)) { - let crate::ParsedCheckLine { - file_string, - is_escaped, - file_path, - expected_hash, - } = crate::parse_check_line( - "4343434343434343434343434343434343434343434343434343434343434343 fo\\a\\no", - ) - .unwrap(); - assert_eq!(expected_hash, blake3::Hash::from([0x43; 32])); - assert!(!is_escaped); - assert_eq!(file_string, "fo\\a\\no"); - assert_eq!(file_path, Path::new("fo\\a\\no")); - } - - // escaped newline - let crate::ParsedCheckLine { - file_string, - is_escaped, - file_path, - expected_hash, - } = crate::parse_check_line( - "\\4444444444444444444444444444444444444444444444444444444444444444 fo\\n\\no", - ) - .unwrap(); - assert_eq!(expected_hash, blake3::Hash::from([0x44; 32])); - assert!(is_escaped); - assert_eq!(file_string, "fo\\n\\no"); - assert_eq!(file_path, Path::new("fo\n\no")); - - // Escaped newline and backslash. Again because backslash is not allowed on - // Windows, this test is Unix-only. - if cfg!(not(windows)) { - let crate::ParsedCheckLine { - file_string, - is_escaped, - file_path, - expected_hash, - } = crate::parse_check_line( - "\\4545454545454545454545454545454545454545454545454545454545454545 fo\\n\\\\o", - ) - .unwrap(); - assert_eq!(expected_hash, blake3::Hash::from([0x45; 32])); - assert!(is_escaped); - assert_eq!(file_string, "fo\\n\\\\o"); - assert_eq!(file_path, Path::new("fo\n\\o")); - } - - // non-ASCII path - let crate::ParsedCheckLine { - file_string, - is_escaped, - file_path, - expected_hash, - } = crate::parse_check_line( - "4646464646464646464646464646464646464646464646464646464646464646 否认", - ) - .unwrap(); - assert_eq!(expected_hash, blake3::Hash::from([0x46; 32])); - assert!(!is_escaped); - assert_eq!(file_string, "否认"); - assert_eq!(file_path, Path::new("否认")); - - // ========================= - // ===== Failure Cases ===== - // ========================= - - // too short - crate::parse_check_line("").unwrap_err(); - crate::parse_check_line("0").unwrap_err(); - crate::parse_check_line("00").unwrap_err(); - crate::parse_check_line("0000000000000000000000000000000000000000000000000000000000000000") - .unwrap_err(); - crate::parse_check_line("0000000000000000000000000000000000000000000000000000000000000000 ") - .unwrap_err(); - - // not enough spaces - crate::parse_check_line("0000000000000000000000000000000000000000000000000000000000000000 foo") - .unwrap_err(); - - // capital letter hex - crate::parse_check_line( - "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA foo", - ) - .unwrap_err(); - - // non-hex hex - crate::parse_check_line( - "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx foo", - ) - .unwrap_err(); - - // non-ASCII hex - crate::parse_check_line("你好, 我叫杰克. 认识你很高兴. 要不要吃个香蕉? foo").unwrap_err(); - - // invalid escape sequence - crate::parse_check_line( - "\\0000000000000000000000000000000000000000000000000000000000000000 fo\\o", - ) - .unwrap_err(); - - // truncated escape sequence - crate::parse_check_line( - "\\0000000000000000000000000000000000000000000000000000000000000000 foo\\", - ) - .unwrap_err(); - - // null char - crate::parse_check_line( - "0000000000000000000000000000000000000000000000000000000000000000 fo\0o", - ) - .unwrap_err(); - - // Unicode replacement char - crate::parse_check_line( - "0000000000000000000000000000000000000000000000000000000000000000 fo�o", - ) - .unwrap_err(); - - // On Windows only, backslashes are not allowed, escaped or otherwise. - if cfg!(windows) { - crate::parse_check_line( - "0000000000000000000000000000000000000000000000000000000000000000 fo\\o", - ) - .unwrap_err(); - crate::parse_check_line( - "\\0000000000000000000000000000000000000000000000000000000000000000 fo\\\\o", - ) - .unwrap_err(); - } -} diff --git a/thirdparty/BLAKE3/b3sum/tests/cli_tests.rs b/thirdparty/BLAKE3/b3sum/tests/cli_tests.rs deleted file mode 100644 index 51fbbba98..000000000 --- a/thirdparty/BLAKE3/b3sum/tests/cli_tests.rs +++ /dev/null @@ -1,552 +0,0 @@ -use duct::cmd; -use std::ffi::OsString; -use std::fs; -use std::io::prelude::*; -use std::path::PathBuf; - -pub fn b3sum_exe() -> PathBuf { - env!("CARGO_BIN_EXE_b3sum").into() -} - -#[test] -fn test_hash_one() { - let expected = format!("{} -", blake3::hash(b"foo").to_hex()); - let output = cmd!(b3sum_exe()).stdin_bytes("foo").read().unwrap(); - assert_eq!(&*expected, output); -} - -#[test] -fn test_hash_one_raw() { - let expected = blake3::hash(b"foo").as_bytes().to_owned(); - let output = cmd!(b3sum_exe(), "--raw") - .stdin_bytes("foo") - .stdout_capture() - .run() - .unwrap() - .stdout; - assert_eq!(expected, output.as_slice()); -} - -#[test] -fn test_hash_many() { - let dir = tempfile::tempdir().unwrap(); - let file1 = dir.path().join("file1"); - fs::write(&file1, b"foo").unwrap(); - let file2 = dir.path().join("file2"); - fs::write(&file2, b"bar").unwrap(); - - let output = cmd!(b3sum_exe(), &file1, &file2).read().unwrap(); - let foo_hash = blake3::hash(b"foo"); - let bar_hash = blake3::hash(b"bar"); - let expected = format!( - "{} {}\n{} {}", - foo_hash.to_hex(), - // account for slash normalization on Windows - file1.to_string_lossy().replace("\\", "/"), - bar_hash.to_hex(), - file2.to_string_lossy().replace("\\", "/"), - ); - assert_eq!(expected, output); - - let output_no_names = cmd!(b3sum_exe(), "--no-names", &file1, &file2) - .read() - .unwrap(); - let expected_no_names = format!("{}\n{}", foo_hash.to_hex(), bar_hash.to_hex(),); - assert_eq!(expected_no_names, output_no_names); -} - -#[test] -fn test_missing_files() { - let dir = tempfile::tempdir().unwrap(); - let file1 = dir.path().join("file1"); - fs::write(&file1, b"foo").unwrap(); - let file2 = dir.path().join("file2"); - fs::write(&file2, b"bar").unwrap(); - - let output = cmd!(b3sum_exe(), "file1", "missing_file", "file2") - .dir(dir.path()) - .stdout_capture() - .stderr_capture() - .unchecked() - .run() - .unwrap(); - assert!(!output.status.success()); - - let foo_hash = blake3::hash(b"foo"); - let bar_hash = blake3::hash(b"bar"); - let expected_stdout = format!( - "{} file1\n{} file2\n", - foo_hash.to_hex(), - bar_hash.to_hex(), - ); - assert_eq!(expected_stdout.as_bytes(), &output.stdout[..]); - - let bing_error = fs::File::open(dir.path().join("missing_file")).unwrap_err(); - let expected_stderr = format!("b3sum: missing_file: {}\n", bing_error.to_string()); - assert_eq!(expected_stderr.as_bytes(), &output.stderr[..]); -} - -#[test] -fn test_hash_length() { - let mut buf = [0; 100]; - blake3::Hasher::new() - .update(b"foo") - .finalize_xof() - .fill(&mut buf); - let expected = format!("{} -", hex::encode(&buf[..])); - let output = cmd!(b3sum_exe(), "--length=100") - .stdin_bytes("foo") - .read() - .unwrap(); - assert_eq!(&*expected, &*output); -} - -#[test] -fn test_keyed() { - let key = [42; blake3::KEY_LEN]; - let f = tempfile::NamedTempFile::new().unwrap(); - f.as_file().write_all(b"foo").unwrap(); - f.as_file().flush().unwrap(); - let expected = blake3::keyed_hash(&key, b"foo").to_hex(); - let output = cmd!(b3sum_exe(), "--keyed", "--no-names", f.path()) - .stdin_bytes(&key[..]) - .read() - .unwrap(); - assert_eq!(&*expected, &*output); -} - -#[test] -fn test_derive_key() { - let context = "BLAKE3 2019-12-28 10:28:41 example context"; - let f = tempfile::NamedTempFile::new().unwrap(); - f.as_file().write_all(b"key material").unwrap(); - f.as_file().flush().unwrap(); - let mut derive_key_out = [0; blake3::OUT_LEN]; - blake3::derive_key(context, b"key material", &mut derive_key_out); - let expected = hex::encode(&derive_key_out); - let output = cmd!(b3sum_exe(), "--derive-key", context, "--no-names", f.path()) - .read() - .unwrap(); - assert_eq!(&*expected, &*output); -} - -#[test] -fn test_no_mmap() { - let f = tempfile::NamedTempFile::new().unwrap(); - f.as_file().write_all(b"foo").unwrap(); - f.as_file().flush().unwrap(); - - let expected = blake3::hash(b"foo").to_hex(); - let output = cmd!(b3sum_exe(), "--no-mmap", "--no-names", f.path()) - .read() - .unwrap(); - assert_eq!(&*expected, &*output); -} - -#[test] -fn test_length_without_value_is_an_error() { - let result = cmd!(b3sum_exe(), "--length") - .stdin_bytes("foo") - .stderr_capture() - .run(); - assert!(result.is_err()); -} - -#[test] -fn test_raw_with_multi_files_is_an_error() { - let f1 = tempfile::NamedTempFile::new().unwrap(); - let f2 = tempfile::NamedTempFile::new().unwrap(); - - // Make sure it doesn't error with just one file - let result = cmd!(b3sum_exe(), "--raw", f1.path()).stdout_capture().run(); - assert!(result.is_ok()); - - // Make sure it errors when both file are passed - let result = cmd!(b3sum_exe(), "--raw", f1.path(), f2.path()) - .stderr_capture() - .run(); - assert!(result.is_err()); -} - -#[test] -#[cfg(unix)] -fn test_newline_and_backslash_escaping_on_unix() { - let empty_hash = blake3::hash(b"").to_hex(); - let dir = tempfile::tempdir().unwrap(); - fs::create_dir(dir.path().join("subdir")).unwrap(); - let names = [ - "abcdef", - "abc\ndef", - "abc\\def", - "abc\rdef", - "abc\r\ndef", - "subdir/foo", - ]; - let mut paths = Vec::new(); - for name in &names { - let path = dir.path().join(name); - println!("creating file at {:?}", path); - fs::write(&path, b"").unwrap(); - paths.push(path); - } - let output = cmd(b3sum_exe(), &names).dir(dir.path()).read().unwrap(); - let expected = format!( - "\ -{0} abcdef -\\{0} abc\\ndef -\\{0} abc\\\\def -{0} abc\rdef -\\{0} abc\r\\ndef -{0} subdir/foo", - empty_hash, - ); - println!("output"); - println!("======"); - println!("{}", output); - println!(); - println!("expected"); - println!("========"); - println!("{}", expected); - println!(); - assert_eq!(expected, output); -} - -#[test] -#[cfg(windows)] -fn test_slash_normalization_on_windows() { - let empty_hash = blake3::hash(b"").to_hex(); - let dir = tempfile::tempdir().unwrap(); - fs::create_dir(dir.path().join("subdir")).unwrap(); - // Note that filenames can't contain newlines or backslashes on Windows, so - // we don't test escaping here. We only test forward slash and backslash as - // directory separators. - let names = ["abcdef", "subdir/foo", "subdir\\bar"]; - let mut paths = Vec::new(); - for name in &names { - let path = dir.path().join(name); - println!("creating file at {:?}", path); - fs::write(&path, b"").unwrap(); - paths.push(path); - } - let output = cmd(b3sum_exe(), &names).dir(dir.path()).read().unwrap(); - let expected = format!( - "\ -{0} abcdef -{0} subdir/foo -{0} subdir/bar", - empty_hash, - ); - println!("output"); - println!("======"); - println!("{}", output); - println!(); - println!("expected"); - println!("========"); - println!("{}", expected); - println!(); - assert_eq!(expected, output); -} - -#[test] -#[cfg(unix)] -fn test_invalid_unicode_on_unix() { - use std::os::unix::ffi::OsStringExt; - - let empty_hash = blake3::hash(b"").to_hex(); - let dir = tempfile::tempdir().unwrap(); - let names = ["abcdef".into(), OsString::from_vec(b"abc\xffdef".to_vec())]; - let mut paths = Vec::new(); - for name in &names { - let path = dir.path().join(name); - println!("creating file at {:?}", path); - // Note: Some operating systems, macOS in particular, simply don't - // allow invalid Unicode in filenames. On those systems, this write - // will fail. That's fine, we'll just short-circuit this test in that - // case. But assert that at least Linux allows this. - let write_result = fs::write(&path, b""); - if cfg!(target_os = "linux") { - write_result.expect("Linux should allow invalid Unicode"); - } else if write_result.is_err() { - return; - } - paths.push(path); - } - let output = cmd(b3sum_exe(), &names).dir(dir.path()).read().unwrap(); - let expected = format!( - "\ -{0} abcdef -{0} abc�def", - empty_hash, - ); - println!("output"); - println!("======"); - println!("{}", output); - println!(); - println!("expected"); - println!("========"); - println!("{}", expected); - println!(); - assert_eq!(expected, output); -} - -#[test] -#[cfg(windows)] -fn test_invalid_unicode_on_windows() { - use std::os::windows::ffi::OsStringExt; - - let empty_hash = blake3::hash(b"").to_hex(); - let dir = tempfile::tempdir().unwrap(); - let surrogate_char = 0xDC00; - let bad_unicode_wchars = [ - 'a' as u16, - 'b' as u16, - 'c' as u16, - surrogate_char, - 'd' as u16, - 'e' as u16, - 'f' as u16, - ]; - let bad_osstring = OsString::from_wide(&bad_unicode_wchars); - let names = ["abcdef".into(), bad_osstring]; - let mut paths = Vec::new(); - for name in &names { - let path = dir.path().join(name); - println!("creating file at {:?}", path); - fs::write(&path, b"").unwrap(); - paths.push(path); - } - let output = cmd(b3sum_exe(), &names).dir(dir.path()).read().unwrap(); - let expected = format!( - "\ -{0} abcdef -{0} abc�def", - empty_hash, - ); - println!("output"); - println!("======"); - println!("{}", output); - println!(); - println!("expected"); - println!("========"); - println!("{}", expected); - println!(); - assert_eq!(expected, output); -} - -#[test] -fn test_check() { - // Make a directory full of files, and make sure the b3sum output in that - // directory is what we expect. - let a_hash = blake3::hash(b"a").to_hex(); - let b_hash = blake3::hash(b"b").to_hex(); - let cd_hash = blake3::hash(b"cd").to_hex(); - let dir = tempfile::tempdir().unwrap(); - fs::write(dir.path().join("a"), b"a").unwrap(); - fs::write(dir.path().join("b"), b"b").unwrap(); - fs::create_dir(dir.path().join("c")).unwrap(); - fs::write(dir.path().join("c/d"), b"cd").unwrap(); - let output = cmd!(b3sum_exe(), "a", "b", "c/d") - .dir(dir.path()) - .stdout_capture() - .stderr_capture() - .run() - .unwrap(); - let stdout = std::str::from_utf8(&output.stdout).unwrap(); - let stderr = std::str::from_utf8(&output.stderr).unwrap(); - let expected_checkfile = format!( - "{} a\n\ - {} b\n\ - {} c/d\n", - a_hash, b_hash, cd_hash, - ); - assert_eq!(expected_checkfile, stdout); - assert_eq!("", stderr); - - // Now use the output we just validated as a checkfile, passed to stdin. - let output = cmd!(b3sum_exe(), "--check") - .stdin_bytes(expected_checkfile.as_bytes()) - .dir(dir.path()) - .stdout_capture() - .stderr_capture() - .run() - .unwrap(); - let stdout = std::str::from_utf8(&output.stdout).unwrap(); - let stderr = std::str::from_utf8(&output.stderr).unwrap(); - let expected_check_output = "\ - a: OK\n\ - b: OK\n\ - c/d: OK\n"; - assert_eq!(expected_check_output, stdout); - assert_eq!("", stderr); - - // Now pass the same checkfile twice on the command line just for fun. - let checkfile_path = dir.path().join("checkfile"); - fs::write(&checkfile_path, &expected_checkfile).unwrap(); - let output = cmd!(b3sum_exe(), "--check", &checkfile_path, &checkfile_path) - .dir(dir.path()) - .stdout_capture() - .stderr_capture() - .run() - .unwrap(); - let stdout = std::str::from_utf8(&output.stdout).unwrap(); - let stderr = std::str::from_utf8(&output.stderr).unwrap(); - let mut double_check_output = String::new(); - double_check_output.push_str(&expected_check_output); - double_check_output.push_str(&expected_check_output); - assert_eq!(double_check_output, stdout); - assert_eq!("", stderr); - - // Corrupt one of the files and check again. - fs::write(dir.path().join("b"), b"CORRUPTION").unwrap(); - let output = cmd!(b3sum_exe(), "--check", &checkfile_path) - .dir(dir.path()) - .stdout_capture() - .stderr_capture() - .unchecked() - .run() - .unwrap(); - let stdout = std::str::from_utf8(&output.stdout).unwrap(); - let stderr = std::str::from_utf8(&output.stderr).unwrap(); - let expected_check_failure = "\ - a: OK\n\ - b: FAILED\n\ - c/d: OK\n"; - assert!(!output.status.success()); - assert_eq!(expected_check_failure, stdout); - assert_eq!("", stderr); - - // Delete one of the files and check again. - fs::remove_file(dir.path().join("b")).unwrap(); - let open_file_error = fs::File::open(dir.path().join("b")).unwrap_err(); - let output = cmd!(b3sum_exe(), "--check", &checkfile_path) - .dir(dir.path()) - .stdout_capture() - .stderr_capture() - .unchecked() - .run() - .unwrap(); - let stdout = std::str::from_utf8(&output.stdout).unwrap(); - let stderr = std::str::from_utf8(&output.stderr).unwrap(); - let expected_check_failure = format!( - "a: OK\n\ - b: FAILED ({})\n\ - c/d: OK\n", - open_file_error, - ); - assert!(!output.status.success()); - assert_eq!(expected_check_failure, stdout); - assert_eq!("", stderr); - - // Confirm that --quiet suppresses the OKs but not the FAILEDs. - let output = cmd!(b3sum_exe(), "--check", "--quiet", &checkfile_path) - .dir(dir.path()) - .stdout_capture() - .stderr_capture() - .unchecked() - .run() - .unwrap(); - let stdout = std::str::from_utf8(&output.stdout).unwrap(); - let stderr = std::str::from_utf8(&output.stderr).unwrap(); - let expected_check_failure = format!("b: FAILED ({})\n", open_file_error); - assert!(!output.status.success()); - assert_eq!(expected_check_failure, stdout); - assert_eq!("", stderr); -} - -#[test] -fn test_check_invalid_characters() { - // Check that a null character in the path fails. - let output = cmd!(b3sum_exe(), "--check") - .stdin_bytes("0000000000000000000000000000000000000000000000000000000000000000 \0") - .stdout_capture() - .stderr_capture() - .unchecked() - .run() - .unwrap(); - let stdout = std::str::from_utf8(&output.stdout).unwrap(); - let stderr = std::str::from_utf8(&output.stderr).unwrap(); - assert!(!output.status.success()); - assert_eq!("", stdout); - assert_eq!("b3sum: Null character in path\n", stderr); - - // Check that a Unicode replacement character in the path fails. - let output = cmd!(b3sum_exe(), "--check") - .stdin_bytes("0000000000000000000000000000000000000000000000000000000000000000 �") - .stdout_capture() - .stderr_capture() - .unchecked() - .run() - .unwrap(); - let stdout = std::str::from_utf8(&output.stdout).unwrap(); - let stderr = std::str::from_utf8(&output.stderr).unwrap(); - assert!(!output.status.success()); - assert_eq!("", stdout); - assert_eq!("b3sum: Unicode replacement character in path\n", stderr); - - // Check that an invalid escape sequence in the path fails. - let output = cmd!(b3sum_exe(), "--check") - .stdin_bytes("\\0000000000000000000000000000000000000000000000000000000000000000 \\a") - .stdout_capture() - .stderr_capture() - .unchecked() - .run() - .unwrap(); - let stdout = std::str::from_utf8(&output.stdout).unwrap(); - let stderr = std::str::from_utf8(&output.stderr).unwrap(); - assert!(!output.status.success()); - assert_eq!("", stdout); - assert_eq!("b3sum: Invalid backslash escape\n", stderr); - - // Windows also forbids literal backslashes. Check for that if and only if - // we're on Windows. - if cfg!(windows) { - let output = cmd!(b3sum_exe(), "--check") - .stdin_bytes("0000000000000000000000000000000000000000000000000000000000000000 \\") - .stdout_capture() - .stderr_capture() - .unchecked() - .run() - .unwrap(); - let stdout = std::str::from_utf8(&output.stdout).unwrap(); - let stderr = std::str::from_utf8(&output.stderr).unwrap(); - assert!(!output.status.success()); - assert_eq!("", stdout); - assert_eq!("b3sum: Backslash in path\n", stderr); - } -} - -#[test] -fn test_globbing() { - // On Unix, globbing is provided by the shell. On Windows, globbing is - // provided by us, using the `wild` crate. - let dir = tempfile::tempdir().unwrap(); - let file1 = dir.path().join("file1"); - fs::write(&file1, b"foo").unwrap(); - let file2 = dir.path().join("file2"); - fs::write(&file2, b"bar").unwrap(); - - let foo_hash = blake3::hash(b"foo"); - let bar_hash = blake3::hash(b"bar"); - // NOTE: This assumes that the glob will be expanded in alphabetical order, - // to "file1 file2" rather than "file2 file1". So far, this seems to - // be true (guaranteed?) of Unix shell behavior, and true in practice - // with the `wild` crate on Windows. It's possible that this could - // start failing in the future, though, or on some unknown platform. - // If that ever happens, we'll need to relax this test somehow, - // probably by just testing for both possible outputs. I'm not - // handling that case in advance, though, because I'd prefer to hear - // about it if it comes up. - let expected = format!("{} file1\n{} file2", foo_hash.to_hex(), bar_hash.to_hex()); - - let star_command = format!("{} *", b3sum_exe().to_str().unwrap()); - let (exe, c_flag) = if cfg!(windows) { - ("cmd.exe", "/C") - } else { - ("/bin/sh", "-c") - }; - let output = cmd!(exe, c_flag, star_command) - .dir(dir.path()) - .read() - .unwrap(); - assert_eq!(expected, output); -} diff --git a/thirdparty/BLAKE3/b3sum/what_does_check_do.md b/thirdparty/BLAKE3/b3sum/what_does_check_do.md deleted file mode 100644 index 3a44a0010..000000000 --- a/thirdparty/BLAKE3/b3sum/what_does_check_do.md +++ /dev/null @@ -1,174 +0,0 @@ -# How does `b3sum --check` behave exactly?<br>or: Are filepaths...text? - -Most of the time, `b3sum --check` is a drop-in replacement for `md5sum --check` -and other Coreutils hashing tools. It consumes a checkfile (the output of a -regular `b3sum` command), re-hashes all the files listed there, and returns -success if all of those hashes are still correct. What makes this more -complicated than it might seem, is that representing filepaths as text means we -need to consider many possible edge cases of unrepresentable filepaths. This -document describes all of these edge cases in detail. - -## The simple case - -Here's the result of running `b3sum a b c/d` in a directory that contains -those three files: - -```bash -$ echo hi > a -$ echo lo > b -$ mkdir c -$ echo stuff > c/d -$ b3sum a b c/d -0b8b60248fad7ac6dfac221b7e01a8b91c772421a15b387dd1fb2d6a94aee438 a -6ae4a57bbba24f79c461d30bcb4db973b9427d9207877e34d2d74528daa84115 b -2d477356c962e54784f1c5dc5297718d92087006f6ee96b08aeaf7f3cd252377 c/d -``` - -If we pipe that output into `b3sum --check`, it will exit with status zero -(success) and print: - -```bash -$ b3sum a b c/d | b3sum --check -a: OK -b: OK -c/d: OK -``` - -If we delete `b` and change the contents of `c/d`, and then use the same -checkfile as above, `b3sum --check` will exit with a non-zero status (failure) -and print: - -```bash -$ b3sum a b c/d > checkfile -$ rm b -$ echo more stuff >> c/d -$ b3sum --check checkfile -a: OK -b: FAILED (No such file or directory (os error 2)) -c/d: FAILED -``` - -In these typical cases, `b3sum` and `md5sum` have identical output for success -and very similar output for failure. - -## Escaping newlines and backslashes - -Since the checkfile format (the regular output format of `b3sum`) is -newline-separated text, we need to worry about what happens when a filepath -contains a newline, or worse. Suppose we create a file named `x[newline]x` -(3 characters). One way to create such a file is with a Python one-liner like -this: - -```python ->>> open("x\nx", "w") -``` - -Here's what happens when we hash that file with `b3sum`: - -```bash -$ b3sum x* -\af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262 x\nx -``` - -Notice two things. First, `b3sum` puts a single `\` character at the front of -the line. This indicates that the filepath contains escape sequences that -`b3sum --check` will need to unescape. Then, `b3sum` replaces the newline -character in the filepath with the two-character escape sequence `\n`. -Similarly, if the filepath contained a backslash, `b3sum` would escape it as -`\\` in the output. So far, all of this behavior is still identical to -`md5sum`. - -## Invalid Unicode - -This is where `b3sum` and `md5um` diverge. Apart from the newline and backslash -escapes described above, `md5sum` copies all other filepath bytes verbatim to -its output. That means its output encoding is "ASCII plus whatever bytes we got -from the command line". This creates two problems: - -1. Printing something that isn't UTF-8 is kind of gross. -2. Windows support. - -What's the deal with Windows? To start with, there's a fundamental difference -in how Unix and Windows represent filepaths. Unix filepaths are "usually UTF-8" -and Windows filepaths are "usually UTF-16". That means that a file named `abc` -is typically represented as the bytes `[97, 98, 99]` on Unix and as the bytes -`[97, 0, 98, 0, 99, 0]` on Windows. The `md5sum` approach won't work if we plan -on creating a checkfile on Unix and checking it on Windows, or vice versa. - -A more portable approach is to convert platform-specific bytes into some -consistent Unicode encoding. (In practice this is going to be UTF-8, but in -theory it could be anything.) Then when `--check` needs to open a file, we -convert the Unicode representation back into platform-specific bytes. This -makes important common cases like `abc`, and in fact even `abc[newline]def`, -work as expected. Great! - -But...what did we mean above when we said *usually* UTF-8 and *usually* UTF-16? -It turns out that not every possible sequence of bytes is valid UTF-8, and not -every possible sequence of 16-bit wide chars is valid UTF-16. For example, the -byte 0xFF (255) can never appear in any UTF-8 string. If we ask Python to -decode it, it yells at us: - -```python ->>> b"\xFF".decode("UTF-8") -UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte -``` - -However, tragically, we *can* create a file with that byte in its name (on -Linux at least, though not usually on macOS): - -```python ->>> open(b"y\xFFy", "w") -``` - -So some filepaths aren't representable in Unicode at all. Our plan to "convert -platform-specific bytes into some consistent Unicode encoding" isn't going to -work for everything. What does `b3sum` do with the file above? - -```bash -$ b3sum y* -af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262 y�y -``` - -That � in there is a "Unicode replacement character". When we run into -filepaths that we can't represent in Unicode, we replace the unrepresentable -parts with these characters. On the checking side, to avoid any possible -confusion between two different invalid filepaths, we automatically fail if we -see a replacement character. Together with a few more details covered in the -next section, this gives us an important set of properties: - -1. Any file can be hashed locally. -2. Any file with a valid Unicode name not containing the � character can be - checked. -3. Checking ambiguous or unrepresentable filepaths always fails. -4. Checkfiles are always valid UTF-8. -5. Checkfiles are portable between Unix and Windows. - -## Formal Rules - -1. When hashing, filepaths are represented in a platform-specific encoding, - which can accommodate any filepath on the current platform. In Rust, this is - `OsStr`/`OsString`. -2. In output, filepaths are first converted to UTF-8. Any non-Unicode segments - are replaced with Unicode replacement characters (U+FFFD). In Rust, this is - `OsStr::to_string_lossy`. -3. Then, if a filepath contains any backslashes (U+005C) or newlines (U+000A), - these characters are escaped as `\\` and `\n` respectively. -4. Finally, any output line containing an escape sequence is prefixed with a - single backslash. -5. When checking, each line is parsed as UTF-8, separated by a newline - (U+000A). Invalid UTF-8 is an error. -6. Then, if a line begins with a backslash, the filepath component is - unescaped. Any escape sequence other than `\\` or `\n` is an error. If a - line does not begin with a backslash, unescaping is not performed, and any - backslashes in the filepath component are interpreted literally. (`b3sum` - output never contains unescaped backslashes, but they can occur in - checkfiles assembled by hand.) -7. Finally, if a filepath contains a Unicode replacement character (U+FFFD) or - a null character (U+0000), it is an error. - - **Additionally, on Windows only:** - -8. In output, all backslashes (U+005C) are replaced with forward slashes - (U+002F). -9. When checking, after unescaping, if a filepath contains a backslash, it is - an error. diff --git a/thirdparty/BLAKE3/benches/bench.rs b/thirdparty/BLAKE3/benches/bench.rs deleted file mode 100644 index ba5a4041f..000000000 --- a/thirdparty/BLAKE3/benches/bench.rs +++ /dev/null @@ -1,520 +0,0 @@ -#![feature(test)] - -extern crate test; - -use arrayref::array_ref; -use arrayvec::ArrayVec; -use blake3::platform::{Platform, MAX_SIMD_DEGREE}; -use blake3::{BLOCK_LEN, CHUNK_LEN, OUT_LEN}; -use rand::prelude::*; -use test::Bencher; - -const KIB: usize = 1024; - -// This struct randomizes two things: -// 1. The actual bytes of input. -// 2. The page offset the input starts at. -pub struct RandomInput { - buf: Vec<u8>, - len: usize, - offsets: Vec<usize>, - offset_index: usize, -} - -impl RandomInput { - pub fn new(b: &mut Bencher, len: usize) -> Self { - b.bytes += len as u64; - let page_size: usize = page_size::get(); - let mut buf = vec![0u8; len + page_size]; - let mut rng = rand::thread_rng(); - rng.fill_bytes(&mut buf); - let mut offsets: Vec<usize> = (0..page_size).collect(); - offsets.shuffle(&mut rng); - Self { - buf, - len, - offsets, - offset_index: 0, - } - } - - pub fn get(&mut self) -> &[u8] { - let offset = self.offsets[self.offset_index]; - self.offset_index += 1; - if self.offset_index >= self.offsets.len() { - self.offset_index = 0; - } - &self.buf[offset..][..self.len] - } -} - -fn bench_single_compression_fn(b: &mut Bencher, platform: Platform) { - let mut state = [1u32; 8]; - let mut r = RandomInput::new(b, 64); - let input = array_ref!(r.get(), 0, 64); - b.iter(|| platform.compress_in_place(&mut state, input, 64 as u8, 0, 0)); -} - -#[bench] -fn bench_single_compression_portable(b: &mut Bencher) { - bench_single_compression_fn(b, Platform::portable()); -} - -#[bench] -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -fn bench_single_compression_sse2(b: &mut Bencher) { - if let Some(platform) = Platform::sse2() { - bench_single_compression_fn(b, platform); - } -} - -#[bench] -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -fn bench_single_compression_sse41(b: &mut Bencher) { - if let Some(platform) = Platform::sse41() { - bench_single_compression_fn(b, platform); - } -} - -#[bench] -#[cfg(blake3_avx512_ffi)] -fn bench_single_compression_avx512(b: &mut Bencher) { - if let Some(platform) = Platform::avx512() { - bench_single_compression_fn(b, platform); - } -} - -fn bench_many_chunks_fn(b: &mut Bencher, platform: Platform) { - let degree = platform.simd_degree(); - let mut inputs = Vec::new(); - for _ in 0..degree { - inputs.push(RandomInput::new(b, CHUNK_LEN)); - } - b.iter(|| { - let input_arrays: ArrayVec<[&[u8; CHUNK_LEN]; MAX_SIMD_DEGREE]> = inputs - .iter_mut() - .take(degree) - .map(|i| array_ref!(i.get(), 0, CHUNK_LEN)) - .collect(); - let mut out = [0; MAX_SIMD_DEGREE * OUT_LEN]; - platform.hash_many( - &input_arrays[..], - &[0; 8], - 0, - blake3::IncrementCounter::Yes, - 0, - 0, - 0, - &mut out, - ); - }); -} - -#[bench] -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -fn bench_many_chunks_sse2(b: &mut Bencher) { - if let Some(platform) = Platform::sse2() { - bench_many_chunks_fn(b, platform); - } -} - -#[bench] -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -fn bench_many_chunks_sse41(b: &mut Bencher) { - if let Some(platform) = Platform::sse41() { - bench_many_chunks_fn(b, platform); - } -} - -#[bench] -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -fn bench_many_chunks_avx2(b: &mut Bencher) { - if let Some(platform) = Platform::avx2() { - bench_many_chunks_fn(b, platform); - } -} - -#[bench] -#[cfg(blake3_avx512_ffi)] -fn bench_many_chunks_avx512(b: &mut Bencher) { - if let Some(platform) = Platform::avx512() { - bench_many_chunks_fn(b, platform); - } -} - -#[bench] -#[cfg(feature = "neon")] -fn bench_many_chunks_neon(b: &mut Bencher) { - if let Some(platform) = Platform::neon() { - bench_many_chunks_fn(b, platform); - } -} - -// TODO: When we get const generics we can unify this with the chunks code. -fn bench_many_parents_fn(b: &mut Bencher, platform: Platform) { - let degree = platform.simd_degree(); - let mut inputs = Vec::new(); - for _ in 0..degree { - inputs.push(RandomInput::new(b, BLOCK_LEN)); - } - b.iter(|| { - let input_arrays: ArrayVec<[&[u8; BLOCK_LEN]; MAX_SIMD_DEGREE]> = inputs - .iter_mut() - .take(degree) - .map(|i| array_ref!(i.get(), 0, BLOCK_LEN)) - .collect(); - let mut out = [0; MAX_SIMD_DEGREE * OUT_LEN]; - platform.hash_many( - &input_arrays[..], - &[0; 8], - 0, - blake3::IncrementCounter::No, - 0, - 0, - 0, - &mut out, - ); - }); -} - -#[bench] -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -fn bench_many_parents_sse2(b: &mut Bencher) { - if let Some(platform) = Platform::sse2() { - bench_many_parents_fn(b, platform); - } -} - -#[bench] -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -fn bench_many_parents_sse41(b: &mut Bencher) { - if let Some(platform) = Platform::sse41() { - bench_many_parents_fn(b, platform); - } -} - -#[bench] -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -fn bench_many_parents_avx2(b: &mut Bencher) { - if let Some(platform) = Platform::avx2() { - bench_many_parents_fn(b, platform); - } -} - -#[bench] -#[cfg(blake3_avx512_ffi)] -fn bench_many_parents_avx512(b: &mut Bencher) { - if let Some(platform) = Platform::avx512() { - bench_many_parents_fn(b, platform); - } -} - -#[bench] -#[cfg(feature = "neon")] -fn bench_many_parents_neon(b: &mut Bencher) { - if let Some(platform) = Platform::neon() { - bench_many_parents_fn(b, platform); - } -} - -fn bench_atonce(b: &mut Bencher, len: usize) { - let mut input = RandomInput::new(b, len); - b.iter(|| blake3::hash(input.get())); -} - -#[bench] -fn bench_atonce_0001_block(b: &mut Bencher) { - bench_atonce(b, BLOCK_LEN); -} - -#[bench] -fn bench_atonce_0001_kib(b: &mut Bencher) { - bench_atonce(b, 1 * KIB); -} - -#[bench] -fn bench_atonce_0002_kib(b: &mut Bencher) { - bench_atonce(b, 2 * KIB); -} - -#[bench] -fn bench_atonce_0004_kib(b: &mut Bencher) { - bench_atonce(b, 4 * KIB); -} - -#[bench] -fn bench_atonce_0008_kib(b: &mut Bencher) { - bench_atonce(b, 8 * KIB); -} - -#[bench] -fn bench_atonce_0016_kib(b: &mut Bencher) { - bench_atonce(b, 16 * KIB); -} - -#[bench] -fn bench_atonce_0032_kib(b: &mut Bencher) { - bench_atonce(b, 32 * KIB); -} - -#[bench] -fn bench_atonce_0064_kib(b: &mut Bencher) { - bench_atonce(b, 64 * KIB); -} - -#[bench] -fn bench_atonce_0128_kib(b: &mut Bencher) { - bench_atonce(b, 128 * KIB); -} - -#[bench] -fn bench_atonce_0256_kib(b: &mut Bencher) { - bench_atonce(b, 256 * KIB); -} - -#[bench] -fn bench_atonce_0512_kib(b: &mut Bencher) { - bench_atonce(b, 512 * KIB); -} - -#[bench] -fn bench_atonce_1024_kib(b: &mut Bencher) { - bench_atonce(b, 1024 * KIB); -} - -fn bench_incremental(b: &mut Bencher, len: usize) { - let mut input = RandomInput::new(b, len); - b.iter(|| blake3::Hasher::new().update(input.get()).finalize()); -} - -#[bench] -fn bench_incremental_0001_block(b: &mut Bencher) { - bench_incremental(b, BLOCK_LEN); -} - -#[bench] -fn bench_incremental_0001_kib(b: &mut Bencher) { - bench_incremental(b, 1 * KIB); -} - -#[bench] -fn bench_incremental_0002_kib(b: &mut Bencher) { - bench_incremental(b, 2 * KIB); -} - -#[bench] -fn bench_incremental_0004_kib(b: &mut Bencher) { - bench_incremental(b, 4 * KIB); -} - -#[bench] -fn bench_incremental_0008_kib(b: &mut Bencher) { - bench_incremental(b, 8 * KIB); -} - -#[bench] -fn bench_incremental_0016_kib(b: &mut Bencher) { - bench_incremental(b, 16 * KIB); -} - -#[bench] -fn bench_incremental_0032_kib(b: &mut Bencher) { - bench_incremental(b, 32 * KIB); -} - -#[bench] -fn bench_incremental_0064_kib(b: &mut Bencher) { - bench_incremental(b, 64 * KIB); -} - -#[bench] -fn bench_incremental_0128_kib(b: &mut Bencher) { - bench_incremental(b, 128 * KIB); -} - -#[bench] -fn bench_incremental_0256_kib(b: &mut Bencher) { - bench_incremental(b, 256 * KIB); -} - -#[bench] -fn bench_incremental_0512_kib(b: &mut Bencher) { - bench_incremental(b, 512 * KIB); -} - -#[bench] -fn bench_incremental_1024_kib(b: &mut Bencher) { - bench_incremental(b, 1024 * KIB); -} - -fn bench_reference(b: &mut Bencher, len: usize) { - let mut input = RandomInput::new(b, len); - b.iter(|| { - let mut hasher = reference_impl::Hasher::new(); - hasher.update(input.get()); - let mut out = [0; 32]; - hasher.finalize(&mut out); - out - }); -} - -#[bench] -fn bench_reference_0001_block(b: &mut Bencher) { - bench_reference(b, BLOCK_LEN); -} - -#[bench] -fn bench_reference_0001_kib(b: &mut Bencher) { - bench_reference(b, 1 * KIB); -} - -#[bench] -fn bench_reference_0002_kib(b: &mut Bencher) { - bench_reference(b, 2 * KIB); -} - -#[bench] -fn bench_reference_0004_kib(b: &mut Bencher) { - bench_reference(b, 4 * KIB); -} - -#[bench] -fn bench_reference_0008_kib(b: &mut Bencher) { - bench_reference(b, 8 * KIB); -} - -#[bench] -fn bench_reference_0016_kib(b: &mut Bencher) { - bench_reference(b, 16 * KIB); -} - -#[bench] -fn bench_reference_0032_kib(b: &mut Bencher) { - bench_reference(b, 32 * KIB); -} - -#[bench] -fn bench_reference_0064_kib(b: &mut Bencher) { - bench_reference(b, 64 * KIB); -} - -#[bench] -fn bench_reference_0128_kib(b: &mut Bencher) { - bench_reference(b, 128 * KIB); -} - -#[bench] -fn bench_reference_0256_kib(b: &mut Bencher) { - bench_reference(b, 256 * KIB); -} - -#[bench] -fn bench_reference_0512_kib(b: &mut Bencher) { - bench_reference(b, 512 * KIB); -} - -#[bench] -fn bench_reference_1024_kib(b: &mut Bencher) { - bench_reference(b, 1024 * KIB); -} - -#[cfg(feature = "rayon")] -fn bench_rayon(b: &mut Bencher, len: usize) { - let mut input = RandomInput::new(b, len); - b.iter(|| { - blake3::Hasher::new() - .update_with_join::<blake3::join::RayonJoin>(input.get()) - .finalize() - }); -} - -#[bench] -#[cfg(feature = "rayon")] -fn bench_rayon_0001_block(b: &mut Bencher) { - bench_rayon(b, BLOCK_LEN); -} - -#[bench] -#[cfg(feature = "rayon")] -fn bench_rayon_0001_kib(b: &mut Bencher) { - bench_rayon(b, 1 * KIB); -} - -#[bench] -#[cfg(feature = "rayon")] -fn bench_rayon_0002_kib(b: &mut Bencher) { - bench_rayon(b, 2 * KIB); -} - -#[bench] -#[cfg(feature = "rayon")] -fn bench_rayon_0004_kib(b: &mut Bencher) { - bench_rayon(b, 4 * KIB); -} - -#[bench] -#[cfg(feature = "rayon")] -fn bench_rayon_0008_kib(b: &mut Bencher) { - bench_rayon(b, 8 * KIB); -} - -#[bench] -#[cfg(feature = "rayon")] -fn bench_rayon_0016_kib(b: &mut Bencher) { - bench_rayon(b, 16 * KIB); -} - -#[bench] -#[cfg(feature = "rayon")] -fn bench_rayon_0032_kib(b: &mut Bencher) { - bench_rayon(b, 32 * KIB); -} - -#[bench] -#[cfg(feature = "rayon")] -fn bench_rayon_0064_kib(b: &mut Bencher) { - bench_rayon(b, 64 * KIB); -} - -#[bench] -#[cfg(feature = "rayon")] -fn bench_rayon_0128_kib(b: &mut Bencher) { - bench_rayon(b, 128 * KIB); -} - -#[bench] -#[cfg(feature = "rayon")] -fn bench_rayon_0256_kib(b: &mut Bencher) { - bench_rayon(b, 256 * KIB); -} - -#[bench] -#[cfg(feature = "rayon")] -fn bench_rayon_0512_kib(b: &mut Bencher) { - bench_rayon(b, 512 * KIB); -} - -#[bench] -#[cfg(feature = "rayon")] -fn bench_rayon_1024_kib(b: &mut Bencher) { - bench_rayon(b, 1024 * KIB); -} - -// This checks that update() splits up its input in increasing powers of 2, so -// that it can recover a high degree of parallelism when the number of bytes -// hashed so far is uneven. The performance of this benchmark should be -// reasonably close to bench_incremental_0064_kib, within 80% or so. When we -// had a bug in this logic (https://github.com/BLAKE3-team/BLAKE3/issues/69), -// performance was less than half. -#[bench] -fn bench_two_updates(b: &mut Bencher) { - let len = 65536; - let mut input = RandomInput::new(b, len); - b.iter(|| { - let mut hasher = blake3::Hasher::new(); - let input = input.get(); - hasher.update(&input[..1]); - hasher.update(&input[1..]); - hasher.finalize() - }); -} diff --git a/thirdparty/BLAKE3/build.rs b/thirdparty/BLAKE3/build.rs deleted file mode 100644 index ea657d8db..000000000 --- a/thirdparty/BLAKE3/build.rs +++ /dev/null @@ -1,260 +0,0 @@ -use std::env; - -fn defined(var: &str) -> bool { - println!("cargo:rerun-if-env-changed={}", var); - env::var_os(var).is_some() -} - -fn is_pure() -> bool { - defined("CARGO_FEATURE_PURE") -} - -fn should_prefer_intrinsics() -> bool { - defined("CARGO_FEATURE_PREFER_INTRINSICS") -} - -fn is_neon() -> bool { - defined("CARGO_FEATURE_NEON") -} - -fn is_ci() -> bool { - defined("BLAKE3_CI") -} - -fn warn(warning: &str) { - assert!(!warning.contains("\n")); - println!("cargo:warning={}", warning); - if is_ci() { - println!("cargo:warning=Warnings in CI are treated as errors. Build failed."); - std::process::exit(1); - } -} - -fn target_components() -> Vec<String> { - let target = env::var("TARGET").unwrap(); - target.split("-").map(|s| s.to_string()).collect() -} - -fn is_x86_64() -> bool { - target_components()[0] == "x86_64" -} - -fn is_x86_32() -> bool { - let arch = &target_components()[0]; - arch == "i386" || arch == "i586" || arch == "i686" -} - -fn is_armv7() -> bool { - target_components()[0] == "armv7" -} - -// Windows targets may be using the MSVC toolchain or the GNU toolchain. The -// right compiler flags to use depend on the toolchain. (And we don't want to -// use flag_if_supported, because we don't want features to be silently -// disabled by old compilers.) -fn is_windows_msvc() -> bool { - // Some targets are only two components long, so check in steps. - target_components()[1] == "pc" - && target_components()[2] == "windows" - && target_components()[3] == "msvc" -} - -fn is_windows_gnu() -> bool { - // Some targets are only two components long, so check in steps. - target_components()[1] == "pc" - && target_components()[2] == "windows" - && target_components()[3] == "gnu" -} - -fn new_build() -> cc::Build { - let mut build = cc::Build::new(); - if !is_windows_msvc() { - build.flag("-std=c11"); - } - build -} - -#[derive(PartialEq)] -enum CCompilerSupport { - NoCompiler, - NoAVX512, - YesAVX512, -} -use CCompilerSupport::*; - -fn c_compiler_support() -> CCompilerSupport { - let build = new_build(); - let flags_checked; - let support_result: Result<bool, _> = if is_windows_msvc() { - flags_checked = "/arch:AVX512"; - build.is_flag_supported("/arch:AVX512") - } else { - // Check for both of the flags we use. If -mavx512f works, then -mavx512vl - // will probably always work too, but we might as well be thorough. - flags_checked = "-mavx512f and -mavx512vl"; - match build.is_flag_supported("-mavx512f") { - Ok(true) => build.is_flag_supported("-mavx512vl"), - false_or_error => false_or_error, - } - }; - match support_result { - Ok(true) => YesAVX512, - Ok(false) => { - warn(&format!( - "The C compiler {:?} does not support {}.", - build.get_compiler().path(), - flags_checked, - )); - NoAVX512 - } - Err(e) => { - println!("{:?}", e); - warn(&format!( - "No C compiler {:?} detected.", - build.get_compiler().path() - )); - NoCompiler - } - } -} - -fn build_sse2_sse41_avx2_rust_intrinsics() { - // No C code to compile here. Set the cfg flags that enable the Rust SSE2, - // SSE4.1, and AVX2 intrinsics modules. The regular Cargo build will compile - // them. - println!("cargo:rustc-cfg=blake3_sse2_rust"); - println!("cargo:rustc-cfg=blake3_sse41_rust"); - println!("cargo:rustc-cfg=blake3_avx2_rust"); -} - -fn build_sse2_sse41_avx2_assembly() { - // Build the assembly implementations for SSE4.1 and AVX2. This is - // preferred, but it only supports x86_64. - assert!(is_x86_64()); - println!("cargo:rustc-cfg=blake3_sse2_ffi"); - println!("cargo:rustc-cfg=blake3_sse41_ffi"); - println!("cargo:rustc-cfg=blake3_avx2_ffi"); - let mut build = new_build(); - if is_windows_msvc() { - build.file("c/blake3_sse2_x86-64_windows_msvc.asm"); - build.file("c/blake3_sse41_x86-64_windows_msvc.asm"); - build.file("c/blake3_avx2_x86-64_windows_msvc.asm"); - } else if is_windows_gnu() { - build.file("c/blake3_sse2_x86-64_windows_gnu.S"); - build.file("c/blake3_sse41_x86-64_windows_gnu.S"); - build.file("c/blake3_avx2_x86-64_windows_gnu.S"); - } else { - // All non-Windows implementations are assumed to support - // Linux-style assembly. These files do contain a small - // explicit workaround for macOS also. - build.file("c/blake3_sse2_x86-64_unix.S"); - build.file("c/blake3_sse41_x86-64_unix.S"); - build.file("c/blake3_avx2_x86-64_unix.S"); - } - build.compile("blake3_sse2_sse41_avx2_assembly"); -} - -fn build_avx512_c_intrinsics() { - // This is required on 32-bit x86 targets, since the assembly - // implementation doesn't support support those. - println!("cargo:rustc-cfg=blake3_avx512_ffi"); - let mut build = new_build(); - build.file("c/blake3_avx512.c"); - if is_windows_msvc() { - build.flag("/arch:AVX512"); - } else { - build.flag("-mavx512f"); - build.flag("-mavx512vl"); - } - if is_windows_gnu() { - // Workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65782. - build.flag("-fno-asynchronous-unwind-tables"); - } - build.compile("blake3_avx512_intrinsics"); -} - -fn build_avx512_assembly() { - // Build the assembly implementation for AVX-512. This is preferred, but it - // only supports x86_64. - assert!(is_x86_64()); - println!("cargo:rustc-cfg=blake3_avx512_ffi"); - let mut build = new_build(); - if is_windows_msvc() { - build.file("c/blake3_avx512_x86-64_windows_msvc.asm"); - } else { - if is_windows_gnu() { - build.file("c/blake3_avx512_x86-64_windows_gnu.S"); - } else { - // All non-Windows implementations are assumed to support Linux-style - // assembly. These files do contain a small explicit workaround for - // macOS also. - build.file("c/blake3_avx512_x86-64_unix.S"); - } - // Older versions of Clang require these flags, even for assembly. See - // https://github.com/BLAKE3-team/BLAKE3/issues/79. - build.flag("-mavx512f"); - build.flag("-mavx512vl"); - } - build.compile("blake3_avx512_assembly"); -} - -fn build_neon_c_intrinsics() { - let mut build = new_build(); - // Note that blake3_neon.c normally depends on the blake3_portable.c - // for the single-instance compression function, but we expose - // portable.rs over FFI instead. See ffi_neon.rs. - build.file("c/blake3_neon.c"); - // ARMv7 platforms that support NEON generally need the following - // flags. AArch64 supports NEON by default and does not support -mpfu. - if is_armv7() { - build.flag("-mfpu=neon-vfpv4"); - build.flag("-mfloat-abi=hard"); - } - build.compile("blake3_neon"); -} - -fn main() -> Result<(), Box<dyn std::error::Error>> { - if is_pure() && is_neon() { - panic!("It doesn't make sense to enable both \"pure\" and \"neon\"."); - } - - if is_x86_64() || is_x86_32() { - let support = c_compiler_support(); - if is_x86_32() || should_prefer_intrinsics() || is_pure() || support == NoCompiler { - build_sse2_sse41_avx2_rust_intrinsics(); - } else { - // We assume that all C compilers can assemble SSE4.1 and AVX2. We - // don't explicitly check for support. - build_sse2_sse41_avx2_assembly(); - } - - if is_pure() || support == NoCompiler || support == NoAVX512 { - // The binary will not include any AVX-512 code. - } else if is_x86_32() || should_prefer_intrinsics() { - build_avx512_c_intrinsics(); - } else { - build_avx512_assembly(); - } - } - - if is_neon() { - build_neon_c_intrinsics(); - } - - // The `cc` crate doesn't automatically emit rerun-if directives for the - // environment variables it supports, in particular for $CC. We expect to - // do a lot of benchmarking across different compilers, so we explicitly - // add the variables that we're likely to need. - println!("cargo:rerun-if-env-changed=CC"); - println!("cargo:rerun-if-env-changed=CFLAGS"); - - // Ditto for source files, though these shouldn't change as often. - for file in std::fs::read_dir("c")? { - println!( - "cargo:rerun-if-changed={}", - file?.path().to_str().expect("utf-8") - ); - } - - Ok(()) -} diff --git a/thirdparty/BLAKE3/c/.gitignore b/thirdparty/BLAKE3/c/.gitignore deleted file mode 100644 index 0bf608cee..000000000 --- a/thirdparty/BLAKE3/c/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -blake3 -example -*.o diff --git a/thirdparty/BLAKE3/c/Makefile.testing b/thirdparty/BLAKE3/c/Makefile.testing deleted file mode 100644 index 41e6b8285..000000000 --- a/thirdparty/BLAKE3/c/Makefile.testing +++ /dev/null @@ -1,78 +0,0 @@ -# This Makefile is only for testing. C callers should follow the instructions -# in ./README.md to incorporate these C files into their existing build. - -NAME=blake3 -CC=gcc -CFLAGS=-O3 -Wall -Wextra -std=c11 -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2 -fPIE -fvisibility=hidden -LDFLAGS=-pie -Wl,-z,relro,-z,now -TARGETS= -ASM_TARGETS= -EXTRAFLAGS=-Wa,--noexecstack - -ifdef BLAKE3_NO_SSE2 -EXTRAFLAGS += -DBLAKE3_NO_SSE2 -else -TARGETS += blake3_sse2.o -ASM_TARGETS += blake3_sse2_x86-64_unix.S -endif - -ifdef BLAKE3_NO_SSE41 -EXTRAFLAGS += -DBLAKE3_NO_SSE41 -else -TARGETS += blake3_sse41.o -ASM_TARGETS += blake3_sse41_x86-64_unix.S -endif - -ifdef BLAKE3_NO_AVX2 -EXTRAFLAGS += -DBLAKE3_NO_AVX2 -else -TARGETS += blake3_avx2.o -ASM_TARGETS += blake3_avx2_x86-64_unix.S -endif - -ifdef BLAKE3_NO_AVX512 -EXTRAFLAGS += -DBLAKE3_NO_AVX512 -else -TARGETS += blake3_avx512.o -ASM_TARGETS += blake3_avx512_x86-64_unix.S -endif - -ifdef BLAKE3_USE_NEON -EXTRAFLAGS += -DBLAKE3_USE_NEON -TARGETS += blake3_neon.o -endif - -all: blake3.c blake3_dispatch.c blake3_portable.c main.c $(TARGETS) - $(CC) $(CFLAGS) $(EXTRAFLAGS) $^ -o $(NAME) $(LDFLAGS) - -blake3_sse2.o: blake3_sse2.c - $(CC) $(CFLAGS) $(EXTRAFLAGS) -c $^ -o $@ -msse2 - -blake3_sse41.o: blake3_sse41.c - $(CC) $(CFLAGS) $(EXTRAFLAGS) -c $^ -o $@ -msse4.1 - -blake3_avx2.o: blake3_avx2.c - $(CC) $(CFLAGS) $(EXTRAFLAGS) -c $^ -o $@ -mavx2 - -blake3_avx512.o: blake3_avx512.c - $(CC) $(CFLAGS) $(EXTRAFLAGS) -c $^ -o $@ -mavx512f -mavx512vl - -blake3_neon.o: blake3_neon.c - $(CC) $(CFLAGS) $(EXTRAFLAGS) -c $^ -o $@ - -test: CFLAGS += -DBLAKE3_TESTING -fsanitize=address,undefined -test: all - ./test.py - -asm: blake3.c blake3_dispatch.c blake3_portable.c main.c $(ASM_TARGETS) - $(CC) $(CFLAGS) $(EXTRAFLAGS) $^ -o $(NAME) $(LDFLAGS) - -test_asm: CFLAGS += -DBLAKE3_TESTING -fsanitize=address,undefined -test_asm: asm - ./test.py - -example: example.c blake3.c blake3_dispatch.c blake3_portable.c $(ASM_TARGETS) - $(CC) $(CFLAGS) $(EXTRAFLAGS) $^ -o $@ $(LDFLAGS) - -clean: - rm -f $(NAME) *.o diff --git a/thirdparty/BLAKE3/c/README.md b/thirdparty/BLAKE3/c/README.md deleted file mode 100644 index 5e8b4e682..000000000 --- a/thirdparty/BLAKE3/c/README.md +++ /dev/null @@ -1,270 +0,0 @@ -The official C implementation of BLAKE3. - -# Example - -An example program that hashes bytes from standard input and prints the -result: - -```c -#include "blake3.h" -#include <stdio.h> -#include <unistd.h> - -int main() { - // Initialize the hasher. - blake3_hasher hasher; - blake3_hasher_init(&hasher); - - // Read input bytes from stdin. - unsigned char buf[65536]; - ssize_t n; - while ((n = read(STDIN_FILENO, buf, sizeof(buf))) > 0) { - blake3_hasher_update(&hasher, buf, n); - } - - // Finalize the hash. BLAKE3_OUT_LEN is the default output length, 32 bytes. - uint8_t output[BLAKE3_OUT_LEN]; - blake3_hasher_finalize(&hasher, output, BLAKE3_OUT_LEN); - - // Print the hash as hexadecimal. - for (size_t i = 0; i < BLAKE3_OUT_LEN; i++) { - printf("%02x", output[i]); - } - printf("\n"); - return 0; -} -``` - -The code above is included in this directory as `example.c`. If you're -on x86\_64 with a Unix-like OS, you can compile a working binary like -this: - -```bash -gcc -O3 -o example example.c blake3.c blake3_dispatch.c blake3_portable.c \ - blake3_sse2_x86-64_unix.S blake3_sse41_x86-64_unix.S blake3_avx2_x86-64_unix.S \ - blake3_avx512_x86-64_unix.S -``` - -# API - -## The Struct - -```c -typedef struct { - // private fields -} blake3_hasher; -``` - -An incremental BLAKE3 hashing state, which can accept any number of -updates. This implementation doesn't allocate any heap memory, but -`sizeof(blake3_hasher)` itself is relatively large, currently 1912 bytes -on x86-64. This size can be reduced by restricting the maximum input -length, as described in Section 5.4 of [the BLAKE3 -spec](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf), -but this implementation doesn't currently support that strategy. - -## Common API Functions - -```c -void blake3_hasher_init( - blake3_hasher *self); -``` - -Initialize a `blake3_hasher` in the default hashing mode. - ---- - -```c -void blake3_hasher_update( - blake3_hasher *self, - const void *input, - size_t input_len); -``` - -Add input to the hasher. This can be called any number of times. - ---- - -```c -void blake3_hasher_finalize( - const blake3_hasher *self, - uint8_t *out, - size_t out_len); -``` - -Finalize the hasher and emit an output of any length. This doesn't -modify the hasher itself, and it's possible to finalize again after -adding more input. The constant `BLAKE3_OUT_LEN` provides the default -output length, 32 bytes. - -## Less Common API Functions - -```c -void blake3_hasher_init_keyed( - blake3_hasher *self, - const uint8_t key[BLAKE3_KEY_LEN]); -``` - -Initialize a `blake3_hasher` in the keyed hashing mode. The key must be -exactly 32 bytes. - ---- - -```c -void blake3_hasher_init_derive_key( - blake3_hasher *self, - const char *context); -``` - -Initialize a `blake3_hasher` in the key derivation mode. The context -string is given as an initialization parameter, and afterwards input key -material should be given with `blake3_hasher_update`. The context string -is a null-terminated C string which should be **hardcoded, globally -unique, and application-specific**. The context string should not -include any dynamic input like salts, nonces, or identifiers read from a -database at runtime. A good default format for the context string is -`"[application] [commit timestamp] [purpose]"`, e.g., `"example.com -2019-12-25 16:18:03 session tokens v1"`. - -This function is intended for application code written in C. For -language bindings, see `blake3_hasher_init_derive_key_raw` below. - ---- - -```c -void blake3_hasher_init_derive_key_raw( - blake3_hasher *self, - const void *context, - size_t context_len); -``` - -As `blake3_hasher_init_derive_key` above, except that the context string -is given as a pointer to an array of arbitrary bytes with a provided -length. This is intended for writing language bindings, where C string -conversion would add unnecessary overhead and new error cases. Unicode -strings should be encoded as UTF-8. - -Application code in C should prefer `blake3_hasher_init_derive_key`, -which takes the context as a C string. If you need to use arbitrary -bytes as a context string in application code, consider whether you're -violating the requirement that context strings should be hardcoded. - ---- - -```c -void blake3_hasher_finalize_seek( - const blake3_hasher *self, - uint64_t seek, - uint8_t *out, - size_t out_len); -``` - -The same as `blake3_hasher_finalize`, but with an additional `seek` -parameter for the starting byte position in the output stream. To -efficiently stream a large output without allocating memory, call this -function in a loop, incrementing `seek` by the output length each time. - -# Building - -This implementation is just C and assembly files. It doesn't include a -public-facing build system. (The `Makefile` in this directory is only -for testing.) Instead, the intention is that you can include these files -in whatever build system you're already using. This section describes -the commands your build system should execute, or which you can execute -by hand. Note that these steps may change in future versions. - -## x86 - -Dynamic dispatch is enabled by default on x86. The implementation will -query the CPU at runtime to detect SIMD support, and it will use the -widest instruction set available. By default, `blake3_dispatch.c` -expects to be linked with code for five different instruction sets: -portable C, SSE2, SSE4.1, AVX2, and AVX-512. - -For each of the x86 SIMD instruction sets, two versions are available, -one in assembly (which is further divided into three flavors: Unix, -Windows MSVC, and Windows GNU) and one using C intrinsics. The assembly -versions are generally preferred: they perform better, they perform more -consistently across different compilers, and they build more quickly. On -the other hand, the assembly versions are x86\_64-only, and you need to -select the right flavor for your target platform. - -Here's an example of building a shared library on x86\_64 Linux using -the assembly implementations: - -```bash -gcc -shared -O3 -o libblake3.so blake3.c blake3_dispatch.c blake3_portable.c \ - blake3_sse2_x86-64_unix.S blake3_sse41_x86-64_unix.S blake3_avx2_x86-64_unix.S \ - blake3_avx512_x86-64_unix.S -``` - -When building the intrinsics-based implementations, you need to build -each implementation separately, with the corresponding instruction set -explicitly enabled in the compiler. Here's the same shared library using -the intrinsics-based implementations: - -```bash -gcc -c -fPIC -O3 -msse2 blake3_sse2.c -o blake3_sse2.o -gcc -c -fPIC -O3 -msse4.1 blake3_sse41.c -o blake3_sse41.o -gcc -c -fPIC -O3 -mavx2 blake3_avx2.c -o blake3_avx2.o -gcc -c -fPIC -O3 -mavx512f -mavx512vl blake3_avx512.c -o blake3_avx512.o -gcc -shared -O3 -o libblake3.so blake3.c blake3_dispatch.c blake3_portable.c \ - blake3_avx2.o blake3_avx512.o blake3_sse41.o blake3_sse2.o -``` - -Note above that building `blake3_avx512.c` requires both `-mavx512f` and -`-mavx512vl` under GCC and Clang. Under MSVC, the single `/arch:AVX512` -flag is sufficient. The MSVC equivalent of `-mavx2` is `/arch:AVX2`. -MSVC enables SSE2 and SSE4.1 by defaut, and it doesn't have a -corresponding flag. - -If you want to omit SIMD code entirely, you need to explicitly disable -each instruction set. Here's an example of building a shared library on -x86 with only portable code: - -```bash -gcc -shared -O3 -o libblake3.so -DBLAKE3_NO_SSE2 -DBLAKE3_NO_SSE41 -DBLAKE3_NO_AVX2 \ - -DBLAKE3_NO_AVX512 blake3.c blake3_dispatch.c blake3_portable.c -``` - -## ARM NEON - -The NEON implementation is not enabled by default on ARM, since not all -ARM targets support it. To enable it, set `BLAKE3_USE_NEON=1`. Here's an -example of building a shared library on ARM Linux with NEON support: - -```bash -gcc -shared -O3 -o libblake3.so -DBLAKE3_USE_NEON blake3.c blake3_dispatch.c \ - blake3_portable.c blake3_neon.c -``` - -Note that on some targets (ARMv7 in particular), extra flags may be -required to activate NEON support in the compiler. If you see an error -like... - -``` -/usr/lib/gcc/armv7l-unknown-linux-gnueabihf/9.2.0/include/arm_neon.h:635:1: error: inlining failed -in call to always_inline ‘vaddq_u32’: target specific option mismatch -``` - -...then you may need to add something like `-mfpu=neon-vfpv4 --mfloat-abi=hard`. - -## Other Platforms - -The portable implementation should work on most other architectures. For -example: - -```bash -gcc -shared -O3 -o libblake3.so blake3.c blake3_dispatch.c blake3_portable.c -``` - -# Differences from the Rust Implementation - -The single-threaded Rust and C implementations use the same algorithms, -and their performance is the same if you use the assembly -implementations or if you compile the intrinsics-based implementations -with Clang. (Both Clang and rustc are LLVM-based.) - -The C implementation doesn't currently include any multithreading -optimizations. OpenMP support or similar might be added in the future. diff --git a/thirdparty/BLAKE3/c/blake3.c b/thirdparty/BLAKE3/c/blake3.c deleted file mode 100644 index 7abf5324e..000000000 --- a/thirdparty/BLAKE3/c/blake3.c +++ /dev/null @@ -1,607 +0,0 @@ -#include <assert.h> -#include <stdbool.h> -#include <string.h> - -#include "blake3.h" -#include "blake3_impl.h" - -const char * blake3_version(void) { - return BLAKE3_VERSION_STRING; -} - -INLINE void chunk_state_init(blake3_chunk_state *self, const uint32_t key[8], - uint8_t flags) { - memcpy(self->cv, key, BLAKE3_KEY_LEN); - self->chunk_counter = 0; - memset(self->buf, 0, BLAKE3_BLOCK_LEN); - self->buf_len = 0; - self->blocks_compressed = 0; - self->flags = flags; -} - -INLINE void chunk_state_reset(blake3_chunk_state *self, const uint32_t key[8], - uint64_t chunk_counter) { - memcpy(self->cv, key, BLAKE3_KEY_LEN); - self->chunk_counter = chunk_counter; - self->blocks_compressed = 0; - memset(self->buf, 0, BLAKE3_BLOCK_LEN); - self->buf_len = 0; -} - -INLINE size_t chunk_state_len(const blake3_chunk_state *self) { - return (BLAKE3_BLOCK_LEN * (size_t)self->blocks_compressed) + - ((size_t)self->buf_len); -} - -INLINE size_t chunk_state_fill_buf(blake3_chunk_state *self, - const uint8_t *input, size_t input_len) { - size_t take = BLAKE3_BLOCK_LEN - ((size_t)self->buf_len); - if (take > input_len) { - take = input_len; - } - uint8_t *dest = self->buf + ((size_t)self->buf_len); - memcpy(dest, input, take); - self->buf_len += (uint8_t)take; - return take; -} - -INLINE uint8_t chunk_state_maybe_start_flag(const blake3_chunk_state *self) { - if (self->blocks_compressed == 0) { - return CHUNK_START; - } else { - return 0; - } -} - -typedef struct { - uint32_t input_cv[8]; - uint64_t counter; - uint8_t block[BLAKE3_BLOCK_LEN]; - uint8_t block_len; - uint8_t flags; -} output_t; - -INLINE output_t make_output(const uint32_t input_cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags) { - output_t ret; - memcpy(ret.input_cv, input_cv, 32); - memcpy(ret.block, block, BLAKE3_BLOCK_LEN); - ret.block_len = block_len; - ret.counter = counter; - ret.flags = flags; - return ret; -} - -// Chaining values within a given chunk (specifically the compress_in_place -// interface) are represented as words. This avoids unnecessary bytes<->words -// conversion overhead in the portable implementation. However, the hash_many -// interface handles both user input and parent node blocks, so it accepts -// bytes. For that reason, chaining values in the CV stack are represented as -// bytes. -INLINE void output_chaining_value(const output_t *self, uint8_t cv[32]) { - uint32_t cv_words[8]; - memcpy(cv_words, self->input_cv, 32); - blake3_compress_in_place(cv_words, self->block, self->block_len, - self->counter, self->flags); - store_cv_words(cv, cv_words); -} - -INLINE void output_root_bytes(const output_t *self, uint64_t seek, uint8_t *out, - size_t out_len) { - uint64_t output_block_counter = seek / 64; - size_t offset_within_block = seek % 64; - uint8_t wide_buf[64]; - while (out_len > 0) { - blake3_compress_xof(self->input_cv, self->block, self->block_len, - output_block_counter, self->flags | ROOT, wide_buf); - size_t available_bytes = 64 - offset_within_block; - size_t memcpy_len; - if (out_len > available_bytes) { - memcpy_len = available_bytes; - } else { - memcpy_len = out_len; - } - memcpy(out, wide_buf + offset_within_block, memcpy_len); - out += memcpy_len; - out_len -= memcpy_len; - output_block_counter += 1; - offset_within_block = 0; - } -} - -INLINE void chunk_state_update(blake3_chunk_state *self, const uint8_t *input, - size_t input_len) { - if (self->buf_len > 0) { - size_t take = chunk_state_fill_buf(self, input, input_len); - input += take; - input_len -= take; - if (input_len > 0) { - blake3_compress_in_place( - self->cv, self->buf, BLAKE3_BLOCK_LEN, self->chunk_counter, - self->flags | chunk_state_maybe_start_flag(self)); - self->blocks_compressed += 1; - self->buf_len = 0; - memset(self->buf, 0, BLAKE3_BLOCK_LEN); - } - } - - while (input_len > BLAKE3_BLOCK_LEN) { - blake3_compress_in_place(self->cv, input, BLAKE3_BLOCK_LEN, - self->chunk_counter, - self->flags | chunk_state_maybe_start_flag(self)); - self->blocks_compressed += 1; - input += BLAKE3_BLOCK_LEN; - input_len -= BLAKE3_BLOCK_LEN; - } - - size_t take = chunk_state_fill_buf(self, input, input_len); - input += take; - input_len -= take; -} - -INLINE output_t chunk_state_output(const blake3_chunk_state *self) { - uint8_t block_flags = - self->flags | chunk_state_maybe_start_flag(self) | CHUNK_END; - return make_output(self->cv, self->buf, self->buf_len, self->chunk_counter, - block_flags); -} - -INLINE output_t parent_output(const uint8_t block[BLAKE3_BLOCK_LEN], - const uint32_t key[8], uint8_t flags) { - return make_output(key, block, BLAKE3_BLOCK_LEN, 0, flags | PARENT); -} - -// Given some input larger than one chunk, return the number of bytes that -// should go in the left subtree. This is the largest power-of-2 number of -// chunks that leaves at least 1 byte for the right subtree. -INLINE size_t left_len(size_t content_len) { - // Subtract 1 to reserve at least one byte for the right side. content_len - // should always be greater than BLAKE3_CHUNK_LEN. - size_t full_chunks = (content_len - 1) / BLAKE3_CHUNK_LEN; - return round_down_to_power_of_2(full_chunks) * BLAKE3_CHUNK_LEN; -} - -// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE chunks at the same time -// on a single thread. Write out the chunk chaining values and return the -// number of chunks hashed. These chunks are never the root and never empty; -// those cases use a different codepath. -INLINE size_t compress_chunks_parallel(const uint8_t *input, size_t input_len, - const uint32_t key[8], - uint64_t chunk_counter, uint8_t flags, - uint8_t *out) { -#if defined(BLAKE3_TESTING) - assert(0 < input_len); - assert(input_len <= MAX_SIMD_DEGREE * BLAKE3_CHUNK_LEN); -#endif - - const uint8_t *chunks_array[MAX_SIMD_DEGREE]; - size_t input_position = 0; - size_t chunks_array_len = 0; - while (input_len - input_position >= BLAKE3_CHUNK_LEN) { - chunks_array[chunks_array_len] = &input[input_position]; - input_position += BLAKE3_CHUNK_LEN; - chunks_array_len += 1; - } - - blake3_hash_many(chunks_array, chunks_array_len, - BLAKE3_CHUNK_LEN / BLAKE3_BLOCK_LEN, key, chunk_counter, - true, flags, CHUNK_START, CHUNK_END, out); - - // Hash the remaining partial chunk, if there is one. Note that the empty - // chunk (meaning the empty message) is a different codepath. - if (input_len > input_position) { - uint64_t counter = chunk_counter + (uint64_t)chunks_array_len; - blake3_chunk_state chunk_state; - chunk_state_init(&chunk_state, key, flags); - chunk_state.chunk_counter = counter; - chunk_state_update(&chunk_state, &input[input_position], - input_len - input_position); - output_t output = chunk_state_output(&chunk_state); - output_chaining_value(&output, &out[chunks_array_len * BLAKE3_OUT_LEN]); - return chunks_array_len + 1; - } else { - return chunks_array_len; - } -} - -// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE parents at the same time -// on a single thread. Write out the parent chaining values and return the -// number of parents hashed. (If there's an odd input chaining value left over, -// return it as an additional output.) These parents are never the root and -// never empty; those cases use a different codepath. -INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values, - size_t num_chaining_values, - const uint32_t key[8], uint8_t flags, - uint8_t *out) { -#if defined(BLAKE3_TESTING) - assert(2 <= num_chaining_values); - assert(num_chaining_values <= 2 * MAX_SIMD_DEGREE_OR_2); -#endif - - const uint8_t *parents_array[MAX_SIMD_DEGREE_OR_2]; - size_t parents_array_len = 0; - while (num_chaining_values - (2 * parents_array_len) >= 2) { - parents_array[parents_array_len] = - &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN]; - parents_array_len += 1; - } - - blake3_hash_many(parents_array, parents_array_len, 1, key, - 0, // Parents always use counter 0. - false, flags | PARENT, - 0, // Parents have no start flags. - 0, // Parents have no end flags. - out); - - // If there's an odd child left over, it becomes an output. - if (num_chaining_values > 2 * parents_array_len) { - memcpy(&out[parents_array_len * BLAKE3_OUT_LEN], - &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN], - BLAKE3_OUT_LEN); - return parents_array_len + 1; - } else { - return parents_array_len; - } -} - -// The wide helper function returns (writes out) an array of chaining values -// and returns the length of that array. The number of chaining values returned -// is the dyanmically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer, -// if the input is shorter than that many chunks. The reason for maintaining a -// wide array of chaining values going back up the tree, is to allow the -// implementation to hash as many parents in parallel as possible. -// -// As a special case when the SIMD degree is 1, this function will still return -// at least 2 outputs. This guarantees that this function doesn't perform the -// root compression. (If it did, it would use the wrong flags, and also we -// wouldn't be able to implement exendable ouput.) Note that this function is -// not used when the whole input is only 1 chunk long; that's a different -// codepath. -// -// Why not just have the caller split the input on the first update(), instead -// of implementing this special rule? Because we don't want to limit SIMD or -// multi-threading parallelism for that update(). -static size_t blake3_compress_subtree_wide(const uint8_t *input, - size_t input_len, - const uint32_t key[8], - uint64_t chunk_counter, - uint8_t flags, uint8_t *out) { - // Note that the single chunk case does *not* bump the SIMD degree up to 2 - // when it is 1. If this implementation adds multi-threading in the future, - // this gives us the option of multi-threading even the 2-chunk case, which - // can help performance on smaller platforms. - if (input_len <= blake3_simd_degree() * BLAKE3_CHUNK_LEN) { - return compress_chunks_parallel(input, input_len, key, chunk_counter, flags, - out); - } - - // With more than simd_degree chunks, we need to recurse. Start by dividing - // the input into left and right subtrees. (Note that this is only optimal - // as long as the SIMD degree is a power of 2. If we ever get a SIMD degree - // of 3 or something, we'll need a more complicated strategy.) - size_t left_input_len = left_len(input_len); - size_t right_input_len = input_len - left_input_len; - const uint8_t *right_input = &input[left_input_len]; - uint64_t right_chunk_counter = - chunk_counter + (uint64_t)(left_input_len / BLAKE3_CHUNK_LEN); - - // Make space for the child outputs. Here we use MAX_SIMD_DEGREE_OR_2 to - // account for the special case of returning 2 outputs when the SIMD degree - // is 1. - uint8_t cv_array[2 * MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN]; - size_t degree = blake3_simd_degree(); - if (left_input_len > BLAKE3_CHUNK_LEN && degree == 1) { - // The special case: We always use a degree of at least two, to make - // sure there are two outputs. Except, as noted above, at the chunk - // level, where we allow degree=1. (Note that the 1-chunk-input case is - // a different codepath.) - degree = 2; - } - uint8_t *right_cvs = &cv_array[degree * BLAKE3_OUT_LEN]; - - // Recurse! If this implementation adds multi-threading support in the - // future, this is where it will go. - size_t left_n = blake3_compress_subtree_wide(input, left_input_len, key, - chunk_counter, flags, cv_array); - size_t right_n = blake3_compress_subtree_wide( - right_input, right_input_len, key, right_chunk_counter, flags, right_cvs); - - // The special case again. If simd_degree=1, then we'll have left_n=1 and - // right_n=1. Rather than compressing them into a single output, return - // them directly, to make sure we always have at least two outputs. - if (left_n == 1) { - memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN); - return 2; - } - - // Otherwise, do one layer of parent node compression. - size_t num_chaining_values = left_n + right_n; - return compress_parents_parallel(cv_array, num_chaining_values, key, flags, - out); -} - -// Hash a subtree with compress_subtree_wide(), and then condense the resulting -// list of chaining values down to a single parent node. Don't compress that -// last parent node, however. Instead, return its message bytes (the -// concatenated chaining values of its children). This is necessary when the -// first call to update() supplies a complete subtree, because the topmost -// parent node of that subtree could end up being the root. It's also necessary -// for extended output in the general case. -// -// As with compress_subtree_wide(), this function is not used on inputs of 1 -// chunk or less. That's a different codepath. -INLINE void compress_subtree_to_parent_node( - const uint8_t *input, size_t input_len, const uint32_t key[8], - uint64_t chunk_counter, uint8_t flags, uint8_t out[2 * BLAKE3_OUT_LEN]) { -#if defined(BLAKE3_TESTING) - assert(input_len > BLAKE3_CHUNK_LEN); -#endif - - uint8_t cv_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN]; - size_t num_cvs = blake3_compress_subtree_wide(input, input_len, key, - chunk_counter, flags, cv_array); - - // If MAX_SIMD_DEGREE is greater than 2 and there's enough input, - // compress_subtree_wide() returns more than 2 chaining values. Condense - // them into 2 by forming parent nodes repeatedly. - uint8_t out_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2]; - while (num_cvs > 2) { - num_cvs = - compress_parents_parallel(cv_array, num_cvs, key, flags, out_array); - memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN); - } - memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN); -} - -INLINE void hasher_init_base(blake3_hasher *self, const uint32_t key[8], - uint8_t flags) { - memcpy(self->key, key, BLAKE3_KEY_LEN); - chunk_state_init(&self->chunk, key, flags); - self->cv_stack_len = 0; -} - -void blake3_hasher_init(blake3_hasher *self) { hasher_init_base(self, IV, 0); } - -void blake3_hasher_init_keyed(blake3_hasher *self, - const uint8_t key[BLAKE3_KEY_LEN]) { - uint32_t key_words[8]; - load_key_words(key, key_words); - hasher_init_base(self, key_words, KEYED_HASH); -} - -void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context, - size_t context_len) { - blake3_hasher context_hasher; - hasher_init_base(&context_hasher, IV, DERIVE_KEY_CONTEXT); - blake3_hasher_update(&context_hasher, context, context_len); - uint8_t context_key[BLAKE3_KEY_LEN]; - blake3_hasher_finalize(&context_hasher, context_key, BLAKE3_KEY_LEN); - uint32_t context_key_words[8]; - load_key_words(context_key, context_key_words); - hasher_init_base(self, context_key_words, DERIVE_KEY_MATERIAL); -} - -void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context) { - blake3_hasher_init_derive_key_raw(self, context, strlen(context)); -} - -// As described in hasher_push_cv() below, we do "lazy merging", delaying -// merges until right before the next CV is about to be added. This is -// different from the reference implementation. Another difference is that we -// aren't always merging 1 chunk at a time. Instead, each CV might represent -// any power-of-two number of chunks, as long as the smaller-above-larger stack -// order is maintained. Instead of the "count the trailing 0-bits" algorithm -// described in the spec, we use a "count the total number of 1-bits" variant -// that doesn't require us to retain the subtree size of the CV on top of the -// stack. The principle is the same: each CV that should remain in the stack is -// represented by a 1-bit in the total number of chunks (or bytes) so far. -INLINE void hasher_merge_cv_stack(blake3_hasher *self, uint64_t total_len) { - size_t post_merge_stack_len = (size_t)popcnt(total_len); - while (self->cv_stack_len > post_merge_stack_len) { - uint8_t *parent_node = - &self->cv_stack[(self->cv_stack_len - 2) * BLAKE3_OUT_LEN]; - output_t output = parent_output(parent_node, self->key, self->chunk.flags); - output_chaining_value(&output, parent_node); - self->cv_stack_len -= 1; - } -} - -// In reference_impl.rs, we merge the new CV with existing CVs from the stack -// before pushing it. We can do that because we know more input is coming, so -// we know none of the merges are root. -// -// This setting is different. We want to feed as much input as possible to -// compress_subtree_wide(), without setting aside anything for the chunk_state. -// If the user gives us 64 KiB, we want to parallelize over all 64 KiB at once -// as a single subtree, if at all possible. -// -// This leads to two problems: -// 1) This 64 KiB input might be the only call that ever gets made to update. -// In this case, the root node of the 64 KiB subtree would be the root node -// of the whole tree, and it would need to be ROOT finalized. We can't -// compress it until we know. -// 2) This 64 KiB input might complete a larger tree, whose root node is -// similarly going to be the the root of the whole tree. For example, maybe -// we have 196 KiB (that is, 128 + 64) hashed so far. We can't compress the -// node at the root of the 256 KiB subtree until we know how to finalize it. -// -// The second problem is solved with "lazy merging". That is, when we're about -// to add a CV to the stack, we don't merge it with anything first, as the -// reference impl does. Instead we do merges using the *previous* CV that was -// added, which is sitting on top of the stack, and we put the new CV -// (unmerged) on top of the stack afterwards. This guarantees that we never -// merge the root node until finalize(). -// -// Solving the first problem requires an additional tool, -// compress_subtree_to_parent_node(). That function always returns the top -// *two* chaining values of the subtree it's compressing. We then do lazy -// merging with each of them separately, so that the second CV will always -// remain unmerged. (That also helps us support extendable output when we're -// hashing an input all-at-once.) -INLINE void hasher_push_cv(blake3_hasher *self, uint8_t new_cv[BLAKE3_OUT_LEN], - uint64_t chunk_counter) { - hasher_merge_cv_stack(self, chunk_counter); - memcpy(&self->cv_stack[self->cv_stack_len * BLAKE3_OUT_LEN], new_cv, - BLAKE3_OUT_LEN); - self->cv_stack_len += 1; -} - -void blake3_hasher_update(blake3_hasher *self, const void *input, - size_t input_len) { - // Explicitly checking for zero avoids causing UB by passing a null pointer - // to memcpy. This comes up in practice with things like: - // std::vector<uint8_t> v; - // blake3_hasher_update(&hasher, v.data(), v.size()); - if (input_len == 0) { - return; - } - - const uint8_t *input_bytes = (const uint8_t *)input; - - // If we have some partial chunk bytes in the internal chunk_state, we need - // to finish that chunk first. - if (chunk_state_len(&self->chunk) > 0) { - size_t take = BLAKE3_CHUNK_LEN - chunk_state_len(&self->chunk); - if (take > input_len) { - take = input_len; - } - chunk_state_update(&self->chunk, input_bytes, take); - input_bytes += take; - input_len -= take; - // If we've filled the current chunk and there's more coming, finalize this - // chunk and proceed. In this case we know it's not the root. - if (input_len > 0) { - output_t output = chunk_state_output(&self->chunk); - uint8_t chunk_cv[32]; - output_chaining_value(&output, chunk_cv); - hasher_push_cv(self, chunk_cv, self->chunk.chunk_counter); - chunk_state_reset(&self->chunk, self->key, self->chunk.chunk_counter + 1); - } else { - return; - } - } - - // Now the chunk_state is clear, and we have more input. If there's more than - // a single chunk (so, definitely not the root chunk), hash the largest whole - // subtree we can, with the full benefits of SIMD (and maybe in the future, - // multi-threading) parallelism. Two restrictions: - // - The subtree has to be a power-of-2 number of chunks. Only subtrees along - // the right edge can be incomplete, and we don't know where the right edge - // is going to be until we get to finalize(). - // - The subtree must evenly divide the total number of chunks up until this - // point (if total is not 0). If the current incomplete subtree is only - // waiting for 1 more chunk, we can't hash a subtree of 4 chunks. We have - // to complete the current subtree first. - // Because we might need to break up the input to form powers of 2, or to - // evenly divide what we already have, this part runs in a loop. - while (input_len > BLAKE3_CHUNK_LEN) { - size_t subtree_len = round_down_to_power_of_2(input_len); - uint64_t count_so_far = self->chunk.chunk_counter * BLAKE3_CHUNK_LEN; - // Shrink the subtree_len until it evenly divides the count so far. We know - // that subtree_len itself is a power of 2, so we can use a bitmasking - // trick instead of an actual remainder operation. (Note that if the caller - // consistently passes power-of-2 inputs of the same size, as is hopefully - // typical, this loop condition will always fail, and subtree_len will - // always be the full length of the input.) - // - // An aside: We don't have to shrink subtree_len quite this much. For - // example, if count_so_far is 1, we could pass 2 chunks to - // compress_subtree_to_parent_node. Since we'll get 2 CVs back, we'll still - // get the right answer in the end, and we might get to use 2-way SIMD - // parallelism. The problem with this optimization, is that it gets us - // stuck always hashing 2 chunks. The total number of chunks will remain - // odd, and we'll never graduate to higher degrees of parallelism. See - // https://github.com/BLAKE3-team/BLAKE3/issues/69. - while ((((uint64_t)(subtree_len - 1)) & count_so_far) != 0) { - subtree_len /= 2; - } - // The shrunken subtree_len might now be 1 chunk long. If so, hash that one - // chunk by itself. Otherwise, compress the subtree into a pair of CVs. - uint64_t subtree_chunks = subtree_len / BLAKE3_CHUNK_LEN; - if (subtree_len <= BLAKE3_CHUNK_LEN) { - blake3_chunk_state chunk_state; - chunk_state_init(&chunk_state, self->key, self->chunk.flags); - chunk_state.chunk_counter = self->chunk.chunk_counter; - chunk_state_update(&chunk_state, input_bytes, subtree_len); - output_t output = chunk_state_output(&chunk_state); - uint8_t cv[BLAKE3_OUT_LEN]; - output_chaining_value(&output, cv); - hasher_push_cv(self, cv, chunk_state.chunk_counter); - } else { - // This is the high-performance happy path, though getting here depends - // on the caller giving us a long enough input. - uint8_t cv_pair[2 * BLAKE3_OUT_LEN]; - compress_subtree_to_parent_node(input_bytes, subtree_len, self->key, - self->chunk.chunk_counter, - self->chunk.flags, cv_pair); - hasher_push_cv(self, cv_pair, self->chunk.chunk_counter); - hasher_push_cv(self, &cv_pair[BLAKE3_OUT_LEN], - self->chunk.chunk_counter + (subtree_chunks / 2)); - } - self->chunk.chunk_counter += subtree_chunks; - input_bytes += subtree_len; - input_len -= subtree_len; - } - - // If there's any remaining input less than a full chunk, add it to the chunk - // state. In that case, also do a final merge loop to make sure the subtree - // stack doesn't contain any unmerged pairs. The remaining input means we - // know these merges are non-root. This merge loop isn't strictly necessary - // here, because hasher_push_chunk_cv already does its own merge loop, but it - // simplifies blake3_hasher_finalize below. - if (input_len > 0) { - chunk_state_update(&self->chunk, input_bytes, input_len); - hasher_merge_cv_stack(self, self->chunk.chunk_counter); - } -} - -void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out, - size_t out_len) { - blake3_hasher_finalize_seek(self, 0, out, out_len); -} - -void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek, - uint8_t *out, size_t out_len) { - // Explicitly checking for zero avoids causing UB by passing a null pointer - // to memcpy. This comes up in practice with things like: - // std::vector<uint8_t> v; - // blake3_hasher_finalize(&hasher, v.data(), v.size()); - if (out_len == 0) { - return; - } - - // If the subtree stack is empty, then the current chunk is the root. - if (self->cv_stack_len == 0) { - output_t output = chunk_state_output(&self->chunk); - output_root_bytes(&output, seek, out, out_len); - return; - } - // If there are any bytes in the chunk state, finalize that chunk and do a - // roll-up merge between that chunk hash and every subtree in the stack. In - // this case, the extra merge loop at the end of blake3_hasher_update - // guarantees that none of the subtrees in the stack need to be merged with - // each other first. Otherwise, if there are no bytes in the chunk state, - // then the top of the stack is a chunk hash, and we start the merge from - // that. - output_t output; - size_t cvs_remaining; - if (chunk_state_len(&self->chunk) > 0) { - cvs_remaining = self->cv_stack_len; - output = chunk_state_output(&self->chunk); - } else { - // There are always at least 2 CVs in the stack in this case. - cvs_remaining = self->cv_stack_len - 2; - output = parent_output(&self->cv_stack[cvs_remaining * 32], self->key, - self->chunk.flags); - } - while (cvs_remaining > 0) { - cvs_remaining -= 1; - uint8_t parent_block[BLAKE3_BLOCK_LEN]; - memcpy(parent_block, &self->cv_stack[cvs_remaining * 32], 32); - output_chaining_value(&output, &parent_block[32]); - output = parent_output(parent_block, self->key, self->chunk.flags); - } - output_root_bytes(&output, seek, out, out_len); -} diff --git a/thirdparty/BLAKE3/c/blake3.h b/thirdparty/BLAKE3/c/blake3.h deleted file mode 100644 index 57ebd5adc..000000000 --- a/thirdparty/BLAKE3/c/blake3.h +++ /dev/null @@ -1,60 +0,0 @@ -#ifndef BLAKE3_H -#define BLAKE3_H - -#include <stddef.h> -#include <stdint.h> - -#ifdef __cplusplus -extern "C" { -#endif - -#define BLAKE3_VERSION_STRING "0.3.7" -#define BLAKE3_KEY_LEN 32 -#define BLAKE3_OUT_LEN 32 -#define BLAKE3_BLOCK_LEN 64 -#define BLAKE3_CHUNK_LEN 1024 -#define BLAKE3_MAX_DEPTH 54 -#define BLAKE3_MAX_SIMD_DEGREE 16 - -// This struct is a private implementation detail. It has to be here because -// it's part of blake3_hasher below. -typedef struct { - uint32_t cv[8]; - uint64_t chunk_counter; - uint8_t buf[BLAKE3_BLOCK_LEN]; - uint8_t buf_len; - uint8_t blocks_compressed; - uint8_t flags; -} blake3_chunk_state; - -typedef struct { - uint32_t key[8]; - blake3_chunk_state chunk; - uint8_t cv_stack_len; - // The stack size is MAX_DEPTH + 1 because we do lazy merging. For example, - // with 7 chunks, we have 3 entries in the stack. Adding an 8th chunk - // requires a 4th entry, rather than merging everything down to 1, because we - // don't know whether more input is coming. This is different from how the - // reference implementation does things. - uint8_t cv_stack[(BLAKE3_MAX_DEPTH + 1) * BLAKE3_OUT_LEN]; -} blake3_hasher; - -const char * blake3_version(void); -void blake3_hasher_init(blake3_hasher *self); -void blake3_hasher_init_keyed(blake3_hasher *self, - const uint8_t key[BLAKE3_KEY_LEN]); -void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context); -void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context, - size_t context_len); -void blake3_hasher_update(blake3_hasher *self, const void *input, - size_t input_len); -void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out, - size_t out_len); -void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek, - uint8_t *out, size_t out_len); - -#ifdef __cplusplus -} -#endif - -#endif /* BLAKE3_H */ diff --git a/thirdparty/BLAKE3/c/blake3_avx2.c b/thirdparty/BLAKE3/c/blake3_avx2.c deleted file mode 100644 index c5a2ce9e2..000000000 --- a/thirdparty/BLAKE3/c/blake3_avx2.c +++ /dev/null @@ -1,325 +0,0 @@ -#include "blake3_impl.h" - -#include <immintrin.h> - -#define DEGREE 8 - -INLINE __m256i loadu(const uint8_t src[32]) { - return _mm256_loadu_si256((const __m256i *)src); -} - -INLINE void storeu(__m256i src, uint8_t dest[16]) { - _mm256_storeu_si256((__m256i *)dest, src); -} - -INLINE __m256i addv(__m256i a, __m256i b) { return _mm256_add_epi32(a, b); } - -// Note that clang-format doesn't like the name "xor" for some reason. -INLINE __m256i xorv(__m256i a, __m256i b) { return _mm256_xor_si256(a, b); } - -INLINE __m256i set1(uint32_t x) { return _mm256_set1_epi32((int32_t)x); } - -INLINE __m256i rot16(__m256i x) { - return _mm256_shuffle_epi8( - x, _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2, - 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2)); -} - -INLINE __m256i rot12(__m256i x) { - return _mm256_or_si256(_mm256_srli_epi32(x, 12), _mm256_slli_epi32(x, 32 - 12)); -} - -INLINE __m256i rot8(__m256i x) { - return _mm256_shuffle_epi8( - x, _mm256_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1, - 12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1)); -} - -INLINE __m256i rot7(__m256i x) { - return _mm256_or_si256(_mm256_srli_epi32(x, 7), _mm256_slli_epi32(x, 32 - 7)); -} - -INLINE void round_fn(__m256i v[16], __m256i m[16], size_t r) { - v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); - v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); - v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); - v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); - v[0] = addv(v[0], v[4]); - v[1] = addv(v[1], v[5]); - v[2] = addv(v[2], v[6]); - v[3] = addv(v[3], v[7]); - v[12] = xorv(v[12], v[0]); - v[13] = xorv(v[13], v[1]); - v[14] = xorv(v[14], v[2]); - v[15] = xorv(v[15], v[3]); - v[12] = rot16(v[12]); - v[13] = rot16(v[13]); - v[14] = rot16(v[14]); - v[15] = rot16(v[15]); - v[8] = addv(v[8], v[12]); - v[9] = addv(v[9], v[13]); - v[10] = addv(v[10], v[14]); - v[11] = addv(v[11], v[15]); - v[4] = xorv(v[4], v[8]); - v[5] = xorv(v[5], v[9]); - v[6] = xorv(v[6], v[10]); - v[7] = xorv(v[7], v[11]); - v[4] = rot12(v[4]); - v[5] = rot12(v[5]); - v[6] = rot12(v[6]); - v[7] = rot12(v[7]); - v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); - v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); - v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); - v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); - v[0] = addv(v[0], v[4]); - v[1] = addv(v[1], v[5]); - v[2] = addv(v[2], v[6]); - v[3] = addv(v[3], v[7]); - v[12] = xorv(v[12], v[0]); - v[13] = xorv(v[13], v[1]); - v[14] = xorv(v[14], v[2]); - v[15] = xorv(v[15], v[3]); - v[12] = rot8(v[12]); - v[13] = rot8(v[13]); - v[14] = rot8(v[14]); - v[15] = rot8(v[15]); - v[8] = addv(v[8], v[12]); - v[9] = addv(v[9], v[13]); - v[10] = addv(v[10], v[14]); - v[11] = addv(v[11], v[15]); - v[4] = xorv(v[4], v[8]); - v[5] = xorv(v[5], v[9]); - v[6] = xorv(v[6], v[10]); - v[7] = xorv(v[7], v[11]); - v[4] = rot7(v[4]); - v[5] = rot7(v[5]); - v[6] = rot7(v[6]); - v[7] = rot7(v[7]); - - v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); - v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); - v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); - v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); - v[0] = addv(v[0], v[5]); - v[1] = addv(v[1], v[6]); - v[2] = addv(v[2], v[7]); - v[3] = addv(v[3], v[4]); - v[15] = xorv(v[15], v[0]); - v[12] = xorv(v[12], v[1]); - v[13] = xorv(v[13], v[2]); - v[14] = xorv(v[14], v[3]); - v[15] = rot16(v[15]); - v[12] = rot16(v[12]); - v[13] = rot16(v[13]); - v[14] = rot16(v[14]); - v[10] = addv(v[10], v[15]); - v[11] = addv(v[11], v[12]); - v[8] = addv(v[8], v[13]); - v[9] = addv(v[9], v[14]); - v[5] = xorv(v[5], v[10]); - v[6] = xorv(v[6], v[11]); - v[7] = xorv(v[7], v[8]); - v[4] = xorv(v[4], v[9]); - v[5] = rot12(v[5]); - v[6] = rot12(v[6]); - v[7] = rot12(v[7]); - v[4] = rot12(v[4]); - v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); - v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); - v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); - v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); - v[0] = addv(v[0], v[5]); - v[1] = addv(v[1], v[6]); - v[2] = addv(v[2], v[7]); - v[3] = addv(v[3], v[4]); - v[15] = xorv(v[15], v[0]); - v[12] = xorv(v[12], v[1]); - v[13] = xorv(v[13], v[2]); - v[14] = xorv(v[14], v[3]); - v[15] = rot8(v[15]); - v[12] = rot8(v[12]); - v[13] = rot8(v[13]); - v[14] = rot8(v[14]); - v[10] = addv(v[10], v[15]); - v[11] = addv(v[11], v[12]); - v[8] = addv(v[8], v[13]); - v[9] = addv(v[9], v[14]); - v[5] = xorv(v[5], v[10]); - v[6] = xorv(v[6], v[11]); - v[7] = xorv(v[7], v[8]); - v[4] = xorv(v[4], v[9]); - v[5] = rot7(v[5]); - v[6] = rot7(v[6]); - v[7] = rot7(v[7]); - v[4] = rot7(v[4]); -} - -INLINE void transpose_vecs(__m256i vecs[DEGREE]) { - // Interleave 32-bit lanes. The low unpack is lanes 00/11/44/55, and the high - // is 22/33/66/77. - __m256i ab_0145 = _mm256_unpacklo_epi32(vecs[0], vecs[1]); - __m256i ab_2367 = _mm256_unpackhi_epi32(vecs[0], vecs[1]); - __m256i cd_0145 = _mm256_unpacklo_epi32(vecs[2], vecs[3]); - __m256i cd_2367 = _mm256_unpackhi_epi32(vecs[2], vecs[3]); - __m256i ef_0145 = _mm256_unpacklo_epi32(vecs[4], vecs[5]); - __m256i ef_2367 = _mm256_unpackhi_epi32(vecs[4], vecs[5]); - __m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]); - __m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]); - - // Interleave 64-bit lates. The low unpack is lanes 00/22 and the high is - // 11/33. - __m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145); - __m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145); - __m256i abcd_26 = _mm256_unpacklo_epi64(ab_2367, cd_2367); - __m256i abcd_37 = _mm256_unpackhi_epi64(ab_2367, cd_2367); - __m256i efgh_04 = _mm256_unpacklo_epi64(ef_0145, gh_0145); - __m256i efgh_15 = _mm256_unpackhi_epi64(ef_0145, gh_0145); - __m256i efgh_26 = _mm256_unpacklo_epi64(ef_2367, gh_2367); - __m256i efgh_37 = _mm256_unpackhi_epi64(ef_2367, gh_2367); - - // Interleave 128-bit lanes. - vecs[0] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x20); - vecs[1] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x20); - vecs[2] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x20); - vecs[3] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x20); - vecs[4] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x31); - vecs[5] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x31); - vecs[6] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x31); - vecs[7] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x31); -} - -INLINE void transpose_msg_vecs(const uint8_t *const *inputs, - size_t block_offset, __m256i out[16]) { - out[0] = loadu(&inputs[0][block_offset + 0 * sizeof(__m256i)]); - out[1] = loadu(&inputs[1][block_offset + 0 * sizeof(__m256i)]); - out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m256i)]); - out[3] = loadu(&inputs[3][block_offset + 0 * sizeof(__m256i)]); - out[4] = loadu(&inputs[4][block_offset + 0 * sizeof(__m256i)]); - out[5] = loadu(&inputs[5][block_offset + 0 * sizeof(__m256i)]); - out[6] = loadu(&inputs[6][block_offset + 0 * sizeof(__m256i)]); - out[7] = loadu(&inputs[7][block_offset + 0 * sizeof(__m256i)]); - out[8] = loadu(&inputs[0][block_offset + 1 * sizeof(__m256i)]); - out[9] = loadu(&inputs[1][block_offset + 1 * sizeof(__m256i)]); - out[10] = loadu(&inputs[2][block_offset + 1 * sizeof(__m256i)]); - out[11] = loadu(&inputs[3][block_offset + 1 * sizeof(__m256i)]); - out[12] = loadu(&inputs[4][block_offset + 1 * sizeof(__m256i)]); - out[13] = loadu(&inputs[5][block_offset + 1 * sizeof(__m256i)]); - out[14] = loadu(&inputs[6][block_offset + 1 * sizeof(__m256i)]); - out[15] = loadu(&inputs[7][block_offset + 1 * sizeof(__m256i)]); - for (size_t i = 0; i < 8; ++i) { - _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0); - } - transpose_vecs(&out[0]); - transpose_vecs(&out[8]); -} - -INLINE void load_counters(uint64_t counter, bool increment_counter, - __m256i *out_lo, __m256i *out_hi) { - const __m256i mask = _mm256_set1_epi32(-(int32_t)increment_counter); - const __m256i add0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); - const __m256i add1 = _mm256_and_si256(mask, add0); - __m256i l = _mm256_add_epi32(_mm256_set1_epi32(counter), add1); - __m256i carry = _mm256_cmpgt_epi32(_mm256_xor_si256(add1, _mm256_set1_epi32(0x80000000)), - _mm256_xor_si256( l, _mm256_set1_epi32(0x80000000))); - __m256i h = _mm256_sub_epi32(_mm256_set1_epi32(counter >> 32), carry); - *out_lo = l; - *out_hi = h; -} - -void blake3_hash8_avx2(const uint8_t *const *inputs, size_t blocks, - const uint32_t key[8], uint64_t counter, - bool increment_counter, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, uint8_t *out) { - __m256i h_vecs[8] = { - set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]), - set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]), - }; - __m256i counter_low_vec, counter_high_vec; - load_counters(counter, increment_counter, &counter_low_vec, - &counter_high_vec); - uint8_t block_flags = flags | flags_start; - - for (size_t block = 0; block < blocks; block++) { - if (block + 1 == blocks) { - block_flags |= flags_end; - } - __m256i block_len_vec = set1(BLAKE3_BLOCK_LEN); - __m256i block_flags_vec = set1(block_flags); - __m256i msg_vecs[16]; - transpose_msg_vecs(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); - - __m256i v[16] = { - h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], - h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], - set1(IV[0]), set1(IV[1]), set1(IV[2]), set1(IV[3]), - counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, - }; - round_fn(v, msg_vecs, 0); - round_fn(v, msg_vecs, 1); - round_fn(v, msg_vecs, 2); - round_fn(v, msg_vecs, 3); - round_fn(v, msg_vecs, 4); - round_fn(v, msg_vecs, 5); - round_fn(v, msg_vecs, 6); - h_vecs[0] = xorv(v[0], v[8]); - h_vecs[1] = xorv(v[1], v[9]); - h_vecs[2] = xorv(v[2], v[10]); - h_vecs[3] = xorv(v[3], v[11]); - h_vecs[4] = xorv(v[4], v[12]); - h_vecs[5] = xorv(v[5], v[13]); - h_vecs[6] = xorv(v[6], v[14]); - h_vecs[7] = xorv(v[7], v[15]); - - block_flags = flags; - } - - transpose_vecs(h_vecs); - storeu(h_vecs[0], &out[0 * sizeof(__m256i)]); - storeu(h_vecs[1], &out[1 * sizeof(__m256i)]); - storeu(h_vecs[2], &out[2 * sizeof(__m256i)]); - storeu(h_vecs[3], &out[3 * sizeof(__m256i)]); - storeu(h_vecs[4], &out[4 * sizeof(__m256i)]); - storeu(h_vecs[5], &out[5 * sizeof(__m256i)]); - storeu(h_vecs[6], &out[6 * sizeof(__m256i)]); - storeu(h_vecs[7], &out[7 * sizeof(__m256i)]); -} - -#if !defined(BLAKE3_NO_SSE41) -void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out); -#else -void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out); -#endif - -void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out) { - while (num_inputs >= DEGREE) { - blake3_hash8_avx2(inputs, blocks, key, counter, increment_counter, flags, - flags_start, flags_end, out); - if (increment_counter) { - counter += DEGREE; - } - inputs += DEGREE; - num_inputs -= DEGREE; - out = &out[DEGREE * BLAKE3_OUT_LEN]; - } -#if !defined(BLAKE3_NO_SSE41) - blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter, - increment_counter, flags, flags_start, flags_end, out); -#else - blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter, - increment_counter, flags, flags_start, flags_end, - out); -#endif -} diff --git a/thirdparty/BLAKE3/c/blake3_avx2_x86-64_unix.S b/thirdparty/BLAKE3/c/blake3_avx2_x86-64_unix.S deleted file mode 100644 index 812bb8568..000000000 --- a/thirdparty/BLAKE3/c/blake3_avx2_x86-64_unix.S +++ /dev/null @@ -1,1815 +0,0 @@ -#if defined(__ELF__) && defined(__linux__) -.section .note.GNU-stack,"",%progbits -#endif - -#if defined(__ELF__) && defined(__CET__) && defined(__has_include) -#if __has_include(<cet.h>) -#include <cet.h> -#endif -#endif - -#if !defined(_CET_ENDBR) -#define _CET_ENDBR -#endif - -.intel_syntax noprefix -.global _blake3_hash_many_avx2 -.global blake3_hash_many_avx2 -#ifdef __APPLE__ -.text -#else -.section .text -#endif - .p2align 6 -_blake3_hash_many_avx2: -blake3_hash_many_avx2: - _CET_ENDBR - push r15 - push r14 - push r13 - push r12 - push rbx - push rbp - mov rbp, rsp - sub rsp, 680 - and rsp, 0xFFFFFFFFFFFFFFC0 - neg r9d - vmovd xmm0, r9d - vpbroadcastd ymm0, xmm0 - vmovdqa ymmword ptr [rsp+0x280], ymm0 - vpand ymm1, ymm0, ymmword ptr [ADD0+rip] - vpand ymm2, ymm0, ymmword ptr [ADD1+rip] - vmovdqa ymmword ptr [rsp+0x220], ymm2 - vmovd xmm2, r8d - vpbroadcastd ymm2, xmm2 - vpaddd ymm2, ymm2, ymm1 - vmovdqa ymmword ptr [rsp+0x240], ymm2 - vpxor ymm1, ymm1, ymmword ptr [CMP_MSB_MASK+rip] - vpxor ymm2, ymm2, ymmword ptr [CMP_MSB_MASK+rip] - vpcmpgtd ymm2, ymm1, ymm2 - shr r8, 32 - vmovd xmm3, r8d - vpbroadcastd ymm3, xmm3 - vpsubd ymm3, ymm3, ymm2 - vmovdqa ymmword ptr [rsp+0x260], ymm3 - shl rdx, 6 - mov qword ptr [rsp+0x2A0], rdx - cmp rsi, 8 - jc 3f -2: - vpbroadcastd ymm0, dword ptr [rcx] - vpbroadcastd ymm1, dword ptr [rcx+0x4] - vpbroadcastd ymm2, dword ptr [rcx+0x8] - vpbroadcastd ymm3, dword ptr [rcx+0xC] - vpbroadcastd ymm4, dword ptr [rcx+0x10] - vpbroadcastd ymm5, dword ptr [rcx+0x14] - vpbroadcastd ymm6, dword ptr [rcx+0x18] - vpbroadcastd ymm7, dword ptr [rcx+0x1C] - mov r8, qword ptr [rdi] - mov r9, qword ptr [rdi+0x8] - mov r10, qword ptr [rdi+0x10] - mov r11, qword ptr [rdi+0x18] - mov r12, qword ptr [rdi+0x20] - mov r13, qword ptr [rdi+0x28] - mov r14, qword ptr [rdi+0x30] - mov r15, qword ptr [rdi+0x38] - movzx eax, byte ptr [rbp+0x38] - movzx ebx, byte ptr [rbp+0x40] - or eax, ebx - xor edx, edx -.p2align 5 -9: - movzx ebx, byte ptr [rbp+0x48] - or ebx, eax - add rdx, 64 - cmp rdx, qword ptr [rsp+0x2A0] - cmove eax, ebx - mov dword ptr [rsp+0x200], eax - vmovups xmm8, xmmword ptr [r8+rdx-0x40] - vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01 - vmovups xmm9, xmmword ptr [r9+rdx-0x40] - vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01 - vunpcklpd ymm12, ymm8, ymm9 - vunpckhpd ymm13, ymm8, ymm9 - vmovups xmm10, xmmword ptr [r10+rdx-0x40] - vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01 - vmovups xmm11, xmmword ptr [r11+rdx-0x40] - vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01 - vunpcklpd ymm14, ymm10, ymm11 - vunpckhpd ymm15, ymm10, ymm11 - vshufps ymm8, ymm12, ymm14, 136 - vmovaps ymmword ptr [rsp], ymm8 - vshufps ymm9, ymm12, ymm14, 221 - vmovaps ymmword ptr [rsp+0x20], ymm9 - vshufps ymm10, ymm13, ymm15, 136 - vmovaps ymmword ptr [rsp+0x40], ymm10 - vshufps ymm11, ymm13, ymm15, 221 - vmovaps ymmword ptr [rsp+0x60], ymm11 - vmovups xmm8, xmmword ptr [r8+rdx-0x30] - vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01 - vmovups xmm9, xmmword ptr [r9+rdx-0x30] - vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01 - vunpcklpd ymm12, ymm8, ymm9 - vunpckhpd ymm13, ymm8, ymm9 - vmovups xmm10, xmmword ptr [r10+rdx-0x30] - vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01 - vmovups xmm11, xmmword ptr [r11+rdx-0x30] - vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01 - vunpcklpd ymm14, ymm10, ymm11 - vunpckhpd ymm15, ymm10, ymm11 - vshufps ymm8, ymm12, ymm14, 136 - vmovaps ymmword ptr [rsp+0x80], ymm8 - vshufps ymm9, ymm12, ymm14, 221 - vmovaps ymmword ptr [rsp+0xA0], ymm9 - vshufps ymm10, ymm13, ymm15, 136 - vmovaps ymmword ptr [rsp+0xC0], ymm10 - vshufps ymm11, ymm13, ymm15, 221 - vmovaps ymmword ptr [rsp+0xE0], ymm11 - vmovups xmm8, xmmword ptr [r8+rdx-0x20] - vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01 - vmovups xmm9, xmmword ptr [r9+rdx-0x20] - vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01 - vunpcklpd ymm12, ymm8, ymm9 - vunpckhpd ymm13, ymm8, ymm9 - vmovups xmm10, xmmword ptr [r10+rdx-0x20] - vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01 - vmovups xmm11, xmmword ptr [r11+rdx-0x20] - vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01 - vunpcklpd ymm14, ymm10, ymm11 - vunpckhpd ymm15, ymm10, ymm11 - vshufps ymm8, ymm12, ymm14, 136 - vmovaps ymmword ptr [rsp+0x100], ymm8 - vshufps ymm9, ymm12, ymm14, 221 - vmovaps ymmword ptr [rsp+0x120], ymm9 - vshufps ymm10, ymm13, ymm15, 136 - vmovaps ymmword ptr [rsp+0x140], ymm10 - vshufps ymm11, ymm13, ymm15, 221 - vmovaps ymmword ptr [rsp+0x160], ymm11 - vmovups xmm8, xmmword ptr [r8+rdx-0x10] - vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01 - vmovups xmm9, xmmword ptr [r9+rdx-0x10] - vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01 - vunpcklpd ymm12, ymm8, ymm9 - vunpckhpd ymm13, ymm8, ymm9 - vmovups xmm10, xmmword ptr [r10+rdx-0x10] - vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01 - vmovups xmm11, xmmword ptr [r11+rdx-0x10] - vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01 - vunpcklpd ymm14, ymm10, ymm11 - vunpckhpd ymm15, ymm10, ymm11 - vshufps ymm8, ymm12, ymm14, 136 - vmovaps ymmword ptr [rsp+0x180], ymm8 - vshufps ymm9, ymm12, ymm14, 221 - vmovaps ymmword ptr [rsp+0x1A0], ymm9 - vshufps ymm10, ymm13, ymm15, 136 - vmovaps ymmword ptr [rsp+0x1C0], ymm10 - vshufps ymm11, ymm13, ymm15, 221 - vmovaps ymmword ptr [rsp+0x1E0], ymm11 - vpbroadcastd ymm15, dword ptr [rsp+0x200] - prefetcht0 [r8+rdx+0x80] - prefetcht0 [r12+rdx+0x80] - prefetcht0 [r9+rdx+0x80] - prefetcht0 [r13+rdx+0x80] - prefetcht0 [r10+rdx+0x80] - prefetcht0 [r14+rdx+0x80] - prefetcht0 [r11+rdx+0x80] - prefetcht0 [r15+rdx+0x80] - vpaddd ymm0, ymm0, ymmword ptr [rsp] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x80] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm0, ymmword ptr [rsp+0x240] - vpxor ymm13, ymm1, ymmword ptr [rsp+0x260] - vpxor ymm14, ymm2, ymmword ptr [BLAKE3_BLOCK_LEN+rip] - vpxor ymm15, ymm3, ymm15 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [BLAKE3_IV_0+rip] - vpaddd ymm9, ymm13, ymmword ptr [BLAKE3_IV_1+rip] - vpaddd ymm10, ymm14, ymmword ptr [BLAKE3_IV_2+rip] - vpaddd ymm11, ymm15, ymmword ptr [BLAKE3_IV_3+rip] - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x20] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0xA0] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x100] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x180] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x120] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1A0] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x40] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0xE0] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0xC0] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] - vpaddd ymm2, ymm2, ymmword ptr [rsp] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x20] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x120] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x160] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1C0] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x60] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1A0] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x80] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x40] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0xC0] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x160] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0xA0] - vpaddd ymm1, ymm1, ymmword ptr [rsp] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1E0] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x140] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1C0] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0xE0] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x60] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x80] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0xA0] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x100] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x180] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1E0] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1A0] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x140] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0xE0] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] - vpaddd ymm2, ymm2, ymmword ptr [rsp] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x40] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x20] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x120] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x100] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1C0] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x180] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1A0] - vpaddd ymm1, ymm1, ymmword ptr [rsp] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x40] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x60] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0xC0] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x160] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x20] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1E0] - vpaddd ymm1, ymm1, ymmword ptr [rsp] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x120] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1C0] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x60] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x140] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x80] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vpxor ymm0, ymm0, ymm8 - vpxor ymm1, ymm1, ymm9 - vpxor ymm2, ymm2, ymm10 - vpxor ymm3, ymm3, ymm11 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpxor ymm4, ymm4, ymm12 - vpxor ymm5, ymm5, ymm13 - vpxor ymm6, ymm6, ymm14 - vpxor ymm7, ymm7, ymm15 - movzx eax, byte ptr [rbp+0x38] - jne 9b - mov rbx, qword ptr [rbp+0x50] - vunpcklps ymm8, ymm0, ymm1 - vunpcklps ymm9, ymm2, ymm3 - vunpckhps ymm10, ymm0, ymm1 - vunpcklps ymm11, ymm4, ymm5 - vunpcklps ymm0, ymm6, ymm7 - vshufps ymm12, ymm8, ymm9, 78 - vblendps ymm1, ymm8, ymm12, 0xCC - vshufps ymm8, ymm11, ymm0, 78 - vunpckhps ymm13, ymm2, ymm3 - vblendps ymm2, ymm11, ymm8, 0xCC - vblendps ymm3, ymm12, ymm9, 0xCC - vperm2f128 ymm12, ymm1, ymm2, 0x20 - vmovups ymmword ptr [rbx], ymm12 - vunpckhps ymm14, ymm4, ymm5 - vblendps ymm4, ymm8, ymm0, 0xCC - vunpckhps ymm15, ymm6, ymm7 - vperm2f128 ymm7, ymm3, ymm4, 0x20 - vmovups ymmword ptr [rbx+0x20], ymm7 - vshufps ymm5, ymm10, ymm13, 78 - vblendps ymm6, ymm5, ymm13, 0xCC - vshufps ymm13, ymm14, ymm15, 78 - vblendps ymm10, ymm10, ymm5, 0xCC - vblendps ymm14, ymm14, ymm13, 0xCC - vperm2f128 ymm8, ymm10, ymm14, 0x20 - vmovups ymmword ptr [rbx+0x40], ymm8 - vblendps ymm15, ymm13, ymm15, 0xCC - vperm2f128 ymm13, ymm6, ymm15, 0x20 - vmovups ymmword ptr [rbx+0x60], ymm13 - vperm2f128 ymm9, ymm1, ymm2, 0x31 - vperm2f128 ymm11, ymm3, ymm4, 0x31 - vmovups ymmword ptr [rbx+0x80], ymm9 - vperm2f128 ymm14, ymm10, ymm14, 0x31 - vperm2f128 ymm15, ymm6, ymm15, 0x31 - vmovups ymmword ptr [rbx+0xA0], ymm11 - vmovups ymmword ptr [rbx+0xC0], ymm14 - vmovups ymmword ptr [rbx+0xE0], ymm15 - vmovdqa ymm0, ymmword ptr [rsp+0x220] - vpaddd ymm1, ymm0, ymmword ptr [rsp+0x240] - vmovdqa ymmword ptr [rsp+0x240], ymm1 - vpxor ymm0, ymm0, ymmword ptr [CMP_MSB_MASK+rip] - vpxor ymm2, ymm1, ymmword ptr [CMP_MSB_MASK+rip] - vpcmpgtd ymm2, ymm0, ymm2 - vmovdqa ymm0, ymmword ptr [rsp+0x260] - vpsubd ymm2, ymm0, ymm2 - vmovdqa ymmword ptr [rsp+0x260], ymm2 - add rdi, 64 - add rbx, 256 - mov qword ptr [rbp+0x50], rbx - sub rsi, 8 - cmp rsi, 8 - jnc 2b - test rsi, rsi - jnz 3f -4: - vzeroupper - mov rsp, rbp - pop rbp - pop rbx - pop r12 - pop r13 - pop r14 - pop r15 - ret -.p2align 5 -3: - mov rbx, qword ptr [rbp+0x50] - mov r15, qword ptr [rsp+0x2A0] - movzx r13d, byte ptr [rbp+0x38] - movzx r12d, byte ptr [rbp+0x48] - test rsi, 0x4 - je 3f - vbroadcasti128 ymm0, xmmword ptr [rcx] - vbroadcasti128 ymm1, xmmword ptr [rcx+0x10] - vmovdqa ymm8, ymm0 - vmovdqa ymm9, ymm1 - vbroadcasti128 ymm12, xmmword ptr [rsp+0x240] - vbroadcasti128 ymm13, xmmword ptr [rsp+0x260] - vpunpckldq ymm14, ymm12, ymm13 - vpunpckhdq ymm15, ymm12, ymm13 - vpermq ymm14, ymm14, 0x50 - vpermq ymm15, ymm15, 0x50 - vbroadcasti128 ymm12, xmmword ptr [BLAKE3_BLOCK_LEN+rip] - vpblendd ymm14, ymm14, ymm12, 0x44 - vpblendd ymm15, ymm15, ymm12, 0x44 - vmovdqa ymmword ptr [rsp], ymm14 - vmovdqa ymmword ptr [rsp+0x20], ymm15 - mov r8, qword ptr [rdi] - mov r9, qword ptr [rdi+0x8] - mov r10, qword ptr [rdi+0x10] - mov r11, qword ptr [rdi+0x18] - movzx eax, byte ptr [rbp+0x40] - or eax, r13d - xor edx, edx -.p2align 5 -2: - mov r14d, eax - or eax, r12d - add rdx, 64 - cmp rdx, r15 - cmovne eax, r14d - mov dword ptr [rsp+0x200], eax - vmovups ymm2, ymmword ptr [r8+rdx-0x40] - vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x40], 0x01 - vmovups ymm3, ymmword ptr [r8+rdx-0x30] - vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x30], 0x01 - vshufps ymm4, ymm2, ymm3, 136 - vshufps ymm5, ymm2, ymm3, 221 - vmovups ymm2, ymmword ptr [r8+rdx-0x20] - vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x20], 0x01 - vmovups ymm3, ymmword ptr [r8+rdx-0x10] - vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x10], 0x01 - vshufps ymm6, ymm2, ymm3, 136 - vshufps ymm7, ymm2, ymm3, 221 - vpshufd ymm6, ymm6, 0x93 - vpshufd ymm7, ymm7, 0x93 - vmovups ymm10, ymmword ptr [r10+rdx-0x40] - vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x40], 0x01 - vmovups ymm11, ymmword ptr [r10+rdx-0x30] - vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x30], 0x01 - vshufps ymm12, ymm10, ymm11, 136 - vshufps ymm13, ymm10, ymm11, 221 - vmovups ymm10, ymmword ptr [r10+rdx-0x20] - vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x20], 0x01 - vmovups ymm11, ymmword ptr [r10+rdx-0x10] - vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x10], 0x01 - vshufps ymm14, ymm10, ymm11, 136 - vshufps ymm15, ymm10, ymm11, 221 - vpshufd ymm14, ymm14, 0x93 - vpshufd ymm15, ymm15, 0x93 - prefetcht0 [r8+rdx+0x80] - prefetcht0 [r9+rdx+0x80] - prefetcht0 [r10+rdx+0x80] - prefetcht0 [r11+rdx+0x80] - vpbroadcastd ymm2, dword ptr [rsp+0x200] - vmovdqa ymm3, ymmword ptr [rsp] - vmovdqa ymm11, ymmword ptr [rsp+0x20] - vpblendd ymm3, ymm3, ymm2, 0x88 - vpblendd ymm11, ymm11, ymm2, 0x88 - vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip] - vmovdqa ymm10, ymm2 - mov al, 7 -9: - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm8, ymm8, ymm12 - vmovdqa ymmword ptr [rsp+0x40], ymm4 - nop - vmovdqa ymmword ptr [rsp+0x60], ymm12 - nop - vpaddd ymm0, ymm0, ymm1 - vpaddd ymm8, ymm8, ymm9 - vpxor ymm3, ymm3, ymm0 - vpxor ymm11, ymm11, ymm8 - vbroadcasti128 ymm4, xmmword ptr [ROT16+rip] - vpshufb ymm3, ymm3, ymm4 - vpshufb ymm11, ymm11, ymm4 - vpaddd ymm2, ymm2, ymm3 - vpaddd ymm10, ymm10, ymm11 - vpxor ymm1, ymm1, ymm2 - vpxor ymm9, ymm9, ymm10 - vpsrld ymm4, ymm1, 12 - vpslld ymm1, ymm1, 20 - vpor ymm1, ymm1, ymm4 - vpsrld ymm4, ymm9, 12 - vpslld ymm9, ymm9, 20 - vpor ymm9, ymm9, ymm4 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm0, ymm0, ymm1 - vpaddd ymm8, ymm8, ymm9 - vmovdqa ymmword ptr [rsp+0x80], ymm5 - vmovdqa ymmword ptr [rsp+0xA0], ymm13 - vpxor ymm3, ymm3, ymm0 - vpxor ymm11, ymm11, ymm8 - vbroadcasti128 ymm4, xmmword ptr [ROT8+rip] - vpshufb ymm3, ymm3, ymm4 - vpshufb ymm11, ymm11, ymm4 - vpaddd ymm2, ymm2, ymm3 - vpaddd ymm10, ymm10, ymm11 - vpxor ymm1, ymm1, ymm2 - vpxor ymm9, ymm9, ymm10 - vpsrld ymm4, ymm1, 7 - vpslld ymm1, ymm1, 25 - vpor ymm1, ymm1, ymm4 - vpsrld ymm4, ymm9, 7 - vpslld ymm9, ymm9, 25 - vpor ymm9, ymm9, ymm4 - vpshufd ymm0, ymm0, 0x93 - vpshufd ymm8, ymm8, 0x93 - vpshufd ymm3, ymm3, 0x4E - vpshufd ymm11, ymm11, 0x4E - vpshufd ymm2, ymm2, 0x39 - vpshufd ymm10, ymm10, 0x39 - vpaddd ymm0, ymm0, ymm6 - vpaddd ymm8, ymm8, ymm14 - vpaddd ymm0, ymm0, ymm1 - vpaddd ymm8, ymm8, ymm9 - vpxor ymm3, ymm3, ymm0 - vpxor ymm11, ymm11, ymm8 - vbroadcasti128 ymm4, xmmword ptr [ROT16+rip] - vpshufb ymm3, ymm3, ymm4 - vpshufb ymm11, ymm11, ymm4 - vpaddd ymm2, ymm2, ymm3 - vpaddd ymm10, ymm10, ymm11 - vpxor ymm1, ymm1, ymm2 - vpxor ymm9, ymm9, ymm10 - vpsrld ymm4, ymm1, 12 - vpslld ymm1, ymm1, 20 - vpor ymm1, ymm1, ymm4 - vpsrld ymm4, ymm9, 12 - vpslld ymm9, ymm9, 20 - vpor ymm9, ymm9, ymm4 - vpaddd ymm0, ymm0, ymm7 - vpaddd ymm8, ymm8, ymm15 - vpaddd ymm0, ymm0, ymm1 - vpaddd ymm8, ymm8, ymm9 - vpxor ymm3, ymm3, ymm0 - vpxor ymm11, ymm11, ymm8 - vbroadcasti128 ymm4, xmmword ptr [ROT8+rip] - vpshufb ymm3, ymm3, ymm4 - vpshufb ymm11, ymm11, ymm4 - vpaddd ymm2, ymm2, ymm3 - vpaddd ymm10, ymm10, ymm11 - vpxor ymm1, ymm1, ymm2 - vpxor ymm9, ymm9, ymm10 - vpsrld ymm4, ymm1, 7 - vpslld ymm1, ymm1, 25 - vpor ymm1, ymm1, ymm4 - vpsrld ymm4, ymm9, 7 - vpslld ymm9, ymm9, 25 - vpor ymm9, ymm9, ymm4 - vpshufd ymm0, ymm0, 0x39 - vpshufd ymm8, ymm8, 0x39 - vpshufd ymm3, ymm3, 0x4E - vpshufd ymm11, ymm11, 0x4E - vpshufd ymm2, ymm2, 0x93 - vpshufd ymm10, ymm10, 0x93 - dec al - je 9f - vmovdqa ymm4, ymmword ptr [rsp+0x40] - vmovdqa ymm5, ymmword ptr [rsp+0x80] - vshufps ymm12, ymm4, ymm5, 214 - vpshufd ymm13, ymm4, 0x0F - vpshufd ymm4, ymm12, 0x39 - vshufps ymm12, ymm6, ymm7, 250 - vpblendd ymm13, ymm13, ymm12, 0xAA - vpunpcklqdq ymm12, ymm7, ymm5 - vpblendd ymm12, ymm12, ymm6, 0x88 - vpshufd ymm12, ymm12, 0x78 - vpunpckhdq ymm5, ymm5, ymm7 - vpunpckldq ymm6, ymm6, ymm5 - vpshufd ymm7, ymm6, 0x1E - vmovdqa ymmword ptr [rsp+0x40], ymm13 - vmovdqa ymmword ptr [rsp+0x80], ymm12 - vmovdqa ymm12, ymmword ptr [rsp+0x60] - vmovdqa ymm13, ymmword ptr [rsp+0xA0] - vshufps ymm5, ymm12, ymm13, 214 - vpshufd ymm6, ymm12, 0x0F - vpshufd ymm12, ymm5, 0x39 - vshufps ymm5, ymm14, ymm15, 250 - vpblendd ymm6, ymm6, ymm5, 0xAA - vpunpcklqdq ymm5, ymm15, ymm13 - vpblendd ymm5, ymm5, ymm14, 0x88 - vpshufd ymm5, ymm5, 0x78 - vpunpckhdq ymm13, ymm13, ymm15 - vpunpckldq ymm14, ymm14, ymm13 - vpshufd ymm15, ymm14, 0x1E - vmovdqa ymm13, ymm6 - vmovdqa ymm14, ymm5 - vmovdqa ymm5, ymmword ptr [rsp+0x40] - vmovdqa ymm6, ymmword ptr [rsp+0x80] - jmp 9b -9: - vpxor ymm0, ymm0, ymm2 - vpxor ymm1, ymm1, ymm3 - vpxor ymm8, ymm8, ymm10 - vpxor ymm9, ymm9, ymm11 - mov eax, r13d - cmp rdx, r15 - jne 2b - vmovdqu xmmword ptr [rbx], xmm0 - vmovdqu xmmword ptr [rbx+0x10], xmm1 - vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 - vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 - vmovdqu xmmword ptr [rbx+0x40], xmm8 - vmovdqu xmmword ptr [rbx+0x50], xmm9 - vextracti128 xmmword ptr [rbx+0x60], ymm8, 0x01 - vextracti128 xmmword ptr [rbx+0x70], ymm9, 0x01 - vmovaps xmm8, xmmword ptr [rsp+0x280] - vmovaps xmm0, xmmword ptr [rsp+0x240] - vmovaps xmm1, xmmword ptr [rsp+0x250] - vmovaps xmm2, xmmword ptr [rsp+0x260] - vmovaps xmm3, xmmword ptr [rsp+0x270] - vblendvps xmm0, xmm0, xmm1, xmm8 - vblendvps xmm2, xmm2, xmm3, xmm8 - vmovaps xmmword ptr [rsp+0x240], xmm0 - vmovaps xmmword ptr [rsp+0x260], xmm2 - add rbx, 128 - add rdi, 32 - sub rsi, 4 -3: - test rsi, 0x2 - je 3f - vbroadcasti128 ymm0, xmmword ptr [rcx] - vbroadcasti128 ymm1, xmmword ptr [rcx+0x10] - vmovd xmm13, dword ptr [rsp+0x240] - vpinsrd xmm13, xmm13, dword ptr [rsp+0x260], 1 - vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 - vmovd xmm14, dword ptr [rsp+0x244] - vpinsrd xmm14, xmm14, dword ptr [rsp+0x264], 1 - vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 - vinserti128 ymm13, ymm13, xmm14, 0x01 - vbroadcasti128 ymm14, xmmword ptr [ROT16+rip] - vbroadcasti128 ymm15, xmmword ptr [ROT8+rip] - mov r8, qword ptr [rdi] - mov r9, qword ptr [rdi+0x8] - movzx eax, byte ptr [rbp+0x40] - or eax, r13d - xor edx, edx -.p2align 5 -2: - mov r14d, eax - or eax, r12d - add rdx, 64 - cmp rdx, r15 - cmovne eax, r14d - mov dword ptr [rsp+0x200], eax - vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip] - vpbroadcastd ymm8, dword ptr [rsp+0x200] - vpblendd ymm3, ymm13, ymm8, 0x88 - vmovups ymm8, ymmword ptr [r8+rdx-0x40] - vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x40], 0x01 - vmovups ymm9, ymmword ptr [r8+rdx-0x30] - vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x30], 0x01 - vshufps ymm4, ymm8, ymm9, 136 - vshufps ymm5, ymm8, ymm9, 221 - vmovups ymm8, ymmword ptr [r8+rdx-0x20] - vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x20], 0x01 - vmovups ymm9, ymmword ptr [r8+rdx-0x10] - vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x10], 0x01 - vshufps ymm6, ymm8, ymm9, 136 - vshufps ymm7, ymm8, ymm9, 221 - vpshufd ymm6, ymm6, 0x93 - vpshufd ymm7, ymm7, 0x93 - mov al, 7 -9: - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm0, ymm0, ymm1 - vpxor ymm3, ymm3, ymm0 - vpshufb ymm3, ymm3, ymm14 - vpaddd ymm2, ymm2, ymm3 - vpxor ymm1, ymm1, ymm2 - vpsrld ymm8, ymm1, 12 - vpslld ymm1, ymm1, 20 - vpor ymm1, ymm1, ymm8 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm0, ymm0, ymm1 - vpxor ymm3, ymm3, ymm0 - vpshufb ymm3, ymm3, ymm15 - vpaddd ymm2, ymm2, ymm3 - vpxor ymm1, ymm1, ymm2 - vpsrld ymm8, ymm1, 7 - vpslld ymm1, ymm1, 25 - vpor ymm1, ymm1, ymm8 - vpshufd ymm0, ymm0, 0x93 - vpshufd ymm3, ymm3, 0x4E - vpshufd ymm2, ymm2, 0x39 - vpaddd ymm0, ymm0, ymm6 - vpaddd ymm0, ymm0, ymm1 - vpxor ymm3, ymm3, ymm0 - vpshufb ymm3, ymm3, ymm14 - vpaddd ymm2, ymm2, ymm3 - vpxor ymm1, ymm1, ymm2 - vpsrld ymm8, ymm1, 12 - vpslld ymm1, ymm1, 20 - vpor ymm1, ymm1, ymm8 - vpaddd ymm0, ymm0, ymm7 - vpaddd ymm0, ymm0, ymm1 - vpxor ymm3, ymm3, ymm0 - vpshufb ymm3, ymm3, ymm15 - vpaddd ymm2, ymm2, ymm3 - vpxor ymm1, ymm1, ymm2 - vpsrld ymm8, ymm1, 7 - vpslld ymm1, ymm1, 25 - vpor ymm1, ymm1, ymm8 - vpshufd ymm0, ymm0, 0x39 - vpshufd ymm3, ymm3, 0x4E - vpshufd ymm2, ymm2, 0x93 - dec al - jz 9f - vshufps ymm8, ymm4, ymm5, 214 - vpshufd ymm9, ymm4, 0x0F - vpshufd ymm4, ymm8, 0x39 - vshufps ymm8, ymm6, ymm7, 250 - vpblendd ymm9, ymm9, ymm8, 0xAA - vpunpcklqdq ymm8, ymm7, ymm5 - vpblendd ymm8, ymm8, ymm6, 0x88 - vpshufd ymm8, ymm8, 0x78 - vpunpckhdq ymm5, ymm5, ymm7 - vpunpckldq ymm6, ymm6, ymm5 - vpshufd ymm7, ymm6, 0x1E - vmovdqa ymm5, ymm9 - vmovdqa ymm6, ymm8 - jmp 9b -9: - vpxor ymm0, ymm0, ymm2 - vpxor ymm1, ymm1, ymm3 - mov eax, r13d - cmp rdx, r15 - jne 2b - vmovdqu xmmword ptr [rbx], xmm0 - vmovdqu xmmword ptr [rbx+0x10], xmm1 - vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 - vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 - vmovaps ymm8, ymmword ptr [rsp+0x280] - vmovaps ymm0, ymmword ptr [rsp+0x240] - vmovups ymm1, ymmword ptr [rsp+0x248] - vmovaps ymm2, ymmword ptr [rsp+0x260] - vmovups ymm3, ymmword ptr [rsp+0x268] - vblendvps ymm0, ymm0, ymm1, ymm8 - vblendvps ymm2, ymm2, ymm3, ymm8 - vmovaps ymmword ptr [rsp+0x240], ymm0 - vmovaps ymmword ptr [rsp+0x260], ymm2 - add rbx, 64 - add rdi, 16 - sub rsi, 2 -3: - test rsi, 0x1 - je 4b - vmovdqu xmm0, xmmword ptr [rcx] - vmovdqu xmm1, xmmword ptr [rcx+0x10] - vmovd xmm3, dword ptr [rsp+0x240] - vpinsrd xmm3, xmm3, dword ptr [rsp+0x260], 1 - vpinsrd xmm13, xmm3, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 - vmovdqa xmm14, xmmword ptr [ROT16+rip] - vmovdqa xmm15, xmmword ptr [ROT8+rip] - mov r8, qword ptr [rdi] - movzx eax, byte ptr [rbp+0x40] - or eax, r13d - xor edx, edx -.p2align 5 -2: - mov r14d, eax - or eax, r12d - add rdx, 64 - cmp rdx, r15 - cmovne eax, r14d - vmovdqa xmm2, xmmword ptr [BLAKE3_IV+rip] - vmovdqa xmm3, xmm13 - vpinsrd xmm3, xmm3, eax, 3 - vmovups xmm8, xmmword ptr [r8+rdx-0x40] - vmovups xmm9, xmmword ptr [r8+rdx-0x30] - vshufps xmm4, xmm8, xmm9, 136 - vshufps xmm5, xmm8, xmm9, 221 - vmovups xmm8, xmmword ptr [r8+rdx-0x20] - vmovups xmm9, xmmword ptr [r8+rdx-0x10] - vshufps xmm6, xmm8, xmm9, 136 - vshufps xmm7, xmm8, xmm9, 221 - vpshufd xmm6, xmm6, 0x93 - vpshufd xmm7, xmm7, 0x93 - mov al, 7 -9: - vpaddd xmm0, xmm0, xmm4 - vpaddd xmm0, xmm0, xmm1 - vpxor xmm3, xmm3, xmm0 - vpshufb xmm3, xmm3, xmm14 - vpaddd xmm2, xmm2, xmm3 - vpxor xmm1, xmm1, xmm2 - vpsrld xmm8, xmm1, 12 - vpslld xmm1, xmm1, 20 - vpor xmm1, xmm1, xmm8 - vpaddd xmm0, xmm0, xmm5 - vpaddd xmm0, xmm0, xmm1 - vpxor xmm3, xmm3, xmm0 - vpshufb xmm3, xmm3, xmm15 - vpaddd xmm2, xmm2, xmm3 - vpxor xmm1, xmm1, xmm2 - vpsrld xmm8, xmm1, 7 - vpslld xmm1, xmm1, 25 - vpor xmm1, xmm1, xmm8 - vpshufd xmm0, xmm0, 0x93 - vpshufd xmm3, xmm3, 0x4E - vpshufd xmm2, xmm2, 0x39 - vpaddd xmm0, xmm0, xmm6 - vpaddd xmm0, xmm0, xmm1 - vpxor xmm3, xmm3, xmm0 - vpshufb xmm3, xmm3, xmm14 - vpaddd xmm2, xmm2, xmm3 - vpxor xmm1, xmm1, xmm2 - vpsrld xmm8, xmm1, 12 - vpslld xmm1, xmm1, 20 - vpor xmm1, xmm1, xmm8 - vpaddd xmm0, xmm0, xmm7 - vpaddd xmm0, xmm0, xmm1 - vpxor xmm3, xmm3, xmm0 - vpshufb xmm3, xmm3, xmm15 - vpaddd xmm2, xmm2, xmm3 - vpxor xmm1, xmm1, xmm2 - vpsrld xmm8, xmm1, 7 - vpslld xmm1, xmm1, 25 - vpor xmm1, xmm1, xmm8 - vpshufd xmm0, xmm0, 0x39 - vpshufd xmm3, xmm3, 0x4E - vpshufd xmm2, xmm2, 0x93 - dec al - jz 9f - vshufps xmm8, xmm4, xmm5, 214 - vpshufd xmm9, xmm4, 0x0F - vpshufd xmm4, xmm8, 0x39 - vshufps xmm8, xmm6, xmm7, 250 - vpblendd xmm9, xmm9, xmm8, 0xAA - vpunpcklqdq xmm8, xmm7, xmm5 - vpblendd xmm8, xmm8, xmm6, 0x88 - vpshufd xmm8, xmm8, 0x78 - vpunpckhdq xmm5, xmm5, xmm7 - vpunpckldq xmm6, xmm6, xmm5 - vpshufd xmm7, xmm6, 0x1E - vmovdqa xmm5, xmm9 - vmovdqa xmm6, xmm8 - jmp 9b -9: - vpxor xmm0, xmm0, xmm2 - vpxor xmm1, xmm1, xmm3 - mov eax, r13d - cmp rdx, r15 - jne 2b - vmovdqu xmmword ptr [rbx], xmm0 - vmovdqu xmmword ptr [rbx+0x10], xmm1 - jmp 4b - - -#ifdef __APPLE__ -.static_data -#else -.section .rodata -#endif -.p2align 6 -ADD0: - .long 0, 1, 2, 3, 4, 5, 6, 7 -ADD1: - .long 8, 8, 8, 8, 8, 8, 8, 8 -BLAKE3_IV_0: - .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 - .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 -BLAKE3_IV_1: - .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 - .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 -BLAKE3_IV_2: - .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 - .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 -BLAKE3_IV_3: - .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A - .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A -BLAKE3_BLOCK_LEN: - .long 0x00000040, 0x00000040, 0x00000040, 0x00000040 - .long 0x00000040, 0x00000040, 0x00000040, 0x00000040 -ROT16: - .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 -ROT8: - .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 -CMP_MSB_MASK: - .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 - .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 -BLAKE3_IV: - .long 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A - diff --git a/thirdparty/BLAKE3/c/blake3_avx2_x86-64_windows_gnu.S b/thirdparty/BLAKE3/c/blake3_avx2_x86-64_windows_gnu.S deleted file mode 100644 index bb58d2ae6..000000000 --- a/thirdparty/BLAKE3/c/blake3_avx2_x86-64_windows_gnu.S +++ /dev/null @@ -1,1817 +0,0 @@ -.intel_syntax noprefix -.global _blake3_hash_many_avx2 -.global blake3_hash_many_avx2 -.section .text - .p2align 6 -_blake3_hash_many_avx2: -blake3_hash_many_avx2: - push r15 - push r14 - push r13 - push r12 - push rsi - push rdi - push rbx - push rbp - mov rbp, rsp - sub rsp, 880 - and rsp, 0xFFFFFFFFFFFFFFC0 - vmovdqa xmmword ptr [rsp+0x2D0], xmm6 - vmovdqa xmmword ptr [rsp+0x2E0], xmm7 - vmovdqa xmmword ptr [rsp+0x2F0], xmm8 - vmovdqa xmmword ptr [rsp+0x300], xmm9 - vmovdqa xmmword ptr [rsp+0x310], xmm10 - vmovdqa xmmword ptr [rsp+0x320], xmm11 - vmovdqa xmmword ptr [rsp+0x330], xmm12 - vmovdqa xmmword ptr [rsp+0x340], xmm13 - vmovdqa xmmword ptr [rsp+0x350], xmm14 - vmovdqa xmmword ptr [rsp+0x360], xmm15 - mov rdi, rcx - mov rsi, rdx - mov rdx, r8 - mov rcx, r9 - mov r8, qword ptr [rbp+0x68] - movzx r9, byte ptr [rbp+0x70] - neg r9d - vmovd xmm0, r9d - vpbroadcastd ymm0, xmm0 - vmovdqa ymmword ptr [rsp+0x260], ymm0 - vpand ymm1, ymm0, ymmword ptr [ADD0+rip] - vpand ymm2, ymm0, ymmword ptr [ADD1+rip] - vmovdqa ymmword ptr [rsp+0x2A0], ymm2 - vmovd xmm2, r8d - vpbroadcastd ymm2, xmm2 - vpaddd ymm2, ymm2, ymm1 - vmovdqa ymmword ptr [rsp+0x220], ymm2 - vpxor ymm1, ymm1, ymmword ptr [CMP_MSB_MASK+rip] - vpxor ymm2, ymm2, ymmword ptr [CMP_MSB_MASK+rip] - vpcmpgtd ymm2, ymm1, ymm2 - shr r8, 32 - vmovd xmm3, r8d - vpbroadcastd ymm3, xmm3 - vpsubd ymm3, ymm3, ymm2 - vmovdqa ymmword ptr [rsp+0x240], ymm3 - shl rdx, 6 - mov qword ptr [rsp+0x2C0], rdx - cmp rsi, 8 - jc 3f -2: - vpbroadcastd ymm0, dword ptr [rcx] - vpbroadcastd ymm1, dword ptr [rcx+0x4] - vpbroadcastd ymm2, dword ptr [rcx+0x8] - vpbroadcastd ymm3, dword ptr [rcx+0xC] - vpbroadcastd ymm4, dword ptr [rcx+0x10] - vpbroadcastd ymm5, dword ptr [rcx+0x14] - vpbroadcastd ymm6, dword ptr [rcx+0x18] - vpbroadcastd ymm7, dword ptr [rcx+0x1C] - mov r8, qword ptr [rdi] - mov r9, qword ptr [rdi+0x8] - mov r10, qword ptr [rdi+0x10] - mov r11, qword ptr [rdi+0x18] - mov r12, qword ptr [rdi+0x20] - mov r13, qword ptr [rdi+0x28] - mov r14, qword ptr [rdi+0x30] - mov r15, qword ptr [rdi+0x38] - movzx eax, byte ptr [rbp+0x78] - movzx ebx, byte ptr [rbp+0x80] - or eax, ebx - xor edx, edx -.p2align 5 -9: - movzx ebx, byte ptr [rbp+0x88] - or ebx, eax - add rdx, 64 - cmp rdx, qword ptr [rsp+0x2C0] - cmove eax, ebx - mov dword ptr [rsp+0x200], eax - vmovups xmm8, xmmword ptr [r8+rdx-0x40] - vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01 - vmovups xmm9, xmmword ptr [r9+rdx-0x40] - vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01 - vunpcklpd ymm12, ymm8, ymm9 - vunpckhpd ymm13, ymm8, ymm9 - vmovups xmm10, xmmword ptr [r10+rdx-0x40] - vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01 - vmovups xmm11, xmmword ptr [r11+rdx-0x40] - vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01 - vunpcklpd ymm14, ymm10, ymm11 - vunpckhpd ymm15, ymm10, ymm11 - vshufps ymm8, ymm12, ymm14, 136 - vmovaps ymmword ptr [rsp], ymm8 - vshufps ymm9, ymm12, ymm14, 221 - vmovaps ymmword ptr [rsp+0x20], ymm9 - vshufps ymm10, ymm13, ymm15, 136 - vmovaps ymmword ptr [rsp+0x40], ymm10 - vshufps ymm11, ymm13, ymm15, 221 - vmovaps ymmword ptr [rsp+0x60], ymm11 - vmovups xmm8, xmmword ptr [r8+rdx-0x30] - vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01 - vmovups xmm9, xmmword ptr [r9+rdx-0x30] - vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01 - vunpcklpd ymm12, ymm8, ymm9 - vunpckhpd ymm13, ymm8, ymm9 - vmovups xmm10, xmmword ptr [r10+rdx-0x30] - vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01 - vmovups xmm11, xmmword ptr [r11+rdx-0x30] - vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01 - vunpcklpd ymm14, ymm10, ymm11 - vunpckhpd ymm15, ymm10, ymm11 - vshufps ymm8, ymm12, ymm14, 136 - vmovaps ymmword ptr [rsp+0x80], ymm8 - vshufps ymm9, ymm12, ymm14, 221 - vmovaps ymmword ptr [rsp+0xA0], ymm9 - vshufps ymm10, ymm13, ymm15, 136 - vmovaps ymmword ptr [rsp+0xC0], ymm10 - vshufps ymm11, ymm13, ymm15, 221 - vmovaps ymmword ptr [rsp+0xE0], ymm11 - vmovups xmm8, xmmword ptr [r8+rdx-0x20] - vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01 - vmovups xmm9, xmmword ptr [r9+rdx-0x20] - vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01 - vunpcklpd ymm12, ymm8, ymm9 - vunpckhpd ymm13, ymm8, ymm9 - vmovups xmm10, xmmword ptr [r10+rdx-0x20] - vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01 - vmovups xmm11, xmmword ptr [r11+rdx-0x20] - vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01 - vunpcklpd ymm14, ymm10, ymm11 - vunpckhpd ymm15, ymm10, ymm11 - vshufps ymm8, ymm12, ymm14, 136 - vmovaps ymmword ptr [rsp+0x100], ymm8 - vshufps ymm9, ymm12, ymm14, 221 - vmovaps ymmword ptr [rsp+0x120], ymm9 - vshufps ymm10, ymm13, ymm15, 136 - vmovaps ymmword ptr [rsp+0x140], ymm10 - vshufps ymm11, ymm13, ymm15, 221 - vmovaps ymmword ptr [rsp+0x160], ymm11 - vmovups xmm8, xmmword ptr [r8+rdx-0x10] - vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01 - vmovups xmm9, xmmword ptr [r9+rdx-0x10] - vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01 - vunpcklpd ymm12, ymm8, ymm9 - vunpckhpd ymm13, ymm8, ymm9 - vmovups xmm10, xmmword ptr [r10+rdx-0x10] - vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01 - vmovups xmm11, xmmword ptr [r11+rdx-0x10] - vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01 - vunpcklpd ymm14, ymm10, ymm11 - vunpckhpd ymm15, ymm10, ymm11 - vshufps ymm8, ymm12, ymm14, 136 - vmovaps ymmword ptr [rsp+0x180], ymm8 - vshufps ymm9, ymm12, ymm14, 221 - vmovaps ymmword ptr [rsp+0x1A0], ymm9 - vshufps ymm10, ymm13, ymm15, 136 - vmovaps ymmword ptr [rsp+0x1C0], ymm10 - vshufps ymm11, ymm13, ymm15, 221 - vmovaps ymmword ptr [rsp+0x1E0], ymm11 - vpbroadcastd ymm15, dword ptr [rsp+0x200] - prefetcht0 [r8+rdx+0x80] - prefetcht0 [r12+rdx+0x80] - prefetcht0 [r9+rdx+0x80] - prefetcht0 [r13+rdx+0x80] - prefetcht0 [r10+rdx+0x80] - prefetcht0 [r14+rdx+0x80] - prefetcht0 [r11+rdx+0x80] - prefetcht0 [r15+rdx+0x80] - vpaddd ymm0, ymm0, ymmword ptr [rsp] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x80] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm0, ymmword ptr [rsp+0x220] - vpxor ymm13, ymm1, ymmword ptr [rsp+0x240] - vpxor ymm14, ymm2, ymmword ptr [BLAKE3_BLOCK_LEN+rip] - vpxor ymm15, ymm3, ymm15 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [BLAKE3_IV_0+rip] - vpaddd ymm9, ymm13, ymmword ptr [BLAKE3_IV_1+rip] - vpaddd ymm10, ymm14, ymmword ptr [BLAKE3_IV_2+rip] - vpaddd ymm11, ymm15, ymmword ptr [BLAKE3_IV_3+rip] - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x20] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0xA0] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x100] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x180] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x120] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1A0] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x40] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0xE0] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0xC0] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] - vpaddd ymm2, ymm2, ymmword ptr [rsp] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x20] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x120] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x160] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1C0] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x60] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1A0] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x80] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x40] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0xC0] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x160] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0xA0] - vpaddd ymm1, ymm1, ymmword ptr [rsp] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1E0] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x140] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1C0] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0xE0] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x60] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x80] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0xA0] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x100] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x180] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1E0] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1A0] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x140] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0xE0] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] - vpaddd ymm2, ymm2, ymmword ptr [rsp] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x40] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x20] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x120] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x100] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1C0] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x180] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1A0] - vpaddd ymm1, ymm1, ymmword ptr [rsp] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x40] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x60] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0xC0] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x160] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x20] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1E0] - vpaddd ymm1, ymm1, ymmword ptr [rsp] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x120] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1C0] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x60] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+0x200], ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0x140] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0x80] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vpxor ymm0, ymm0, ymm8 - vpxor ymm1, ymm1, ymm9 - vpxor ymm2, ymm2, ymm10 - vpxor ymm3, ymm3, ymm11 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpxor ymm4, ymm4, ymm12 - vpxor ymm5, ymm5, ymm13 - vpxor ymm6, ymm6, ymm14 - vpxor ymm7, ymm7, ymm15 - movzx eax, byte ptr [rbp+0x78] - jne 9b - mov rbx, qword ptr [rbp+0x90] - vunpcklps ymm8, ymm0, ymm1 - vunpcklps ymm9, ymm2, ymm3 - vunpckhps ymm10, ymm0, ymm1 - vunpcklps ymm11, ymm4, ymm5 - vunpcklps ymm0, ymm6, ymm7 - vshufps ymm12, ymm8, ymm9, 78 - vblendps ymm1, ymm8, ymm12, 0xCC - vshufps ymm8, ymm11, ymm0, 78 - vunpckhps ymm13, ymm2, ymm3 - vblendps ymm2, ymm11, ymm8, 0xCC - vblendps ymm3, ymm12, ymm9, 0xCC - vperm2f128 ymm12, ymm1, ymm2, 0x20 - vmovups ymmword ptr [rbx], ymm12 - vunpckhps ymm14, ymm4, ymm5 - vblendps ymm4, ymm8, ymm0, 0xCC - vunpckhps ymm15, ymm6, ymm7 - vperm2f128 ymm7, ymm3, ymm4, 0x20 - vmovups ymmword ptr [rbx+0x20], ymm7 - vshufps ymm5, ymm10, ymm13, 78 - vblendps ymm6, ymm5, ymm13, 0xCC - vshufps ymm13, ymm14, ymm15, 78 - vblendps ymm10, ymm10, ymm5, 0xCC - vblendps ymm14, ymm14, ymm13, 0xCC - vperm2f128 ymm8, ymm10, ymm14, 0x20 - vmovups ymmword ptr [rbx+0x40], ymm8 - vblendps ymm15, ymm13, ymm15, 0xCC - vperm2f128 ymm13, ymm6, ymm15, 0x20 - vmovups ymmword ptr [rbx+0x60], ymm13 - vperm2f128 ymm9, ymm1, ymm2, 0x31 - vperm2f128 ymm11, ymm3, ymm4, 0x31 - vmovups ymmword ptr [rbx+0x80], ymm9 - vperm2f128 ymm14, ymm10, ymm14, 0x31 - vperm2f128 ymm15, ymm6, ymm15, 0x31 - vmovups ymmword ptr [rbx+0xA0], ymm11 - vmovups ymmword ptr [rbx+0xC0], ymm14 - vmovups ymmword ptr [rbx+0xE0], ymm15 - vmovdqa ymm0, ymmword ptr [rsp+0x2A0] - vpaddd ymm1, ymm0, ymmword ptr [rsp+0x220] - vmovdqa ymmword ptr [rsp+0x220], ymm1 - vpxor ymm0, ymm0, ymmword ptr [CMP_MSB_MASK+rip] - vpxor ymm2, ymm1, ymmword ptr [CMP_MSB_MASK+rip] - vpcmpgtd ymm2, ymm0, ymm2 - vmovdqa ymm0, ymmword ptr [rsp+0x240] - vpsubd ymm2, ymm0, ymm2 - vmovdqa ymmword ptr [rsp+0x240], ymm2 - add rdi, 64 - add rbx, 256 - mov qword ptr [rbp+0x90], rbx - sub rsi, 8 - cmp rsi, 8 - jnc 2b - test rsi, rsi - jnz 3f -4: - vzeroupper - vmovdqa xmm6, xmmword ptr [rsp+0x2D0] - vmovdqa xmm7, xmmword ptr [rsp+0x2E0] - vmovdqa xmm8, xmmword ptr [rsp+0x2F0] - vmovdqa xmm9, xmmword ptr [rsp+0x300] - vmovdqa xmm10, xmmword ptr [rsp+0x310] - vmovdqa xmm11, xmmword ptr [rsp+0x320] - vmovdqa xmm12, xmmword ptr [rsp+0x330] - vmovdqa xmm13, xmmword ptr [rsp+0x340] - vmovdqa xmm14, xmmword ptr [rsp+0x350] - vmovdqa xmm15, xmmword ptr [rsp+0x360] - mov rsp, rbp - pop rbp - pop rbx - pop rdi - pop rsi - pop r12 - pop r13 - pop r14 - pop r15 - ret -.p2align 5 -3: - mov rbx, qword ptr [rbp+0x90] - mov r15, qword ptr [rsp+0x2C0] - movzx r13d, byte ptr [rbp+0x78] - movzx r12d, byte ptr [rbp+0x88] - test rsi, 0x4 - je 3f - vbroadcasti128 ymm0, xmmword ptr [rcx] - vbroadcasti128 ymm1, xmmword ptr [rcx+0x10] - vmovdqa ymm8, ymm0 - vmovdqa ymm9, ymm1 - vbroadcasti128 ymm12, xmmword ptr [rsp+0x220] - vbroadcasti128 ymm13, xmmword ptr [rsp+0x240] - vpunpckldq ymm14, ymm12, ymm13 - vpunpckhdq ymm15, ymm12, ymm13 - vpermq ymm14, ymm14, 0x50 - vpermq ymm15, ymm15, 0x50 - vbroadcasti128 ymm12, xmmword ptr [BLAKE3_BLOCK_LEN+rip] - vpblendd ymm14, ymm14, ymm12, 0x44 - vpblendd ymm15, ymm15, ymm12, 0x44 - vmovdqa ymmword ptr [rsp], ymm14 - vmovdqa ymmword ptr [rsp+0x20], ymm15 - mov r8, qword ptr [rdi] - mov r9, qword ptr [rdi+0x8] - mov r10, qword ptr [rdi+0x10] - mov r11, qword ptr [rdi+0x18] - movzx eax, byte ptr [rbp+0x80] - or eax, r13d - xor edx, edx -.p2align 5 -2: - mov r14d, eax - or eax, r12d - add rdx, 64 - cmp rdx, r15 - cmovne eax, r14d - mov dword ptr [rsp+0x200], eax - vmovups ymm2, ymmword ptr [r8+rdx-0x40] - vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x40], 0x01 - vmovups ymm3, ymmword ptr [r8+rdx-0x30] - vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x30], 0x01 - vshufps ymm4, ymm2, ymm3, 136 - vshufps ymm5, ymm2, ymm3, 221 - vmovups ymm2, ymmword ptr [r8+rdx-0x20] - vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x20], 0x01 - vmovups ymm3, ymmword ptr [r8+rdx-0x10] - vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x10], 0x01 - vshufps ymm6, ymm2, ymm3, 136 - vshufps ymm7, ymm2, ymm3, 221 - vpshufd ymm6, ymm6, 0x93 - vpshufd ymm7, ymm7, 0x93 - vmovups ymm10, ymmword ptr [r10+rdx-0x40] - vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x40], 0x01 - vmovups ymm11, ymmword ptr [r10+rdx-0x30] - vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x30], 0x01 - vshufps ymm12, ymm10, ymm11, 136 - vshufps ymm13, ymm10, ymm11, 221 - vmovups ymm10, ymmword ptr [r10+rdx-0x20] - vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x20], 0x01 - vmovups ymm11, ymmword ptr [r10+rdx-0x10] - vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x10], 0x01 - vshufps ymm14, ymm10, ymm11, 136 - vshufps ymm15, ymm10, ymm11, 221 - vpshufd ymm14, ymm14, 0x93 - vpshufd ymm15, ymm15, 0x93 - vpbroadcastd ymm2, dword ptr [rsp+0x200] - vmovdqa ymm3, ymmword ptr [rsp] - vmovdqa ymm11, ymmword ptr [rsp+0x20] - vpblendd ymm3, ymm3, ymm2, 0x88 - vpblendd ymm11, ymm11, ymm2, 0x88 - vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip] - vmovdqa ymm10, ymm2 - mov al, 7 -9: - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm8, ymm8, ymm12 - vmovdqa ymmword ptr [rsp+0x40], ymm4 - nop - vmovdqa ymmword ptr [rsp+0x60], ymm12 - nop - vpaddd ymm0, ymm0, ymm1 - vpaddd ymm8, ymm8, ymm9 - vpxor ymm3, ymm3, ymm0 - vpxor ymm11, ymm11, ymm8 - vbroadcasti128 ymm4, xmmword ptr [ROT16+rip] - vpshufb ymm3, ymm3, ymm4 - vpshufb ymm11, ymm11, ymm4 - vpaddd ymm2, ymm2, ymm3 - vpaddd ymm10, ymm10, ymm11 - vpxor ymm1, ymm1, ymm2 - vpxor ymm9, ymm9, ymm10 - vpsrld ymm4, ymm1, 12 - vpslld ymm1, ymm1, 20 - vpor ymm1, ymm1, ymm4 - vpsrld ymm4, ymm9, 12 - vpslld ymm9, ymm9, 20 - vpor ymm9, ymm9, ymm4 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm0, ymm0, ymm1 - vpaddd ymm8, ymm8, ymm9 - vmovdqa ymmword ptr [rsp+0x80], ymm5 - vmovdqa ymmword ptr [rsp+0xA0], ymm13 - vpxor ymm3, ymm3, ymm0 - vpxor ymm11, ymm11, ymm8 - vbroadcasti128 ymm4, xmmword ptr [ROT8+rip] - vpshufb ymm3, ymm3, ymm4 - vpshufb ymm11, ymm11, ymm4 - vpaddd ymm2, ymm2, ymm3 - vpaddd ymm10, ymm10, ymm11 - vpxor ymm1, ymm1, ymm2 - vpxor ymm9, ymm9, ymm10 - vpsrld ymm4, ymm1, 7 - vpslld ymm1, ymm1, 25 - vpor ymm1, ymm1, ymm4 - vpsrld ymm4, ymm9, 7 - vpslld ymm9, ymm9, 25 - vpor ymm9, ymm9, ymm4 - vpshufd ymm0, ymm0, 0x93 - vpshufd ymm8, ymm8, 0x93 - vpshufd ymm3, ymm3, 0x4E - vpshufd ymm11, ymm11, 0x4E - vpshufd ymm2, ymm2, 0x39 - vpshufd ymm10, ymm10, 0x39 - vpaddd ymm0, ymm0, ymm6 - vpaddd ymm8, ymm8, ymm14 - vpaddd ymm0, ymm0, ymm1 - vpaddd ymm8, ymm8, ymm9 - vpxor ymm3, ymm3, ymm0 - vpxor ymm11, ymm11, ymm8 - vbroadcasti128 ymm4, xmmword ptr [ROT16+rip] - vpshufb ymm3, ymm3, ymm4 - vpshufb ymm11, ymm11, ymm4 - vpaddd ymm2, ymm2, ymm3 - vpaddd ymm10, ymm10, ymm11 - vpxor ymm1, ymm1, ymm2 - vpxor ymm9, ymm9, ymm10 - vpsrld ymm4, ymm1, 12 - vpslld ymm1, ymm1, 20 - vpor ymm1, ymm1, ymm4 - vpsrld ymm4, ymm9, 12 - vpslld ymm9, ymm9, 20 - vpor ymm9, ymm9, ymm4 - vpaddd ymm0, ymm0, ymm7 - vpaddd ymm8, ymm8, ymm15 - vpaddd ymm0, ymm0, ymm1 - vpaddd ymm8, ymm8, ymm9 - vpxor ymm3, ymm3, ymm0 - vpxor ymm11, ymm11, ymm8 - vbroadcasti128 ymm4, xmmword ptr [ROT8+rip] - vpshufb ymm3, ymm3, ymm4 - vpshufb ymm11, ymm11, ymm4 - vpaddd ymm2, ymm2, ymm3 - vpaddd ymm10, ymm10, ymm11 - vpxor ymm1, ymm1, ymm2 - vpxor ymm9, ymm9, ymm10 - vpsrld ymm4, ymm1, 7 - vpslld ymm1, ymm1, 25 - vpor ymm1, ymm1, ymm4 - vpsrld ymm4, ymm9, 7 - vpslld ymm9, ymm9, 25 - vpor ymm9, ymm9, ymm4 - vpshufd ymm0, ymm0, 0x39 - vpshufd ymm8, ymm8, 0x39 - vpshufd ymm3, ymm3, 0x4E - vpshufd ymm11, ymm11, 0x4E - vpshufd ymm2, ymm2, 0x93 - vpshufd ymm10, ymm10, 0x93 - dec al - je 9f - vmovdqa ymm4, ymmword ptr [rsp+0x40] - vmovdqa ymm5, ymmword ptr [rsp+0x80] - vshufps ymm12, ymm4, ymm5, 214 - vpshufd ymm13, ymm4, 0x0F - vpshufd ymm4, ymm12, 0x39 - vshufps ymm12, ymm6, ymm7, 250 - vpblendd ymm13, ymm13, ymm12, 0xAA - vpunpcklqdq ymm12, ymm7, ymm5 - vpblendd ymm12, ymm12, ymm6, 0x88 - vpshufd ymm12, ymm12, 0x78 - vpunpckhdq ymm5, ymm5, ymm7 - vpunpckldq ymm6, ymm6, ymm5 - vpshufd ymm7, ymm6, 0x1E - vmovdqa ymmword ptr [rsp+0x40], ymm13 - vmovdqa ymmword ptr [rsp+0x80], ymm12 - vmovdqa ymm12, ymmword ptr [rsp+0x60] - vmovdqa ymm13, ymmword ptr [rsp+0xA0] - vshufps ymm5, ymm12, ymm13, 214 - vpshufd ymm6, ymm12, 0x0F - vpshufd ymm12, ymm5, 0x39 - vshufps ymm5, ymm14, ymm15, 250 - vpblendd ymm6, ymm6, ymm5, 0xAA - vpunpcklqdq ymm5, ymm15, ymm13 - vpblendd ymm5, ymm5, ymm14, 0x88 - vpshufd ymm5, ymm5, 0x78 - vpunpckhdq ymm13, ymm13, ymm15 - vpunpckldq ymm14, ymm14, ymm13 - vpshufd ymm15, ymm14, 0x1E - vmovdqa ymm13, ymm6 - vmovdqa ymm14, ymm5 - vmovdqa ymm5, ymmword ptr [rsp+0x40] - vmovdqa ymm6, ymmword ptr [rsp+0x80] - jmp 9b -9: - vpxor ymm0, ymm0, ymm2 - vpxor ymm1, ymm1, ymm3 - vpxor ymm8, ymm8, ymm10 - vpxor ymm9, ymm9, ymm11 - mov eax, r13d - cmp rdx, r15 - jne 2b - vmovdqu xmmword ptr [rbx], xmm0 - vmovdqu xmmword ptr [rbx+0x10], xmm1 - vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 - vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 - vmovdqu xmmword ptr [rbx+0x40], xmm8 - vmovdqu xmmword ptr [rbx+0x50], xmm9 - vextracti128 xmmword ptr [rbx+0x60], ymm8, 0x01 - vextracti128 xmmword ptr [rbx+0x70], ymm9, 0x01 - vmovaps xmm8, xmmword ptr [rsp+0x260] - vmovaps xmm0, xmmword ptr [rsp+0x220] - vmovaps xmm1, xmmword ptr [rsp+0x230] - vmovaps xmm2, xmmword ptr [rsp+0x240] - vmovaps xmm3, xmmword ptr [rsp+0x250] - vblendvps xmm0, xmm0, xmm1, xmm8 - vblendvps xmm2, xmm2, xmm3, xmm8 - vmovaps xmmword ptr [rsp+0x220], xmm0 - vmovaps xmmword ptr [rsp+0x240], xmm2 - add rbx, 128 - add rdi, 32 - sub rsi, 4 -3: - test rsi, 0x2 - je 3f - vbroadcasti128 ymm0, xmmword ptr [rcx] - vbroadcasti128 ymm1, xmmword ptr [rcx+0x10] - vmovd xmm13, dword ptr [rsp+0x220] - vpinsrd xmm13, xmm13, dword ptr [rsp+0x240], 1 - vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 - vmovd xmm14, dword ptr [rsp+0x224] - vpinsrd xmm14, xmm14, dword ptr [rsp+0x244], 1 - vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 - vinserti128 ymm13, ymm13, xmm14, 0x01 - vbroadcasti128 ymm14, xmmword ptr [ROT16+rip] - vbroadcasti128 ymm15, xmmword ptr [ROT8+rip] - mov r8, qword ptr [rdi] - mov r9, qword ptr [rdi+0x8] - movzx eax, byte ptr [rbp+0x80] - or eax, r13d - xor edx, edx -.p2align 5 -2: - mov r14d, eax - or eax, r12d - add rdx, 64 - cmp rdx, r15 - cmovne eax, r14d - mov dword ptr [rsp+0x200], eax - vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip] - vpbroadcastd ymm8, dword ptr [rsp+0x200] - vpblendd ymm3, ymm13, ymm8, 0x88 - vmovups ymm8, ymmword ptr [r8+rdx-0x40] - vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x40], 0x01 - vmovups ymm9, ymmword ptr [r8+rdx-0x30] - vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x30], 0x01 - vshufps ymm4, ymm8, ymm9, 136 - vshufps ymm5, ymm8, ymm9, 221 - vmovups ymm8, ymmword ptr [r8+rdx-0x20] - vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x20], 0x01 - vmovups ymm9, ymmword ptr [r8+rdx-0x10] - vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x10], 0x01 - vshufps ymm6, ymm8, ymm9, 136 - vshufps ymm7, ymm8, ymm9, 221 - vpshufd ymm6, ymm6, 0x93 - vpshufd ymm7, ymm7, 0x93 - mov al, 7 -9: - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm0, ymm0, ymm1 - vpxor ymm3, ymm3, ymm0 - vpshufb ymm3, ymm3, ymm14 - vpaddd ymm2, ymm2, ymm3 - vpxor ymm1, ymm1, ymm2 - vpsrld ymm8, ymm1, 12 - vpslld ymm1, ymm1, 20 - vpor ymm1, ymm1, ymm8 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm0, ymm0, ymm1 - vpxor ymm3, ymm3, ymm0 - vpshufb ymm3, ymm3, ymm15 - vpaddd ymm2, ymm2, ymm3 - vpxor ymm1, ymm1, ymm2 - vpsrld ymm8, ymm1, 7 - vpslld ymm1, ymm1, 25 - vpor ymm1, ymm1, ymm8 - vpshufd ymm0, ymm0, 0x93 - vpshufd ymm3, ymm3, 0x4E - vpshufd ymm2, ymm2, 0x39 - vpaddd ymm0, ymm0, ymm6 - vpaddd ymm0, ymm0, ymm1 - vpxor ymm3, ymm3, ymm0 - vpshufb ymm3, ymm3, ymm14 - vpaddd ymm2, ymm2, ymm3 - vpxor ymm1, ymm1, ymm2 - vpsrld ymm8, ymm1, 12 - vpslld ymm1, ymm1, 20 - vpor ymm1, ymm1, ymm8 - vpaddd ymm0, ymm0, ymm7 - vpaddd ymm0, ymm0, ymm1 - vpxor ymm3, ymm3, ymm0 - vpshufb ymm3, ymm3, ymm15 - vpaddd ymm2, ymm2, ymm3 - vpxor ymm1, ymm1, ymm2 - vpsrld ymm8, ymm1, 7 - vpslld ymm1, ymm1, 25 - vpor ymm1, ymm1, ymm8 - vpshufd ymm0, ymm0, 0x39 - vpshufd ymm3, ymm3, 0x4E - vpshufd ymm2, ymm2, 0x93 - dec al - jz 9f - vshufps ymm8, ymm4, ymm5, 214 - vpshufd ymm9, ymm4, 0x0F - vpshufd ymm4, ymm8, 0x39 - vshufps ymm8, ymm6, ymm7, 250 - vpblendd ymm9, ymm9, ymm8, 0xAA - vpunpcklqdq ymm8, ymm7, ymm5 - vpblendd ymm8, ymm8, ymm6, 0x88 - vpshufd ymm8, ymm8, 0x78 - vpunpckhdq ymm5, ymm5, ymm7 - vpunpckldq ymm6, ymm6, ymm5 - vpshufd ymm7, ymm6, 0x1E - vmovdqa ymm5, ymm9 - vmovdqa ymm6, ymm8 - jmp 9b -9: - vpxor ymm0, ymm0, ymm2 - vpxor ymm1, ymm1, ymm3 - mov eax, r13d - cmp rdx, r15 - jne 2b - vmovdqu xmmword ptr [rbx], xmm0 - vmovdqu xmmword ptr [rbx+0x10], xmm1 - vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 - vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 - vmovaps ymm8, ymmword ptr [rsp+0x260] - vmovaps ymm0, ymmword ptr [rsp+0x220] - vmovups ymm1, ymmword ptr [rsp+0x228] - vmovaps ymm2, ymmword ptr [rsp+0x240] - vmovups ymm3, ymmword ptr [rsp+0x248] - vblendvps ymm0, ymm0, ymm1, ymm8 - vblendvps ymm2, ymm2, ymm3, ymm8 - vmovaps ymmword ptr [rsp+0x220], ymm0 - vmovaps ymmword ptr [rsp+0x240], ymm2 - add rbx, 64 - add rdi, 16 - sub rsi, 2 -3: - test rsi, 0x1 - je 4b - vmovdqu xmm0, xmmword ptr [rcx] - vmovdqu xmm1, xmmword ptr [rcx+0x10] - vmovd xmm3, dword ptr [rsp+0x220] - vpinsrd xmm3, xmm3, dword ptr [rsp+0x240], 1 - vpinsrd xmm13, xmm3, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 - vmovdqa xmm14, xmmword ptr [ROT16+rip] - vmovdqa xmm15, xmmword ptr [ROT8+rip] - mov r8, qword ptr [rdi] - movzx eax, byte ptr [rbp+0x80] - or eax, r13d - xor edx, edx -.p2align 5 -2: - mov r14d, eax - or eax, r12d - add rdx, 64 - cmp rdx, r15 - cmovne eax, r14d - vmovdqa xmm2, xmmword ptr [BLAKE3_IV+rip] - vmovdqa xmm3, xmm13 - vpinsrd xmm3, xmm3, eax, 3 - vmovups xmm8, xmmword ptr [r8+rdx-0x40] - vmovups xmm9, xmmword ptr [r8+rdx-0x30] - vshufps xmm4, xmm8, xmm9, 136 - vshufps xmm5, xmm8, xmm9, 221 - vmovups xmm8, xmmword ptr [r8+rdx-0x20] - vmovups xmm9, xmmword ptr [r8+rdx-0x10] - vshufps xmm6, xmm8, xmm9, 136 - vshufps xmm7, xmm8, xmm9, 221 - vpshufd xmm6, xmm6, 0x93 - vpshufd xmm7, xmm7, 0x93 - mov al, 7 -9: - vpaddd xmm0, xmm0, xmm4 - vpaddd xmm0, xmm0, xmm1 - vpxor xmm3, xmm3, xmm0 - vpshufb xmm3, xmm3, xmm14 - vpaddd xmm2, xmm2, xmm3 - vpxor xmm1, xmm1, xmm2 - vpsrld xmm8, xmm1, 12 - vpslld xmm1, xmm1, 20 - vpor xmm1, xmm1, xmm8 - vpaddd xmm0, xmm0, xmm5 - vpaddd xmm0, xmm0, xmm1 - vpxor xmm3, xmm3, xmm0 - vpshufb xmm3, xmm3, xmm15 - vpaddd xmm2, xmm2, xmm3 - vpxor xmm1, xmm1, xmm2 - vpsrld xmm8, xmm1, 7 - vpslld xmm1, xmm1, 25 - vpor xmm1, xmm1, xmm8 - vpshufd xmm0, xmm0, 0x93 - vpshufd xmm3, xmm3, 0x4E - vpshufd xmm2, xmm2, 0x39 - vpaddd xmm0, xmm0, xmm6 - vpaddd xmm0, xmm0, xmm1 - vpxor xmm3, xmm3, xmm0 - vpshufb xmm3, xmm3, xmm14 - vpaddd xmm2, xmm2, xmm3 - vpxor xmm1, xmm1, xmm2 - vpsrld xmm8, xmm1, 12 - vpslld xmm1, xmm1, 20 - vpor xmm1, xmm1, xmm8 - vpaddd xmm0, xmm0, xmm7 - vpaddd xmm0, xmm0, xmm1 - vpxor xmm3, xmm3, xmm0 - vpshufb xmm3, xmm3, xmm15 - vpaddd xmm2, xmm2, xmm3 - vpxor xmm1, xmm1, xmm2 - vpsrld xmm8, xmm1, 7 - vpslld xmm1, xmm1, 25 - vpor xmm1, xmm1, xmm8 - vpshufd xmm0, xmm0, 0x39 - vpshufd xmm3, xmm3, 0x4E - vpshufd xmm2, xmm2, 0x93 - dec al - jz 9f - vshufps xmm8, xmm4, xmm5, 214 - vpshufd xmm9, xmm4, 0x0F - vpshufd xmm4, xmm8, 0x39 - vshufps xmm8, xmm6, xmm7, 250 - vpblendd xmm9, xmm9, xmm8, 0xAA - vpunpcklqdq xmm8, xmm7, xmm5 - vpblendd xmm8, xmm8, xmm6, 0x88 - vpshufd xmm8, xmm8, 0x78 - vpunpckhdq xmm5, xmm5, xmm7 - vpunpckldq xmm6, xmm6, xmm5 - vpshufd xmm7, xmm6, 0x1E - vmovdqa xmm5, xmm9 - vmovdqa xmm6, xmm8 - jmp 9b -9: - vpxor xmm0, xmm0, xmm2 - vpxor xmm1, xmm1, xmm3 - mov eax, r13d - cmp rdx, r15 - jne 2b - vmovdqu xmmword ptr [rbx], xmm0 - vmovdqu xmmword ptr [rbx+0x10], xmm1 - jmp 4b - -.section .rodata -.p2align 6 -ADD0: - .long 0, 1, 2, 3, 4, 5, 6, 7 -ADD1: - .long 8, 8, 8, 8, 8, 8, 8, 8 -BLAKE3_IV_0: - .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 - .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 -BLAKE3_IV_1: - .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 - .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 -BLAKE3_IV_2: - .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 - .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 -BLAKE3_IV_3: - .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A - .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A -BLAKE3_BLOCK_LEN: - .long 0x00000040, 0x00000040, 0x00000040, 0x00000040 - .long 0x00000040, 0x00000040, 0x00000040, 0x00000040 -ROT16: - .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 -ROT8: - .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 -CMP_MSB_MASK: - .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 - .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 -BLAKE3_IV: - .long 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A - diff --git a/thirdparty/BLAKE3/c/blake3_avx2_x86-64_windows_msvc.asm b/thirdparty/BLAKE3/c/blake3_avx2_x86-64_windows_msvc.asm deleted file mode 100644 index 352298edd..000000000 --- a/thirdparty/BLAKE3/c/blake3_avx2_x86-64_windows_msvc.asm +++ /dev/null @@ -1,1828 +0,0 @@ -public _blake3_hash_many_avx2 -public blake3_hash_many_avx2 - -_TEXT SEGMENT ALIGN(16) 'CODE' - -ALIGN 16 -blake3_hash_many_avx2 PROC -_blake3_hash_many_avx2 PROC - push r15 - push r14 - push r13 - push r12 - push rsi - push rdi - push rbx - push rbp - mov rbp, rsp - sub rsp, 880 - and rsp, 0FFFFFFFFFFFFFFC0H - vmovdqa xmmword ptr [rsp+2D0H], xmm6 - vmovdqa xmmword ptr [rsp+2E0H], xmm7 - vmovdqa xmmword ptr [rsp+2F0H], xmm8 - vmovdqa xmmword ptr [rsp+300H], xmm9 - vmovdqa xmmword ptr [rsp+310H], xmm10 - vmovdqa xmmword ptr [rsp+320H], xmm11 - vmovdqa xmmword ptr [rsp+330H], xmm12 - vmovdqa xmmword ptr [rsp+340H], xmm13 - vmovdqa xmmword ptr [rsp+350H], xmm14 - vmovdqa xmmword ptr [rsp+360H], xmm15 - mov rdi, rcx - mov rsi, rdx - mov rdx, r8 - mov rcx, r9 - mov r8, qword ptr [rbp+68H] - movzx r9, byte ptr [rbp+70H] - neg r9d - vmovd xmm0, r9d - vpbroadcastd ymm0, xmm0 - vmovdqa ymmword ptr [rsp+260H], ymm0 - vpand ymm1, ymm0, ymmword ptr [ADD0] - vpand ymm2, ymm0, ymmword ptr [ADD1] - vmovdqa ymmword ptr [rsp+2A0H], ymm2 - vmovd xmm2, r8d - vpbroadcastd ymm2, xmm2 - vpaddd ymm2, ymm2, ymm1 - vmovdqa ymmword ptr [rsp+220H], ymm2 - vpxor ymm1, ymm1, ymmword ptr [CMP_MSB_MASK] - vpxor ymm2, ymm2, ymmword ptr [CMP_MSB_MASK] - vpcmpgtd ymm2, ymm1, ymm2 - shr r8, 32 - vmovd xmm3, r8d - vpbroadcastd ymm3, xmm3 - vpsubd ymm3, ymm3, ymm2 - vmovdqa ymmword ptr [rsp+240H], ymm3 - shl rdx, 6 - mov qword ptr [rsp+2C0H], rdx - cmp rsi, 8 - jc final7blocks -outerloop8: - vpbroadcastd ymm0, dword ptr [rcx] - vpbroadcastd ymm1, dword ptr [rcx+4H] - vpbroadcastd ymm2, dword ptr [rcx+8H] - vpbroadcastd ymm3, dword ptr [rcx+0CH] - vpbroadcastd ymm4, dword ptr [rcx+10H] - vpbroadcastd ymm5, dword ptr [rcx+14H] - vpbroadcastd ymm6, dword ptr [rcx+18H] - vpbroadcastd ymm7, dword ptr [rcx+1CH] - mov r8, qword ptr [rdi] - mov r9, qword ptr [rdi+8H] - mov r10, qword ptr [rdi+10H] - mov r11, qword ptr [rdi+18H] - mov r12, qword ptr [rdi+20H] - mov r13, qword ptr [rdi+28H] - mov r14, qword ptr [rdi+30H] - mov r15, qword ptr [rdi+38H] - movzx eax, byte ptr [rbp+78H] - movzx ebx, byte ptr [rbp+80H] - or eax, ebx - xor edx, edx -ALIGN 16 -innerloop8: - movzx ebx, byte ptr [rbp+88H] - or ebx, eax - add rdx, 64 - cmp rdx, qword ptr [rsp+2C0H] - cmove eax, ebx - mov dword ptr [rsp+200H], eax - vmovups xmm8, xmmword ptr [r8+rdx-40H] - vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-40H], 01H - vmovups xmm9, xmmword ptr [r9+rdx-40H] - vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-40H], 01H - vunpcklpd ymm12, ymm8, ymm9 - vunpckhpd ymm13, ymm8, ymm9 - vmovups xmm10, xmmword ptr [r10+rdx-40H] - vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-40H], 01H - vmovups xmm11, xmmword ptr [r11+rdx-40H] - vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-40H], 01H - vunpcklpd ymm14, ymm10, ymm11 - vunpckhpd ymm15, ymm10, ymm11 - vshufps ymm8, ymm12, ymm14, 136 - vmovaps ymmword ptr [rsp], ymm8 - vshufps ymm9, ymm12, ymm14, 221 - vmovaps ymmword ptr [rsp+20H], ymm9 - vshufps ymm10, ymm13, ymm15, 136 - vmovaps ymmword ptr [rsp+40H], ymm10 - vshufps ymm11, ymm13, ymm15, 221 - vmovaps ymmword ptr [rsp+60H], ymm11 - vmovups xmm8, xmmword ptr [r8+rdx-30H] - vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-30H], 01H - vmovups xmm9, xmmword ptr [r9+rdx-30H] - vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-30H], 01H - vunpcklpd ymm12, ymm8, ymm9 - vunpckhpd ymm13, ymm8, ymm9 - vmovups xmm10, xmmword ptr [r10+rdx-30H] - vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-30H], 01H - vmovups xmm11, xmmword ptr [r11+rdx-30H] - vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-30H], 01H - vunpcklpd ymm14, ymm10, ymm11 - vunpckhpd ymm15, ymm10, ymm11 - vshufps ymm8, ymm12, ymm14, 136 - vmovaps ymmword ptr [rsp+80H], ymm8 - vshufps ymm9, ymm12, ymm14, 221 - vmovaps ymmword ptr [rsp+0A0H], ymm9 - vshufps ymm10, ymm13, ymm15, 136 - vmovaps ymmword ptr [rsp+0C0H], ymm10 - vshufps ymm11, ymm13, ymm15, 221 - vmovaps ymmword ptr [rsp+0E0H], ymm11 - vmovups xmm8, xmmword ptr [r8+rdx-20H] - vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-20H], 01H - vmovups xmm9, xmmword ptr [r9+rdx-20H] - vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-20H], 01H - vunpcklpd ymm12, ymm8, ymm9 - vunpckhpd ymm13, ymm8, ymm9 - vmovups xmm10, xmmword ptr [r10+rdx-20H] - vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-20H], 01H - vmovups xmm11, xmmword ptr [r11+rdx-20H] - vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-20H], 01H - vunpcklpd ymm14, ymm10, ymm11 - vunpckhpd ymm15, ymm10, ymm11 - vshufps ymm8, ymm12, ymm14, 136 - vmovaps ymmword ptr [rsp+100H], ymm8 - vshufps ymm9, ymm12, ymm14, 221 - vmovaps ymmword ptr [rsp+120H], ymm9 - vshufps ymm10, ymm13, ymm15, 136 - vmovaps ymmword ptr [rsp+140H], ymm10 - vshufps ymm11, ymm13, ymm15, 221 - vmovaps ymmword ptr [rsp+160H], ymm11 - vmovups xmm8, xmmword ptr [r8+rdx-10H] - vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-10H], 01H - vmovups xmm9, xmmword ptr [r9+rdx-10H] - vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-10H], 01H - vunpcklpd ymm12, ymm8, ymm9 - vunpckhpd ymm13, ymm8, ymm9 - vmovups xmm10, xmmword ptr [r10+rdx-10H] - vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-10H], 01H - vmovups xmm11, xmmword ptr [r11+rdx-10H] - vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-10H], 01H - vunpcklpd ymm14, ymm10, ymm11 - vunpckhpd ymm15, ymm10, ymm11 - vshufps ymm8, ymm12, ymm14, 136 - vmovaps ymmword ptr [rsp+180H], ymm8 - vshufps ymm9, ymm12, ymm14, 221 - vmovaps ymmword ptr [rsp+1A0H], ymm9 - vshufps ymm10, ymm13, ymm15, 136 - vmovaps ymmword ptr [rsp+1C0H], ymm10 - vshufps ymm11, ymm13, ymm15, 221 - vmovaps ymmword ptr [rsp+1E0H], ymm11 - vpbroadcastd ymm15, dword ptr [rsp+200H] - prefetcht0 byte ptr [r8+rdx+80H] - prefetcht0 byte ptr [r12+rdx+80H] - prefetcht0 byte ptr [r9+rdx+80H] - prefetcht0 byte ptr [r13+rdx+80H] - prefetcht0 byte ptr [r10+rdx+80H] - prefetcht0 byte ptr [r14+rdx+80H] - prefetcht0 byte ptr [r11+rdx+80H] - prefetcht0 byte ptr [r15+rdx+80H] - vpaddd ymm0, ymm0, ymmword ptr [rsp] - vpaddd ymm1, ymm1, ymmword ptr [rsp+40H] - vpaddd ymm2, ymm2, ymmword ptr [rsp+80H] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0C0H] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm0, ymmword ptr [rsp+220H] - vpxor ymm13, ymm1, ymmword ptr [rsp+240H] - vpxor ymm14, ymm2, ymmword ptr [BLAKE3_BLOCK_LEN] - vpxor ymm15, ymm3, ymm15 - vbroadcasti128 ymm8, xmmword ptr [ROT16] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [BLAKE3_IV_0] - vpaddd ymm9, ymm13, ymmword ptr [BLAKE3_IV_1] - vpaddd ymm10, ymm14, ymmword ptr [BLAKE3_IV_2] - vpaddd ymm11, ymm15, ymmword ptr [BLAKE3_IV_3] - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+200H], ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+20H] - vpaddd ymm1, ymm1, ymmword ptr [rsp+60H] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0A0H] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0E0H] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+200H], ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+100H] - vpaddd ymm1, ymm1, ymmword ptr [rsp+140H] - vpaddd ymm2, ymm2, ymmword ptr [rsp+180H] - vpaddd ymm3, ymm3, ymmword ptr [rsp+1C0H] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+200H], ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+120H] - vpaddd ymm1, ymm1, ymmword ptr [rsp+160H] - vpaddd ymm2, ymm2, ymmword ptr [rsp+1A0H] - vpaddd ymm3, ymm3, ymmword ptr [rsp+1E0H] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+200H], ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+40H] - vpaddd ymm1, ymm1, ymmword ptr [rsp+60H] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0E0H] - vpaddd ymm3, ymm3, ymmword ptr [rsp+80H] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+200H], ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0C0H] - vpaddd ymm1, ymm1, ymmword ptr [rsp+140H] - vpaddd ymm2, ymm2, ymmword ptr [rsp] - vpaddd ymm3, ymm3, ymmword ptr [rsp+1A0H] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+200H], ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+20H] - vpaddd ymm1, ymm1, ymmword ptr [rsp+180H] - vpaddd ymm2, ymm2, ymmword ptr [rsp+120H] - vpaddd ymm3, ymm3, ymmword ptr [rsp+1E0H] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+200H], ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+160H] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0A0H] - vpaddd ymm2, ymm2, ymmword ptr [rsp+1C0H] - vpaddd ymm3, ymm3, ymmword ptr [rsp+100H] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+200H], ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+60H] - vpaddd ymm1, ymm1, ymmword ptr [rsp+140H] - vpaddd ymm2, ymm2, ymmword ptr [rsp+1A0H] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0E0H] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+200H], ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+80H] - vpaddd ymm1, ymm1, ymmword ptr [rsp+180H] - vpaddd ymm2, ymm2, ymmword ptr [rsp+40H] - vpaddd ymm3, ymm3, ymmword ptr [rsp+1C0H] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+200H], ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0C0H] - vpaddd ymm1, ymm1, ymmword ptr [rsp+120H] - vpaddd ymm2, ymm2, ymmword ptr [rsp+160H] - vpaddd ymm3, ymm3, ymmword ptr [rsp+100H] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+200H], ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0A0H] - vpaddd ymm1, ymm1, ymmword ptr [rsp] - vpaddd ymm2, ymm2, ymmword ptr [rsp+1E0H] - vpaddd ymm3, ymm3, ymmword ptr [rsp+20H] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+200H], ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+140H] - vpaddd ymm1, ymm1, ymmword ptr [rsp+180H] - vpaddd ymm2, ymm2, ymmword ptr [rsp+1C0H] - vpaddd ymm3, ymm3, ymmword ptr [rsp+1A0H] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+200H], ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0E0H] - vpaddd ymm1, ymm1, ymmword ptr [rsp+120H] - vpaddd ymm2, ymm2, ymmword ptr [rsp+60H] - vpaddd ymm3, ymm3, ymmword ptr [rsp+1E0H] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+200H], ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+80H] - vpaddd ymm1, ymm1, ymmword ptr [rsp+160H] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0A0H] - vpaddd ymm3, ymm3, ymmword ptr [rsp+20H] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+200H], ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp] - vpaddd ymm1, ymm1, ymmword ptr [rsp+40H] - vpaddd ymm2, ymm2, ymmword ptr [rsp+100H] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0C0H] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+200H], ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+180H] - vpaddd ymm1, ymm1, ymmword ptr [rsp+120H] - vpaddd ymm2, ymm2, ymmword ptr [rsp+1E0H] - vpaddd ymm3, ymm3, ymmword ptr [rsp+1C0H] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+200H], ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+1A0H] - vpaddd ymm1, ymm1, ymmword ptr [rsp+160H] - vpaddd ymm2, ymm2, ymmword ptr [rsp+140H] - vpaddd ymm3, ymm3, ymmword ptr [rsp+100H] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+200H], ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+0E0H] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0A0H] - vpaddd ymm2, ymm2, ymmword ptr [rsp] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0C0H] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+200H], ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+40H] - vpaddd ymm1, ymm1, ymmword ptr [rsp+60H] - vpaddd ymm2, ymm2, ymmword ptr [rsp+20H] - vpaddd ymm3, ymm3, ymmword ptr [rsp+80H] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+200H], ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+120H] - vpaddd ymm1, ymm1, ymmword ptr [rsp+160H] - vpaddd ymm2, ymm2, ymmword ptr [rsp+100H] - vpaddd ymm3, ymm3, ymmword ptr [rsp+1E0H] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+200H], ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+1C0H] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0A0H] - vpaddd ymm2, ymm2, ymmword ptr [rsp+180H] - vpaddd ymm3, ymm3, ymmword ptr [rsp+20H] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+200H], ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+1A0H] - vpaddd ymm1, ymm1, ymmword ptr [rsp] - vpaddd ymm2, ymm2, ymmword ptr [rsp+40H] - vpaddd ymm3, ymm3, ymmword ptr [rsp+80H] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+200H], ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+60H] - vpaddd ymm1, ymm1, ymmword ptr [rsp+140H] - vpaddd ymm2, ymm2, ymmword ptr [rsp+0C0H] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0E0H] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+200H], ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+160H] - vpaddd ymm1, ymm1, ymmword ptr [rsp+0A0H] - vpaddd ymm2, ymm2, ymmword ptr [rsp+20H] - vpaddd ymm3, ymm3, ymmword ptr [rsp+100H] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+200H], ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+1E0H] - vpaddd ymm1, ymm1, ymmword ptr [rsp] - vpaddd ymm2, ymm2, ymmword ptr [rsp+120H] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0C0H] - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxor ymm12, ymm12, ymm0 - vpxor ymm13, ymm13, ymm1 - vpxor ymm14, ymm14, ymm2 - vpxor ymm15, ymm15, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8] - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpshufb ymm15, ymm15, ymm8 - vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxor ymm4, ymm4, ymm8 - vpxor ymm5, ymm5, ymm9 - vpxor ymm6, ymm6, ymm10 - vpxor ymm7, ymm7, ymm11 - vmovdqa ymmword ptr [rsp+200H], ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+1C0H] - vpaddd ymm1, ymm1, ymmword ptr [rsp+40H] - vpaddd ymm2, ymm2, ymmword ptr [rsp+60H] - vpaddd ymm3, ymm3, ymmword ptr [rsp+0E0H] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT16] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vmovdqa ymmword ptr [rsp+200H], ymm8 - vpsrld ymm8, ymm5, 12 - vpslld ymm5, ymm5, 20 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 12 - vpslld ymm6, ymm6, 20 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 12 - vpslld ymm7, ymm7, 20 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 12 - vpslld ymm4, ymm4, 20 - vpor ymm4, ymm4, ymm8 - vpaddd ymm0, ymm0, ymmword ptr [rsp+140H] - vpaddd ymm1, ymm1, ymmword ptr [rsp+180H] - vpaddd ymm2, ymm2, ymmword ptr [rsp+80H] - vpaddd ymm3, ymm3, ymmword ptr [rsp+1A0H] - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxor ymm15, ymm15, ymm0 - vpxor ymm12, ymm12, ymm1 - vpxor ymm13, ymm13, ymm2 - vpxor ymm14, ymm14, ymm3 - vbroadcasti128 ymm8, xmmword ptr [ROT8] - vpshufb ymm15, ymm15, ymm8 - vpshufb ymm12, ymm12, ymm8 - vpshufb ymm13, ymm13, ymm8 - vpshufb ymm14, ymm14, ymm8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] - vpaddd ymm9, ymm9, ymm14 - vpxor ymm5, ymm5, ymm10 - vpxor ymm6, ymm6, ymm11 - vpxor ymm7, ymm7, ymm8 - vpxor ymm4, ymm4, ymm9 - vpxor ymm0, ymm0, ymm8 - vpxor ymm1, ymm1, ymm9 - vpxor ymm2, ymm2, ymm10 - vpxor ymm3, ymm3, ymm11 - vpsrld ymm8, ymm5, 7 - vpslld ymm5, ymm5, 25 - vpor ymm5, ymm5, ymm8 - vpsrld ymm8, ymm6, 7 - vpslld ymm6, ymm6, 25 - vpor ymm6, ymm6, ymm8 - vpsrld ymm8, ymm7, 7 - vpslld ymm7, ymm7, 25 - vpor ymm7, ymm7, ymm8 - vpsrld ymm8, ymm4, 7 - vpslld ymm4, ymm4, 25 - vpor ymm4, ymm4, ymm8 - vpxor ymm4, ymm4, ymm12 - vpxor ymm5, ymm5, ymm13 - vpxor ymm6, ymm6, ymm14 - vpxor ymm7, ymm7, ymm15 - movzx eax, byte ptr [rbp+78H] - jne innerloop8 - mov rbx, qword ptr [rbp+90H] - vunpcklps ymm8, ymm0, ymm1 - vunpcklps ymm9, ymm2, ymm3 - vunpckhps ymm10, ymm0, ymm1 - vunpcklps ymm11, ymm4, ymm5 - vunpcklps ymm0, ymm6, ymm7 - vshufps ymm12, ymm8, ymm9, 78 - vblendps ymm1, ymm8, ymm12, 0CCH - vshufps ymm8, ymm11, ymm0, 78 - vunpckhps ymm13, ymm2, ymm3 - vblendps ymm2, ymm11, ymm8, 0CCH - vblendps ymm3, ymm12, ymm9, 0CCH - vperm2f128 ymm12, ymm1, ymm2, 20H - vmovups ymmword ptr [rbx], ymm12 - vunpckhps ymm14, ymm4, ymm5 - vblendps ymm4, ymm8, ymm0, 0CCH - vunpckhps ymm15, ymm6, ymm7 - vperm2f128 ymm7, ymm3, ymm4, 20H - vmovups ymmword ptr [rbx+20H], ymm7 - vshufps ymm5, ymm10, ymm13, 78 - vblendps ymm6, ymm5, ymm13, 0CCH - vshufps ymm13, ymm14, ymm15, 78 - vblendps ymm10, ymm10, ymm5, 0CCH - vblendps ymm14, ymm14, ymm13, 0CCH - vperm2f128 ymm8, ymm10, ymm14, 20H - vmovups ymmword ptr [rbx+40H], ymm8 - vblendps ymm15, ymm13, ymm15, 0CCH - vperm2f128 ymm13, ymm6, ymm15, 20H - vmovups ymmword ptr [rbx+60H], ymm13 - vperm2f128 ymm9, ymm1, ymm2, 31H - vperm2f128 ymm11, ymm3, ymm4, 31H - vmovups ymmword ptr [rbx+80H], ymm9 - vperm2f128 ymm14, ymm10, ymm14, 31H - vperm2f128 ymm15, ymm6, ymm15, 31H - vmovups ymmword ptr [rbx+0A0H], ymm11 - vmovups ymmword ptr [rbx+0C0H], ymm14 - vmovups ymmword ptr [rbx+0E0H], ymm15 - vmovdqa ymm0, ymmword ptr [rsp+2A0H] - vpaddd ymm1, ymm0, ymmword ptr [rsp+220H] - vmovdqa ymmword ptr [rsp+220H], ymm1 - vpxor ymm0, ymm0, ymmword ptr [CMP_MSB_MASK] - vpxor ymm2, ymm1, ymmword ptr [CMP_MSB_MASK] - vpcmpgtd ymm2, ymm0, ymm2 - vmovdqa ymm0, ymmword ptr [rsp+240H] - vpsubd ymm2, ymm0, ymm2 - vmovdqa ymmword ptr [rsp+240H], ymm2 - add rdi, 64 - add rbx, 256 - mov qword ptr [rbp+90H], rbx - sub rsi, 8 - cmp rsi, 8 - jnc outerloop8 - test rsi, rsi - jnz final7blocks -unwind: - vzeroupper - vmovdqa xmm6, xmmword ptr [rsp+2D0H] - vmovdqa xmm7, xmmword ptr [rsp+2E0H] - vmovdqa xmm8, xmmword ptr [rsp+2F0H] - vmovdqa xmm9, xmmword ptr [rsp+300H] - vmovdqa xmm10, xmmword ptr [rsp+310H] - vmovdqa xmm11, xmmword ptr [rsp+320H] - vmovdqa xmm12, xmmword ptr [rsp+330H] - vmovdqa xmm13, xmmword ptr [rsp+340H] - vmovdqa xmm14, xmmword ptr [rsp+350H] - vmovdqa xmm15, xmmword ptr [rsp+360H] - mov rsp, rbp - pop rbp - pop rbx - pop rdi - pop rsi - pop r12 - pop r13 - pop r14 - pop r15 - ret -ALIGN 16 -final7blocks: - mov rbx, qword ptr [rbp+90H] - mov r15, qword ptr [rsp+2C0H] - movzx r13d, byte ptr [rbp+78H] - movzx r12d, byte ptr [rbp+88H] - test rsi, 4H - je final3blocks - vbroadcasti128 ymm0, xmmword ptr [rcx] - vbroadcasti128 ymm1, xmmword ptr [rcx+10H] - vmovdqa ymm8, ymm0 - vmovdqa ymm9, ymm1 - vbroadcasti128 ymm12, xmmword ptr [rsp+220H] - vbroadcasti128 ymm13, xmmword ptr [rsp+240H] - vpunpckldq ymm14, ymm12, ymm13 - vpunpckhdq ymm15, ymm12, ymm13 - vpermq ymm14, ymm14, 50H - vpermq ymm15, ymm15, 50H - vbroadcasti128 ymm12, xmmword ptr [BLAKE3_BLOCK_LEN] - vpblendd ymm14, ymm14, ymm12, 44H - vpblendd ymm15, ymm15, ymm12, 44H - vmovdqa ymmword ptr [rsp], ymm14 - vmovdqa ymmword ptr [rsp+20H], ymm15 - mov r8, qword ptr [rdi] - mov r9, qword ptr [rdi+8H] - mov r10, qword ptr [rdi+10H] - mov r11, qword ptr [rdi+18H] - movzx eax, byte ptr [rbp+80H] - or eax, r13d - xor edx, edx -ALIGN 16 -innerloop4: - mov r14d, eax - or eax, r12d - add rdx, 64 - cmp rdx, r15 - cmovne eax, r14d - mov dword ptr [rsp+200H], eax - vmovups ymm2, ymmword ptr [r8+rdx-40H] - vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-40H], 01H - vmovups ymm3, ymmword ptr [r8+rdx-30H] - vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-30H], 01H - vshufps ymm4, ymm2, ymm3, 136 - vshufps ymm5, ymm2, ymm3, 221 - vmovups ymm2, ymmword ptr [r8+rdx-20H] - vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-20H], 01H - vmovups ymm3, ymmword ptr [r8+rdx-10H] - vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-10H], 01H - vshufps ymm6, ymm2, ymm3, 136 - vshufps ymm7, ymm2, ymm3, 221 - vpshufd ymm6, ymm6, 93H - vpshufd ymm7, ymm7, 93H - vmovups ymm10, ymmword ptr [r10+rdx-40H] - vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-40H], 01H - vmovups ymm11, ymmword ptr [r10+rdx-30H] - vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-30H], 01H - vshufps ymm12, ymm10, ymm11, 136 - vshufps ymm13, ymm10, ymm11, 221 - vmovups ymm10, ymmword ptr [r10+rdx-20H] - vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-20H], 01H - vmovups ymm11, ymmword ptr [r10+rdx-10H] - vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-10H], 01H - vshufps ymm14, ymm10, ymm11, 136 - vshufps ymm15, ymm10, ymm11, 221 - vpshufd ymm14, ymm14, 93H - vpshufd ymm15, ymm15, 93H - vpbroadcastd ymm2, dword ptr [rsp+200H] - vmovdqa ymm3, ymmword ptr [rsp] - vmovdqa ymm11, ymmword ptr [rsp+20H] - vpblendd ymm3, ymm3, ymm2, 88H - vpblendd ymm11, ymm11, ymm2, 88H - vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV] - vmovdqa ymm10, ymm2 - mov al, 7 -roundloop4: - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm8, ymm8, ymm12 - vmovdqa ymmword ptr [rsp+40H], ymm4 - nop - vmovdqa ymmword ptr [rsp+60H], ymm12 - nop - vpaddd ymm0, ymm0, ymm1 - vpaddd ymm8, ymm8, ymm9 - vpxor ymm3, ymm3, ymm0 - vpxor ymm11, ymm11, ymm8 - vbroadcasti128 ymm4, xmmword ptr [ROT16] - vpshufb ymm3, ymm3, ymm4 - vpshufb ymm11, ymm11, ymm4 - vpaddd ymm2, ymm2, ymm3 - vpaddd ymm10, ymm10, ymm11 - vpxor ymm1, ymm1, ymm2 - vpxor ymm9, ymm9, ymm10 - vpsrld ymm4, ymm1, 12 - vpslld ymm1, ymm1, 20 - vpor ymm1, ymm1, ymm4 - vpsrld ymm4, ymm9, 12 - vpslld ymm9, ymm9, 20 - vpor ymm9, ymm9, ymm4 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm0, ymm0, ymm1 - vpaddd ymm8, ymm8, ymm9 - vmovdqa ymmword ptr [rsp+80H], ymm5 - vmovdqa ymmword ptr [rsp+0A0H], ymm13 - vpxor ymm3, ymm3, ymm0 - vpxor ymm11, ymm11, ymm8 - vbroadcasti128 ymm4, xmmword ptr [ROT8] - vpshufb ymm3, ymm3, ymm4 - vpshufb ymm11, ymm11, ymm4 - vpaddd ymm2, ymm2, ymm3 - vpaddd ymm10, ymm10, ymm11 - vpxor ymm1, ymm1, ymm2 - vpxor ymm9, ymm9, ymm10 - vpsrld ymm4, ymm1, 7 - vpslld ymm1, ymm1, 25 - vpor ymm1, ymm1, ymm4 - vpsrld ymm4, ymm9, 7 - vpslld ymm9, ymm9, 25 - vpor ymm9, ymm9, ymm4 - vpshufd ymm0, ymm0, 93H - vpshufd ymm8, ymm8, 93H - vpshufd ymm3, ymm3, 4EH - vpshufd ymm11, ymm11, 4EH - vpshufd ymm2, ymm2, 39H - vpshufd ymm10, ymm10, 39H - vpaddd ymm0, ymm0, ymm6 - vpaddd ymm8, ymm8, ymm14 - vpaddd ymm0, ymm0, ymm1 - vpaddd ymm8, ymm8, ymm9 - vpxor ymm3, ymm3, ymm0 - vpxor ymm11, ymm11, ymm8 - vbroadcasti128 ymm4, xmmword ptr [ROT16] - vpshufb ymm3, ymm3, ymm4 - vpshufb ymm11, ymm11, ymm4 - vpaddd ymm2, ymm2, ymm3 - vpaddd ymm10, ymm10, ymm11 - vpxor ymm1, ymm1, ymm2 - vpxor ymm9, ymm9, ymm10 - vpsrld ymm4, ymm1, 12 - vpslld ymm1, ymm1, 20 - vpor ymm1, ymm1, ymm4 - vpsrld ymm4, ymm9, 12 - vpslld ymm9, ymm9, 20 - vpor ymm9, ymm9, ymm4 - vpaddd ymm0, ymm0, ymm7 - vpaddd ymm8, ymm8, ymm15 - vpaddd ymm0, ymm0, ymm1 - vpaddd ymm8, ymm8, ymm9 - vpxor ymm3, ymm3, ymm0 - vpxor ymm11, ymm11, ymm8 - vbroadcasti128 ymm4, xmmword ptr [ROT8] - vpshufb ymm3, ymm3, ymm4 - vpshufb ymm11, ymm11, ymm4 - vpaddd ymm2, ymm2, ymm3 - vpaddd ymm10, ymm10, ymm11 - vpxor ymm1, ymm1, ymm2 - vpxor ymm9, ymm9, ymm10 - vpsrld ymm4, ymm1, 7 - vpslld ymm1, ymm1, 25 - vpor ymm1, ymm1, ymm4 - vpsrld ymm4, ymm9, 7 - vpslld ymm9, ymm9, 25 - vpor ymm9, ymm9, ymm4 - vpshufd ymm0, ymm0, 39H - vpshufd ymm8, ymm8, 39H - vpshufd ymm3, ymm3, 4EH - vpshufd ymm11, ymm11, 4EH - vpshufd ymm2, ymm2, 93H - vpshufd ymm10, ymm10, 93H - dec al - je endroundloop4 - vmovdqa ymm4, ymmword ptr [rsp+40H] - vmovdqa ymm5, ymmword ptr [rsp+80H] - vshufps ymm12, ymm4, ymm5, 214 - vpshufd ymm13, ymm4, 0FH - vpshufd ymm4, ymm12, 39H - vshufps ymm12, ymm6, ymm7, 250 - vpblendd ymm13, ymm13, ymm12, 0AAH - vpunpcklqdq ymm12, ymm7, ymm5 - vpblendd ymm12, ymm12, ymm6, 88H - vpshufd ymm12, ymm12, 78H - vpunpckhdq ymm5, ymm5, ymm7 - vpunpckldq ymm6, ymm6, ymm5 - vpshufd ymm7, ymm6, 1EH - vmovdqa ymmword ptr [rsp+40H], ymm13 - vmovdqa ymmword ptr [rsp+80H], ymm12 - vmovdqa ymm12, ymmword ptr [rsp+60H] - vmovdqa ymm13, ymmword ptr [rsp+0A0H] - vshufps ymm5, ymm12, ymm13, 214 - vpshufd ymm6, ymm12, 0FH - vpshufd ymm12, ymm5, 39H - vshufps ymm5, ymm14, ymm15, 250 - vpblendd ymm6, ymm6, ymm5, 0AAH - vpunpcklqdq ymm5, ymm15, ymm13 - vpblendd ymm5, ymm5, ymm14, 88H - vpshufd ymm5, ymm5, 78H - vpunpckhdq ymm13, ymm13, ymm15 - vpunpckldq ymm14, ymm14, ymm13 - vpshufd ymm15, ymm14, 1EH - vmovdqa ymm13, ymm6 - vmovdqa ymm14, ymm5 - vmovdqa ymm5, ymmword ptr [rsp+40H] - vmovdqa ymm6, ymmword ptr [rsp+80H] - jmp roundloop4 -endroundloop4: - vpxor ymm0, ymm0, ymm2 - vpxor ymm1, ymm1, ymm3 - vpxor ymm8, ymm8, ymm10 - vpxor ymm9, ymm9, ymm11 - mov eax, r13d - cmp rdx, r15 - jne innerloop4 - vmovdqu xmmword ptr [rbx], xmm0 - vmovdqu xmmword ptr [rbx+10H], xmm1 - vextracti128 xmmword ptr [rbx+20H], ymm0, 01H - vextracti128 xmmword ptr [rbx+30H], ymm1, 01H - vmovdqu xmmword ptr [rbx+40H], xmm8 - vmovdqu xmmword ptr [rbx+50H], xmm9 - vextracti128 xmmword ptr [rbx+60H], ymm8, 01H - vextracti128 xmmword ptr [rbx+70H], ymm9, 01H - vmovaps xmm8, xmmword ptr [rsp+260H] - vmovaps xmm0, xmmword ptr [rsp+220H] - vmovaps xmm1, xmmword ptr [rsp+230H] - vmovaps xmm2, xmmword ptr [rsp+240H] - vmovaps xmm3, xmmword ptr [rsp+250H] - vblendvps xmm0, xmm0, xmm1, xmm8 - vblendvps xmm2, xmm2, xmm3, xmm8 - vmovaps xmmword ptr [rsp+220H], xmm0 - vmovaps xmmword ptr [rsp+240H], xmm2 - add rbx, 128 - add rdi, 32 - sub rsi, 4 -final3blocks: - test rsi, 2H - je final1blocks - vbroadcasti128 ymm0, xmmword ptr [rcx] - vbroadcasti128 ymm1, xmmword ptr [rcx+10H] - vmovd xmm13, dword ptr [rsp+220H] - vpinsrd xmm13, xmm13, dword ptr [rsp+240H], 1 - vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN], 2 - vmovd xmm14, dword ptr [rsp+224H] - vpinsrd xmm14, xmm14, dword ptr [rsp+244H], 1 - vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN], 2 - vinserti128 ymm13, ymm13, xmm14, 01H - vbroadcasti128 ymm14, xmmword ptr [ROT16] - vbroadcasti128 ymm15, xmmword ptr [ROT8] - mov r8, qword ptr [rdi] - mov r9, qword ptr [rdi+8H] - movzx eax, byte ptr [rbp+80H] - or eax, r13d - xor edx, edx -ALIGN 16 -innerloop2: - mov r14d, eax - or eax, r12d - add rdx, 64 - cmp rdx, r15 - cmovne eax, r14d - mov dword ptr [rsp+200H], eax - vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV] - vpbroadcastd ymm8, dword ptr [rsp+200H] - vpblendd ymm3, ymm13, ymm8, 88H - vmovups ymm8, ymmword ptr [r8+rdx-40H] - vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-40H], 01H - vmovups ymm9, ymmword ptr [r8+rdx-30H] - vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-30H], 01H - vshufps ymm4, ymm8, ymm9, 136 - vshufps ymm5, ymm8, ymm9, 221 - vmovups ymm8, ymmword ptr [r8+rdx-20H] - vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-20H], 01H - vmovups ymm9, ymmword ptr [r8+rdx-10H] - vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-10H], 01H - vshufps ymm6, ymm8, ymm9, 136 - vshufps ymm7, ymm8, ymm9, 221 - vpshufd ymm6, ymm6, 93H - vpshufd ymm7, ymm7, 93H - mov al, 7 -roundloop2: - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm0, ymm0, ymm1 - vpxor ymm3, ymm3, ymm0 - vpshufb ymm3, ymm3, ymm14 - vpaddd ymm2, ymm2, ymm3 - vpxor ymm1, ymm1, ymm2 - vpsrld ymm8, ymm1, 12 - vpslld ymm1, ymm1, 20 - vpor ymm1, ymm1, ymm8 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm0, ymm0, ymm1 - vpxor ymm3, ymm3, ymm0 - vpshufb ymm3, ymm3, ymm15 - vpaddd ymm2, ymm2, ymm3 - vpxor ymm1, ymm1, ymm2 - vpsrld ymm8, ymm1, 7 - vpslld ymm1, ymm1, 25 - vpor ymm1, ymm1, ymm8 - vpshufd ymm0, ymm0, 93H - vpshufd ymm3, ymm3, 4EH - vpshufd ymm2, ymm2, 39H - vpaddd ymm0, ymm0, ymm6 - vpaddd ymm0, ymm0, ymm1 - vpxor ymm3, ymm3, ymm0 - vpshufb ymm3, ymm3, ymm14 - vpaddd ymm2, ymm2, ymm3 - vpxor ymm1, ymm1, ymm2 - vpsrld ymm8, ymm1, 12 - vpslld ymm1, ymm1, 20 - vpor ymm1, ymm1, ymm8 - vpaddd ymm0, ymm0, ymm7 - vpaddd ymm0, ymm0, ymm1 - vpxor ymm3, ymm3, ymm0 - vpshufb ymm3, ymm3, ymm15 - vpaddd ymm2, ymm2, ymm3 - vpxor ymm1, ymm1, ymm2 - vpsrld ymm8, ymm1, 7 - vpslld ymm1, ymm1, 25 - vpor ymm1, ymm1, ymm8 - vpshufd ymm0, ymm0, 39H - vpshufd ymm3, ymm3, 4EH - vpshufd ymm2, ymm2, 93H - dec al - jz endroundloop2 - vshufps ymm8, ymm4, ymm5, 214 - vpshufd ymm9, ymm4, 0FH - vpshufd ymm4, ymm8, 39H - vshufps ymm8, ymm6, ymm7, 250 - vpblendd ymm9, ymm9, ymm8, 0AAH - vpunpcklqdq ymm8, ymm7, ymm5 - vpblendd ymm8, ymm8, ymm6, 88H - vpshufd ymm8, ymm8, 78H - vpunpckhdq ymm5, ymm5, ymm7 - vpunpckldq ymm6, ymm6, ymm5 - vpshufd ymm7, ymm6, 1EH - vmovdqa ymm5, ymm9 - vmovdqa ymm6, ymm8 - jmp roundloop2 -endroundloop2: - vpxor ymm0, ymm0, ymm2 - vpxor ymm1, ymm1, ymm3 - mov eax, r13d - cmp rdx, r15 - jne innerloop2 - vmovdqu xmmword ptr [rbx], xmm0 - vmovdqu xmmword ptr [rbx+10H], xmm1 - vextracti128 xmmword ptr [rbx+20H], ymm0, 01H - vextracti128 xmmword ptr [rbx+30H], ymm1, 01H - vmovaps ymm8, ymmword ptr [rsp+260H] - vmovaps ymm0, ymmword ptr [rsp+220H] - vmovups ymm1, ymmword ptr [rsp+228H] - vmovaps ymm2, ymmword ptr [rsp+240H] - vmovups ymm3, ymmword ptr [rsp+248H] - vblendvps ymm0, ymm0, ymm1, ymm8 - vblendvps ymm2, ymm2, ymm3, ymm8 - vmovaps ymmword ptr [rsp+220H], ymm0 - vmovaps ymmword ptr [rsp+240H], ymm2 - add rbx, 64 - add rdi, 16 - sub rsi, 2 -final1blocks: - test rsi, 1H - je unwind - vmovdqu xmm0, xmmword ptr [rcx] - vmovdqu xmm1, xmmword ptr [rcx+10H] - vmovd xmm3, dword ptr [rsp+220H] - vpinsrd xmm3, xmm3, dword ptr [rsp+240H], 1 - vpinsrd xmm13, xmm3, dword ptr [BLAKE3_BLOCK_LEN], 2 - vmovdqa xmm14, xmmword ptr [ROT16] - vmovdqa xmm15, xmmword ptr [ROT8] - mov r8, qword ptr [rdi] - movzx eax, byte ptr [rbp+80H] - or eax, r13d - xor edx, edx -ALIGN 16 -innerloop1: - mov r14d, eax - or eax, r12d - add rdx, 64 - cmp rdx, r15 - cmovne eax, r14d - vmovdqa xmm2, xmmword ptr [BLAKE3_IV] - vmovdqa xmm3, xmm13 - vpinsrd xmm3, xmm3, eax, 3 - vmovups xmm8, xmmword ptr [r8+rdx-40H] - vmovups xmm9, xmmword ptr [r8+rdx-30H] - vshufps xmm4, xmm8, xmm9, 136 - vshufps xmm5, xmm8, xmm9, 221 - vmovups xmm8, xmmword ptr [r8+rdx-20H] - vmovups xmm9, xmmword ptr [r8+rdx-10H] - vshufps xmm6, xmm8, xmm9, 136 - vshufps xmm7, xmm8, xmm9, 221 - vpshufd xmm6, xmm6, 93H - vpshufd xmm7, xmm7, 93H - mov al, 7 -roundloop1: - vpaddd xmm0, xmm0, xmm4 - vpaddd xmm0, xmm0, xmm1 - vpxor xmm3, xmm3, xmm0 - vpshufb xmm3, xmm3, xmm14 - vpaddd xmm2, xmm2, xmm3 - vpxor xmm1, xmm1, xmm2 - vpsrld xmm8, xmm1, 12 - vpslld xmm1, xmm1, 20 - vpor xmm1, xmm1, xmm8 - vpaddd xmm0, xmm0, xmm5 - vpaddd xmm0, xmm0, xmm1 - vpxor xmm3, xmm3, xmm0 - vpshufb xmm3, xmm3, xmm15 - vpaddd xmm2, xmm2, xmm3 - vpxor xmm1, xmm1, xmm2 - vpsrld xmm8, xmm1, 7 - vpslld xmm1, xmm1, 25 - vpor xmm1, xmm1, xmm8 - vpshufd xmm0, xmm0, 93H - vpshufd xmm3, xmm3, 4EH - vpshufd xmm2, xmm2, 39H - vpaddd xmm0, xmm0, xmm6 - vpaddd xmm0, xmm0, xmm1 - vpxor xmm3, xmm3, xmm0 - vpshufb xmm3, xmm3, xmm14 - vpaddd xmm2, xmm2, xmm3 - vpxor xmm1, xmm1, xmm2 - vpsrld xmm8, xmm1, 12 - vpslld xmm1, xmm1, 20 - vpor xmm1, xmm1, xmm8 - vpaddd xmm0, xmm0, xmm7 - vpaddd xmm0, xmm0, xmm1 - vpxor xmm3, xmm3, xmm0 - vpshufb xmm3, xmm3, xmm15 - vpaddd xmm2, xmm2, xmm3 - vpxor xmm1, xmm1, xmm2 - vpsrld xmm8, xmm1, 7 - vpslld xmm1, xmm1, 25 - vpor xmm1, xmm1, xmm8 - vpshufd xmm0, xmm0, 39H - vpshufd xmm3, xmm3, 4EH - vpshufd xmm2, xmm2, 93H - dec al - jz endroundloop1 - vshufps xmm8, xmm4, xmm5, 214 - vpshufd xmm9, xmm4, 0FH - vpshufd xmm4, xmm8, 39H - vshufps xmm8, xmm6, xmm7, 250 - vpblendd xmm9, xmm9, xmm8, 0AAH - vpunpcklqdq xmm8, xmm7, xmm5 - vpblendd xmm8, xmm8, xmm6, 88H - vpshufd xmm8, xmm8, 78H - vpunpckhdq xmm5, xmm5, xmm7 - vpunpckldq xmm6, xmm6, xmm5 - vpshufd xmm7, xmm6, 1EH - vmovdqa xmm5, xmm9 - vmovdqa xmm6, xmm8 - jmp roundloop1 -endroundloop1: - vpxor xmm0, xmm0, xmm2 - vpxor xmm1, xmm1, xmm3 - mov eax, r13d - cmp rdx, r15 - jne innerloop1 - vmovdqu xmmword ptr [rbx], xmm0 - vmovdqu xmmword ptr [rbx+10H], xmm1 - jmp unwind - -_blake3_hash_many_avx2 ENDP -blake3_hash_many_avx2 ENDP -_TEXT ENDS - -_RDATA SEGMENT READONLY PAGE ALIAS(".rdata") 'CONST' -ALIGN 64 -ADD0: - dd 0, 1, 2, 3, 4, 5, 6, 7 - -ADD1: - dd 8 dup (8) - -BLAKE3_IV_0: - dd 8 dup (6A09E667H) - -BLAKE3_IV_1: - dd 8 dup (0BB67AE85H) - -BLAKE3_IV_2: - dd 8 dup (3C6EF372H) - -BLAKE3_IV_3: - dd 8 dup (0A54FF53AH) - -BLAKE3_BLOCK_LEN: - dd 8 dup (64) - -ROT16: - db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 - -ROT8: - db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 - -CMP_MSB_MASK: - dd 8 dup(80000000H) - -BLAKE3_IV: - dd 6A09E667H, 0BB67AE85H, 3C6EF372H, 0A54FF53AH - -_RDATA ENDS -END diff --git a/thirdparty/BLAKE3/c/blake3_avx512.c b/thirdparty/BLAKE3/c/blake3_avx512.c deleted file mode 100644 index 77a5c385c..000000000 --- a/thirdparty/BLAKE3/c/blake3_avx512.c +++ /dev/null @@ -1,1204 +0,0 @@ -#include "blake3_impl.h" - -#include <immintrin.h> - -#define _mm_shuffle_ps2(a, b, c) \ - (_mm_castps_si128( \ - _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c)))) - -INLINE __m128i loadu_128(const uint8_t src[16]) { - return _mm_loadu_si128((const __m128i *)src); -} - -INLINE __m256i loadu_256(const uint8_t src[32]) { - return _mm256_loadu_si256((const __m256i *)src); -} - -INLINE __m512i loadu_512(const uint8_t src[64]) { - return _mm512_loadu_si512((const __m512i *)src); -} - -INLINE void storeu_128(__m128i src, uint8_t dest[16]) { - _mm_storeu_si128((__m128i *)dest, src); -} - -INLINE void storeu_256(__m256i src, uint8_t dest[16]) { - _mm256_storeu_si256((__m256i *)dest, src); -} - -INLINE __m128i add_128(__m128i a, __m128i b) { return _mm_add_epi32(a, b); } - -INLINE __m256i add_256(__m256i a, __m256i b) { return _mm256_add_epi32(a, b); } - -INLINE __m512i add_512(__m512i a, __m512i b) { return _mm512_add_epi32(a, b); } - -INLINE __m128i xor_128(__m128i a, __m128i b) { return _mm_xor_si128(a, b); } - -INLINE __m256i xor_256(__m256i a, __m256i b) { return _mm256_xor_si256(a, b); } - -INLINE __m512i xor_512(__m512i a, __m512i b) { return _mm512_xor_si512(a, b); } - -INLINE __m128i set1_128(uint32_t x) { return _mm_set1_epi32((int32_t)x); } - -INLINE __m256i set1_256(uint32_t x) { return _mm256_set1_epi32((int32_t)x); } - -INLINE __m512i set1_512(uint32_t x) { return _mm512_set1_epi32((int32_t)x); } - -INLINE __m128i set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { - return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d); -} - -INLINE __m128i rot16_128(__m128i x) { return _mm_ror_epi32(x, 16); } - -INLINE __m256i rot16_256(__m256i x) { return _mm256_ror_epi32(x, 16); } - -INLINE __m512i rot16_512(__m512i x) { return _mm512_ror_epi32(x, 16); } - -INLINE __m128i rot12_128(__m128i x) { return _mm_ror_epi32(x, 12); } - -INLINE __m256i rot12_256(__m256i x) { return _mm256_ror_epi32(x, 12); } - -INLINE __m512i rot12_512(__m512i x) { return _mm512_ror_epi32(x, 12); } - -INLINE __m128i rot8_128(__m128i x) { return _mm_ror_epi32(x, 8); } - -INLINE __m256i rot8_256(__m256i x) { return _mm256_ror_epi32(x, 8); } - -INLINE __m512i rot8_512(__m512i x) { return _mm512_ror_epi32(x, 8); } - -INLINE __m128i rot7_128(__m128i x) { return _mm_ror_epi32(x, 7); } - -INLINE __m256i rot7_256(__m256i x) { return _mm256_ror_epi32(x, 7); } - -INLINE __m512i rot7_512(__m512i x) { return _mm512_ror_epi32(x, 7); } - -/* - * ---------------------------------------------------------------------------- - * compress_avx512 - * ---------------------------------------------------------------------------- - */ - -INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, - __m128i m) { - *row0 = add_128(add_128(*row0, m), *row1); - *row3 = xor_128(*row3, *row0); - *row3 = rot16_128(*row3); - *row2 = add_128(*row2, *row3); - *row1 = xor_128(*row1, *row2); - *row1 = rot12_128(*row1); -} - -INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, - __m128i m) { - *row0 = add_128(add_128(*row0, m), *row1); - *row3 = xor_128(*row3, *row0); - *row3 = rot8_128(*row3); - *row2 = add_128(*row2, *row3); - *row1 = xor_128(*row1, *row2); - *row1 = rot7_128(*row1); -} - -// Note the optimization here of leaving row1 as the unrotated row, rather than -// row0. All the message loads below are adjusted to compensate for this. See -// discussion at https://github.com/sneves/blake2-avx2/pull/4 -INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { - *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3)); - *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); - *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1)); -} - -INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { - *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1)); - *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); - *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3)); -} - -INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, uint8_t flags) { - rows[0] = loadu_128((uint8_t *)&cv[0]); - rows[1] = loadu_128((uint8_t *)&cv[4]); - rows[2] = set4(IV[0], IV[1], IV[2], IV[3]); - rows[3] = set4(counter_low(counter), counter_high(counter), - (uint32_t)block_len, (uint32_t)flags); - - __m128i m0 = loadu_128(&block[sizeof(__m128i) * 0]); - __m128i m1 = loadu_128(&block[sizeof(__m128i) * 1]); - __m128i m2 = loadu_128(&block[sizeof(__m128i) * 2]); - __m128i m3 = loadu_128(&block[sizeof(__m128i) * 3]); - - __m128i t0, t1, t2, t3, tt; - - // Round 1. The first round permutes the message words from the original - // input order, into the groups that get mixed in parallel. - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); // 6 4 2 0 - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); // 7 5 3 1 - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10 8 - t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3)); // 12 10 8 14 - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11 9 - t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3)); // 13 11 9 15 - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 2. This round and all following rounds apply a fixed permutation - // to the message words from the round before. - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 3 - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 4 - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 5 - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 6 - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 7 - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); -} - -void blake3_compress_xof_avx512(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags, uint8_t out[64]) { - __m128i rows[4]; - compress_pre(rows, cv, block, block_len, counter, flags); - storeu_128(xor_128(rows[0], rows[2]), &out[0]); - storeu_128(xor_128(rows[1], rows[3]), &out[16]); - storeu_128(xor_128(rows[2], loadu_128((uint8_t *)&cv[0])), &out[32]); - storeu_128(xor_128(rows[3], loadu_128((uint8_t *)&cv[4])), &out[48]); -} - -void blake3_compress_in_place_avx512(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags) { - __m128i rows[4]; - compress_pre(rows, cv, block, block_len, counter, flags); - storeu_128(xor_128(rows[0], rows[2]), (uint8_t *)&cv[0]); - storeu_128(xor_128(rows[1], rows[3]), (uint8_t *)&cv[4]); -} - -/* - * ---------------------------------------------------------------------------- - * hash4_avx512 - * ---------------------------------------------------------------------------- - */ - -INLINE void round_fn4(__m128i v[16], __m128i m[16], size_t r) { - v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); - v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); - v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); - v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); - v[0] = add_128(v[0], v[4]); - v[1] = add_128(v[1], v[5]); - v[2] = add_128(v[2], v[6]); - v[3] = add_128(v[3], v[7]); - v[12] = xor_128(v[12], v[0]); - v[13] = xor_128(v[13], v[1]); - v[14] = xor_128(v[14], v[2]); - v[15] = xor_128(v[15], v[3]); - v[12] = rot16_128(v[12]); - v[13] = rot16_128(v[13]); - v[14] = rot16_128(v[14]); - v[15] = rot16_128(v[15]); - v[8] = add_128(v[8], v[12]); - v[9] = add_128(v[9], v[13]); - v[10] = add_128(v[10], v[14]); - v[11] = add_128(v[11], v[15]); - v[4] = xor_128(v[4], v[8]); - v[5] = xor_128(v[5], v[9]); - v[6] = xor_128(v[6], v[10]); - v[7] = xor_128(v[7], v[11]); - v[4] = rot12_128(v[4]); - v[5] = rot12_128(v[5]); - v[6] = rot12_128(v[6]); - v[7] = rot12_128(v[7]); - v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); - v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); - v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); - v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); - v[0] = add_128(v[0], v[4]); - v[1] = add_128(v[1], v[5]); - v[2] = add_128(v[2], v[6]); - v[3] = add_128(v[3], v[7]); - v[12] = xor_128(v[12], v[0]); - v[13] = xor_128(v[13], v[1]); - v[14] = xor_128(v[14], v[2]); - v[15] = xor_128(v[15], v[3]); - v[12] = rot8_128(v[12]); - v[13] = rot8_128(v[13]); - v[14] = rot8_128(v[14]); - v[15] = rot8_128(v[15]); - v[8] = add_128(v[8], v[12]); - v[9] = add_128(v[9], v[13]); - v[10] = add_128(v[10], v[14]); - v[11] = add_128(v[11], v[15]); - v[4] = xor_128(v[4], v[8]); - v[5] = xor_128(v[5], v[9]); - v[6] = xor_128(v[6], v[10]); - v[7] = xor_128(v[7], v[11]); - v[4] = rot7_128(v[4]); - v[5] = rot7_128(v[5]); - v[6] = rot7_128(v[6]); - v[7] = rot7_128(v[7]); - - v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); - v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); - v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); - v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); - v[0] = add_128(v[0], v[5]); - v[1] = add_128(v[1], v[6]); - v[2] = add_128(v[2], v[7]); - v[3] = add_128(v[3], v[4]); - v[15] = xor_128(v[15], v[0]); - v[12] = xor_128(v[12], v[1]); - v[13] = xor_128(v[13], v[2]); - v[14] = xor_128(v[14], v[3]); - v[15] = rot16_128(v[15]); - v[12] = rot16_128(v[12]); - v[13] = rot16_128(v[13]); - v[14] = rot16_128(v[14]); - v[10] = add_128(v[10], v[15]); - v[11] = add_128(v[11], v[12]); - v[8] = add_128(v[8], v[13]); - v[9] = add_128(v[9], v[14]); - v[5] = xor_128(v[5], v[10]); - v[6] = xor_128(v[6], v[11]); - v[7] = xor_128(v[7], v[8]); - v[4] = xor_128(v[4], v[9]); - v[5] = rot12_128(v[5]); - v[6] = rot12_128(v[6]); - v[7] = rot12_128(v[7]); - v[4] = rot12_128(v[4]); - v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); - v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); - v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); - v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); - v[0] = add_128(v[0], v[5]); - v[1] = add_128(v[1], v[6]); - v[2] = add_128(v[2], v[7]); - v[3] = add_128(v[3], v[4]); - v[15] = xor_128(v[15], v[0]); - v[12] = xor_128(v[12], v[1]); - v[13] = xor_128(v[13], v[2]); - v[14] = xor_128(v[14], v[3]); - v[15] = rot8_128(v[15]); - v[12] = rot8_128(v[12]); - v[13] = rot8_128(v[13]); - v[14] = rot8_128(v[14]); - v[10] = add_128(v[10], v[15]); - v[11] = add_128(v[11], v[12]); - v[8] = add_128(v[8], v[13]); - v[9] = add_128(v[9], v[14]); - v[5] = xor_128(v[5], v[10]); - v[6] = xor_128(v[6], v[11]); - v[7] = xor_128(v[7], v[8]); - v[4] = xor_128(v[4], v[9]); - v[5] = rot7_128(v[5]); - v[6] = rot7_128(v[6]); - v[7] = rot7_128(v[7]); - v[4] = rot7_128(v[4]); -} - -INLINE void transpose_vecs_128(__m128i vecs[4]) { - // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is - // 22/33. Note that this doesn't split the vector into two lanes, as the - // AVX2 counterparts do. - __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]); - __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]); - __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]); - __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]); - - // Interleave 64-bit lanes. - __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01); - __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01); - __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23); - __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23); - - vecs[0] = abcd_0; - vecs[1] = abcd_1; - vecs[2] = abcd_2; - vecs[3] = abcd_3; -} - -INLINE void transpose_msg_vecs4(const uint8_t *const *inputs, - size_t block_offset, __m128i out[16]) { - out[0] = loadu_128(&inputs[0][block_offset + 0 * sizeof(__m128i)]); - out[1] = loadu_128(&inputs[1][block_offset + 0 * sizeof(__m128i)]); - out[2] = loadu_128(&inputs[2][block_offset + 0 * sizeof(__m128i)]); - out[3] = loadu_128(&inputs[3][block_offset + 0 * sizeof(__m128i)]); - out[4] = loadu_128(&inputs[0][block_offset + 1 * sizeof(__m128i)]); - out[5] = loadu_128(&inputs[1][block_offset + 1 * sizeof(__m128i)]); - out[6] = loadu_128(&inputs[2][block_offset + 1 * sizeof(__m128i)]); - out[7] = loadu_128(&inputs[3][block_offset + 1 * sizeof(__m128i)]); - out[8] = loadu_128(&inputs[0][block_offset + 2 * sizeof(__m128i)]); - out[9] = loadu_128(&inputs[1][block_offset + 2 * sizeof(__m128i)]); - out[10] = loadu_128(&inputs[2][block_offset + 2 * sizeof(__m128i)]); - out[11] = loadu_128(&inputs[3][block_offset + 2 * sizeof(__m128i)]); - out[12] = loadu_128(&inputs[0][block_offset + 3 * sizeof(__m128i)]); - out[13] = loadu_128(&inputs[1][block_offset + 3 * sizeof(__m128i)]); - out[14] = loadu_128(&inputs[2][block_offset + 3 * sizeof(__m128i)]); - out[15] = loadu_128(&inputs[3][block_offset + 3 * sizeof(__m128i)]); - for (size_t i = 0; i < 4; ++i) { - _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0); - } - transpose_vecs_128(&out[0]); - transpose_vecs_128(&out[4]); - transpose_vecs_128(&out[8]); - transpose_vecs_128(&out[12]); -} - -INLINE void load_counters4(uint64_t counter, bool increment_counter, - __m128i *out_lo, __m128i *out_hi) { - uint64_t mask = (increment_counter ? ~0 : 0); - __m256i mask_vec = _mm256_set1_epi64x(mask); - __m256i deltas = _mm256_setr_epi64x(0, 1, 2, 3); - deltas = _mm256_and_si256(mask_vec, deltas); - __m256i counters = - _mm256_add_epi64(_mm256_set1_epi64x((int64_t)counter), deltas); - *out_lo = _mm256_cvtepi64_epi32(counters); - *out_hi = _mm256_cvtepi64_epi32(_mm256_srli_epi64(counters, 32)); -} - -void blake3_hash4_avx512(const uint8_t *const *inputs, size_t blocks, - const uint32_t key[8], uint64_t counter, - bool increment_counter, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, uint8_t *out) { - __m128i h_vecs[8] = { - set1_128(key[0]), set1_128(key[1]), set1_128(key[2]), set1_128(key[3]), - set1_128(key[4]), set1_128(key[5]), set1_128(key[6]), set1_128(key[7]), - }; - __m128i counter_low_vec, counter_high_vec; - load_counters4(counter, increment_counter, &counter_low_vec, - &counter_high_vec); - uint8_t block_flags = flags | flags_start; - - for (size_t block = 0; block < blocks; block++) { - if (block + 1 == blocks) { - block_flags |= flags_end; - } - __m128i block_len_vec = set1_128(BLAKE3_BLOCK_LEN); - __m128i block_flags_vec = set1_128(block_flags); - __m128i msg_vecs[16]; - transpose_msg_vecs4(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); - - __m128i v[16] = { - h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], - h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], - set1_128(IV[0]), set1_128(IV[1]), set1_128(IV[2]), set1_128(IV[3]), - counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, - }; - round_fn4(v, msg_vecs, 0); - round_fn4(v, msg_vecs, 1); - round_fn4(v, msg_vecs, 2); - round_fn4(v, msg_vecs, 3); - round_fn4(v, msg_vecs, 4); - round_fn4(v, msg_vecs, 5); - round_fn4(v, msg_vecs, 6); - h_vecs[0] = xor_128(v[0], v[8]); - h_vecs[1] = xor_128(v[1], v[9]); - h_vecs[2] = xor_128(v[2], v[10]); - h_vecs[3] = xor_128(v[3], v[11]); - h_vecs[4] = xor_128(v[4], v[12]); - h_vecs[5] = xor_128(v[5], v[13]); - h_vecs[6] = xor_128(v[6], v[14]); - h_vecs[7] = xor_128(v[7], v[15]); - - block_flags = flags; - } - - transpose_vecs_128(&h_vecs[0]); - transpose_vecs_128(&h_vecs[4]); - // The first four vecs now contain the first half of each output, and the - // second four vecs contain the second half of each output. - storeu_128(h_vecs[0], &out[0 * sizeof(__m128i)]); - storeu_128(h_vecs[4], &out[1 * sizeof(__m128i)]); - storeu_128(h_vecs[1], &out[2 * sizeof(__m128i)]); - storeu_128(h_vecs[5], &out[3 * sizeof(__m128i)]); - storeu_128(h_vecs[2], &out[4 * sizeof(__m128i)]); - storeu_128(h_vecs[6], &out[5 * sizeof(__m128i)]); - storeu_128(h_vecs[3], &out[6 * sizeof(__m128i)]); - storeu_128(h_vecs[7], &out[7 * sizeof(__m128i)]); -} - -/* - * ---------------------------------------------------------------------------- - * hash8_avx512 - * ---------------------------------------------------------------------------- - */ - -INLINE void round_fn8(__m256i v[16], __m256i m[16], size_t r) { - v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); - v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); - v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); - v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); - v[0] = add_256(v[0], v[4]); - v[1] = add_256(v[1], v[5]); - v[2] = add_256(v[2], v[6]); - v[3] = add_256(v[3], v[7]); - v[12] = xor_256(v[12], v[0]); - v[13] = xor_256(v[13], v[1]); - v[14] = xor_256(v[14], v[2]); - v[15] = xor_256(v[15], v[3]); - v[12] = rot16_256(v[12]); - v[13] = rot16_256(v[13]); - v[14] = rot16_256(v[14]); - v[15] = rot16_256(v[15]); - v[8] = add_256(v[8], v[12]); - v[9] = add_256(v[9], v[13]); - v[10] = add_256(v[10], v[14]); - v[11] = add_256(v[11], v[15]); - v[4] = xor_256(v[4], v[8]); - v[5] = xor_256(v[5], v[9]); - v[6] = xor_256(v[6], v[10]); - v[7] = xor_256(v[7], v[11]); - v[4] = rot12_256(v[4]); - v[5] = rot12_256(v[5]); - v[6] = rot12_256(v[6]); - v[7] = rot12_256(v[7]); - v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); - v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); - v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); - v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); - v[0] = add_256(v[0], v[4]); - v[1] = add_256(v[1], v[5]); - v[2] = add_256(v[2], v[6]); - v[3] = add_256(v[3], v[7]); - v[12] = xor_256(v[12], v[0]); - v[13] = xor_256(v[13], v[1]); - v[14] = xor_256(v[14], v[2]); - v[15] = xor_256(v[15], v[3]); - v[12] = rot8_256(v[12]); - v[13] = rot8_256(v[13]); - v[14] = rot8_256(v[14]); - v[15] = rot8_256(v[15]); - v[8] = add_256(v[8], v[12]); - v[9] = add_256(v[9], v[13]); - v[10] = add_256(v[10], v[14]); - v[11] = add_256(v[11], v[15]); - v[4] = xor_256(v[4], v[8]); - v[5] = xor_256(v[5], v[9]); - v[6] = xor_256(v[6], v[10]); - v[7] = xor_256(v[7], v[11]); - v[4] = rot7_256(v[4]); - v[5] = rot7_256(v[5]); - v[6] = rot7_256(v[6]); - v[7] = rot7_256(v[7]); - - v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); - v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); - v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); - v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); - v[0] = add_256(v[0], v[5]); - v[1] = add_256(v[1], v[6]); - v[2] = add_256(v[2], v[7]); - v[3] = add_256(v[3], v[4]); - v[15] = xor_256(v[15], v[0]); - v[12] = xor_256(v[12], v[1]); - v[13] = xor_256(v[13], v[2]); - v[14] = xor_256(v[14], v[3]); - v[15] = rot16_256(v[15]); - v[12] = rot16_256(v[12]); - v[13] = rot16_256(v[13]); - v[14] = rot16_256(v[14]); - v[10] = add_256(v[10], v[15]); - v[11] = add_256(v[11], v[12]); - v[8] = add_256(v[8], v[13]); - v[9] = add_256(v[9], v[14]); - v[5] = xor_256(v[5], v[10]); - v[6] = xor_256(v[6], v[11]); - v[7] = xor_256(v[7], v[8]); - v[4] = xor_256(v[4], v[9]); - v[5] = rot12_256(v[5]); - v[6] = rot12_256(v[6]); - v[7] = rot12_256(v[7]); - v[4] = rot12_256(v[4]); - v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); - v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); - v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); - v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); - v[0] = add_256(v[0], v[5]); - v[1] = add_256(v[1], v[6]); - v[2] = add_256(v[2], v[7]); - v[3] = add_256(v[3], v[4]); - v[15] = xor_256(v[15], v[0]); - v[12] = xor_256(v[12], v[1]); - v[13] = xor_256(v[13], v[2]); - v[14] = xor_256(v[14], v[3]); - v[15] = rot8_256(v[15]); - v[12] = rot8_256(v[12]); - v[13] = rot8_256(v[13]); - v[14] = rot8_256(v[14]); - v[10] = add_256(v[10], v[15]); - v[11] = add_256(v[11], v[12]); - v[8] = add_256(v[8], v[13]); - v[9] = add_256(v[9], v[14]); - v[5] = xor_256(v[5], v[10]); - v[6] = xor_256(v[6], v[11]); - v[7] = xor_256(v[7], v[8]); - v[4] = xor_256(v[4], v[9]); - v[5] = rot7_256(v[5]); - v[6] = rot7_256(v[6]); - v[7] = rot7_256(v[7]); - v[4] = rot7_256(v[4]); -} - -INLINE void transpose_vecs_256(__m256i vecs[8]) { - // Interleave 32-bit lanes. The low unpack is lanes 00/11/44/55, and the high - // is 22/33/66/77. - __m256i ab_0145 = _mm256_unpacklo_epi32(vecs[0], vecs[1]); - __m256i ab_2367 = _mm256_unpackhi_epi32(vecs[0], vecs[1]); - __m256i cd_0145 = _mm256_unpacklo_epi32(vecs[2], vecs[3]); - __m256i cd_2367 = _mm256_unpackhi_epi32(vecs[2], vecs[3]); - __m256i ef_0145 = _mm256_unpacklo_epi32(vecs[4], vecs[5]); - __m256i ef_2367 = _mm256_unpackhi_epi32(vecs[4], vecs[5]); - __m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]); - __m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]); - - // Interleave 64-bit lates. The low unpack is lanes 00/22 and the high is - // 11/33. - __m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145); - __m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145); - __m256i abcd_26 = _mm256_unpacklo_epi64(ab_2367, cd_2367); - __m256i abcd_37 = _mm256_unpackhi_epi64(ab_2367, cd_2367); - __m256i efgh_04 = _mm256_unpacklo_epi64(ef_0145, gh_0145); - __m256i efgh_15 = _mm256_unpackhi_epi64(ef_0145, gh_0145); - __m256i efgh_26 = _mm256_unpacklo_epi64(ef_2367, gh_2367); - __m256i efgh_37 = _mm256_unpackhi_epi64(ef_2367, gh_2367); - - // Interleave 128-bit lanes. - vecs[0] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x20); - vecs[1] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x20); - vecs[2] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x20); - vecs[3] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x20); - vecs[4] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x31); - vecs[5] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x31); - vecs[6] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x31); - vecs[7] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x31); -} - -INLINE void transpose_msg_vecs8(const uint8_t *const *inputs, - size_t block_offset, __m256i out[16]) { - out[0] = loadu_256(&inputs[0][block_offset + 0 * sizeof(__m256i)]); - out[1] = loadu_256(&inputs[1][block_offset + 0 * sizeof(__m256i)]); - out[2] = loadu_256(&inputs[2][block_offset + 0 * sizeof(__m256i)]); - out[3] = loadu_256(&inputs[3][block_offset + 0 * sizeof(__m256i)]); - out[4] = loadu_256(&inputs[4][block_offset + 0 * sizeof(__m256i)]); - out[5] = loadu_256(&inputs[5][block_offset + 0 * sizeof(__m256i)]); - out[6] = loadu_256(&inputs[6][block_offset + 0 * sizeof(__m256i)]); - out[7] = loadu_256(&inputs[7][block_offset + 0 * sizeof(__m256i)]); - out[8] = loadu_256(&inputs[0][block_offset + 1 * sizeof(__m256i)]); - out[9] = loadu_256(&inputs[1][block_offset + 1 * sizeof(__m256i)]); - out[10] = loadu_256(&inputs[2][block_offset + 1 * sizeof(__m256i)]); - out[11] = loadu_256(&inputs[3][block_offset + 1 * sizeof(__m256i)]); - out[12] = loadu_256(&inputs[4][block_offset + 1 * sizeof(__m256i)]); - out[13] = loadu_256(&inputs[5][block_offset + 1 * sizeof(__m256i)]); - out[14] = loadu_256(&inputs[6][block_offset + 1 * sizeof(__m256i)]); - out[15] = loadu_256(&inputs[7][block_offset + 1 * sizeof(__m256i)]); - for (size_t i = 0; i < 8; ++i) { - _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0); - } - transpose_vecs_256(&out[0]); - transpose_vecs_256(&out[8]); -} - -INLINE void load_counters8(uint64_t counter, bool increment_counter, - __m256i *out_lo, __m256i *out_hi) { - uint64_t mask = (increment_counter ? ~0 : 0); - __m512i mask_vec = _mm512_set1_epi64(mask); - __m512i deltas = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7); - deltas = _mm512_and_si512(mask_vec, deltas); - __m512i counters = - _mm512_add_epi64(_mm512_set1_epi64((int64_t)counter), deltas); - *out_lo = _mm512_cvtepi64_epi32(counters); - *out_hi = _mm512_cvtepi64_epi32(_mm512_srli_epi64(counters, 32)); -} - -void blake3_hash8_avx512(const uint8_t *const *inputs, size_t blocks, - const uint32_t key[8], uint64_t counter, - bool increment_counter, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, uint8_t *out) { - __m256i h_vecs[8] = { - set1_256(key[0]), set1_256(key[1]), set1_256(key[2]), set1_256(key[3]), - set1_256(key[4]), set1_256(key[5]), set1_256(key[6]), set1_256(key[7]), - }; - __m256i counter_low_vec, counter_high_vec; - load_counters8(counter, increment_counter, &counter_low_vec, - &counter_high_vec); - uint8_t block_flags = flags | flags_start; - - for (size_t block = 0; block < blocks; block++) { - if (block + 1 == blocks) { - block_flags |= flags_end; - } - __m256i block_len_vec = set1_256(BLAKE3_BLOCK_LEN); - __m256i block_flags_vec = set1_256(block_flags); - __m256i msg_vecs[16]; - transpose_msg_vecs8(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); - - __m256i v[16] = { - h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], - h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], - set1_256(IV[0]), set1_256(IV[1]), set1_256(IV[2]), set1_256(IV[3]), - counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, - }; - round_fn8(v, msg_vecs, 0); - round_fn8(v, msg_vecs, 1); - round_fn8(v, msg_vecs, 2); - round_fn8(v, msg_vecs, 3); - round_fn8(v, msg_vecs, 4); - round_fn8(v, msg_vecs, 5); - round_fn8(v, msg_vecs, 6); - h_vecs[0] = xor_256(v[0], v[8]); - h_vecs[1] = xor_256(v[1], v[9]); - h_vecs[2] = xor_256(v[2], v[10]); - h_vecs[3] = xor_256(v[3], v[11]); - h_vecs[4] = xor_256(v[4], v[12]); - h_vecs[5] = xor_256(v[5], v[13]); - h_vecs[6] = xor_256(v[6], v[14]); - h_vecs[7] = xor_256(v[7], v[15]); - - block_flags = flags; - } - - transpose_vecs_256(h_vecs); - storeu_256(h_vecs[0], &out[0 * sizeof(__m256i)]); - storeu_256(h_vecs[1], &out[1 * sizeof(__m256i)]); - storeu_256(h_vecs[2], &out[2 * sizeof(__m256i)]); - storeu_256(h_vecs[3], &out[3 * sizeof(__m256i)]); - storeu_256(h_vecs[4], &out[4 * sizeof(__m256i)]); - storeu_256(h_vecs[5], &out[5 * sizeof(__m256i)]); - storeu_256(h_vecs[6], &out[6 * sizeof(__m256i)]); - storeu_256(h_vecs[7], &out[7 * sizeof(__m256i)]); -} - -/* - * ---------------------------------------------------------------------------- - * hash16_avx512 - * ---------------------------------------------------------------------------- - */ - -INLINE void round_fn16(__m512i v[16], __m512i m[16], size_t r) { - v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); - v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); - v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); - v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); - v[0] = add_512(v[0], v[4]); - v[1] = add_512(v[1], v[5]); - v[2] = add_512(v[2], v[6]); - v[3] = add_512(v[3], v[7]); - v[12] = xor_512(v[12], v[0]); - v[13] = xor_512(v[13], v[1]); - v[14] = xor_512(v[14], v[2]); - v[15] = xor_512(v[15], v[3]); - v[12] = rot16_512(v[12]); - v[13] = rot16_512(v[13]); - v[14] = rot16_512(v[14]); - v[15] = rot16_512(v[15]); - v[8] = add_512(v[8], v[12]); - v[9] = add_512(v[9], v[13]); - v[10] = add_512(v[10], v[14]); - v[11] = add_512(v[11], v[15]); - v[4] = xor_512(v[4], v[8]); - v[5] = xor_512(v[5], v[9]); - v[6] = xor_512(v[6], v[10]); - v[7] = xor_512(v[7], v[11]); - v[4] = rot12_512(v[4]); - v[5] = rot12_512(v[5]); - v[6] = rot12_512(v[6]); - v[7] = rot12_512(v[7]); - v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); - v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); - v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); - v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); - v[0] = add_512(v[0], v[4]); - v[1] = add_512(v[1], v[5]); - v[2] = add_512(v[2], v[6]); - v[3] = add_512(v[3], v[7]); - v[12] = xor_512(v[12], v[0]); - v[13] = xor_512(v[13], v[1]); - v[14] = xor_512(v[14], v[2]); - v[15] = xor_512(v[15], v[3]); - v[12] = rot8_512(v[12]); - v[13] = rot8_512(v[13]); - v[14] = rot8_512(v[14]); - v[15] = rot8_512(v[15]); - v[8] = add_512(v[8], v[12]); - v[9] = add_512(v[9], v[13]); - v[10] = add_512(v[10], v[14]); - v[11] = add_512(v[11], v[15]); - v[4] = xor_512(v[4], v[8]); - v[5] = xor_512(v[5], v[9]); - v[6] = xor_512(v[6], v[10]); - v[7] = xor_512(v[7], v[11]); - v[4] = rot7_512(v[4]); - v[5] = rot7_512(v[5]); - v[6] = rot7_512(v[6]); - v[7] = rot7_512(v[7]); - - v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); - v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); - v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); - v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); - v[0] = add_512(v[0], v[5]); - v[1] = add_512(v[1], v[6]); - v[2] = add_512(v[2], v[7]); - v[3] = add_512(v[3], v[4]); - v[15] = xor_512(v[15], v[0]); - v[12] = xor_512(v[12], v[1]); - v[13] = xor_512(v[13], v[2]); - v[14] = xor_512(v[14], v[3]); - v[15] = rot16_512(v[15]); - v[12] = rot16_512(v[12]); - v[13] = rot16_512(v[13]); - v[14] = rot16_512(v[14]); - v[10] = add_512(v[10], v[15]); - v[11] = add_512(v[11], v[12]); - v[8] = add_512(v[8], v[13]); - v[9] = add_512(v[9], v[14]); - v[5] = xor_512(v[5], v[10]); - v[6] = xor_512(v[6], v[11]); - v[7] = xor_512(v[7], v[8]); - v[4] = xor_512(v[4], v[9]); - v[5] = rot12_512(v[5]); - v[6] = rot12_512(v[6]); - v[7] = rot12_512(v[7]); - v[4] = rot12_512(v[4]); - v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); - v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); - v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); - v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); - v[0] = add_512(v[0], v[5]); - v[1] = add_512(v[1], v[6]); - v[2] = add_512(v[2], v[7]); - v[3] = add_512(v[3], v[4]); - v[15] = xor_512(v[15], v[0]); - v[12] = xor_512(v[12], v[1]); - v[13] = xor_512(v[13], v[2]); - v[14] = xor_512(v[14], v[3]); - v[15] = rot8_512(v[15]); - v[12] = rot8_512(v[12]); - v[13] = rot8_512(v[13]); - v[14] = rot8_512(v[14]); - v[10] = add_512(v[10], v[15]); - v[11] = add_512(v[11], v[12]); - v[8] = add_512(v[8], v[13]); - v[9] = add_512(v[9], v[14]); - v[5] = xor_512(v[5], v[10]); - v[6] = xor_512(v[6], v[11]); - v[7] = xor_512(v[7], v[8]); - v[4] = xor_512(v[4], v[9]); - v[5] = rot7_512(v[5]); - v[6] = rot7_512(v[6]); - v[7] = rot7_512(v[7]); - v[4] = rot7_512(v[4]); -} - -// 0b10001000, or lanes a0/a2/b0/b2 in little-endian order -#define LO_IMM8 0x88 - -INLINE __m512i unpack_lo_128(__m512i a, __m512i b) { - return _mm512_shuffle_i32x4(a, b, LO_IMM8); -} - -// 0b11011101, or lanes a1/a3/b1/b3 in little-endian order -#define HI_IMM8 0xdd - -INLINE __m512i unpack_hi_128(__m512i a, __m512i b) { - return _mm512_shuffle_i32x4(a, b, HI_IMM8); -} - -INLINE void transpose_vecs_512(__m512i vecs[16]) { - // Interleave 32-bit lanes. The _0 unpack is lanes - // 0/0/1/1/4/4/5/5/8/8/9/9/12/12/13/13, and the _2 unpack is lanes - // 2/2/3/3/6/6/7/7/10/10/11/11/14/14/15/15. - __m512i ab_0 = _mm512_unpacklo_epi32(vecs[0], vecs[1]); - __m512i ab_2 = _mm512_unpackhi_epi32(vecs[0], vecs[1]); - __m512i cd_0 = _mm512_unpacklo_epi32(vecs[2], vecs[3]); - __m512i cd_2 = _mm512_unpackhi_epi32(vecs[2], vecs[3]); - __m512i ef_0 = _mm512_unpacklo_epi32(vecs[4], vecs[5]); - __m512i ef_2 = _mm512_unpackhi_epi32(vecs[4], vecs[5]); - __m512i gh_0 = _mm512_unpacklo_epi32(vecs[6], vecs[7]); - __m512i gh_2 = _mm512_unpackhi_epi32(vecs[6], vecs[7]); - __m512i ij_0 = _mm512_unpacklo_epi32(vecs[8], vecs[9]); - __m512i ij_2 = _mm512_unpackhi_epi32(vecs[8], vecs[9]); - __m512i kl_0 = _mm512_unpacklo_epi32(vecs[10], vecs[11]); - __m512i kl_2 = _mm512_unpackhi_epi32(vecs[10], vecs[11]); - __m512i mn_0 = _mm512_unpacklo_epi32(vecs[12], vecs[13]); - __m512i mn_2 = _mm512_unpackhi_epi32(vecs[12], vecs[13]); - __m512i op_0 = _mm512_unpacklo_epi32(vecs[14], vecs[15]); - __m512i op_2 = _mm512_unpackhi_epi32(vecs[14], vecs[15]); - - // Interleave 64-bit lates. The _0 unpack is lanes - // 0/0/0/0/4/4/4/4/8/8/8/8/12/12/12/12, the _1 unpack is lanes - // 1/1/1/1/5/5/5/5/9/9/9/9/13/13/13/13, the _2 unpack is lanes - // 2/2/2/2/6/6/6/6/10/10/10/10/14/14/14/14, and the _3 unpack is lanes - // 3/3/3/3/7/7/7/7/11/11/11/11/15/15/15/15. - __m512i abcd_0 = _mm512_unpacklo_epi64(ab_0, cd_0); - __m512i abcd_1 = _mm512_unpackhi_epi64(ab_0, cd_0); - __m512i abcd_2 = _mm512_unpacklo_epi64(ab_2, cd_2); - __m512i abcd_3 = _mm512_unpackhi_epi64(ab_2, cd_2); - __m512i efgh_0 = _mm512_unpacklo_epi64(ef_0, gh_0); - __m512i efgh_1 = _mm512_unpackhi_epi64(ef_0, gh_0); - __m512i efgh_2 = _mm512_unpacklo_epi64(ef_2, gh_2); - __m512i efgh_3 = _mm512_unpackhi_epi64(ef_2, gh_2); - __m512i ijkl_0 = _mm512_unpacklo_epi64(ij_0, kl_0); - __m512i ijkl_1 = _mm512_unpackhi_epi64(ij_0, kl_0); - __m512i ijkl_2 = _mm512_unpacklo_epi64(ij_2, kl_2); - __m512i ijkl_3 = _mm512_unpackhi_epi64(ij_2, kl_2); - __m512i mnop_0 = _mm512_unpacklo_epi64(mn_0, op_0); - __m512i mnop_1 = _mm512_unpackhi_epi64(mn_0, op_0); - __m512i mnop_2 = _mm512_unpacklo_epi64(mn_2, op_2); - __m512i mnop_3 = _mm512_unpackhi_epi64(mn_2, op_2); - - // Interleave 128-bit lanes. The _0 unpack is - // 0/0/0/0/8/8/8/8/0/0/0/0/8/8/8/8, the _1 unpack is - // 1/1/1/1/9/9/9/9/1/1/1/1/9/9/9/9, and so on. - __m512i abcdefgh_0 = unpack_lo_128(abcd_0, efgh_0); - __m512i abcdefgh_1 = unpack_lo_128(abcd_1, efgh_1); - __m512i abcdefgh_2 = unpack_lo_128(abcd_2, efgh_2); - __m512i abcdefgh_3 = unpack_lo_128(abcd_3, efgh_3); - __m512i abcdefgh_4 = unpack_hi_128(abcd_0, efgh_0); - __m512i abcdefgh_5 = unpack_hi_128(abcd_1, efgh_1); - __m512i abcdefgh_6 = unpack_hi_128(abcd_2, efgh_2); - __m512i abcdefgh_7 = unpack_hi_128(abcd_3, efgh_3); - __m512i ijklmnop_0 = unpack_lo_128(ijkl_0, mnop_0); - __m512i ijklmnop_1 = unpack_lo_128(ijkl_1, mnop_1); - __m512i ijklmnop_2 = unpack_lo_128(ijkl_2, mnop_2); - __m512i ijklmnop_3 = unpack_lo_128(ijkl_3, mnop_3); - __m512i ijklmnop_4 = unpack_hi_128(ijkl_0, mnop_0); - __m512i ijklmnop_5 = unpack_hi_128(ijkl_1, mnop_1); - __m512i ijklmnop_6 = unpack_hi_128(ijkl_2, mnop_2); - __m512i ijklmnop_7 = unpack_hi_128(ijkl_3, mnop_3); - - // Interleave 128-bit lanes again for the final outputs. - vecs[0] = unpack_lo_128(abcdefgh_0, ijklmnop_0); - vecs[1] = unpack_lo_128(abcdefgh_1, ijklmnop_1); - vecs[2] = unpack_lo_128(abcdefgh_2, ijklmnop_2); - vecs[3] = unpack_lo_128(abcdefgh_3, ijklmnop_3); - vecs[4] = unpack_lo_128(abcdefgh_4, ijklmnop_4); - vecs[5] = unpack_lo_128(abcdefgh_5, ijklmnop_5); - vecs[6] = unpack_lo_128(abcdefgh_6, ijklmnop_6); - vecs[7] = unpack_lo_128(abcdefgh_7, ijklmnop_7); - vecs[8] = unpack_hi_128(abcdefgh_0, ijklmnop_0); - vecs[9] = unpack_hi_128(abcdefgh_1, ijklmnop_1); - vecs[10] = unpack_hi_128(abcdefgh_2, ijklmnop_2); - vecs[11] = unpack_hi_128(abcdefgh_3, ijklmnop_3); - vecs[12] = unpack_hi_128(abcdefgh_4, ijklmnop_4); - vecs[13] = unpack_hi_128(abcdefgh_5, ijklmnop_5); - vecs[14] = unpack_hi_128(abcdefgh_6, ijklmnop_6); - vecs[15] = unpack_hi_128(abcdefgh_7, ijklmnop_7); -} - -INLINE void transpose_msg_vecs16(const uint8_t *const *inputs, - size_t block_offset, __m512i out[16]) { - out[0] = loadu_512(&inputs[0][block_offset]); - out[1] = loadu_512(&inputs[1][block_offset]); - out[2] = loadu_512(&inputs[2][block_offset]); - out[3] = loadu_512(&inputs[3][block_offset]); - out[4] = loadu_512(&inputs[4][block_offset]); - out[5] = loadu_512(&inputs[5][block_offset]); - out[6] = loadu_512(&inputs[6][block_offset]); - out[7] = loadu_512(&inputs[7][block_offset]); - out[8] = loadu_512(&inputs[8][block_offset]); - out[9] = loadu_512(&inputs[9][block_offset]); - out[10] = loadu_512(&inputs[10][block_offset]); - out[11] = loadu_512(&inputs[11][block_offset]); - out[12] = loadu_512(&inputs[12][block_offset]); - out[13] = loadu_512(&inputs[13][block_offset]); - out[14] = loadu_512(&inputs[14][block_offset]); - out[15] = loadu_512(&inputs[15][block_offset]); - for (size_t i = 0; i < 16; ++i) { - _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0); - } - transpose_vecs_512(out); -} - -INLINE void load_counters16(uint64_t counter, bool increment_counter, - __m512i *out_lo, __m512i *out_hi) { - const __m512i mask = _mm512_set1_epi32(-(int32_t)increment_counter); - const __m512i add0 = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); - const __m512i add1 = _mm512_and_si512(mask, add0); - __m512i l = _mm512_add_epi32(_mm512_set1_epi32(counter), add1); - __mmask16 carry = _mm512_cmp_epu32_mask(l, add1, _MM_CMPINT_LT); - __m512i h = _mm512_mask_add_epi32(_mm512_set1_epi32(counter >> 32), carry, _mm512_set1_epi32(counter >> 32), _mm512_set1_epi32(1)); - *out_lo = l; - *out_hi = h; -} - -void blake3_hash16_avx512(const uint8_t *const *inputs, size_t blocks, - const uint32_t key[8], uint64_t counter, - bool increment_counter, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, - uint8_t *out) { - __m512i h_vecs[8] = { - set1_512(key[0]), set1_512(key[1]), set1_512(key[2]), set1_512(key[3]), - set1_512(key[4]), set1_512(key[5]), set1_512(key[6]), set1_512(key[7]), - }; - __m512i counter_low_vec, counter_high_vec; - load_counters16(counter, increment_counter, &counter_low_vec, - &counter_high_vec); - uint8_t block_flags = flags | flags_start; - - for (size_t block = 0; block < blocks; block++) { - if (block + 1 == blocks) { - block_flags |= flags_end; - } - __m512i block_len_vec = set1_512(BLAKE3_BLOCK_LEN); - __m512i block_flags_vec = set1_512(block_flags); - __m512i msg_vecs[16]; - transpose_msg_vecs16(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); - - __m512i v[16] = { - h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], - h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], - set1_512(IV[0]), set1_512(IV[1]), set1_512(IV[2]), set1_512(IV[3]), - counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, - }; - round_fn16(v, msg_vecs, 0); - round_fn16(v, msg_vecs, 1); - round_fn16(v, msg_vecs, 2); - round_fn16(v, msg_vecs, 3); - round_fn16(v, msg_vecs, 4); - round_fn16(v, msg_vecs, 5); - round_fn16(v, msg_vecs, 6); - h_vecs[0] = xor_512(v[0], v[8]); - h_vecs[1] = xor_512(v[1], v[9]); - h_vecs[2] = xor_512(v[2], v[10]); - h_vecs[3] = xor_512(v[3], v[11]); - h_vecs[4] = xor_512(v[4], v[12]); - h_vecs[5] = xor_512(v[5], v[13]); - h_vecs[6] = xor_512(v[6], v[14]); - h_vecs[7] = xor_512(v[7], v[15]); - - block_flags = flags; - } - - // transpose_vecs_512 operates on a 16x16 matrix of words, but we only have 8 - // state vectors. Pad the matrix with zeros. After transposition, store the - // lower half of each vector. - __m512i padded[16] = { - h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], - h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], - set1_512(0), set1_512(0), set1_512(0), set1_512(0), - set1_512(0), set1_512(0), set1_512(0), set1_512(0), - }; - transpose_vecs_512(padded); - _mm256_mask_storeu_epi32(&out[0 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[0])); - _mm256_mask_storeu_epi32(&out[1 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[1])); - _mm256_mask_storeu_epi32(&out[2 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[2])); - _mm256_mask_storeu_epi32(&out[3 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[3])); - _mm256_mask_storeu_epi32(&out[4 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[4])); - _mm256_mask_storeu_epi32(&out[5 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[5])); - _mm256_mask_storeu_epi32(&out[6 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[6])); - _mm256_mask_storeu_epi32(&out[7 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[7])); - _mm256_mask_storeu_epi32(&out[8 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[8])); - _mm256_mask_storeu_epi32(&out[9 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[9])); - _mm256_mask_storeu_epi32(&out[10 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[10])); - _mm256_mask_storeu_epi32(&out[11 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[11])); - _mm256_mask_storeu_epi32(&out[12 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[12])); - _mm256_mask_storeu_epi32(&out[13 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[13])); - _mm256_mask_storeu_epi32(&out[14 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[14])); - _mm256_mask_storeu_epi32(&out[15 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[15])); -} - -/* - * ---------------------------------------------------------------------------- - * hash_many_avx512 - * ---------------------------------------------------------------------------- - */ - -INLINE void hash_one_avx512(const uint8_t *input, size_t blocks, - const uint32_t key[8], uint64_t counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) { - uint32_t cv[8]; - memcpy(cv, key, BLAKE3_KEY_LEN); - uint8_t block_flags = flags | flags_start; - while (blocks > 0) { - if (blocks == 1) { - block_flags |= flags_end; - } - blake3_compress_in_place_avx512(cv, input, BLAKE3_BLOCK_LEN, counter, - block_flags); - input = &input[BLAKE3_BLOCK_LEN]; - blocks -= 1; - block_flags = flags; - } - memcpy(out, cv, BLAKE3_OUT_LEN); -} - -void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out) { - while (num_inputs >= 16) { - blake3_hash16_avx512(inputs, blocks, key, counter, increment_counter, flags, - flags_start, flags_end, out); - if (increment_counter) { - counter += 16; - } - inputs += 16; - num_inputs -= 16; - out = &out[16 * BLAKE3_OUT_LEN]; - } - while (num_inputs >= 8) { - blake3_hash8_avx512(inputs, blocks, key, counter, increment_counter, flags, - flags_start, flags_end, out); - if (increment_counter) { - counter += 8; - } - inputs += 8; - num_inputs -= 8; - out = &out[8 * BLAKE3_OUT_LEN]; - } - while (num_inputs >= 4) { - blake3_hash4_avx512(inputs, blocks, key, counter, increment_counter, flags, - flags_start, flags_end, out); - if (increment_counter) { - counter += 4; - } - inputs += 4; - num_inputs -= 4; - out = &out[4 * BLAKE3_OUT_LEN]; - } - while (num_inputs > 0) { - hash_one_avx512(inputs[0], blocks, key, counter, flags, flags_start, - flags_end, out); - if (increment_counter) { - counter += 1; - } - inputs += 1; - num_inputs -= 1; - out = &out[BLAKE3_OUT_LEN]; - } -} diff --git a/thirdparty/BLAKE3/c/blake3_avx512_x86-64_unix.S b/thirdparty/BLAKE3/c/blake3_avx512_x86-64_unix.S deleted file mode 100644 index a06aede0f..000000000 --- a/thirdparty/BLAKE3/c/blake3_avx512_x86-64_unix.S +++ /dev/null @@ -1,2585 +0,0 @@ -#if defined(__ELF__) && defined(__linux__) -.section .note.GNU-stack,"",%progbits -#endif - -#if defined(__ELF__) && defined(__CET__) && defined(__has_include) -#if __has_include(<cet.h>) -#include <cet.h> -#endif -#endif - -#if !defined(_CET_ENDBR) -#define _CET_ENDBR -#endif - -.intel_syntax noprefix -.global _blake3_hash_many_avx512 -.global blake3_hash_many_avx512 -.global blake3_compress_in_place_avx512 -.global _blake3_compress_in_place_avx512 -.global blake3_compress_xof_avx512 -.global _blake3_compress_xof_avx512 - -#ifdef __APPLE__ -.text -#else -.section .text -#endif -.p2align 6 -_blake3_hash_many_avx512: -blake3_hash_many_avx512: - _CET_ENDBR - push r15 - push r14 - push r13 - push r12 - push rbx - push rbp - mov rbp, rsp - sub rsp, 144 - and rsp, 0xFFFFFFFFFFFFFFC0 - neg r9 - kmovw k1, r9d - vmovd xmm0, r8d - vpbroadcastd ymm0, xmm0 - shr r8, 32 - vmovd xmm1, r8d - vpbroadcastd ymm1, xmm1 - vmovdqa ymm4, ymm1 - vmovdqa ymm5, ymm1 - vpaddd ymm2, ymm0, ymmword ptr [ADD0+rip] - vpaddd ymm3, ymm0, ymmword ptr [ADD0+32+rip] - vpcmpltud k2, ymm2, ymm0 - vpcmpltud k3, ymm3, ymm0 - vpaddd ymm4 {k2}, ymm4, dword ptr [ADD1+rip] {1to8} - vpaddd ymm5 {k3}, ymm5, dword ptr [ADD1+rip] {1to8} - knotw k2, k1 - vmovdqa32 ymm2 {k2}, ymm0 - vmovdqa32 ymm3 {k2}, ymm0 - vmovdqa32 ymm4 {k2}, ymm1 - vmovdqa32 ymm5 {k2}, ymm1 - vmovdqa ymmword ptr [rsp], ymm2 - vmovdqa ymmword ptr [rsp+0x1*0x20], ymm3 - vmovdqa ymmword ptr [rsp+0x2*0x20], ymm4 - vmovdqa ymmword ptr [rsp+0x3*0x20], ymm5 - shl rdx, 6 - mov qword ptr [rsp+0x80], rdx - cmp rsi, 16 - jc 3f -2: - vpbroadcastd zmm0, dword ptr [rcx] - vpbroadcastd zmm1, dword ptr [rcx+0x1*0x4] - vpbroadcastd zmm2, dword ptr [rcx+0x2*0x4] - vpbroadcastd zmm3, dword ptr [rcx+0x3*0x4] - vpbroadcastd zmm4, dword ptr [rcx+0x4*0x4] - vpbroadcastd zmm5, dword ptr [rcx+0x5*0x4] - vpbroadcastd zmm6, dword ptr [rcx+0x6*0x4] - vpbroadcastd zmm7, dword ptr [rcx+0x7*0x4] - movzx eax, byte ptr [rbp+0x38] - movzx ebx, byte ptr [rbp+0x40] - or eax, ebx - xor edx, edx -.p2align 5 -9: - movzx ebx, byte ptr [rbp+0x48] - or ebx, eax - add rdx, 64 - cmp rdx, qword ptr [rsp+0x80] - cmove eax, ebx - mov dword ptr [rsp+0x88], eax - mov r8, qword ptr [rdi] - mov r9, qword ptr [rdi+0x8] - mov r10, qword ptr [rdi+0x10] - mov r11, qword ptr [rdi+0x18] - mov r12, qword ptr [rdi+0x40] - mov r13, qword ptr [rdi+0x48] - mov r14, qword ptr [rdi+0x50] - mov r15, qword ptr [rdi+0x58] - vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20] - vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01 - vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20] - vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01 - vpunpcklqdq zmm8, zmm16, zmm17 - vpunpckhqdq zmm9, zmm16, zmm17 - vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20] - vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01 - vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20] - vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01 - vpunpcklqdq zmm10, zmm18, zmm19 - vpunpckhqdq zmm11, zmm18, zmm19 - mov r8, qword ptr [rdi+0x20] - mov r9, qword ptr [rdi+0x28] - mov r10, qword ptr [rdi+0x30] - mov r11, qword ptr [rdi+0x38] - mov r12, qword ptr [rdi+0x60] - mov r13, qword ptr [rdi+0x68] - mov r14, qword ptr [rdi+0x70] - mov r15, qword ptr [rdi+0x78] - vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20] - vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01 - vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20] - vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01 - vpunpcklqdq zmm12, zmm16, zmm17 - vpunpckhqdq zmm13, zmm16, zmm17 - vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20] - vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01 - vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20] - vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01 - vpunpcklqdq zmm14, zmm18, zmm19 - vpunpckhqdq zmm15, zmm18, zmm19 - vmovdqa32 zmm27, zmmword ptr [INDEX0+rip] - vmovdqa32 zmm31, zmmword ptr [INDEX1+rip] - vshufps zmm16, zmm8, zmm10, 136 - vshufps zmm17, zmm12, zmm14, 136 - vmovdqa32 zmm20, zmm16 - vpermt2d zmm16, zmm27, zmm17 - vpermt2d zmm20, zmm31, zmm17 - vshufps zmm17, zmm8, zmm10, 221 - vshufps zmm30, zmm12, zmm14, 221 - vmovdqa32 zmm21, zmm17 - vpermt2d zmm17, zmm27, zmm30 - vpermt2d zmm21, zmm31, zmm30 - vshufps zmm18, zmm9, zmm11, 136 - vshufps zmm8, zmm13, zmm15, 136 - vmovdqa32 zmm22, zmm18 - vpermt2d zmm18, zmm27, zmm8 - vpermt2d zmm22, zmm31, zmm8 - vshufps zmm19, zmm9, zmm11, 221 - vshufps zmm8, zmm13, zmm15, 221 - vmovdqa32 zmm23, zmm19 - vpermt2d zmm19, zmm27, zmm8 - vpermt2d zmm23, zmm31, zmm8 - mov r8, qword ptr [rdi] - mov r9, qword ptr [rdi+0x8] - mov r10, qword ptr [rdi+0x10] - mov r11, qword ptr [rdi+0x18] - mov r12, qword ptr [rdi+0x40] - mov r13, qword ptr [rdi+0x48] - mov r14, qword ptr [rdi+0x50] - mov r15, qword ptr [rdi+0x58] - vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20] - vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01 - vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20] - vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01 - vpunpcklqdq zmm8, zmm24, zmm25 - vpunpckhqdq zmm9, zmm24, zmm25 - vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20] - vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01 - vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20] - vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01 - vpunpcklqdq zmm10, zmm24, zmm25 - vpunpckhqdq zmm11, zmm24, zmm25 - prefetcht0 [r8+rdx+0x80] - prefetcht0 [r12+rdx+0x80] - prefetcht0 [r9+rdx+0x80] - prefetcht0 [r13+rdx+0x80] - prefetcht0 [r10+rdx+0x80] - prefetcht0 [r14+rdx+0x80] - prefetcht0 [r11+rdx+0x80] - prefetcht0 [r15+rdx+0x80] - mov r8, qword ptr [rdi+0x20] - mov r9, qword ptr [rdi+0x28] - mov r10, qword ptr [rdi+0x30] - mov r11, qword ptr [rdi+0x38] - mov r12, qword ptr [rdi+0x60] - mov r13, qword ptr [rdi+0x68] - mov r14, qword ptr [rdi+0x70] - mov r15, qword ptr [rdi+0x78] - vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20] - vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01 - vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20] - vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01 - vpunpcklqdq zmm12, zmm24, zmm25 - vpunpckhqdq zmm13, zmm24, zmm25 - vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20] - vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01 - vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20] - vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01 - vpunpcklqdq zmm14, zmm24, zmm25 - vpunpckhqdq zmm15, zmm24, zmm25 - prefetcht0 [r8+rdx+0x80] - prefetcht0 [r12+rdx+0x80] - prefetcht0 [r9+rdx+0x80] - prefetcht0 [r13+rdx+0x80] - prefetcht0 [r10+rdx+0x80] - prefetcht0 [r14+rdx+0x80] - prefetcht0 [r11+rdx+0x80] - prefetcht0 [r15+rdx+0x80] - vshufps zmm24, zmm8, zmm10, 136 - vshufps zmm30, zmm12, zmm14, 136 - vmovdqa32 zmm28, zmm24 - vpermt2d zmm24, zmm27, zmm30 - vpermt2d zmm28, zmm31, zmm30 - vshufps zmm25, zmm8, zmm10, 221 - vshufps zmm30, zmm12, zmm14, 221 - vmovdqa32 zmm29, zmm25 - vpermt2d zmm25, zmm27, zmm30 - vpermt2d zmm29, zmm31, zmm30 - vshufps zmm26, zmm9, zmm11, 136 - vshufps zmm8, zmm13, zmm15, 136 - vmovdqa32 zmm30, zmm26 - vpermt2d zmm26, zmm27, zmm8 - vpermt2d zmm30, zmm31, zmm8 - vshufps zmm8, zmm9, zmm11, 221 - vshufps zmm10, zmm13, zmm15, 221 - vpermi2d zmm27, zmm8, zmm10 - vpermi2d zmm31, zmm8, zmm10 - vpbroadcastd zmm8, dword ptr [BLAKE3_IV_0+rip] - vpbroadcastd zmm9, dword ptr [BLAKE3_IV_1+rip] - vpbroadcastd zmm10, dword ptr [BLAKE3_IV_2+rip] - vpbroadcastd zmm11, dword ptr [BLAKE3_IV_3+rip] - vmovdqa32 zmm12, zmmword ptr [rsp] - vmovdqa32 zmm13, zmmword ptr [rsp+0x1*0x40] - vpbroadcastd zmm14, dword ptr [BLAKE3_BLOCK_LEN+rip] - vpbroadcastd zmm15, dword ptr [rsp+0x22*0x4] - vpaddd zmm0, zmm0, zmm16 - vpaddd zmm1, zmm1, zmm18 - vpaddd zmm2, zmm2, zmm20 - vpaddd zmm3, zmm3, zmm22 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vprord zmm15, zmm15, 16 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 12 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vpaddd zmm0, zmm0, zmm17 - vpaddd zmm1, zmm1, zmm19 - vpaddd zmm2, zmm2, zmm21 - vpaddd zmm3, zmm3, zmm23 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vprord zmm15, zmm15, 8 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 7 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vpaddd zmm0, zmm0, zmm24 - vpaddd zmm1, zmm1, zmm26 - vpaddd zmm2, zmm2, zmm28 - vpaddd zmm3, zmm3, zmm30 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 16 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vprord zmm4, zmm4, 12 - vpaddd zmm0, zmm0, zmm25 - vpaddd zmm1, zmm1, zmm27 - vpaddd zmm2, zmm2, zmm29 - vpaddd zmm3, zmm3, zmm31 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 8 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vprord zmm4, zmm4, 7 - vpaddd zmm0, zmm0, zmm18 - vpaddd zmm1, zmm1, zmm19 - vpaddd zmm2, zmm2, zmm23 - vpaddd zmm3, zmm3, zmm20 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vprord zmm15, zmm15, 16 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 12 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vpaddd zmm0, zmm0, zmm22 - vpaddd zmm1, zmm1, zmm26 - vpaddd zmm2, zmm2, zmm16 - vpaddd zmm3, zmm3, zmm29 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vprord zmm15, zmm15, 8 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 7 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vpaddd zmm0, zmm0, zmm17 - vpaddd zmm1, zmm1, zmm28 - vpaddd zmm2, zmm2, zmm25 - vpaddd zmm3, zmm3, zmm31 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 16 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vprord zmm4, zmm4, 12 - vpaddd zmm0, zmm0, zmm27 - vpaddd zmm1, zmm1, zmm21 - vpaddd zmm2, zmm2, zmm30 - vpaddd zmm3, zmm3, zmm24 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 8 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vprord zmm4, zmm4, 7 - vpaddd zmm0, zmm0, zmm19 - vpaddd zmm1, zmm1, zmm26 - vpaddd zmm2, zmm2, zmm29 - vpaddd zmm3, zmm3, zmm23 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vprord zmm15, zmm15, 16 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 12 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vpaddd zmm0, zmm0, zmm20 - vpaddd zmm1, zmm1, zmm28 - vpaddd zmm2, zmm2, zmm18 - vpaddd zmm3, zmm3, zmm30 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vprord zmm15, zmm15, 8 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 7 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vpaddd zmm0, zmm0, zmm22 - vpaddd zmm1, zmm1, zmm25 - vpaddd zmm2, zmm2, zmm27 - vpaddd zmm3, zmm3, zmm24 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 16 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vprord zmm4, zmm4, 12 - vpaddd zmm0, zmm0, zmm21 - vpaddd zmm1, zmm1, zmm16 - vpaddd zmm2, zmm2, zmm31 - vpaddd zmm3, zmm3, zmm17 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 8 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vprord zmm4, zmm4, 7 - vpaddd zmm0, zmm0, zmm26 - vpaddd zmm1, zmm1, zmm28 - vpaddd zmm2, zmm2, zmm30 - vpaddd zmm3, zmm3, zmm29 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vprord zmm15, zmm15, 16 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 12 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vpaddd zmm0, zmm0, zmm23 - vpaddd zmm1, zmm1, zmm25 - vpaddd zmm2, zmm2, zmm19 - vpaddd zmm3, zmm3, zmm31 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vprord zmm15, zmm15, 8 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 7 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vpaddd zmm0, zmm0, zmm20 - vpaddd zmm1, zmm1, zmm27 - vpaddd zmm2, zmm2, zmm21 - vpaddd zmm3, zmm3, zmm17 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 16 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vprord zmm4, zmm4, 12 - vpaddd zmm0, zmm0, zmm16 - vpaddd zmm1, zmm1, zmm18 - vpaddd zmm2, zmm2, zmm24 - vpaddd zmm3, zmm3, zmm22 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 8 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vprord zmm4, zmm4, 7 - vpaddd zmm0, zmm0, zmm28 - vpaddd zmm1, zmm1, zmm25 - vpaddd zmm2, zmm2, zmm31 - vpaddd zmm3, zmm3, zmm30 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vprord zmm15, zmm15, 16 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 12 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vpaddd zmm0, zmm0, zmm29 - vpaddd zmm1, zmm1, zmm27 - vpaddd zmm2, zmm2, zmm26 - vpaddd zmm3, zmm3, zmm24 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vprord zmm15, zmm15, 8 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 7 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vpaddd zmm0, zmm0, zmm23 - vpaddd zmm1, zmm1, zmm21 - vpaddd zmm2, zmm2, zmm16 - vpaddd zmm3, zmm3, zmm22 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 16 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vprord zmm4, zmm4, 12 - vpaddd zmm0, zmm0, zmm18 - vpaddd zmm1, zmm1, zmm19 - vpaddd zmm2, zmm2, zmm17 - vpaddd zmm3, zmm3, zmm20 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 8 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vprord zmm4, zmm4, 7 - vpaddd zmm0, zmm0, zmm25 - vpaddd zmm1, zmm1, zmm27 - vpaddd zmm2, zmm2, zmm24 - vpaddd zmm3, zmm3, zmm31 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vprord zmm15, zmm15, 16 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 12 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vpaddd zmm0, zmm0, zmm30 - vpaddd zmm1, zmm1, zmm21 - vpaddd zmm2, zmm2, zmm28 - vpaddd zmm3, zmm3, zmm17 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vprord zmm15, zmm15, 8 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 7 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vpaddd zmm0, zmm0, zmm29 - vpaddd zmm1, zmm1, zmm16 - vpaddd zmm2, zmm2, zmm18 - vpaddd zmm3, zmm3, zmm20 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 16 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vprord zmm4, zmm4, 12 - vpaddd zmm0, zmm0, zmm19 - vpaddd zmm1, zmm1, zmm26 - vpaddd zmm2, zmm2, zmm22 - vpaddd zmm3, zmm3, zmm23 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 8 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vprord zmm4, zmm4, 7 - vpaddd zmm0, zmm0, zmm27 - vpaddd zmm1, zmm1, zmm21 - vpaddd zmm2, zmm2, zmm17 - vpaddd zmm3, zmm3, zmm24 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vprord zmm15, zmm15, 16 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 12 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vpaddd zmm0, zmm0, zmm31 - vpaddd zmm1, zmm1, zmm16 - vpaddd zmm2, zmm2, zmm25 - vpaddd zmm3, zmm3, zmm22 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vprord zmm15, zmm15, 8 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 7 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vpaddd zmm0, zmm0, zmm30 - vpaddd zmm1, zmm1, zmm18 - vpaddd zmm2, zmm2, zmm19 - vpaddd zmm3, zmm3, zmm23 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 16 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vprord zmm4, zmm4, 12 - vpaddd zmm0, zmm0, zmm26 - vpaddd zmm1, zmm1, zmm28 - vpaddd zmm2, zmm2, zmm20 - vpaddd zmm3, zmm3, zmm29 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 8 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vprord zmm4, zmm4, 7 - vpxord zmm0, zmm0, zmm8 - vpxord zmm1, zmm1, zmm9 - vpxord zmm2, zmm2, zmm10 - vpxord zmm3, zmm3, zmm11 - vpxord zmm4, zmm4, zmm12 - vpxord zmm5, zmm5, zmm13 - vpxord zmm6, zmm6, zmm14 - vpxord zmm7, zmm7, zmm15 - movzx eax, byte ptr [rbp+0x38] - jne 9b - mov rbx, qword ptr [rbp+0x50] - vpunpckldq zmm16, zmm0, zmm1 - vpunpckhdq zmm17, zmm0, zmm1 - vpunpckldq zmm18, zmm2, zmm3 - vpunpckhdq zmm19, zmm2, zmm3 - vpunpckldq zmm20, zmm4, zmm5 - vpunpckhdq zmm21, zmm4, zmm5 - vpunpckldq zmm22, zmm6, zmm7 - vpunpckhdq zmm23, zmm6, zmm7 - vpunpcklqdq zmm0, zmm16, zmm18 - vpunpckhqdq zmm1, zmm16, zmm18 - vpunpcklqdq zmm2, zmm17, zmm19 - vpunpckhqdq zmm3, zmm17, zmm19 - vpunpcklqdq zmm4, zmm20, zmm22 - vpunpckhqdq zmm5, zmm20, zmm22 - vpunpcklqdq zmm6, zmm21, zmm23 - vpunpckhqdq zmm7, zmm21, zmm23 - vshufi32x4 zmm16, zmm0, zmm4, 0x88 - vshufi32x4 zmm17, zmm1, zmm5, 0x88 - vshufi32x4 zmm18, zmm2, zmm6, 0x88 - vshufi32x4 zmm19, zmm3, zmm7, 0x88 - vshufi32x4 zmm20, zmm0, zmm4, 0xDD - vshufi32x4 zmm21, zmm1, zmm5, 0xDD - vshufi32x4 zmm22, zmm2, zmm6, 0xDD - vshufi32x4 zmm23, zmm3, zmm7, 0xDD - vshufi32x4 zmm0, zmm16, zmm17, 0x88 - vshufi32x4 zmm1, zmm18, zmm19, 0x88 - vshufi32x4 zmm2, zmm20, zmm21, 0x88 - vshufi32x4 zmm3, zmm22, zmm23, 0x88 - vshufi32x4 zmm4, zmm16, zmm17, 0xDD - vshufi32x4 zmm5, zmm18, zmm19, 0xDD - vshufi32x4 zmm6, zmm20, zmm21, 0xDD - vshufi32x4 zmm7, zmm22, zmm23, 0xDD - vmovdqu32 zmmword ptr [rbx], zmm0 - vmovdqu32 zmmword ptr [rbx+0x1*0x40], zmm1 - vmovdqu32 zmmword ptr [rbx+0x2*0x40], zmm2 - vmovdqu32 zmmword ptr [rbx+0x3*0x40], zmm3 - vmovdqu32 zmmword ptr [rbx+0x4*0x40], zmm4 - vmovdqu32 zmmword ptr [rbx+0x5*0x40], zmm5 - vmovdqu32 zmmword ptr [rbx+0x6*0x40], zmm6 - vmovdqu32 zmmword ptr [rbx+0x7*0x40], zmm7 - vmovdqa32 zmm0, zmmword ptr [rsp] - vmovdqa32 zmm1, zmmword ptr [rsp+0x1*0x40] - vmovdqa32 zmm2, zmm0 - vpaddd zmm2{k1}, zmm0, dword ptr [ADD16+rip] {1to16} - vpcmpltud k2, zmm2, zmm0 - vpaddd zmm1 {k2}, zmm1, dword ptr [ADD1+rip] {1to16} - vmovdqa32 zmmword ptr [rsp], zmm2 - vmovdqa32 zmmword ptr [rsp+0x1*0x40], zmm1 - add rdi, 128 - add rbx, 512 - mov qword ptr [rbp+0x50], rbx - sub rsi, 16 - cmp rsi, 16 - jnc 2b - test rsi, rsi - jnz 3f -4: - vzeroupper - mov rsp, rbp - pop rbp - pop rbx - pop r12 - pop r13 - pop r14 - pop r15 - ret -.p2align 6 -3: - test esi, 0x8 - je 3f - vpbroadcastd ymm0, dword ptr [rcx] - vpbroadcastd ymm1, dword ptr [rcx+0x4] - vpbroadcastd ymm2, dword ptr [rcx+0x8] - vpbroadcastd ymm3, dword ptr [rcx+0xC] - vpbroadcastd ymm4, dword ptr [rcx+0x10] - vpbroadcastd ymm5, dword ptr [rcx+0x14] - vpbroadcastd ymm6, dword ptr [rcx+0x18] - vpbroadcastd ymm7, dword ptr [rcx+0x1C] - mov r8, qword ptr [rdi] - mov r9, qword ptr [rdi+0x8] - mov r10, qword ptr [rdi+0x10] - mov r11, qword ptr [rdi+0x18] - mov r12, qword ptr [rdi+0x20] - mov r13, qword ptr [rdi+0x28] - mov r14, qword ptr [rdi+0x30] - mov r15, qword ptr [rdi+0x38] - movzx eax, byte ptr [rbp+0x38] - movzx ebx, byte ptr [rbp+0x40] - or eax, ebx - xor edx, edx -2: - movzx ebx, byte ptr [rbp+0x48] - or ebx, eax - add rdx, 64 - cmp rdx, qword ptr [rsp+0x80] - cmove eax, ebx - mov dword ptr [rsp+0x88], eax - vmovups xmm8, xmmword ptr [r8+rdx-0x40] - vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01 - vmovups xmm9, xmmword ptr [r9+rdx-0x40] - vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01 - vunpcklpd ymm12, ymm8, ymm9 - vunpckhpd ymm13, ymm8, ymm9 - vmovups xmm10, xmmword ptr [r10+rdx-0x40] - vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01 - vmovups xmm11, xmmword ptr [r11+rdx-0x40] - vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01 - vunpcklpd ymm14, ymm10, ymm11 - vunpckhpd ymm15, ymm10, ymm11 - vshufps ymm16, ymm12, ymm14, 136 - vshufps ymm17, ymm12, ymm14, 221 - vshufps ymm18, ymm13, ymm15, 136 - vshufps ymm19, ymm13, ymm15, 221 - vmovups xmm8, xmmword ptr [r8+rdx-0x30] - vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01 - vmovups xmm9, xmmword ptr [r9+rdx-0x30] - vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01 - vunpcklpd ymm12, ymm8, ymm9 - vunpckhpd ymm13, ymm8, ymm9 - vmovups xmm10, xmmword ptr [r10+rdx-0x30] - vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01 - vmovups xmm11, xmmword ptr [r11+rdx-0x30] - vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01 - vunpcklpd ymm14, ymm10, ymm11 - vunpckhpd ymm15, ymm10, ymm11 - vshufps ymm20, ymm12, ymm14, 136 - vshufps ymm21, ymm12, ymm14, 221 - vshufps ymm22, ymm13, ymm15, 136 - vshufps ymm23, ymm13, ymm15, 221 - vmovups xmm8, xmmword ptr [r8+rdx-0x20] - vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01 - vmovups xmm9, xmmword ptr [r9+rdx-0x20] - vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01 - vunpcklpd ymm12, ymm8, ymm9 - vunpckhpd ymm13, ymm8, ymm9 - vmovups xmm10, xmmword ptr [r10+rdx-0x20] - vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01 - vmovups xmm11, xmmword ptr [r11+rdx-0x20] - vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01 - vunpcklpd ymm14, ymm10, ymm11 - vunpckhpd ymm15, ymm10, ymm11 - vshufps ymm24, ymm12, ymm14, 136 - vshufps ymm25, ymm12, ymm14, 221 - vshufps ymm26, ymm13, ymm15, 136 - vshufps ymm27, ymm13, ymm15, 221 - vmovups xmm8, xmmword ptr [r8+rdx-0x10] - vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01 - vmovups xmm9, xmmword ptr [r9+rdx-0x10] - vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01 - vunpcklpd ymm12, ymm8, ymm9 - vunpckhpd ymm13, ymm8, ymm9 - vmovups xmm10, xmmword ptr [r10+rdx-0x10] - vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01 - vmovups xmm11, xmmword ptr [r11+rdx-0x10] - vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01 - vunpcklpd ymm14, ymm10, ymm11 - vunpckhpd ymm15, ymm10, ymm11 - vshufps ymm28, ymm12, ymm14, 136 - vshufps ymm29, ymm12, ymm14, 221 - vshufps ymm30, ymm13, ymm15, 136 - vshufps ymm31, ymm13, ymm15, 221 - vpbroadcastd ymm8, dword ptr [BLAKE3_IV_0+rip] - vpbroadcastd ymm9, dword ptr [BLAKE3_IV_1+rip] - vpbroadcastd ymm10, dword ptr [BLAKE3_IV_2+rip] - vpbroadcastd ymm11, dword ptr [BLAKE3_IV_3+rip] - vmovdqa ymm12, ymmword ptr [rsp] - vmovdqa ymm13, ymmword ptr [rsp+0x40] - vpbroadcastd ymm14, dword ptr [BLAKE3_BLOCK_LEN+rip] - vpbroadcastd ymm15, dword ptr [rsp+0x88] - vpaddd ymm0, ymm0, ymm16 - vpaddd ymm1, ymm1, ymm18 - vpaddd ymm2, ymm2, ymm20 - vpaddd ymm3, ymm3, ymm22 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vprord ymm15, ymm15, 16 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 12 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vpaddd ymm0, ymm0, ymm17 - vpaddd ymm1, ymm1, ymm19 - vpaddd ymm2, ymm2, ymm21 - vpaddd ymm3, ymm3, ymm23 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vprord ymm15, ymm15, 8 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 7 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vpaddd ymm0, ymm0, ymm24 - vpaddd ymm1, ymm1, ymm26 - vpaddd ymm2, ymm2, ymm28 - vpaddd ymm3, ymm3, ymm30 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 16 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vprord ymm4, ymm4, 12 - vpaddd ymm0, ymm0, ymm25 - vpaddd ymm1, ymm1, ymm27 - vpaddd ymm2, ymm2, ymm29 - vpaddd ymm3, ymm3, ymm31 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 8 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vprord ymm4, ymm4, 7 - vpaddd ymm0, ymm0, ymm18 - vpaddd ymm1, ymm1, ymm19 - vpaddd ymm2, ymm2, ymm23 - vpaddd ymm3, ymm3, ymm20 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vprord ymm15, ymm15, 16 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 12 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vpaddd ymm0, ymm0, ymm22 - vpaddd ymm1, ymm1, ymm26 - vpaddd ymm2, ymm2, ymm16 - vpaddd ymm3, ymm3, ymm29 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vprord ymm15, ymm15, 8 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 7 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vpaddd ymm0, ymm0, ymm17 - vpaddd ymm1, ymm1, ymm28 - vpaddd ymm2, ymm2, ymm25 - vpaddd ymm3, ymm3, ymm31 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 16 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vprord ymm4, ymm4, 12 - vpaddd ymm0, ymm0, ymm27 - vpaddd ymm1, ymm1, ymm21 - vpaddd ymm2, ymm2, ymm30 - vpaddd ymm3, ymm3, ymm24 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 8 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vprord ymm4, ymm4, 7 - vpaddd ymm0, ymm0, ymm19 - vpaddd ymm1, ymm1, ymm26 - vpaddd ymm2, ymm2, ymm29 - vpaddd ymm3, ymm3, ymm23 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vprord ymm15, ymm15, 16 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 12 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vpaddd ymm0, ymm0, ymm20 - vpaddd ymm1, ymm1, ymm28 - vpaddd ymm2, ymm2, ymm18 - vpaddd ymm3, ymm3, ymm30 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vprord ymm15, ymm15, 8 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 7 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vpaddd ymm0, ymm0, ymm22 - vpaddd ymm1, ymm1, ymm25 - vpaddd ymm2, ymm2, ymm27 - vpaddd ymm3, ymm3, ymm24 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 16 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vprord ymm4, ymm4, 12 - vpaddd ymm0, ymm0, ymm21 - vpaddd ymm1, ymm1, ymm16 - vpaddd ymm2, ymm2, ymm31 - vpaddd ymm3, ymm3, ymm17 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 8 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vprord ymm4, ymm4, 7 - vpaddd ymm0, ymm0, ymm26 - vpaddd ymm1, ymm1, ymm28 - vpaddd ymm2, ymm2, ymm30 - vpaddd ymm3, ymm3, ymm29 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vprord ymm15, ymm15, 16 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 12 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vpaddd ymm0, ymm0, ymm23 - vpaddd ymm1, ymm1, ymm25 - vpaddd ymm2, ymm2, ymm19 - vpaddd ymm3, ymm3, ymm31 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vprord ymm15, ymm15, 8 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 7 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vpaddd ymm0, ymm0, ymm20 - vpaddd ymm1, ymm1, ymm27 - vpaddd ymm2, ymm2, ymm21 - vpaddd ymm3, ymm3, ymm17 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 16 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vprord ymm4, ymm4, 12 - vpaddd ymm0, ymm0, ymm16 - vpaddd ymm1, ymm1, ymm18 - vpaddd ymm2, ymm2, ymm24 - vpaddd ymm3, ymm3, ymm22 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 8 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vprord ymm4, ymm4, 7 - vpaddd ymm0, ymm0, ymm28 - vpaddd ymm1, ymm1, ymm25 - vpaddd ymm2, ymm2, ymm31 - vpaddd ymm3, ymm3, ymm30 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vprord ymm15, ymm15, 16 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 12 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vpaddd ymm0, ymm0, ymm29 - vpaddd ymm1, ymm1, ymm27 - vpaddd ymm2, ymm2, ymm26 - vpaddd ymm3, ymm3, ymm24 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vprord ymm15, ymm15, 8 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 7 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vpaddd ymm0, ymm0, ymm23 - vpaddd ymm1, ymm1, ymm21 - vpaddd ymm2, ymm2, ymm16 - vpaddd ymm3, ymm3, ymm22 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 16 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vprord ymm4, ymm4, 12 - vpaddd ymm0, ymm0, ymm18 - vpaddd ymm1, ymm1, ymm19 - vpaddd ymm2, ymm2, ymm17 - vpaddd ymm3, ymm3, ymm20 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 8 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vprord ymm4, ymm4, 7 - vpaddd ymm0, ymm0, ymm25 - vpaddd ymm1, ymm1, ymm27 - vpaddd ymm2, ymm2, ymm24 - vpaddd ymm3, ymm3, ymm31 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vprord ymm15, ymm15, 16 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 12 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vpaddd ymm0, ymm0, ymm30 - vpaddd ymm1, ymm1, ymm21 - vpaddd ymm2, ymm2, ymm28 - vpaddd ymm3, ymm3, ymm17 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vprord ymm15, ymm15, 8 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 7 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vpaddd ymm0, ymm0, ymm29 - vpaddd ymm1, ymm1, ymm16 - vpaddd ymm2, ymm2, ymm18 - vpaddd ymm3, ymm3, ymm20 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 16 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vprord ymm4, ymm4, 12 - vpaddd ymm0, ymm0, ymm19 - vpaddd ymm1, ymm1, ymm26 - vpaddd ymm2, ymm2, ymm22 - vpaddd ymm3, ymm3, ymm23 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 8 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vprord ymm4, ymm4, 7 - vpaddd ymm0, ymm0, ymm27 - vpaddd ymm1, ymm1, ymm21 - vpaddd ymm2, ymm2, ymm17 - vpaddd ymm3, ymm3, ymm24 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vprord ymm15, ymm15, 16 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 12 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vpaddd ymm0, ymm0, ymm31 - vpaddd ymm1, ymm1, ymm16 - vpaddd ymm2, ymm2, ymm25 - vpaddd ymm3, ymm3, ymm22 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vprord ymm15, ymm15, 8 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 7 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vpaddd ymm0, ymm0, ymm30 - vpaddd ymm1, ymm1, ymm18 - vpaddd ymm2, ymm2, ymm19 - vpaddd ymm3, ymm3, ymm23 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 16 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vprord ymm4, ymm4, 12 - vpaddd ymm0, ymm0, ymm26 - vpaddd ymm1, ymm1, ymm28 - vpaddd ymm2, ymm2, ymm20 - vpaddd ymm3, ymm3, ymm29 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 8 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vprord ymm4, ymm4, 7 - vpxor ymm0, ymm0, ymm8 - vpxor ymm1, ymm1, ymm9 - vpxor ymm2, ymm2, ymm10 - vpxor ymm3, ymm3, ymm11 - vpxor ymm4, ymm4, ymm12 - vpxor ymm5, ymm5, ymm13 - vpxor ymm6, ymm6, ymm14 - vpxor ymm7, ymm7, ymm15 - movzx eax, byte ptr [rbp+0x38] - jne 2b - mov rbx, qword ptr [rbp+0x50] - vunpcklps ymm8, ymm0, ymm1 - vunpcklps ymm9, ymm2, ymm3 - vunpckhps ymm10, ymm0, ymm1 - vunpcklps ymm11, ymm4, ymm5 - vunpcklps ymm0, ymm6, ymm7 - vshufps ymm12, ymm8, ymm9, 78 - vblendps ymm1, ymm8, ymm12, 0xCC - vshufps ymm8, ymm11, ymm0, 78 - vunpckhps ymm13, ymm2, ymm3 - vblendps ymm2, ymm11, ymm8, 0xCC - vblendps ymm3, ymm12, ymm9, 0xCC - vperm2f128 ymm12, ymm1, ymm2, 0x20 - vmovups ymmword ptr [rbx], ymm12 - vunpckhps ymm14, ymm4, ymm5 - vblendps ymm4, ymm8, ymm0, 0xCC - vunpckhps ymm15, ymm6, ymm7 - vperm2f128 ymm7, ymm3, ymm4, 0x20 - vmovups ymmword ptr [rbx+0x20], ymm7 - vshufps ymm5, ymm10, ymm13, 78 - vblendps ymm6, ymm5, ymm13, 0xCC - vshufps ymm13, ymm14, ymm15, 78 - vblendps ymm10, ymm10, ymm5, 0xCC - vblendps ymm14, ymm14, ymm13, 0xCC - vperm2f128 ymm8, ymm10, ymm14, 0x20 - vmovups ymmword ptr [rbx+0x40], ymm8 - vblendps ymm15, ymm13, ymm15, 0xCC - vperm2f128 ymm13, ymm6, ymm15, 0x20 - vmovups ymmword ptr [rbx+0x60], ymm13 - vperm2f128 ymm9, ymm1, ymm2, 0x31 - vperm2f128 ymm11, ymm3, ymm4, 0x31 - vmovups ymmword ptr [rbx+0x80], ymm9 - vperm2f128 ymm14, ymm10, ymm14, 0x31 - vperm2f128 ymm15, ymm6, ymm15, 0x31 - vmovups ymmword ptr [rbx+0xA0], ymm11 - vmovups ymmword ptr [rbx+0xC0], ymm14 - vmovups ymmword ptr [rbx+0xE0], ymm15 - vmovdqa ymm0, ymmword ptr [rsp] - vmovdqa ymm2, ymmword ptr [rsp+0x2*0x20] - vmovdqa32 ymm0 {k1}, ymmword ptr [rsp+0x1*0x20] - vmovdqa32 ymm2 {k1}, ymmword ptr [rsp+0x3*0x20] - vmovdqa ymmword ptr [rsp], ymm0 - vmovdqa ymmword ptr [rsp+0x2*0x20], ymm2 - add rbx, 256 - mov qword ptr [rbp+0x50], rbx - add rdi, 64 - sub rsi, 8 -3: - mov rbx, qword ptr [rbp+0x50] - mov r15, qword ptr [rsp+0x80] - movzx r13, byte ptr [rbp+0x38] - movzx r12, byte ptr [rbp+0x48] - test esi, 0x4 - je 3f - vbroadcasti32x4 zmm0, xmmword ptr [rcx] - vbroadcasti32x4 zmm1, xmmword ptr [rcx+0x1*0x10] - vmovdqa xmm12, xmmword ptr [rsp] - vmovdqa xmm13, xmmword ptr [rsp+0x4*0x10] - vpunpckldq xmm14, xmm12, xmm13 - vpunpckhdq xmm15, xmm12, xmm13 - vpermq ymm14, ymm14, 0xDC - vpermq ymm15, ymm15, 0xDC - vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN+rip] - vinserti64x4 zmm13, zmm14, ymm15, 0x01 - mov eax, 17476 - kmovw k2, eax - vpblendmd zmm13 {k2}, zmm13, zmm12 - vbroadcasti32x4 zmm15, xmmword ptr [BLAKE3_IV+rip] - mov r8, qword ptr [rdi] - mov r9, qword ptr [rdi+0x8] - mov r10, qword ptr [rdi+0x10] - mov r11, qword ptr [rdi+0x18] - mov eax, 43690 - kmovw k3, eax - mov eax, 34952 - kmovw k4, eax - movzx eax, byte ptr [rbp+0x40] - or eax, r13d - xor edx, edx -.p2align 5 -2: - mov r14d, eax - or eax, r12d - add rdx, 64 - cmp rdx, r15 - cmovne eax, r14d - mov dword ptr [rsp+0x88], eax - vmovdqa32 zmm2, zmm15 - vpbroadcastd zmm8, dword ptr [rsp+0x22*0x4] - vpblendmd zmm3 {k4}, zmm13, zmm8 - vmovups zmm8, zmmword ptr [r8+rdx-0x1*0x40] - vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-0x4*0x10], 0x01 - vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-0x4*0x10], 0x02 - vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-0x4*0x10], 0x03 - vmovups zmm9, zmmword ptr [r8+rdx-0x30] - vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-0x3*0x10], 0x01 - vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-0x3*0x10], 0x02 - vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-0x3*0x10], 0x03 - vshufps zmm4, zmm8, zmm9, 136 - vshufps zmm5, zmm8, zmm9, 221 - vmovups zmm8, zmmword ptr [r8+rdx-0x20] - vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-0x2*0x10], 0x01 - vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-0x2*0x10], 0x02 - vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-0x2*0x10], 0x03 - vmovups zmm9, zmmword ptr [r8+rdx-0x10] - vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-0x1*0x10], 0x01 - vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-0x1*0x10], 0x02 - vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-0x1*0x10], 0x03 - vshufps zmm6, zmm8, zmm9, 136 - vshufps zmm7, zmm8, zmm9, 221 - vpshufd zmm6, zmm6, 0x93 - vpshufd zmm7, zmm7, 0x93 - mov al, 7 -9: - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm0, zmm0, zmm1 - vpxord zmm3, zmm3, zmm0 - vprord zmm3, zmm3, 16 - vpaddd zmm2, zmm2, zmm3 - vpxord zmm1, zmm1, zmm2 - vprord zmm1, zmm1, 12 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm0, zmm0, zmm1 - vpxord zmm3, zmm3, zmm0 - vprord zmm3, zmm3, 8 - vpaddd zmm2, zmm2, zmm3 - vpxord zmm1, zmm1, zmm2 - vprord zmm1, zmm1, 7 - vpshufd zmm0, zmm0, 0x93 - vpshufd zmm3, zmm3, 0x4E - vpshufd zmm2, zmm2, 0x39 - vpaddd zmm0, zmm0, zmm6 - vpaddd zmm0, zmm0, zmm1 - vpxord zmm3, zmm3, zmm0 - vprord zmm3, zmm3, 16 - vpaddd zmm2, zmm2, zmm3 - vpxord zmm1, zmm1, zmm2 - vprord zmm1, zmm1, 12 - vpaddd zmm0, zmm0, zmm7 - vpaddd zmm0, zmm0, zmm1 - vpxord zmm3, zmm3, zmm0 - vprord zmm3, zmm3, 8 - vpaddd zmm2, zmm2, zmm3 - vpxord zmm1, zmm1, zmm2 - vprord zmm1, zmm1, 7 - vpshufd zmm0, zmm0, 0x39 - vpshufd zmm3, zmm3, 0x4E - vpshufd zmm2, zmm2, 0x93 - dec al - jz 9f - vshufps zmm8, zmm4, zmm5, 214 - vpshufd zmm9, zmm4, 0x0F - vpshufd zmm4, zmm8, 0x39 - vshufps zmm8, zmm6, zmm7, 250 - vpblendmd zmm9 {k3}, zmm9, zmm8 - vpunpcklqdq zmm8, zmm7, zmm5 - vpblendmd zmm8 {k4}, zmm8, zmm6 - vpshufd zmm8, zmm8, 0x78 - vpunpckhdq zmm5, zmm5, zmm7 - vpunpckldq zmm6, zmm6, zmm5 - vpshufd zmm7, zmm6, 0x1E - vmovdqa32 zmm5, zmm9 - vmovdqa32 zmm6, zmm8 - jmp 9b -9: - vpxord zmm0, zmm0, zmm2 - vpxord zmm1, zmm1, zmm3 - mov eax, r13d - cmp rdx, r15 - jne 2b - vmovdqu xmmword ptr [rbx], xmm0 - vmovdqu xmmword ptr [rbx+0x10], xmm1 - vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 - vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 - vextracti32x4 xmmword ptr [rbx+0x4*0x10], zmm0, 0x02 - vextracti32x4 xmmword ptr [rbx+0x5*0x10], zmm1, 0x02 - vextracti32x4 xmmword ptr [rbx+0x6*0x10], zmm0, 0x03 - vextracti32x4 xmmword ptr [rbx+0x7*0x10], zmm1, 0x03 - vmovdqa xmm0, xmmword ptr [rsp] - vmovdqa xmm2, xmmword ptr [rsp+0x40] - vmovdqa32 xmm0 {k1}, xmmword ptr [rsp+0x1*0x10] - vmovdqa32 xmm2 {k1}, xmmword ptr [rsp+0x5*0x10] - vmovdqa xmmword ptr [rsp], xmm0 - vmovdqa xmmword ptr [rsp+0x40], xmm2 - add rbx, 128 - add rdi, 32 - sub rsi, 4 -3: - test esi, 0x2 - je 3f - vbroadcasti128 ymm0, xmmword ptr [rcx] - vbroadcasti128 ymm1, xmmword ptr [rcx+0x10] - vmovd xmm13, dword ptr [rsp] - vpinsrd xmm13, xmm13, dword ptr [rsp+0x40], 1 - vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 - vmovd xmm14, dword ptr [rsp+0x4] - vpinsrd xmm14, xmm14, dword ptr [rsp+0x44], 1 - vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 - vinserti128 ymm13, ymm13, xmm14, 0x01 - mov r8, qword ptr [rdi] - mov r9, qword ptr [rdi+0x8] - movzx eax, byte ptr [rbp+0x40] - or eax, r13d - xor edx, edx -.p2align 5 -2: - mov r14d, eax - or eax, r12d - add rdx, 64 - cmp rdx, r15 - cmovne eax, r14d - mov dword ptr [rsp+0x88], eax - vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip] - vpbroadcastd ymm8, dword ptr [rsp+0x88] - vpblendd ymm3, ymm13, ymm8, 0x88 - vmovups ymm8, ymmword ptr [r8+rdx-0x40] - vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x40], 0x01 - vmovups ymm9, ymmword ptr [r8+rdx-0x30] - vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x30], 0x01 - vshufps ymm4, ymm8, ymm9, 136 - vshufps ymm5, ymm8, ymm9, 221 - vmovups ymm8, ymmword ptr [r8+rdx-0x20] - vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x20], 0x01 - vmovups ymm9, ymmword ptr [r8+rdx-0x10] - vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x10], 0x01 - vshufps ymm6, ymm8, ymm9, 136 - vshufps ymm7, ymm8, ymm9, 221 - vpshufd ymm6, ymm6, 0x93 - vpshufd ymm7, ymm7, 0x93 - mov al, 7 -9: - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm0, ymm0, ymm1 - vpxord ymm3, ymm3, ymm0 - vprord ymm3, ymm3, 16 - vpaddd ymm2, ymm2, ymm3 - vpxord ymm1, ymm1, ymm2 - vprord ymm1, ymm1, 12 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm0, ymm0, ymm1 - vpxord ymm3, ymm3, ymm0 - vprord ymm3, ymm3, 8 - vpaddd ymm2, ymm2, ymm3 - vpxord ymm1, ymm1, ymm2 - vprord ymm1, ymm1, 7 - vpshufd ymm0, ymm0, 0x93 - vpshufd ymm3, ymm3, 0x4E - vpshufd ymm2, ymm2, 0x39 - vpaddd ymm0, ymm0, ymm6 - vpaddd ymm0, ymm0, ymm1 - vpxord ymm3, ymm3, ymm0 - vprord ymm3, ymm3, 16 - vpaddd ymm2, ymm2, ymm3 - vpxord ymm1, ymm1, ymm2 - vprord ymm1, ymm1, 12 - vpaddd ymm0, ymm0, ymm7 - vpaddd ymm0, ymm0, ymm1 - vpxord ymm3, ymm3, ymm0 - vprord ymm3, ymm3, 8 - vpaddd ymm2, ymm2, ymm3 - vpxord ymm1, ymm1, ymm2 - vprord ymm1, ymm1, 7 - vpshufd ymm0, ymm0, 0x39 - vpshufd ymm3, ymm3, 0x4E - vpshufd ymm2, ymm2, 0x93 - dec al - jz 9f - vshufps ymm8, ymm4, ymm5, 214 - vpshufd ymm9, ymm4, 0x0F - vpshufd ymm4, ymm8, 0x39 - vshufps ymm8, ymm6, ymm7, 250 - vpblendd ymm9, ymm9, ymm8, 0xAA - vpunpcklqdq ymm8, ymm7, ymm5 - vpblendd ymm8, ymm8, ymm6, 0x88 - vpshufd ymm8, ymm8, 0x78 - vpunpckhdq ymm5, ymm5, ymm7 - vpunpckldq ymm6, ymm6, ymm5 - vpshufd ymm7, ymm6, 0x1E - vmovdqa ymm5, ymm9 - vmovdqa ymm6, ymm8 - jmp 9b -9: - vpxor ymm0, ymm0, ymm2 - vpxor ymm1, ymm1, ymm3 - mov eax, r13d - cmp rdx, r15 - jne 2b - vmovdqu xmmword ptr [rbx], xmm0 - vmovdqu xmmword ptr [rbx+0x10], xmm1 - vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 - vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 - vmovdqa xmm0, xmmword ptr [rsp] - vmovdqa xmm2, xmmword ptr [rsp+0x4*0x10] - vmovdqu32 xmm0 {k1}, xmmword ptr [rsp+0x8] - vmovdqu32 xmm2 {k1}, xmmword ptr [rsp+0x48] - vmovdqa xmmword ptr [rsp], xmm0 - vmovdqa xmmword ptr [rsp+0x4*0x10], xmm2 - add rbx, 64 - add rdi, 16 - sub rsi, 2 -3: - test esi, 0x1 - je 4b - vmovdqu xmm0, xmmword ptr [rcx] - vmovdqu xmm1, xmmword ptr [rcx+0x10] - vmovd xmm14, dword ptr [rsp] - vpinsrd xmm14, xmm14, dword ptr [rsp+0x40], 1 - vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 - vmovdqa xmm15, xmmword ptr [BLAKE3_IV+rip] - mov r8, qword ptr [rdi] - movzx eax, byte ptr [rbp+0x40] - or eax, r13d - xor edx, edx -.p2align 5 -2: - mov r14d, eax - or eax, r12d - add rdx, 64 - cmp rdx, r15 - cmovne eax, r14d - vpinsrd xmm3, xmm14, eax, 3 - vmovdqa xmm2, xmm15 - vmovups xmm8, xmmword ptr [r8+rdx-0x40] - vmovups xmm9, xmmword ptr [r8+rdx-0x30] - vshufps xmm4, xmm8, xmm9, 136 - vshufps xmm5, xmm8, xmm9, 221 - vmovups xmm8, xmmword ptr [r8+rdx-0x20] - vmovups xmm9, xmmword ptr [r8+rdx-0x10] - vshufps xmm6, xmm8, xmm9, 136 - vshufps xmm7, xmm8, xmm9, 221 - vpshufd xmm6, xmm6, 0x93 - vpshufd xmm7, xmm7, 0x93 - mov al, 7 -9: - vpaddd xmm0, xmm0, xmm4 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 16 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 12 - vpaddd xmm0, xmm0, xmm5 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 8 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 7 - vpshufd xmm0, xmm0, 0x93 - vpshufd xmm3, xmm3, 0x4E - vpshufd xmm2, xmm2, 0x39 - vpaddd xmm0, xmm0, xmm6 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 16 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 12 - vpaddd xmm0, xmm0, xmm7 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 8 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 7 - vpshufd xmm0, xmm0, 0x39 - vpshufd xmm3, xmm3, 0x4E - vpshufd xmm2, xmm2, 0x93 - dec al - jz 9f - vshufps xmm8, xmm4, xmm5, 214 - vpshufd xmm9, xmm4, 0x0F - vpshufd xmm4, xmm8, 0x39 - vshufps xmm8, xmm6, xmm7, 250 - vpblendd xmm9, xmm9, xmm8, 0xAA - vpunpcklqdq xmm8, xmm7, xmm5 - vpblendd xmm8, xmm8, xmm6, 0x88 - vpshufd xmm8, xmm8, 0x78 - vpunpckhdq xmm5, xmm5, xmm7 - vpunpckldq xmm6, xmm6, xmm5 - vpshufd xmm7, xmm6, 0x1E - vmovdqa xmm5, xmm9 - vmovdqa xmm6, xmm8 - jmp 9b -9: - vpxor xmm0, xmm0, xmm2 - vpxor xmm1, xmm1, xmm3 - mov eax, r13d - cmp rdx, r15 - jne 2b - vmovdqu xmmword ptr [rbx], xmm0 - vmovdqu xmmword ptr [rbx+0x10], xmm1 - jmp 4b -.p2align 6 -_blake3_compress_in_place_avx512: -blake3_compress_in_place_avx512: - _CET_ENDBR - vmovdqu xmm0, xmmword ptr [rdi] - vmovdqu xmm1, xmmword ptr [rdi+0x10] - movzx eax, r8b - movzx edx, dl - shl rax, 32 - add rdx, rax - vmovq xmm3, rcx - vmovq xmm4, rdx - vpunpcklqdq xmm3, xmm3, xmm4 - vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip] - vmovups xmm8, xmmword ptr [rsi] - vmovups xmm9, xmmword ptr [rsi+0x10] - vshufps xmm4, xmm8, xmm9, 136 - vshufps xmm5, xmm8, xmm9, 221 - vmovups xmm8, xmmword ptr [rsi+0x20] - vmovups xmm9, xmmword ptr [rsi+0x30] - vshufps xmm6, xmm8, xmm9, 136 - vshufps xmm7, xmm8, xmm9, 221 - vpshufd xmm6, xmm6, 0x93 - vpshufd xmm7, xmm7, 0x93 - mov al, 7 -9: - vpaddd xmm0, xmm0, xmm4 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 16 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 12 - vpaddd xmm0, xmm0, xmm5 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 8 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 7 - vpshufd xmm0, xmm0, 0x93 - vpshufd xmm3, xmm3, 0x4E - vpshufd xmm2, xmm2, 0x39 - vpaddd xmm0, xmm0, xmm6 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 16 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 12 - vpaddd xmm0, xmm0, xmm7 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 8 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 7 - vpshufd xmm0, xmm0, 0x39 - vpshufd xmm3, xmm3, 0x4E - vpshufd xmm2, xmm2, 0x93 - dec al - jz 9f - vshufps xmm8, xmm4, xmm5, 214 - vpshufd xmm9, xmm4, 0x0F - vpshufd xmm4, xmm8, 0x39 - vshufps xmm8, xmm6, xmm7, 250 - vpblendd xmm9, xmm9, xmm8, 0xAA - vpunpcklqdq xmm8, xmm7, xmm5 - vpblendd xmm8, xmm8, xmm6, 0x88 - vpshufd xmm8, xmm8, 0x78 - vpunpckhdq xmm5, xmm5, xmm7 - vpunpckldq xmm6, xmm6, xmm5 - vpshufd xmm7, xmm6, 0x1E - vmovdqa xmm5, xmm9 - vmovdqa xmm6, xmm8 - jmp 9b -9: - vpxor xmm0, xmm0, xmm2 - vpxor xmm1, xmm1, xmm3 - vmovdqu xmmword ptr [rdi], xmm0 - vmovdqu xmmword ptr [rdi+0x10], xmm1 - ret - -.p2align 6 -_blake3_compress_xof_avx512: -blake3_compress_xof_avx512: - _CET_ENDBR - vmovdqu xmm0, xmmword ptr [rdi] - vmovdqu xmm1, xmmword ptr [rdi+0x10] - movzx eax, r8b - movzx edx, dl - shl rax, 32 - add rdx, rax - vmovq xmm3, rcx - vmovq xmm4, rdx - vpunpcklqdq xmm3, xmm3, xmm4 - vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip] - vmovups xmm8, xmmword ptr [rsi] - vmovups xmm9, xmmword ptr [rsi+0x10] - vshufps xmm4, xmm8, xmm9, 136 - vshufps xmm5, xmm8, xmm9, 221 - vmovups xmm8, xmmword ptr [rsi+0x20] - vmovups xmm9, xmmword ptr [rsi+0x30] - vshufps xmm6, xmm8, xmm9, 136 - vshufps xmm7, xmm8, xmm9, 221 - vpshufd xmm6, xmm6, 0x93 - vpshufd xmm7, xmm7, 0x93 - mov al, 7 -9: - vpaddd xmm0, xmm0, xmm4 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 16 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 12 - vpaddd xmm0, xmm0, xmm5 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 8 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 7 - vpshufd xmm0, xmm0, 0x93 - vpshufd xmm3, xmm3, 0x4E - vpshufd xmm2, xmm2, 0x39 - vpaddd xmm0, xmm0, xmm6 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 16 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 12 - vpaddd xmm0, xmm0, xmm7 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 8 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 7 - vpshufd xmm0, xmm0, 0x39 - vpshufd xmm3, xmm3, 0x4E - vpshufd xmm2, xmm2, 0x93 - dec al - jz 9f - vshufps xmm8, xmm4, xmm5, 214 - vpshufd xmm9, xmm4, 0x0F - vpshufd xmm4, xmm8, 0x39 - vshufps xmm8, xmm6, xmm7, 250 - vpblendd xmm9, xmm9, xmm8, 0xAA - vpunpcklqdq xmm8, xmm7, xmm5 - vpblendd xmm8, xmm8, xmm6, 0x88 - vpshufd xmm8, xmm8, 0x78 - vpunpckhdq xmm5, xmm5, xmm7 - vpunpckldq xmm6, xmm6, xmm5 - vpshufd xmm7, xmm6, 0x1E - vmovdqa xmm5, xmm9 - vmovdqa xmm6, xmm8 - jmp 9b -9: - vpxor xmm0, xmm0, xmm2 - vpxor xmm1, xmm1, xmm3 - vpxor xmm2, xmm2, [rdi] - vpxor xmm3, xmm3, [rdi+0x10] - vmovdqu xmmword ptr [r9], xmm0 - vmovdqu xmmword ptr [r9+0x10], xmm1 - vmovdqu xmmword ptr [r9+0x20], xmm2 - vmovdqu xmmword ptr [r9+0x30], xmm3 - ret - -#ifdef __APPLE__ -.static_data -#else -.section .rodata -#endif -.p2align 6 -INDEX0: - .long 0, 1, 2, 3, 16, 17, 18, 19 - .long 8, 9, 10, 11, 24, 25, 26, 27 -INDEX1: - .long 4, 5, 6, 7, 20, 21, 22, 23 - .long 12, 13, 14, 15, 28, 29, 30, 31 -ADD0: - .long 0, 1, 2, 3, 4, 5, 6, 7 - .long 8, 9, 10, 11, 12, 13, 14, 15 -ADD1: .long 1 - -ADD16: .long 16 -BLAKE3_BLOCK_LEN: - .long 64 -.p2align 6 -BLAKE3_IV: -BLAKE3_IV_0: - .long 0x6A09E667 -BLAKE3_IV_1: - .long 0xBB67AE85 -BLAKE3_IV_2: - .long 0x3C6EF372 -BLAKE3_IV_3: - .long 0xA54FF53A diff --git a/thirdparty/BLAKE3/c/blake3_avx512_x86-64_windows_gnu.S b/thirdparty/BLAKE3/c/blake3_avx512_x86-64_windows_gnu.S deleted file mode 100644 index e10b9f36c..000000000 --- a/thirdparty/BLAKE3/c/blake3_avx512_x86-64_windows_gnu.S +++ /dev/null @@ -1,2615 +0,0 @@ -.intel_syntax noprefix - -.global _blake3_hash_many_avx512 -.global blake3_hash_many_avx512 -.global blake3_compress_in_place_avx512 -.global _blake3_compress_in_place_avx512 -.global blake3_compress_xof_avx512 -.global _blake3_compress_xof_avx512 - -.section .text -.p2align 6 -_blake3_hash_many_avx512: -blake3_hash_many_avx512: - push r15 - push r14 - push r13 - push r12 - push rdi - push rsi - push rbx - push rbp - mov rbp, rsp - sub rsp, 304 - and rsp, 0xFFFFFFFFFFFFFFC0 - vmovdqa xmmword ptr [rsp+0x90], xmm6 - vmovdqa xmmword ptr [rsp+0xA0], xmm7 - vmovdqa xmmword ptr [rsp+0xB0], xmm8 - vmovdqa xmmword ptr [rsp+0xC0], xmm9 - vmovdqa xmmword ptr [rsp+0xD0], xmm10 - vmovdqa xmmword ptr [rsp+0xE0], xmm11 - vmovdqa xmmword ptr [rsp+0xF0], xmm12 - vmovdqa xmmword ptr [rsp+0x100], xmm13 - vmovdqa xmmword ptr [rsp+0x110], xmm14 - vmovdqa xmmword ptr [rsp+0x120], xmm15 - mov rdi, rcx - mov rsi, rdx - mov rdx, r8 - mov rcx, r9 - mov r8, qword ptr [rbp+0x68] - movzx r9, byte ptr [rbp+0x70] - neg r9 - kmovw k1, r9d - vmovd xmm0, r8d - vpbroadcastd ymm0, xmm0 - shr r8, 32 - vmovd xmm1, r8d - vpbroadcastd ymm1, xmm1 - vmovdqa ymm4, ymm1 - vmovdqa ymm5, ymm1 - vpaddd ymm2, ymm0, ymmword ptr [ADD0+rip] - vpaddd ymm3, ymm0, ymmword ptr [ADD0+32+rip] - vpcmpltud k2, ymm2, ymm0 - vpcmpltud k3, ymm3, ymm0 - vpaddd ymm4 {k2}, ymm4, dword ptr [ADD1+rip] {1to8} - vpaddd ymm5 {k3}, ymm5, dword ptr [ADD1+rip] {1to8} - knotw k2, k1 - vmovdqa32 ymm2 {k2}, ymm0 - vmovdqa32 ymm3 {k2}, ymm0 - vmovdqa32 ymm4 {k2}, ymm1 - vmovdqa32 ymm5 {k2}, ymm1 - vmovdqa ymmword ptr [rsp], ymm2 - vmovdqa ymmword ptr [rsp+0x20], ymm3 - vmovdqa ymmword ptr [rsp+0x40], ymm4 - vmovdqa ymmword ptr [rsp+0x60], ymm5 - shl rdx, 6 - mov qword ptr [rsp+0x80], rdx - cmp rsi, 16 - jc 3f -2: - vpbroadcastd zmm0, dword ptr [rcx] - vpbroadcastd zmm1, dword ptr [rcx+0x1*0x4] - vpbroadcastd zmm2, dword ptr [rcx+0x2*0x4] - vpbroadcastd zmm3, dword ptr [rcx+0x3*0x4] - vpbroadcastd zmm4, dword ptr [rcx+0x4*0x4] - vpbroadcastd zmm5, dword ptr [rcx+0x5*0x4] - vpbroadcastd zmm6, dword ptr [rcx+0x6*0x4] - vpbroadcastd zmm7, dword ptr [rcx+0x7*0x4] - movzx eax, byte ptr [rbp+0x78] - movzx ebx, byte ptr [rbp+0x80] - or eax, ebx - xor edx, edx -.p2align 5 -9: - movzx ebx, byte ptr [rbp+0x88] - or ebx, eax - add rdx, 64 - cmp rdx, qword ptr [rsp+0x80] - cmove eax, ebx - mov dword ptr [rsp+0x88], eax - mov r8, qword ptr [rdi] - mov r9, qword ptr [rdi+0x8] - mov r10, qword ptr [rdi+0x10] - mov r11, qword ptr [rdi+0x18] - mov r12, qword ptr [rdi+0x40] - mov r13, qword ptr [rdi+0x48] - mov r14, qword ptr [rdi+0x50] - mov r15, qword ptr [rdi+0x58] - vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20] - vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01 - vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20] - vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01 - vpunpcklqdq zmm8, zmm16, zmm17 - vpunpckhqdq zmm9, zmm16, zmm17 - vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20] - vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01 - vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20] - vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01 - vpunpcklqdq zmm10, zmm18, zmm19 - vpunpckhqdq zmm11, zmm18, zmm19 - mov r8, qword ptr [rdi+0x20] - mov r9, qword ptr [rdi+0x28] - mov r10, qword ptr [rdi+0x30] - mov r11, qword ptr [rdi+0x38] - mov r12, qword ptr [rdi+0x60] - mov r13, qword ptr [rdi+0x68] - mov r14, qword ptr [rdi+0x70] - mov r15, qword ptr [rdi+0x78] - vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20] - vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01 - vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20] - vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01 - vpunpcklqdq zmm12, zmm16, zmm17 - vpunpckhqdq zmm13, zmm16, zmm17 - vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20] - vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01 - vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20] - vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01 - vpunpcklqdq zmm14, zmm18, zmm19 - vpunpckhqdq zmm15, zmm18, zmm19 - vmovdqa32 zmm27, zmmword ptr [INDEX0+rip] - vmovdqa32 zmm31, zmmword ptr [INDEX1+rip] - vshufps zmm16, zmm8, zmm10, 136 - vshufps zmm17, zmm12, zmm14, 136 - vmovdqa32 zmm20, zmm16 - vpermt2d zmm16, zmm27, zmm17 - vpermt2d zmm20, zmm31, zmm17 - vshufps zmm17, zmm8, zmm10, 221 - vshufps zmm30, zmm12, zmm14, 221 - vmovdqa32 zmm21, zmm17 - vpermt2d zmm17, zmm27, zmm30 - vpermt2d zmm21, zmm31, zmm30 - vshufps zmm18, zmm9, zmm11, 136 - vshufps zmm8, zmm13, zmm15, 136 - vmovdqa32 zmm22, zmm18 - vpermt2d zmm18, zmm27, zmm8 - vpermt2d zmm22, zmm31, zmm8 - vshufps zmm19, zmm9, zmm11, 221 - vshufps zmm8, zmm13, zmm15, 221 - vmovdqa32 zmm23, zmm19 - vpermt2d zmm19, zmm27, zmm8 - vpermt2d zmm23, zmm31, zmm8 - mov r8, qword ptr [rdi] - mov r9, qword ptr [rdi+0x8] - mov r10, qword ptr [rdi+0x10] - mov r11, qword ptr [rdi+0x18] - mov r12, qword ptr [rdi+0x40] - mov r13, qword ptr [rdi+0x48] - mov r14, qword ptr [rdi+0x50] - mov r15, qword ptr [rdi+0x58] - vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20] - vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01 - vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20] - vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01 - vpunpcklqdq zmm8, zmm24, zmm25 - vpunpckhqdq zmm9, zmm24, zmm25 - vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20] - vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01 - vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20] - vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01 - vpunpcklqdq zmm10, zmm24, zmm25 - vpunpckhqdq zmm11, zmm24, zmm25 - prefetcht0 [r8+rdx+0x80] - prefetcht0 [r12+rdx+0x80] - prefetcht0 [r9+rdx+0x80] - prefetcht0 [r13+rdx+0x80] - prefetcht0 [r10+rdx+0x80] - prefetcht0 [r14+rdx+0x80] - prefetcht0 [r11+rdx+0x80] - prefetcht0 [r15+rdx+0x80] - mov r8, qword ptr [rdi+0x20] - mov r9, qword ptr [rdi+0x28] - mov r10, qword ptr [rdi+0x30] - mov r11, qword ptr [rdi+0x38] - mov r12, qword ptr [rdi+0x60] - mov r13, qword ptr [rdi+0x68] - mov r14, qword ptr [rdi+0x70] - mov r15, qword ptr [rdi+0x78] - vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20] - vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01 - vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20] - vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01 - vpunpcklqdq zmm12, zmm24, zmm25 - vpunpckhqdq zmm13, zmm24, zmm25 - vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20] - vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01 - vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20] - vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01 - vpunpcklqdq zmm14, zmm24, zmm25 - vpunpckhqdq zmm15, zmm24, zmm25 - prefetcht0 [r8+rdx+0x80] - prefetcht0 [r12+rdx+0x80] - prefetcht0 [r9+rdx+0x80] - prefetcht0 [r13+rdx+0x80] - prefetcht0 [r10+rdx+0x80] - prefetcht0 [r14+rdx+0x80] - prefetcht0 [r11+rdx+0x80] - prefetcht0 [r15+rdx+0x80] - vshufps zmm24, zmm8, zmm10, 136 - vshufps zmm30, zmm12, zmm14, 136 - vmovdqa32 zmm28, zmm24 - vpermt2d zmm24, zmm27, zmm30 - vpermt2d zmm28, zmm31, zmm30 - vshufps zmm25, zmm8, zmm10, 221 - vshufps zmm30, zmm12, zmm14, 221 - vmovdqa32 zmm29, zmm25 - vpermt2d zmm25, zmm27, zmm30 - vpermt2d zmm29, zmm31, zmm30 - vshufps zmm26, zmm9, zmm11, 136 - vshufps zmm8, zmm13, zmm15, 136 - vmovdqa32 zmm30, zmm26 - vpermt2d zmm26, zmm27, zmm8 - vpermt2d zmm30, zmm31, zmm8 - vshufps zmm8, zmm9, zmm11, 221 - vshufps zmm10, zmm13, zmm15, 221 - vpermi2d zmm27, zmm8, zmm10 - vpermi2d zmm31, zmm8, zmm10 - vpbroadcastd zmm8, dword ptr [BLAKE3_IV_0+rip] - vpbroadcastd zmm9, dword ptr [BLAKE3_IV_1+rip] - vpbroadcastd zmm10, dword ptr [BLAKE3_IV_2+rip] - vpbroadcastd zmm11, dword ptr [BLAKE3_IV_3+rip] - vmovdqa32 zmm12, zmmword ptr [rsp] - vmovdqa32 zmm13, zmmword ptr [rsp+0x1*0x40] - vpbroadcastd zmm14, dword ptr [BLAKE3_BLOCK_LEN+rip] - vpbroadcastd zmm15, dword ptr [rsp+0x22*0x4] - vpaddd zmm0, zmm0, zmm16 - vpaddd zmm1, zmm1, zmm18 - vpaddd zmm2, zmm2, zmm20 - vpaddd zmm3, zmm3, zmm22 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vprord zmm15, zmm15, 16 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 12 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vpaddd zmm0, zmm0, zmm17 - vpaddd zmm1, zmm1, zmm19 - vpaddd zmm2, zmm2, zmm21 - vpaddd zmm3, zmm3, zmm23 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vprord zmm15, zmm15, 8 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 7 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vpaddd zmm0, zmm0, zmm24 - vpaddd zmm1, zmm1, zmm26 - vpaddd zmm2, zmm2, zmm28 - vpaddd zmm3, zmm3, zmm30 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 16 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vprord zmm4, zmm4, 12 - vpaddd zmm0, zmm0, zmm25 - vpaddd zmm1, zmm1, zmm27 - vpaddd zmm2, zmm2, zmm29 - vpaddd zmm3, zmm3, zmm31 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 8 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vprord zmm4, zmm4, 7 - vpaddd zmm0, zmm0, zmm18 - vpaddd zmm1, zmm1, zmm19 - vpaddd zmm2, zmm2, zmm23 - vpaddd zmm3, zmm3, zmm20 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vprord zmm15, zmm15, 16 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 12 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vpaddd zmm0, zmm0, zmm22 - vpaddd zmm1, zmm1, zmm26 - vpaddd zmm2, zmm2, zmm16 - vpaddd zmm3, zmm3, zmm29 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vprord zmm15, zmm15, 8 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 7 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vpaddd zmm0, zmm0, zmm17 - vpaddd zmm1, zmm1, zmm28 - vpaddd zmm2, zmm2, zmm25 - vpaddd zmm3, zmm3, zmm31 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 16 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vprord zmm4, zmm4, 12 - vpaddd zmm0, zmm0, zmm27 - vpaddd zmm1, zmm1, zmm21 - vpaddd zmm2, zmm2, zmm30 - vpaddd zmm3, zmm3, zmm24 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 8 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vprord zmm4, zmm4, 7 - vpaddd zmm0, zmm0, zmm19 - vpaddd zmm1, zmm1, zmm26 - vpaddd zmm2, zmm2, zmm29 - vpaddd zmm3, zmm3, zmm23 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vprord zmm15, zmm15, 16 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 12 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vpaddd zmm0, zmm0, zmm20 - vpaddd zmm1, zmm1, zmm28 - vpaddd zmm2, zmm2, zmm18 - vpaddd zmm3, zmm3, zmm30 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vprord zmm15, zmm15, 8 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 7 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vpaddd zmm0, zmm0, zmm22 - vpaddd zmm1, zmm1, zmm25 - vpaddd zmm2, zmm2, zmm27 - vpaddd zmm3, zmm3, zmm24 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 16 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vprord zmm4, zmm4, 12 - vpaddd zmm0, zmm0, zmm21 - vpaddd zmm1, zmm1, zmm16 - vpaddd zmm2, zmm2, zmm31 - vpaddd zmm3, zmm3, zmm17 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 8 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vprord zmm4, zmm4, 7 - vpaddd zmm0, zmm0, zmm26 - vpaddd zmm1, zmm1, zmm28 - vpaddd zmm2, zmm2, zmm30 - vpaddd zmm3, zmm3, zmm29 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vprord zmm15, zmm15, 16 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 12 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vpaddd zmm0, zmm0, zmm23 - vpaddd zmm1, zmm1, zmm25 - vpaddd zmm2, zmm2, zmm19 - vpaddd zmm3, zmm3, zmm31 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vprord zmm15, zmm15, 8 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 7 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vpaddd zmm0, zmm0, zmm20 - vpaddd zmm1, zmm1, zmm27 - vpaddd zmm2, zmm2, zmm21 - vpaddd zmm3, zmm3, zmm17 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 16 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vprord zmm4, zmm4, 12 - vpaddd zmm0, zmm0, zmm16 - vpaddd zmm1, zmm1, zmm18 - vpaddd zmm2, zmm2, zmm24 - vpaddd zmm3, zmm3, zmm22 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 8 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vprord zmm4, zmm4, 7 - vpaddd zmm0, zmm0, zmm28 - vpaddd zmm1, zmm1, zmm25 - vpaddd zmm2, zmm2, zmm31 - vpaddd zmm3, zmm3, zmm30 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vprord zmm15, zmm15, 16 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 12 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vpaddd zmm0, zmm0, zmm29 - vpaddd zmm1, zmm1, zmm27 - vpaddd zmm2, zmm2, zmm26 - vpaddd zmm3, zmm3, zmm24 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vprord zmm15, zmm15, 8 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 7 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vpaddd zmm0, zmm0, zmm23 - vpaddd zmm1, zmm1, zmm21 - vpaddd zmm2, zmm2, zmm16 - vpaddd zmm3, zmm3, zmm22 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 16 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vprord zmm4, zmm4, 12 - vpaddd zmm0, zmm0, zmm18 - vpaddd zmm1, zmm1, zmm19 - vpaddd zmm2, zmm2, zmm17 - vpaddd zmm3, zmm3, zmm20 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 8 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vprord zmm4, zmm4, 7 - vpaddd zmm0, zmm0, zmm25 - vpaddd zmm1, zmm1, zmm27 - vpaddd zmm2, zmm2, zmm24 - vpaddd zmm3, zmm3, zmm31 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vprord zmm15, zmm15, 16 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 12 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vpaddd zmm0, zmm0, zmm30 - vpaddd zmm1, zmm1, zmm21 - vpaddd zmm2, zmm2, zmm28 - vpaddd zmm3, zmm3, zmm17 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vprord zmm15, zmm15, 8 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 7 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vpaddd zmm0, zmm0, zmm29 - vpaddd zmm1, zmm1, zmm16 - vpaddd zmm2, zmm2, zmm18 - vpaddd zmm3, zmm3, zmm20 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 16 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vprord zmm4, zmm4, 12 - vpaddd zmm0, zmm0, zmm19 - vpaddd zmm1, zmm1, zmm26 - vpaddd zmm2, zmm2, zmm22 - vpaddd zmm3, zmm3, zmm23 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 8 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vprord zmm4, zmm4, 7 - vpaddd zmm0, zmm0, zmm27 - vpaddd zmm1, zmm1, zmm21 - vpaddd zmm2, zmm2, zmm17 - vpaddd zmm3, zmm3, zmm24 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vprord zmm15, zmm15, 16 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 12 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vpaddd zmm0, zmm0, zmm31 - vpaddd zmm1, zmm1, zmm16 - vpaddd zmm2, zmm2, zmm25 - vpaddd zmm3, zmm3, zmm22 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vprord zmm15, zmm15, 8 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 7 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vpaddd zmm0, zmm0, zmm30 - vpaddd zmm1, zmm1, zmm18 - vpaddd zmm2, zmm2, zmm19 - vpaddd zmm3, zmm3, zmm23 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 16 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vprord zmm4, zmm4, 12 - vpaddd zmm0, zmm0, zmm26 - vpaddd zmm1, zmm1, zmm28 - vpaddd zmm2, zmm2, zmm20 - vpaddd zmm3, zmm3, zmm29 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 8 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vprord zmm4, zmm4, 7 - vpxord zmm0, zmm0, zmm8 - vpxord zmm1, zmm1, zmm9 - vpxord zmm2, zmm2, zmm10 - vpxord zmm3, zmm3, zmm11 - vpxord zmm4, zmm4, zmm12 - vpxord zmm5, zmm5, zmm13 - vpxord zmm6, zmm6, zmm14 - vpxord zmm7, zmm7, zmm15 - movzx eax, byte ptr [rbp+0x78] - jne 9b - mov rbx, qword ptr [rbp+0x90] - vpunpckldq zmm16, zmm0, zmm1 - vpunpckhdq zmm17, zmm0, zmm1 - vpunpckldq zmm18, zmm2, zmm3 - vpunpckhdq zmm19, zmm2, zmm3 - vpunpckldq zmm20, zmm4, zmm5 - vpunpckhdq zmm21, zmm4, zmm5 - vpunpckldq zmm22, zmm6, zmm7 - vpunpckhdq zmm23, zmm6, zmm7 - vpunpcklqdq zmm0, zmm16, zmm18 - vpunpckhqdq zmm1, zmm16, zmm18 - vpunpcklqdq zmm2, zmm17, zmm19 - vpunpckhqdq zmm3, zmm17, zmm19 - vpunpcklqdq zmm4, zmm20, zmm22 - vpunpckhqdq zmm5, zmm20, zmm22 - vpunpcklqdq zmm6, zmm21, zmm23 - vpunpckhqdq zmm7, zmm21, zmm23 - vshufi32x4 zmm16, zmm0, zmm4, 0x88 - vshufi32x4 zmm17, zmm1, zmm5, 0x88 - vshufi32x4 zmm18, zmm2, zmm6, 0x88 - vshufi32x4 zmm19, zmm3, zmm7, 0x88 - vshufi32x4 zmm20, zmm0, zmm4, 0xDD - vshufi32x4 zmm21, zmm1, zmm5, 0xDD - vshufi32x4 zmm22, zmm2, zmm6, 0xDD - vshufi32x4 zmm23, zmm3, zmm7, 0xDD - vshufi32x4 zmm0, zmm16, zmm17, 0x88 - vshufi32x4 zmm1, zmm18, zmm19, 0x88 - vshufi32x4 zmm2, zmm20, zmm21, 0x88 - vshufi32x4 zmm3, zmm22, zmm23, 0x88 - vshufi32x4 zmm4, zmm16, zmm17, 0xDD - vshufi32x4 zmm5, zmm18, zmm19, 0xDD - vshufi32x4 zmm6, zmm20, zmm21, 0xDD - vshufi32x4 zmm7, zmm22, zmm23, 0xDD - vmovdqu32 zmmword ptr [rbx], zmm0 - vmovdqu32 zmmword ptr [rbx+0x1*0x40], zmm1 - vmovdqu32 zmmword ptr [rbx+0x2*0x40], zmm2 - vmovdqu32 zmmword ptr [rbx+0x3*0x40], zmm3 - vmovdqu32 zmmword ptr [rbx+0x4*0x40], zmm4 - vmovdqu32 zmmword ptr [rbx+0x5*0x40], zmm5 - vmovdqu32 zmmword ptr [rbx+0x6*0x40], zmm6 - vmovdqu32 zmmword ptr [rbx+0x7*0x40], zmm7 - vmovdqa32 zmm0, zmmword ptr [rsp] - vmovdqa32 zmm1, zmmword ptr [rsp+0x1*0x40] - vmovdqa32 zmm2, zmm0 - vpaddd zmm2{k1}, zmm0, dword ptr [ADD16+rip] {1to16} - vpcmpltud k2, zmm2, zmm0 - vpaddd zmm1 {k2}, zmm1, dword ptr [ADD1+rip] {1to16} - vmovdqa32 zmmword ptr [rsp], zmm2 - vmovdqa32 zmmword ptr [rsp+0x1*0x40], zmm1 - add rdi, 128 - add rbx, 512 - mov qword ptr [rbp+0x90], rbx - sub rsi, 16 - cmp rsi, 16 - jnc 2b - test rsi, rsi - jne 3f -4: - vzeroupper - vmovdqa xmm6, xmmword ptr [rsp+0x90] - vmovdqa xmm7, xmmword ptr [rsp+0xA0] - vmovdqa xmm8, xmmword ptr [rsp+0xB0] - vmovdqa xmm9, xmmword ptr [rsp+0xC0] - vmovdqa xmm10, xmmword ptr [rsp+0xD0] - vmovdqa xmm11, xmmword ptr [rsp+0xE0] - vmovdqa xmm12, xmmword ptr [rsp+0xF0] - vmovdqa xmm13, xmmword ptr [rsp+0x100] - vmovdqa xmm14, xmmword ptr [rsp+0x110] - vmovdqa xmm15, xmmword ptr [rsp+0x120] - mov rsp, rbp - pop rbp - pop rbx - pop rsi - pop rdi - pop r12 - pop r13 - pop r14 - pop r15 - ret -.p2align 6 -3: - test esi, 0x8 - je 3f - vpbroadcastd ymm0, dword ptr [rcx] - vpbroadcastd ymm1, dword ptr [rcx+0x4] - vpbroadcastd ymm2, dword ptr [rcx+0x8] - vpbroadcastd ymm3, dword ptr [rcx+0xC] - vpbroadcastd ymm4, dword ptr [rcx+0x10] - vpbroadcastd ymm5, dword ptr [rcx+0x14] - vpbroadcastd ymm6, dword ptr [rcx+0x18] - vpbroadcastd ymm7, dword ptr [rcx+0x1C] - mov r8, qword ptr [rdi] - mov r9, qword ptr [rdi+0x8] - mov r10, qword ptr [rdi+0x10] - mov r11, qword ptr [rdi+0x18] - mov r12, qword ptr [rdi+0x20] - mov r13, qword ptr [rdi+0x28] - mov r14, qword ptr [rdi+0x30] - mov r15, qword ptr [rdi+0x38] - movzx eax, byte ptr [rbp+0x78] - movzx ebx, byte ptr [rbp+0x80] - or eax, ebx - xor edx, edx -2: - movzx ebx, byte ptr [rbp+0x88] - or ebx, eax - add rdx, 64 - cmp rdx, qword ptr [rsp+0x80] - cmove eax, ebx - mov dword ptr [rsp+0x88], eax - vmovups xmm8, xmmword ptr [r8+rdx-0x40] - vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01 - vmovups xmm9, xmmword ptr [r9+rdx-0x40] - vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01 - vunpcklpd ymm12, ymm8, ymm9 - vunpckhpd ymm13, ymm8, ymm9 - vmovups xmm10, xmmword ptr [r10+rdx-0x40] - vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01 - vmovups xmm11, xmmword ptr [r11+rdx-0x40] - vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01 - vunpcklpd ymm14, ymm10, ymm11 - vunpckhpd ymm15, ymm10, ymm11 - vshufps ymm16, ymm12, ymm14, 136 - vshufps ymm17, ymm12, ymm14, 221 - vshufps ymm18, ymm13, ymm15, 136 - vshufps ymm19, ymm13, ymm15, 221 - vmovups xmm8, xmmword ptr [r8+rdx-0x30] - vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01 - vmovups xmm9, xmmword ptr [r9+rdx-0x30] - vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01 - vunpcklpd ymm12, ymm8, ymm9 - vunpckhpd ymm13, ymm8, ymm9 - vmovups xmm10, xmmword ptr [r10+rdx-0x30] - vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01 - vmovups xmm11, xmmword ptr [r11+rdx-0x30] - vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01 - vunpcklpd ymm14, ymm10, ymm11 - vunpckhpd ymm15, ymm10, ymm11 - vshufps ymm20, ymm12, ymm14, 136 - vshufps ymm21, ymm12, ymm14, 221 - vshufps ymm22, ymm13, ymm15, 136 - vshufps ymm23, ymm13, ymm15, 221 - vmovups xmm8, xmmword ptr [r8+rdx-0x20] - vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01 - vmovups xmm9, xmmword ptr [r9+rdx-0x20] - vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01 - vunpcklpd ymm12, ymm8, ymm9 - vunpckhpd ymm13, ymm8, ymm9 - vmovups xmm10, xmmword ptr [r10+rdx-0x20] - vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01 - vmovups xmm11, xmmword ptr [r11+rdx-0x20] - vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01 - vunpcklpd ymm14, ymm10, ymm11 - vunpckhpd ymm15, ymm10, ymm11 - vshufps ymm24, ymm12, ymm14, 136 - vshufps ymm25, ymm12, ymm14, 221 - vshufps ymm26, ymm13, ymm15, 136 - vshufps ymm27, ymm13, ymm15, 221 - vmovups xmm8, xmmword ptr [r8+rdx-0x10] - vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01 - vmovups xmm9, xmmword ptr [r9+rdx-0x10] - vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01 - vunpcklpd ymm12, ymm8, ymm9 - vunpckhpd ymm13, ymm8, ymm9 - vmovups xmm10, xmmword ptr [r10+rdx-0x10] - vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01 - vmovups xmm11, xmmword ptr [r11+rdx-0x10] - vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01 - vunpcklpd ymm14, ymm10, ymm11 - vunpckhpd ymm15, ymm10, ymm11 - vshufps ymm28, ymm12, ymm14, 136 - vshufps ymm29, ymm12, ymm14, 221 - vshufps ymm30, ymm13, ymm15, 136 - vshufps ymm31, ymm13, ymm15, 221 - vpbroadcastd ymm8, dword ptr [BLAKE3_IV_0+rip] - vpbroadcastd ymm9, dword ptr [BLAKE3_IV_1+rip] - vpbroadcastd ymm10, dword ptr [BLAKE3_IV_2+rip] - vpbroadcastd ymm11, dword ptr [BLAKE3_IV_3+rip] - vmovdqa ymm12, ymmword ptr [rsp] - vmovdqa ymm13, ymmword ptr [rsp+0x40] - vpbroadcastd ymm14, dword ptr [BLAKE3_BLOCK_LEN+rip] - vpbroadcastd ymm15, dword ptr [rsp+0x88] - vpaddd ymm0, ymm0, ymm16 - vpaddd ymm1, ymm1, ymm18 - vpaddd ymm2, ymm2, ymm20 - vpaddd ymm3, ymm3, ymm22 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vprord ymm15, ymm15, 16 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 12 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vpaddd ymm0, ymm0, ymm17 - vpaddd ymm1, ymm1, ymm19 - vpaddd ymm2, ymm2, ymm21 - vpaddd ymm3, ymm3, ymm23 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vprord ymm15, ymm15, 8 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 7 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vpaddd ymm0, ymm0, ymm24 - vpaddd ymm1, ymm1, ymm26 - vpaddd ymm2, ymm2, ymm28 - vpaddd ymm3, ymm3, ymm30 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 16 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vprord ymm4, ymm4, 12 - vpaddd ymm0, ymm0, ymm25 - vpaddd ymm1, ymm1, ymm27 - vpaddd ymm2, ymm2, ymm29 - vpaddd ymm3, ymm3, ymm31 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 8 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vprord ymm4, ymm4, 7 - vpaddd ymm0, ymm0, ymm18 - vpaddd ymm1, ymm1, ymm19 - vpaddd ymm2, ymm2, ymm23 - vpaddd ymm3, ymm3, ymm20 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vprord ymm15, ymm15, 16 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 12 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vpaddd ymm0, ymm0, ymm22 - vpaddd ymm1, ymm1, ymm26 - vpaddd ymm2, ymm2, ymm16 - vpaddd ymm3, ymm3, ymm29 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vprord ymm15, ymm15, 8 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 7 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vpaddd ymm0, ymm0, ymm17 - vpaddd ymm1, ymm1, ymm28 - vpaddd ymm2, ymm2, ymm25 - vpaddd ymm3, ymm3, ymm31 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 16 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vprord ymm4, ymm4, 12 - vpaddd ymm0, ymm0, ymm27 - vpaddd ymm1, ymm1, ymm21 - vpaddd ymm2, ymm2, ymm30 - vpaddd ymm3, ymm3, ymm24 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 8 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vprord ymm4, ymm4, 7 - vpaddd ymm0, ymm0, ymm19 - vpaddd ymm1, ymm1, ymm26 - vpaddd ymm2, ymm2, ymm29 - vpaddd ymm3, ymm3, ymm23 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vprord ymm15, ymm15, 16 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 12 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vpaddd ymm0, ymm0, ymm20 - vpaddd ymm1, ymm1, ymm28 - vpaddd ymm2, ymm2, ymm18 - vpaddd ymm3, ymm3, ymm30 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vprord ymm15, ymm15, 8 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 7 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vpaddd ymm0, ymm0, ymm22 - vpaddd ymm1, ymm1, ymm25 - vpaddd ymm2, ymm2, ymm27 - vpaddd ymm3, ymm3, ymm24 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 16 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vprord ymm4, ymm4, 12 - vpaddd ymm0, ymm0, ymm21 - vpaddd ymm1, ymm1, ymm16 - vpaddd ymm2, ymm2, ymm31 - vpaddd ymm3, ymm3, ymm17 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 8 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vprord ymm4, ymm4, 7 - vpaddd ymm0, ymm0, ymm26 - vpaddd ymm1, ymm1, ymm28 - vpaddd ymm2, ymm2, ymm30 - vpaddd ymm3, ymm3, ymm29 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vprord ymm15, ymm15, 16 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 12 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vpaddd ymm0, ymm0, ymm23 - vpaddd ymm1, ymm1, ymm25 - vpaddd ymm2, ymm2, ymm19 - vpaddd ymm3, ymm3, ymm31 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vprord ymm15, ymm15, 8 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 7 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vpaddd ymm0, ymm0, ymm20 - vpaddd ymm1, ymm1, ymm27 - vpaddd ymm2, ymm2, ymm21 - vpaddd ymm3, ymm3, ymm17 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 16 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vprord ymm4, ymm4, 12 - vpaddd ymm0, ymm0, ymm16 - vpaddd ymm1, ymm1, ymm18 - vpaddd ymm2, ymm2, ymm24 - vpaddd ymm3, ymm3, ymm22 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 8 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vprord ymm4, ymm4, 7 - vpaddd ymm0, ymm0, ymm28 - vpaddd ymm1, ymm1, ymm25 - vpaddd ymm2, ymm2, ymm31 - vpaddd ymm3, ymm3, ymm30 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vprord ymm15, ymm15, 16 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 12 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vpaddd ymm0, ymm0, ymm29 - vpaddd ymm1, ymm1, ymm27 - vpaddd ymm2, ymm2, ymm26 - vpaddd ymm3, ymm3, ymm24 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vprord ymm15, ymm15, 8 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 7 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vpaddd ymm0, ymm0, ymm23 - vpaddd ymm1, ymm1, ymm21 - vpaddd ymm2, ymm2, ymm16 - vpaddd ymm3, ymm3, ymm22 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 16 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vprord ymm4, ymm4, 12 - vpaddd ymm0, ymm0, ymm18 - vpaddd ymm1, ymm1, ymm19 - vpaddd ymm2, ymm2, ymm17 - vpaddd ymm3, ymm3, ymm20 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 8 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vprord ymm4, ymm4, 7 - vpaddd ymm0, ymm0, ymm25 - vpaddd ymm1, ymm1, ymm27 - vpaddd ymm2, ymm2, ymm24 - vpaddd ymm3, ymm3, ymm31 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vprord ymm15, ymm15, 16 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 12 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vpaddd ymm0, ymm0, ymm30 - vpaddd ymm1, ymm1, ymm21 - vpaddd ymm2, ymm2, ymm28 - vpaddd ymm3, ymm3, ymm17 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vprord ymm15, ymm15, 8 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 7 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vpaddd ymm0, ymm0, ymm29 - vpaddd ymm1, ymm1, ymm16 - vpaddd ymm2, ymm2, ymm18 - vpaddd ymm3, ymm3, ymm20 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 16 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vprord ymm4, ymm4, 12 - vpaddd ymm0, ymm0, ymm19 - vpaddd ymm1, ymm1, ymm26 - vpaddd ymm2, ymm2, ymm22 - vpaddd ymm3, ymm3, ymm23 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 8 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vprord ymm4, ymm4, 7 - vpaddd ymm0, ymm0, ymm27 - vpaddd ymm1, ymm1, ymm21 - vpaddd ymm2, ymm2, ymm17 - vpaddd ymm3, ymm3, ymm24 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vprord ymm15, ymm15, 16 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 12 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vpaddd ymm0, ymm0, ymm31 - vpaddd ymm1, ymm1, ymm16 - vpaddd ymm2, ymm2, ymm25 - vpaddd ymm3, ymm3, ymm22 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vprord ymm15, ymm15, 8 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 7 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vpaddd ymm0, ymm0, ymm30 - vpaddd ymm1, ymm1, ymm18 - vpaddd ymm2, ymm2, ymm19 - vpaddd ymm3, ymm3, ymm23 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 16 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vprord ymm4, ymm4, 12 - vpaddd ymm0, ymm0, ymm26 - vpaddd ymm1, ymm1, ymm28 - vpaddd ymm2, ymm2, ymm20 - vpaddd ymm3, ymm3, ymm29 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 8 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vprord ymm4, ymm4, 7 - vpxor ymm0, ymm0, ymm8 - vpxor ymm1, ymm1, ymm9 - vpxor ymm2, ymm2, ymm10 - vpxor ymm3, ymm3, ymm11 - vpxor ymm4, ymm4, ymm12 - vpxor ymm5, ymm5, ymm13 - vpxor ymm6, ymm6, ymm14 - vpxor ymm7, ymm7, ymm15 - movzx eax, byte ptr [rbp+0x78] - jne 2b - mov rbx, qword ptr [rbp+0x90] - vunpcklps ymm8, ymm0, ymm1 - vunpcklps ymm9, ymm2, ymm3 - vunpckhps ymm10, ymm0, ymm1 - vunpcklps ymm11, ymm4, ymm5 - vunpcklps ymm0, ymm6, ymm7 - vshufps ymm12, ymm8, ymm9, 78 - vblendps ymm1, ymm8, ymm12, 0xCC - vshufps ymm8, ymm11, ymm0, 78 - vunpckhps ymm13, ymm2, ymm3 - vblendps ymm2, ymm11, ymm8, 0xCC - vblendps ymm3, ymm12, ymm9, 0xCC - vperm2f128 ymm12, ymm1, ymm2, 0x20 - vmovups ymmword ptr [rbx], ymm12 - vunpckhps ymm14, ymm4, ymm5 - vblendps ymm4, ymm8, ymm0, 0xCC - vunpckhps ymm15, ymm6, ymm7 - vperm2f128 ymm7, ymm3, ymm4, 0x20 - vmovups ymmword ptr [rbx+0x20], ymm7 - vshufps ymm5, ymm10, ymm13, 78 - vblendps ymm6, ymm5, ymm13, 0xCC - vshufps ymm13, ymm14, ymm15, 78 - vblendps ymm10, ymm10, ymm5, 0xCC - vblendps ymm14, ymm14, ymm13, 0xCC - vperm2f128 ymm8, ymm10, ymm14, 0x20 - vmovups ymmword ptr [rbx+0x40], ymm8 - vblendps ymm15, ymm13, ymm15, 0xCC - vperm2f128 ymm13, ymm6, ymm15, 0x20 - vmovups ymmword ptr [rbx+0x60], ymm13 - vperm2f128 ymm9, ymm1, ymm2, 0x31 - vperm2f128 ymm11, ymm3, ymm4, 0x31 - vmovups ymmword ptr [rbx+0x80], ymm9 - vperm2f128 ymm14, ymm10, ymm14, 0x31 - vperm2f128 ymm15, ymm6, ymm15, 0x31 - vmovups ymmword ptr [rbx+0xA0], ymm11 - vmovups ymmword ptr [rbx+0xC0], ymm14 - vmovups ymmword ptr [rbx+0xE0], ymm15 - vmovdqa ymm0, ymmword ptr [rsp] - vmovdqa ymm2, ymmword ptr [rsp+0x40] - vmovdqa32 ymm0 {k1}, ymmword ptr [rsp+0x1*0x20] - vmovdqa32 ymm2 {k1}, ymmword ptr [rsp+0x3*0x20] - vmovdqa ymmword ptr [rsp], ymm0 - vmovdqa ymmword ptr [rsp+0x40], ymm2 - add rbx, 256 - mov qword ptr [rbp+0x90], rbx - add rdi, 64 - sub rsi, 8 -3: - mov rbx, qword ptr [rbp+0x90] - mov r15, qword ptr [rsp+0x80] - movzx r13, byte ptr [rbp+0x78] - movzx r12, byte ptr [rbp+0x88] - test esi, 0x4 - je 3f - vbroadcasti32x4 zmm0, xmmword ptr [rcx] - vbroadcasti32x4 zmm1, xmmword ptr [rcx+0x1*0x10] - vmovdqa xmm12, xmmword ptr [rsp] - vmovdqa xmm13, xmmword ptr [rsp+0x40] - vpunpckldq xmm14, xmm12, xmm13 - vpunpckhdq xmm15, xmm12, xmm13 - vpermq ymm14, ymm14, 0xDC - vpermq ymm15, ymm15, 0xDC - vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN+rip] - vinserti64x4 zmm13, zmm14, ymm15, 0x01 - mov eax, 17476 - kmovw k2, eax - vpblendmd zmm13 {k2}, zmm13, zmm12 - vbroadcasti32x4 zmm15, xmmword ptr [BLAKE3_IV+rip] - mov r8, qword ptr [rdi] - mov r9, qword ptr [rdi+0x8] - mov r10, qword ptr [rdi+0x10] - mov r11, qword ptr [rdi+0x18] - mov eax, 43690 - kmovw k3, eax - mov eax, 34952 - kmovw k4, eax - movzx eax, byte ptr [rbp+0x80] - or eax, r13d - xor edx, edx -.p2align 5 -2: - mov r14d, eax - or eax, r12d - add rdx, 64 - cmp rdx, r15 - cmovne eax, r14d - mov dword ptr [rsp+0x88], eax - vmovdqa32 zmm2, zmm15 - vpbroadcastd zmm8, dword ptr [rsp+0x22*0x4] - vpblendmd zmm3 {k4}, zmm13, zmm8 - vmovups zmm8, zmmword ptr [r8+rdx-0x1*0x40] - vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-0x4*0x10], 0x01 - vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-0x4*0x10], 0x02 - vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-0x4*0x10], 0x03 - vmovups zmm9, zmmword ptr [r8+rdx-0x30] - vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-0x3*0x10], 0x01 - vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-0x3*0x10], 0x02 - vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-0x3*0x10], 0x03 - vshufps zmm4, zmm8, zmm9, 136 - vshufps zmm5, zmm8, zmm9, 221 - vmovups zmm8, zmmword ptr [r8+rdx-0x20] - vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-0x2*0x10], 0x01 - vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-0x2*0x10], 0x02 - vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-0x2*0x10], 0x03 - vmovups zmm9, zmmword ptr [r8+rdx-0x10] - vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-0x1*0x10], 0x01 - vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-0x1*0x10], 0x02 - vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-0x1*0x10], 0x03 - vshufps zmm6, zmm8, zmm9, 136 - vshufps zmm7, zmm8, zmm9, 221 - vpshufd zmm6, zmm6, 0x93 - vpshufd zmm7, zmm7, 0x93 - mov al, 7 -9: - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm0, zmm0, zmm1 - vpxord zmm3, zmm3, zmm0 - vprord zmm3, zmm3, 16 - vpaddd zmm2, zmm2, zmm3 - vpxord zmm1, zmm1, zmm2 - vprord zmm1, zmm1, 12 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm0, zmm0, zmm1 - vpxord zmm3, zmm3, zmm0 - vprord zmm3, zmm3, 8 - vpaddd zmm2, zmm2, zmm3 - vpxord zmm1, zmm1, zmm2 - vprord zmm1, zmm1, 7 - vpshufd zmm0, zmm0, 0x93 - vpshufd zmm3, zmm3, 0x4E - vpshufd zmm2, zmm2, 0x39 - vpaddd zmm0, zmm0, zmm6 - vpaddd zmm0, zmm0, zmm1 - vpxord zmm3, zmm3, zmm0 - vprord zmm3, zmm3, 16 - vpaddd zmm2, zmm2, zmm3 - vpxord zmm1, zmm1, zmm2 - vprord zmm1, zmm1, 12 - vpaddd zmm0, zmm0, zmm7 - vpaddd zmm0, zmm0, zmm1 - vpxord zmm3, zmm3, zmm0 - vprord zmm3, zmm3, 8 - vpaddd zmm2, zmm2, zmm3 - vpxord zmm1, zmm1, zmm2 - vprord zmm1, zmm1, 7 - vpshufd zmm0, zmm0, 0x39 - vpshufd zmm3, zmm3, 0x4E - vpshufd zmm2, zmm2, 0x93 - dec al - jz 9f - vshufps zmm8, zmm4, zmm5, 214 - vpshufd zmm9, zmm4, 0x0F - vpshufd zmm4, zmm8, 0x39 - vshufps zmm8, zmm6, zmm7, 250 - vpblendmd zmm9 {k3}, zmm9, zmm8 - vpunpcklqdq zmm8, zmm7, zmm5 - vpblendmd zmm8 {k4}, zmm8, zmm6 - vpshufd zmm8, zmm8, 0x78 - vpunpckhdq zmm5, zmm5, zmm7 - vpunpckldq zmm6, zmm6, zmm5 - vpshufd zmm7, zmm6, 0x1E - vmovdqa32 zmm5, zmm9 - vmovdqa32 zmm6, zmm8 - jmp 9b -9: - vpxord zmm0, zmm0, zmm2 - vpxord zmm1, zmm1, zmm3 - mov eax, r13d - cmp rdx, r15 - jne 2b - vmovdqu xmmword ptr [rbx], xmm0 - vmovdqu xmmword ptr [rbx+0x10], xmm1 - vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 - vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 - vextracti32x4 xmmword ptr [rbx+0x4*0x10], zmm0, 0x02 - vextracti32x4 xmmword ptr [rbx+0x5*0x10], zmm1, 0x02 - vextracti32x4 xmmword ptr [rbx+0x6*0x10], zmm0, 0x03 - vextracti32x4 xmmword ptr [rbx+0x7*0x10], zmm1, 0x03 - vmovdqa xmm0, xmmword ptr [rsp] - vmovdqa xmm2, xmmword ptr [rsp+0x40] - vmovdqa32 xmm0 {k1}, xmmword ptr [rsp+0x1*0x10] - vmovdqa32 xmm2 {k1}, xmmword ptr [rsp+0x5*0x10] - vmovdqa xmmword ptr [rsp], xmm0 - vmovdqa xmmword ptr [rsp+0x40], xmm2 - add rbx, 128 - add rdi, 32 - sub rsi, 4 -3: - test esi, 0x2 - je 3f - vbroadcasti128 ymm0, xmmword ptr [rcx] - vbroadcasti128 ymm1, xmmword ptr [rcx+0x10] - vmovd xmm13, dword ptr [rsp] - vpinsrd xmm13, xmm13, dword ptr [rsp+0x40], 1 - vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 - vmovd xmm14, dword ptr [rsp+0x4] - vpinsrd xmm14, xmm14, dword ptr [rsp+0x44], 1 - vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 - vinserti128 ymm13, ymm13, xmm14, 0x01 - mov r8, qword ptr [rdi] - mov r9, qword ptr [rdi+0x8] - movzx eax, byte ptr [rbp+0x80] - or eax, r13d - xor edx, edx -.p2align 5 -2: - mov r14d, eax - or eax, r12d - add rdx, 64 - cmp rdx, r15 - cmovne eax, r14d - mov dword ptr [rsp+0x88], eax - vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip] - vpbroadcastd ymm8, dword ptr [rsp+0x88] - vpblendd ymm3, ymm13, ymm8, 0x88 - vmovups ymm8, ymmword ptr [r8+rdx-0x40] - vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x40], 0x01 - vmovups ymm9, ymmword ptr [r8+rdx-0x30] - vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x30], 0x01 - vshufps ymm4, ymm8, ymm9, 136 - vshufps ymm5, ymm8, ymm9, 221 - vmovups ymm8, ymmword ptr [r8+rdx-0x20] - vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x20], 0x01 - vmovups ymm9, ymmword ptr [r8+rdx-0x10] - vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x10], 0x01 - vshufps ymm6, ymm8, ymm9, 136 - vshufps ymm7, ymm8, ymm9, 221 - vpshufd ymm6, ymm6, 0x93 - vpshufd ymm7, ymm7, 0x93 - mov al, 7 -9: - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm0, ymm0, ymm1 - vpxord ymm3, ymm3, ymm0 - vprord ymm3, ymm3, 16 - vpaddd ymm2, ymm2, ymm3 - vpxord ymm1, ymm1, ymm2 - vprord ymm1, ymm1, 12 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm0, ymm0, ymm1 - vpxord ymm3, ymm3, ymm0 - vprord ymm3, ymm3, 8 - vpaddd ymm2, ymm2, ymm3 - vpxord ymm1, ymm1, ymm2 - vprord ymm1, ymm1, 7 - vpshufd ymm0, ymm0, 0x93 - vpshufd ymm3, ymm3, 0x4E - vpshufd ymm2, ymm2, 0x39 - vpaddd ymm0, ymm0, ymm6 - vpaddd ymm0, ymm0, ymm1 - vpxord ymm3, ymm3, ymm0 - vprord ymm3, ymm3, 16 - vpaddd ymm2, ymm2, ymm3 - vpxord ymm1, ymm1, ymm2 - vprord ymm1, ymm1, 12 - vpaddd ymm0, ymm0, ymm7 - vpaddd ymm0, ymm0, ymm1 - vpxord ymm3, ymm3, ymm0 - vprord ymm3, ymm3, 8 - vpaddd ymm2, ymm2, ymm3 - vpxord ymm1, ymm1, ymm2 - vprord ymm1, ymm1, 7 - vpshufd ymm0, ymm0, 0x39 - vpshufd ymm3, ymm3, 0x4E - vpshufd ymm2, ymm2, 0x93 - dec al - jz 9f - vshufps ymm8, ymm4, ymm5, 214 - vpshufd ymm9, ymm4, 0x0F - vpshufd ymm4, ymm8, 0x39 - vshufps ymm8, ymm6, ymm7, 250 - vpblendd ymm9, ymm9, ymm8, 0xAA - vpunpcklqdq ymm8, ymm7, ymm5 - vpblendd ymm8, ymm8, ymm6, 0x88 - vpshufd ymm8, ymm8, 0x78 - vpunpckhdq ymm5, ymm5, ymm7 - vpunpckldq ymm6, ymm6, ymm5 - vpshufd ymm7, ymm6, 0x1E - vmovdqa ymm5, ymm9 - vmovdqa ymm6, ymm8 - jmp 9b -9: - vpxor ymm0, ymm0, ymm2 - vpxor ymm1, ymm1, ymm3 - mov eax, r13d - cmp rdx, r15 - jne 2b - vmovdqu xmmword ptr [rbx], xmm0 - vmovdqu xmmword ptr [rbx+0x10], xmm1 - vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 - vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 - vmovdqa xmm0, xmmword ptr [rsp] - vmovdqa xmm2, xmmword ptr [rsp+0x40] - vmovdqu32 xmm0 {k1}, xmmword ptr [rsp+0x8] - vmovdqu32 xmm2 {k1}, xmmword ptr [rsp+0x48] - vmovdqa xmmword ptr [rsp], xmm0 - vmovdqa xmmword ptr [rsp+0x40], xmm2 - add rbx, 64 - add rdi, 16 - sub rsi, 2 -3: - test esi, 0x1 - je 4b - vmovdqu xmm0, xmmword ptr [rcx] - vmovdqu xmm1, xmmword ptr [rcx+0x10] - vmovd xmm14, dword ptr [rsp] - vpinsrd xmm14, xmm14, dword ptr [rsp+0x40], 1 - vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 - vmovdqa xmm15, xmmword ptr [BLAKE3_IV+rip] - mov r8, qword ptr [rdi] - movzx eax, byte ptr [rbp+0x80] - or eax, r13d - xor edx, edx -.p2align 5 -2: - mov r14d, eax - or eax, r12d - add rdx, 64 - cmp rdx, r15 - cmovne eax, r14d - vpinsrd xmm3, xmm14, eax, 3 - vmovdqa xmm2, xmm15 - vmovups xmm8, xmmword ptr [r8+rdx-0x40] - vmovups xmm9, xmmword ptr [r8+rdx-0x30] - vshufps xmm4, xmm8, xmm9, 136 - vshufps xmm5, xmm8, xmm9, 221 - vmovups xmm8, xmmword ptr [r8+rdx-0x20] - vmovups xmm9, xmmword ptr [r8+rdx-0x10] - vshufps xmm6, xmm8, xmm9, 136 - vshufps xmm7, xmm8, xmm9, 221 - vpshufd xmm6, xmm6, 0x93 - vpshufd xmm7, xmm7, 0x93 - mov al, 7 -9: - vpaddd xmm0, xmm0, xmm4 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 16 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 12 - vpaddd xmm0, xmm0, xmm5 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 8 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 7 - vpshufd xmm0, xmm0, 0x93 - vpshufd xmm3, xmm3, 0x4E - vpshufd xmm2, xmm2, 0x39 - vpaddd xmm0, xmm0, xmm6 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 16 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 12 - vpaddd xmm0, xmm0, xmm7 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 8 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 7 - vpshufd xmm0, xmm0, 0x39 - vpshufd xmm3, xmm3, 0x4E - vpshufd xmm2, xmm2, 0x93 - dec al - jz 9f - vshufps xmm8, xmm4, xmm5, 214 - vpshufd xmm9, xmm4, 0x0F - vpshufd xmm4, xmm8, 0x39 - vshufps xmm8, xmm6, xmm7, 250 - vpblendd xmm9, xmm9, xmm8, 0xAA - vpunpcklqdq xmm8, xmm7, xmm5 - vpblendd xmm8, xmm8, xmm6, 0x88 - vpshufd xmm8, xmm8, 0x78 - vpunpckhdq xmm5, xmm5, xmm7 - vpunpckldq xmm6, xmm6, xmm5 - vpshufd xmm7, xmm6, 0x1E - vmovdqa xmm5, xmm9 - vmovdqa xmm6, xmm8 - jmp 9b -9: - vpxor xmm0, xmm0, xmm2 - vpxor xmm1, xmm1, xmm3 - mov eax, r13d - cmp rdx, r15 - jne 2b - vmovdqu xmmword ptr [rbx], xmm0 - vmovdqu xmmword ptr [rbx+0x10], xmm1 - jmp 4b - - -.p2align 6 -_blake3_compress_in_place_avx512: -blake3_compress_in_place_avx512: - sub rsp, 72 - vmovdqa xmmword ptr [rsp], xmm6 - vmovdqa xmmword ptr [rsp+0x10], xmm7 - vmovdqa xmmword ptr [rsp+0x20], xmm8 - vmovdqa xmmword ptr [rsp+0x30], xmm9 - vmovdqu xmm0, xmmword ptr [rcx] - vmovdqu xmm1, xmmword ptr [rcx+0x10] - movzx eax, byte ptr [rsp+0x70] - movzx r8d, r8b - shl rax, 32 - add r8, rax - vmovq xmm3, r9 - vmovq xmm4, r8 - vpunpcklqdq xmm3, xmm3, xmm4 - vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip] - vmovups xmm8, xmmword ptr [rdx] - vmovups xmm9, xmmword ptr [rdx+0x10] - vshufps xmm4, xmm8, xmm9, 136 - vshufps xmm5, xmm8, xmm9, 221 - vmovups xmm8, xmmword ptr [rdx+0x20] - vmovups xmm9, xmmword ptr [rdx+0x30] - vshufps xmm6, xmm8, xmm9, 136 - vshufps xmm7, xmm8, xmm9, 221 - vpshufd xmm6, xmm6, 0x93 - vpshufd xmm7, xmm7, 0x93 - mov al, 7 -9: - vpaddd xmm0, xmm0, xmm4 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 16 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 12 - vpaddd xmm0, xmm0, xmm5 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 8 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 7 - vpshufd xmm0, xmm0, 0x93 - vpshufd xmm3, xmm3, 0x4E - vpshufd xmm2, xmm2, 0x39 - vpaddd xmm0, xmm0, xmm6 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 16 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 12 - vpaddd xmm0, xmm0, xmm7 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 8 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 7 - vpshufd xmm0, xmm0, 0x39 - vpshufd xmm3, xmm3, 0x4E - vpshufd xmm2, xmm2, 0x93 - dec al - jz 9f - vshufps xmm8, xmm4, xmm5, 214 - vpshufd xmm9, xmm4, 0x0F - vpshufd xmm4, xmm8, 0x39 - vshufps xmm8, xmm6, xmm7, 250 - vpblendd xmm9, xmm9, xmm8, 0xAA - vpunpcklqdq xmm8, xmm7, xmm5 - vpblendd xmm8, xmm8, xmm6, 0x88 - vpshufd xmm8, xmm8, 0x78 - vpunpckhdq xmm5, xmm5, xmm7 - vpunpckldq xmm6, xmm6, xmm5 - vpshufd xmm7, xmm6, 0x1E - vmovdqa xmm5, xmm9 - vmovdqa xmm6, xmm8 - jmp 9b -9: - vpxor xmm0, xmm0, xmm2 - vpxor xmm1, xmm1, xmm3 - vmovdqu xmmword ptr [rcx], xmm0 - vmovdqu xmmword ptr [rcx+0x10], xmm1 - vmovdqa xmm6, xmmword ptr [rsp] - vmovdqa xmm7, xmmword ptr [rsp+0x10] - vmovdqa xmm8, xmmword ptr [rsp+0x20] - vmovdqa xmm9, xmmword ptr [rsp+0x30] - add rsp, 72 - ret - - -.p2align 6 -_blake3_compress_xof_avx512: -blake3_compress_xof_avx512: - sub rsp, 72 - vmovdqa xmmword ptr [rsp], xmm6 - vmovdqa xmmword ptr [rsp+0x10], xmm7 - vmovdqa xmmword ptr [rsp+0x20], xmm8 - vmovdqa xmmword ptr [rsp+0x30], xmm9 - vmovdqu xmm0, xmmword ptr [rcx] - vmovdqu xmm1, xmmword ptr [rcx+0x10] - movzx eax, byte ptr [rsp+0x70] - movzx r8d, r8b - mov r10, qword ptr [rsp+0x78] - shl rax, 32 - add r8, rax - vmovq xmm3, r9 - vmovq xmm4, r8 - vpunpcklqdq xmm3, xmm3, xmm4 - vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip] - vmovups xmm8, xmmword ptr [rdx] - vmovups xmm9, xmmword ptr [rdx+0x10] - vshufps xmm4, xmm8, xmm9, 136 - vshufps xmm5, xmm8, xmm9, 221 - vmovups xmm8, xmmword ptr [rdx+0x20] - vmovups xmm9, xmmword ptr [rdx+0x30] - vshufps xmm6, xmm8, xmm9, 136 - vshufps xmm7, xmm8, xmm9, 221 - vpshufd xmm6, xmm6, 0x93 - vpshufd xmm7, xmm7, 0x93 - mov al, 7 -9: - vpaddd xmm0, xmm0, xmm4 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 16 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 12 - vpaddd xmm0, xmm0, xmm5 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 8 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 7 - vpshufd xmm0, xmm0, 0x93 - vpshufd xmm3, xmm3, 0x4E - vpshufd xmm2, xmm2, 0x39 - vpaddd xmm0, xmm0, xmm6 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 16 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 12 - vpaddd xmm0, xmm0, xmm7 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 8 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 7 - vpshufd xmm0, xmm0, 0x39 - vpshufd xmm3, xmm3, 0x4E - vpshufd xmm2, xmm2, 0x93 - dec al - jz 9f - vshufps xmm8, xmm4, xmm5, 214 - vpshufd xmm9, xmm4, 0x0F - vpshufd xmm4, xmm8, 0x39 - vshufps xmm8, xmm6, xmm7, 250 - vpblendd xmm9, xmm9, xmm8, 0xAA - vpunpcklqdq xmm8, xmm7, xmm5 - vpblendd xmm8, xmm8, xmm6, 0x88 - vpshufd xmm8, xmm8, 0x78 - vpunpckhdq xmm5, xmm5, xmm7 - vpunpckldq xmm6, xmm6, xmm5 - vpshufd xmm7, xmm6, 0x1E - vmovdqa xmm5, xmm9 - vmovdqa xmm6, xmm8 - jmp 9b -9: - vpxor xmm0, xmm0, xmm2 - vpxor xmm1, xmm1, xmm3 - vpxor xmm2, xmm2, xmmword ptr [rcx] - vpxor xmm3, xmm3, xmmword ptr [rcx+0x10] - vmovdqu xmmword ptr [r10], xmm0 - vmovdqu xmmword ptr [r10+0x10], xmm1 - vmovdqu xmmword ptr [r10+0x20], xmm2 - vmovdqu xmmword ptr [r10+0x30], xmm3 - vmovdqa xmm6, xmmword ptr [rsp] - vmovdqa xmm7, xmmword ptr [rsp+0x10] - vmovdqa xmm8, xmmword ptr [rsp+0x20] - vmovdqa xmm9, xmmword ptr [rsp+0x30] - add rsp, 72 - ret - -.section .rodata -.p2align 6 -INDEX0: - .long 0, 1, 2, 3, 16, 17, 18, 19 - .long 8, 9, 10, 11, 24, 25, 26, 27 -INDEX1: - .long 4, 5, 6, 7, 20, 21, 22, 23 - .long 12, 13, 14, 15, 28, 29, 30, 31 -ADD0: - .long 0, 1, 2, 3, 4, 5, 6, 7 - .long 8, 9, 10, 11, 12, 13, 14, 15 -ADD1: .long 1 - -ADD16: .long 16 -BLAKE3_BLOCK_LEN: - .long 64 -.p2align 6 -BLAKE3_IV: -BLAKE3_IV_0: - .long 0x6A09E667 -BLAKE3_IV_1: - .long 0xBB67AE85 -BLAKE3_IV_2: - .long 0x3C6EF372 -BLAKE3_IV_3: - .long 0xA54FF53A diff --git a/thirdparty/BLAKE3/c/blake3_avx512_x86-64_windows_msvc.asm b/thirdparty/BLAKE3/c/blake3_avx512_x86-64_windows_msvc.asm deleted file mode 100644 index b19efbaae..000000000 --- a/thirdparty/BLAKE3/c/blake3_avx512_x86-64_windows_msvc.asm +++ /dev/null @@ -1,2634 +0,0 @@ -public _blake3_hash_many_avx512 -public blake3_hash_many_avx512 -public blake3_compress_in_place_avx512 -public _blake3_compress_in_place_avx512 -public blake3_compress_xof_avx512 -public _blake3_compress_xof_avx512 - -_TEXT SEGMENT ALIGN(16) 'CODE' - -ALIGN 16 -blake3_hash_many_avx512 PROC -_blake3_hash_many_avx512 PROC - push r15 - push r14 - push r13 - push r12 - push rdi - push rsi - push rbx - push rbp - mov rbp, rsp - sub rsp, 304 - and rsp, 0FFFFFFFFFFFFFFC0H - vmovdqa xmmword ptr [rsp+90H], xmm6 - vmovdqa xmmword ptr [rsp+0A0H], xmm7 - vmovdqa xmmword ptr [rsp+0B0H], xmm8 - vmovdqa xmmword ptr [rsp+0C0H], xmm9 - vmovdqa xmmword ptr [rsp+0D0H], xmm10 - vmovdqa xmmword ptr [rsp+0E0H], xmm11 - vmovdqa xmmword ptr [rsp+0F0H], xmm12 - vmovdqa xmmword ptr [rsp+100H], xmm13 - vmovdqa xmmword ptr [rsp+110H], xmm14 - vmovdqa xmmword ptr [rsp+120H], xmm15 - mov rdi, rcx - mov rsi, rdx - mov rdx, r8 - mov rcx, r9 - mov r8, qword ptr [rbp+68H] - movzx r9, byte ptr [rbp+70H] - neg r9 - kmovw k1, r9d - vmovd xmm0, r8d - vpbroadcastd ymm0, xmm0 - shr r8, 32 - vmovd xmm1, r8d - vpbroadcastd ymm1, xmm1 - vmovdqa ymm4, ymm1 - vmovdqa ymm5, ymm1 - vpaddd ymm2, ymm0, ymmword ptr [ADD0] - vpaddd ymm3, ymm0, ymmword ptr [ADD0+32] - vpcmpud k2, ymm2, ymm0, 1 - vpcmpud k3, ymm3, ymm0, 1 - ; XXX: ml64.exe does not currently understand the syntax. We use a workaround. - vpbroadcastd ymm6, dword ptr [ADD1] - vpaddd ymm4 {k2}, ymm4, ymm6 - vpaddd ymm5 {k3}, ymm5, ymm6 - ; vpaddd ymm4 {k2}, ymm4, dword ptr [ADD1] {1to8} - ; vpaddd ymm5 {k3}, ymm5, dword ptr [ADD1] {1to8} - knotw k2, k1 - vmovdqa32 ymm2 {k2}, ymm0 - vmovdqa32 ymm3 {k2}, ymm0 - vmovdqa32 ymm4 {k2}, ymm1 - vmovdqa32 ymm5 {k2}, ymm1 - vmovdqa ymmword ptr [rsp], ymm2 - vmovdqa ymmword ptr [rsp+20H], ymm3 - vmovdqa ymmword ptr [rsp+40H], ymm4 - vmovdqa ymmword ptr [rsp+60H], ymm5 - shl rdx, 6 - mov qword ptr [rsp+80H], rdx - cmp rsi, 16 - jc final15blocks -outerloop16: - vpbroadcastd zmm0, dword ptr [rcx] - vpbroadcastd zmm1, dword ptr [rcx+1H*4H] - vpbroadcastd zmm2, dword ptr [rcx+2H*4H] - vpbroadcastd zmm3, dword ptr [rcx+3H*4H] - vpbroadcastd zmm4, dword ptr [rcx+4H*4H] - vpbroadcastd zmm5, dword ptr [rcx+5H*4H] - vpbroadcastd zmm6, dword ptr [rcx+6H*4H] - vpbroadcastd zmm7, dword ptr [rcx+7H*4H] - movzx eax, byte ptr [rbp+78H] - movzx ebx, byte ptr [rbp+80H] - or eax, ebx - xor edx, edx -ALIGN 16 -innerloop16: - movzx ebx, byte ptr [rbp+88H] - or ebx, eax - add rdx, 64 - cmp rdx, qword ptr [rsp+80H] - cmove eax, ebx - mov dword ptr [rsp+88H], eax - mov r8, qword ptr [rdi] - mov r9, qword ptr [rdi+8H] - mov r10, qword ptr [rdi+10H] - mov r11, qword ptr [rdi+18H] - mov r12, qword ptr [rdi+40H] - mov r13, qword ptr [rdi+48H] - mov r14, qword ptr [rdi+50H] - mov r15, qword ptr [rdi+58H] - vmovdqu32 ymm16, ymmword ptr [rdx+r8-2H*20H] - vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-2H*20H], 01H - vmovdqu32 ymm17, ymmword ptr [rdx+r9-2H*20H] - vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-2H*20H], 01H - vpunpcklqdq zmm8, zmm16, zmm17 - vpunpckhqdq zmm9, zmm16, zmm17 - vmovdqu32 ymm18, ymmword ptr [rdx+r10-2H*20H] - vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-2H*20H], 01H - vmovdqu32 ymm19, ymmword ptr [rdx+r11-2H*20H] - vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-2H*20H], 01H - vpunpcklqdq zmm10, zmm18, zmm19 - vpunpckhqdq zmm11, zmm18, zmm19 - mov r8, qword ptr [rdi+20H] - mov r9, qword ptr [rdi+28H] - mov r10, qword ptr [rdi+30H] - mov r11, qword ptr [rdi+38H] - mov r12, qword ptr [rdi+60H] - mov r13, qword ptr [rdi+68H] - mov r14, qword ptr [rdi+70H] - mov r15, qword ptr [rdi+78H] - vmovdqu32 ymm16, ymmword ptr [rdx+r8-2H*20H] - vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-2H*20H], 01H - vmovdqu32 ymm17, ymmword ptr [rdx+r9-2H*20H] - vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-2H*20H], 01H - vpunpcklqdq zmm12, zmm16, zmm17 - vpunpckhqdq zmm13, zmm16, zmm17 - vmovdqu32 ymm18, ymmword ptr [rdx+r10-2H*20H] - vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-2H*20H], 01H - vmovdqu32 ymm19, ymmword ptr [rdx+r11-2H*20H] - vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-2H*20H], 01H - vpunpcklqdq zmm14, zmm18, zmm19 - vpunpckhqdq zmm15, zmm18, zmm19 - vmovdqa32 zmm27, zmmword ptr [INDEX0] - vmovdqa32 zmm31, zmmword ptr [INDEX1] - vshufps zmm16, zmm8, zmm10, 136 - vshufps zmm17, zmm12, zmm14, 136 - vmovdqa32 zmm20, zmm16 - vpermt2d zmm16, zmm27, zmm17 - vpermt2d zmm20, zmm31, zmm17 - vshufps zmm17, zmm8, zmm10, 221 - vshufps zmm30, zmm12, zmm14, 221 - vmovdqa32 zmm21, zmm17 - vpermt2d zmm17, zmm27, zmm30 - vpermt2d zmm21, zmm31, zmm30 - vshufps zmm18, zmm9, zmm11, 136 - vshufps zmm8, zmm13, zmm15, 136 - vmovdqa32 zmm22, zmm18 - vpermt2d zmm18, zmm27, zmm8 - vpermt2d zmm22, zmm31, zmm8 - vshufps zmm19, zmm9, zmm11, 221 - vshufps zmm8, zmm13, zmm15, 221 - vmovdqa32 zmm23, zmm19 - vpermt2d zmm19, zmm27, zmm8 - vpermt2d zmm23, zmm31, zmm8 - mov r8, qword ptr [rdi] - mov r9, qword ptr [rdi+8H] - mov r10, qword ptr [rdi+10H] - mov r11, qword ptr [rdi+18H] - mov r12, qword ptr [rdi+40H] - mov r13, qword ptr [rdi+48H] - mov r14, qword ptr [rdi+50H] - mov r15, qword ptr [rdi+58H] - vmovdqu32 ymm24, ymmword ptr [r8+rdx-1H*20H] - vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-1H*20H], 01H - vmovdqu32 ymm25, ymmword ptr [r9+rdx-1H*20H] - vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-1H*20H], 01H - vpunpcklqdq zmm8, zmm24, zmm25 - vpunpckhqdq zmm9, zmm24, zmm25 - vmovdqu32 ymm24, ymmword ptr [r10+rdx-1H*20H] - vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-1H*20H], 01H - vmovdqu32 ymm25, ymmword ptr [r11+rdx-1H*20H] - vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-1H*20H], 01H - vpunpcklqdq zmm10, zmm24, zmm25 - vpunpckhqdq zmm11, zmm24, zmm25 - prefetcht0 byte ptr [r8+rdx+80H] - prefetcht0 byte ptr [r12+rdx+80H] - prefetcht0 byte ptr [r9+rdx+80H] - prefetcht0 byte ptr [r13+rdx+80H] - prefetcht0 byte ptr [r10+rdx+80H] - prefetcht0 byte ptr [r14+rdx+80H] - prefetcht0 byte ptr [r11+rdx+80H] - prefetcht0 byte ptr [r15+rdx+80H] - mov r8, qword ptr [rdi+20H] - mov r9, qword ptr [rdi+28H] - mov r10, qword ptr [rdi+30H] - mov r11, qword ptr [rdi+38H] - mov r12, qword ptr [rdi+60H] - mov r13, qword ptr [rdi+68H] - mov r14, qword ptr [rdi+70H] - mov r15, qword ptr [rdi+78H] - vmovdqu32 ymm24, ymmword ptr [r8+rdx-1H*20H] - vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-1H*20H], 01H - vmovdqu32 ymm25, ymmword ptr [r9+rdx-1H*20H] - vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-1H*20H], 01H - vpunpcklqdq zmm12, zmm24, zmm25 - vpunpckhqdq zmm13, zmm24, zmm25 - vmovdqu32 ymm24, ymmword ptr [r10+rdx-1H*20H] - vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-1H*20H], 01H - vmovdqu32 ymm25, ymmword ptr [r11+rdx-1H*20H] - vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-1H*20H], 01H - vpunpcklqdq zmm14, zmm24, zmm25 - vpunpckhqdq zmm15, zmm24, zmm25 - prefetcht0 byte ptr [r8+rdx+80H] - prefetcht0 byte ptr [r12+rdx+80H] - prefetcht0 byte ptr [r9+rdx+80H] - prefetcht0 byte ptr [r13+rdx+80H] - prefetcht0 byte ptr [r10+rdx+80H] - prefetcht0 byte ptr [r14+rdx+80H] - prefetcht0 byte ptr [r11+rdx+80H] - prefetcht0 byte ptr [r15+rdx+80H] - vshufps zmm24, zmm8, zmm10, 136 - vshufps zmm30, zmm12, zmm14, 136 - vmovdqa32 zmm28, zmm24 - vpermt2d zmm24, zmm27, zmm30 - vpermt2d zmm28, zmm31, zmm30 - vshufps zmm25, zmm8, zmm10, 221 - vshufps zmm30, zmm12, zmm14, 221 - vmovdqa32 zmm29, zmm25 - vpermt2d zmm25, zmm27, zmm30 - vpermt2d zmm29, zmm31, zmm30 - vshufps zmm26, zmm9, zmm11, 136 - vshufps zmm8, zmm13, zmm15, 136 - vmovdqa32 zmm30, zmm26 - vpermt2d zmm26, zmm27, zmm8 - vpermt2d zmm30, zmm31, zmm8 - vshufps zmm8, zmm9, zmm11, 221 - vshufps zmm10, zmm13, zmm15, 221 - vpermi2d zmm27, zmm8, zmm10 - vpermi2d zmm31, zmm8, zmm10 - vpbroadcastd zmm8, dword ptr [BLAKE3_IV_0] - vpbroadcastd zmm9, dword ptr [BLAKE3_IV_1] - vpbroadcastd zmm10, dword ptr [BLAKE3_IV_2] - vpbroadcastd zmm11, dword ptr [BLAKE3_IV_3] - vmovdqa32 zmm12, zmmword ptr [rsp] - vmovdqa32 zmm13, zmmword ptr [rsp+1H*40H] - vpbroadcastd zmm14, dword ptr [BLAKE3_BLOCK_LEN] - vpbroadcastd zmm15, dword ptr [rsp+22H*4H] - vpaddd zmm0, zmm0, zmm16 - vpaddd zmm1, zmm1, zmm18 - vpaddd zmm2, zmm2, zmm20 - vpaddd zmm3, zmm3, zmm22 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vprord zmm15, zmm15, 16 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 12 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vpaddd zmm0, zmm0, zmm17 - vpaddd zmm1, zmm1, zmm19 - vpaddd zmm2, zmm2, zmm21 - vpaddd zmm3, zmm3, zmm23 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vprord zmm15, zmm15, 8 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 7 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vpaddd zmm0, zmm0, zmm24 - vpaddd zmm1, zmm1, zmm26 - vpaddd zmm2, zmm2, zmm28 - vpaddd zmm3, zmm3, zmm30 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 16 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vprord zmm4, zmm4, 12 - vpaddd zmm0, zmm0, zmm25 - vpaddd zmm1, zmm1, zmm27 - vpaddd zmm2, zmm2, zmm29 - vpaddd zmm3, zmm3, zmm31 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 8 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vprord zmm4, zmm4, 7 - vpaddd zmm0, zmm0, zmm18 - vpaddd zmm1, zmm1, zmm19 - vpaddd zmm2, zmm2, zmm23 - vpaddd zmm3, zmm3, zmm20 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vprord zmm15, zmm15, 16 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 12 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vpaddd zmm0, zmm0, zmm22 - vpaddd zmm1, zmm1, zmm26 - vpaddd zmm2, zmm2, zmm16 - vpaddd zmm3, zmm3, zmm29 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vprord zmm15, zmm15, 8 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 7 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vpaddd zmm0, zmm0, zmm17 - vpaddd zmm1, zmm1, zmm28 - vpaddd zmm2, zmm2, zmm25 - vpaddd zmm3, zmm3, zmm31 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 16 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vprord zmm4, zmm4, 12 - vpaddd zmm0, zmm0, zmm27 - vpaddd zmm1, zmm1, zmm21 - vpaddd zmm2, zmm2, zmm30 - vpaddd zmm3, zmm3, zmm24 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 8 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vprord zmm4, zmm4, 7 - vpaddd zmm0, zmm0, zmm19 - vpaddd zmm1, zmm1, zmm26 - vpaddd zmm2, zmm2, zmm29 - vpaddd zmm3, zmm3, zmm23 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vprord zmm15, zmm15, 16 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 12 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vpaddd zmm0, zmm0, zmm20 - vpaddd zmm1, zmm1, zmm28 - vpaddd zmm2, zmm2, zmm18 - vpaddd zmm3, zmm3, zmm30 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vprord zmm15, zmm15, 8 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 7 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vpaddd zmm0, zmm0, zmm22 - vpaddd zmm1, zmm1, zmm25 - vpaddd zmm2, zmm2, zmm27 - vpaddd zmm3, zmm3, zmm24 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 16 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vprord zmm4, zmm4, 12 - vpaddd zmm0, zmm0, zmm21 - vpaddd zmm1, zmm1, zmm16 - vpaddd zmm2, zmm2, zmm31 - vpaddd zmm3, zmm3, zmm17 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 8 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vprord zmm4, zmm4, 7 - vpaddd zmm0, zmm0, zmm26 - vpaddd zmm1, zmm1, zmm28 - vpaddd zmm2, zmm2, zmm30 - vpaddd zmm3, zmm3, zmm29 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vprord zmm15, zmm15, 16 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 12 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vpaddd zmm0, zmm0, zmm23 - vpaddd zmm1, zmm1, zmm25 - vpaddd zmm2, zmm2, zmm19 - vpaddd zmm3, zmm3, zmm31 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vprord zmm15, zmm15, 8 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 7 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vpaddd zmm0, zmm0, zmm20 - vpaddd zmm1, zmm1, zmm27 - vpaddd zmm2, zmm2, zmm21 - vpaddd zmm3, zmm3, zmm17 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 16 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vprord zmm4, zmm4, 12 - vpaddd zmm0, zmm0, zmm16 - vpaddd zmm1, zmm1, zmm18 - vpaddd zmm2, zmm2, zmm24 - vpaddd zmm3, zmm3, zmm22 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 8 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vprord zmm4, zmm4, 7 - vpaddd zmm0, zmm0, zmm28 - vpaddd zmm1, zmm1, zmm25 - vpaddd zmm2, zmm2, zmm31 - vpaddd zmm3, zmm3, zmm30 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vprord zmm15, zmm15, 16 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 12 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vpaddd zmm0, zmm0, zmm29 - vpaddd zmm1, zmm1, zmm27 - vpaddd zmm2, zmm2, zmm26 - vpaddd zmm3, zmm3, zmm24 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vprord zmm15, zmm15, 8 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 7 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vpaddd zmm0, zmm0, zmm23 - vpaddd zmm1, zmm1, zmm21 - vpaddd zmm2, zmm2, zmm16 - vpaddd zmm3, zmm3, zmm22 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 16 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vprord zmm4, zmm4, 12 - vpaddd zmm0, zmm0, zmm18 - vpaddd zmm1, zmm1, zmm19 - vpaddd zmm2, zmm2, zmm17 - vpaddd zmm3, zmm3, zmm20 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 8 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vprord zmm4, zmm4, 7 - vpaddd zmm0, zmm0, zmm25 - vpaddd zmm1, zmm1, zmm27 - vpaddd zmm2, zmm2, zmm24 - vpaddd zmm3, zmm3, zmm31 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vprord zmm15, zmm15, 16 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 12 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vpaddd zmm0, zmm0, zmm30 - vpaddd zmm1, zmm1, zmm21 - vpaddd zmm2, zmm2, zmm28 - vpaddd zmm3, zmm3, zmm17 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vprord zmm15, zmm15, 8 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 7 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vpaddd zmm0, zmm0, zmm29 - vpaddd zmm1, zmm1, zmm16 - vpaddd zmm2, zmm2, zmm18 - vpaddd zmm3, zmm3, zmm20 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 16 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vprord zmm4, zmm4, 12 - vpaddd zmm0, zmm0, zmm19 - vpaddd zmm1, zmm1, zmm26 - vpaddd zmm2, zmm2, zmm22 - vpaddd zmm3, zmm3, zmm23 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 8 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vprord zmm4, zmm4, 7 - vpaddd zmm0, zmm0, zmm27 - vpaddd zmm1, zmm1, zmm21 - vpaddd zmm2, zmm2, zmm17 - vpaddd zmm3, zmm3, zmm24 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vprord zmm15, zmm15, 16 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 12 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vpaddd zmm0, zmm0, zmm31 - vpaddd zmm1, zmm1, zmm16 - vpaddd zmm2, zmm2, zmm25 - vpaddd zmm3, zmm3, zmm22 - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm1, zmm1, zmm5 - vpaddd zmm2, zmm2, zmm6 - vpaddd zmm3, zmm3, zmm7 - vpxord zmm12, zmm12, zmm0 - vpxord zmm13, zmm13, zmm1 - vpxord zmm14, zmm14, zmm2 - vpxord zmm15, zmm15, zmm3 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vprord zmm15, zmm15, 8 - vpaddd zmm8, zmm8, zmm12 - vpaddd zmm9, zmm9, zmm13 - vpaddd zmm10, zmm10, zmm14 - vpaddd zmm11, zmm11, zmm15 - vpxord zmm4, zmm4, zmm8 - vpxord zmm5, zmm5, zmm9 - vpxord zmm6, zmm6, zmm10 - vpxord zmm7, zmm7, zmm11 - vprord zmm4, zmm4, 7 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vpaddd zmm0, zmm0, zmm30 - vpaddd zmm1, zmm1, zmm18 - vpaddd zmm2, zmm2, zmm19 - vpaddd zmm3, zmm3, zmm23 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 16 - vprord zmm12, zmm12, 16 - vprord zmm13, zmm13, 16 - vprord zmm14, zmm14, 16 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 12 - vprord zmm6, zmm6, 12 - vprord zmm7, zmm7, 12 - vprord zmm4, zmm4, 12 - vpaddd zmm0, zmm0, zmm26 - vpaddd zmm1, zmm1, zmm28 - vpaddd zmm2, zmm2, zmm20 - vpaddd zmm3, zmm3, zmm29 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm1, zmm1, zmm6 - vpaddd zmm2, zmm2, zmm7 - vpaddd zmm3, zmm3, zmm4 - vpxord zmm15, zmm15, zmm0 - vpxord zmm12, zmm12, zmm1 - vpxord zmm13, zmm13, zmm2 - vpxord zmm14, zmm14, zmm3 - vprord zmm15, zmm15, 8 - vprord zmm12, zmm12, 8 - vprord zmm13, zmm13, 8 - vprord zmm14, zmm14, 8 - vpaddd zmm10, zmm10, zmm15 - vpaddd zmm11, zmm11, zmm12 - vpaddd zmm8, zmm8, zmm13 - vpaddd zmm9, zmm9, zmm14 - vpxord zmm5, zmm5, zmm10 - vpxord zmm6, zmm6, zmm11 - vpxord zmm7, zmm7, zmm8 - vpxord zmm4, zmm4, zmm9 - vprord zmm5, zmm5, 7 - vprord zmm6, zmm6, 7 - vprord zmm7, zmm7, 7 - vprord zmm4, zmm4, 7 - vpxord zmm0, zmm0, zmm8 - vpxord zmm1, zmm1, zmm9 - vpxord zmm2, zmm2, zmm10 - vpxord zmm3, zmm3, zmm11 - vpxord zmm4, zmm4, zmm12 - vpxord zmm5, zmm5, zmm13 - vpxord zmm6, zmm6, zmm14 - vpxord zmm7, zmm7, zmm15 - movzx eax, byte ptr [rbp+78H] - jne innerloop16 - mov rbx, qword ptr [rbp+90H] - vpunpckldq zmm16, zmm0, zmm1 - vpunpckhdq zmm17, zmm0, zmm1 - vpunpckldq zmm18, zmm2, zmm3 - vpunpckhdq zmm19, zmm2, zmm3 - vpunpckldq zmm20, zmm4, zmm5 - vpunpckhdq zmm21, zmm4, zmm5 - vpunpckldq zmm22, zmm6, zmm7 - vpunpckhdq zmm23, zmm6, zmm7 - vpunpcklqdq zmm0, zmm16, zmm18 - vpunpckhqdq zmm1, zmm16, zmm18 - vpunpcklqdq zmm2, zmm17, zmm19 - vpunpckhqdq zmm3, zmm17, zmm19 - vpunpcklqdq zmm4, zmm20, zmm22 - vpunpckhqdq zmm5, zmm20, zmm22 - vpunpcklqdq zmm6, zmm21, zmm23 - vpunpckhqdq zmm7, zmm21, zmm23 - vshufi32x4 zmm16, zmm0, zmm4, 88H - vshufi32x4 zmm17, zmm1, zmm5, 88H - vshufi32x4 zmm18, zmm2, zmm6, 88H - vshufi32x4 zmm19, zmm3, zmm7, 88H - vshufi32x4 zmm20, zmm0, zmm4, 0DDH - vshufi32x4 zmm21, zmm1, zmm5, 0DDH - vshufi32x4 zmm22, zmm2, zmm6, 0DDH - vshufi32x4 zmm23, zmm3, zmm7, 0DDH - vshufi32x4 zmm0, zmm16, zmm17, 88H - vshufi32x4 zmm1, zmm18, zmm19, 88H - vshufi32x4 zmm2, zmm20, zmm21, 88H - vshufi32x4 zmm3, zmm22, zmm23, 88H - vshufi32x4 zmm4, zmm16, zmm17, 0DDH - vshufi32x4 zmm5, zmm18, zmm19, 0DDH - vshufi32x4 zmm6, zmm20, zmm21, 0DDH - vshufi32x4 zmm7, zmm22, zmm23, 0DDH - vmovdqu32 zmmword ptr [rbx], zmm0 - vmovdqu32 zmmword ptr [rbx+1H*40H], zmm1 - vmovdqu32 zmmword ptr [rbx+2H*40H], zmm2 - vmovdqu32 zmmword ptr [rbx+3H*40H], zmm3 - vmovdqu32 zmmword ptr [rbx+4H*40H], zmm4 - vmovdqu32 zmmword ptr [rbx+5H*40H], zmm5 - vmovdqu32 zmmword ptr [rbx+6H*40H], zmm6 - vmovdqu32 zmmword ptr [rbx+7H*40H], zmm7 - vmovdqa32 zmm0, zmmword ptr [rsp] - vmovdqa32 zmm1, zmmword ptr [rsp+1H*40H] - vmovdqa32 zmm2, zmm0 - ; XXX: ml64.exe does not currently understand the syntax. We use a workaround. - vpbroadcastd zmm4, dword ptr [ADD16] - vpbroadcastd zmm5, dword ptr [ADD1] - vpaddd zmm2{k1}, zmm0, zmm4 - ; vpaddd zmm2{k1}, zmm0, dword ptr [ADD16] ; {1to16} - vpcmpud k2, zmm2, zmm0, 1 - vpaddd zmm1 {k2}, zmm1, zmm5 - ; vpaddd zmm1 {k2}, zmm1, dword ptr [ADD1] ; {1to16} - vmovdqa32 zmmword ptr [rsp], zmm2 - vmovdqa32 zmmword ptr [rsp+1H*40H], zmm1 - add rdi, 128 - add rbx, 512 - mov qword ptr [rbp+90H], rbx - sub rsi, 16 - cmp rsi, 16 - jnc outerloop16 - test rsi, rsi - jne final15blocks -unwind: - vzeroupper - vmovdqa xmm6, xmmword ptr [rsp+90H] - vmovdqa xmm7, xmmword ptr [rsp+0A0H] - vmovdqa xmm8, xmmword ptr [rsp+0B0H] - vmovdqa xmm9, xmmword ptr [rsp+0C0H] - vmovdqa xmm10, xmmword ptr [rsp+0D0H] - vmovdqa xmm11, xmmword ptr [rsp+0E0H] - vmovdqa xmm12, xmmword ptr [rsp+0F0H] - vmovdqa xmm13, xmmword ptr [rsp+100H] - vmovdqa xmm14, xmmword ptr [rsp+110H] - vmovdqa xmm15, xmmword ptr [rsp+120H] - mov rsp, rbp - pop rbp - pop rbx - pop rsi - pop rdi - pop r12 - pop r13 - pop r14 - pop r15 - ret -ALIGN 16 -final15blocks: - test esi, 8H - je final7blocks - vpbroadcastd ymm0, dword ptr [rcx] - vpbroadcastd ymm1, dword ptr [rcx+4H] - vpbroadcastd ymm2, dword ptr [rcx+8H] - vpbroadcastd ymm3, dword ptr [rcx+0CH] - vpbroadcastd ymm4, dword ptr [rcx+10H] - vpbroadcastd ymm5, dword ptr [rcx+14H] - vpbroadcastd ymm6, dword ptr [rcx+18H] - vpbroadcastd ymm7, dword ptr [rcx+1CH] - mov r8, qword ptr [rdi] - mov r9, qword ptr [rdi+8H] - mov r10, qword ptr [rdi+10H] - mov r11, qword ptr [rdi+18H] - mov r12, qword ptr [rdi+20H] - mov r13, qword ptr [rdi+28H] - mov r14, qword ptr [rdi+30H] - mov r15, qword ptr [rdi+38H] - movzx eax, byte ptr [rbp+78H] - movzx ebx, byte ptr [rbp+80H] - or eax, ebx - xor edx, edx -innerloop8: - movzx ebx, byte ptr [rbp+88H] - or ebx, eax - add rdx, 64 - cmp rdx, qword ptr [rsp+80H] - cmove eax, ebx - mov dword ptr [rsp+88H], eax - vmovups xmm8, xmmword ptr [r8+rdx-40H] - vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-40H], 01H - vmovups xmm9, xmmword ptr [r9+rdx-40H] - vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-40H], 01H - vunpcklpd ymm12, ymm8, ymm9 - vunpckhpd ymm13, ymm8, ymm9 - vmovups xmm10, xmmword ptr [r10+rdx-40H] - vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-40H], 01H - vmovups xmm11, xmmword ptr [r11+rdx-40H] - vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-40H], 01H - vunpcklpd ymm14, ymm10, ymm11 - vunpckhpd ymm15, ymm10, ymm11 - vshufps ymm16, ymm12, ymm14, 136 - vshufps ymm17, ymm12, ymm14, 221 - vshufps ymm18, ymm13, ymm15, 136 - vshufps ymm19, ymm13, ymm15, 221 - vmovups xmm8, xmmword ptr [r8+rdx-30H] - vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-30H], 01H - vmovups xmm9, xmmword ptr [r9+rdx-30H] - vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-30H], 01H - vunpcklpd ymm12, ymm8, ymm9 - vunpckhpd ymm13, ymm8, ymm9 - vmovups xmm10, xmmword ptr [r10+rdx-30H] - vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-30H], 01H - vmovups xmm11, xmmword ptr [r11+rdx-30H] - vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-30H], 01H - vunpcklpd ymm14, ymm10, ymm11 - vunpckhpd ymm15, ymm10, ymm11 - vshufps ymm20, ymm12, ymm14, 136 - vshufps ymm21, ymm12, ymm14, 221 - vshufps ymm22, ymm13, ymm15, 136 - vshufps ymm23, ymm13, ymm15, 221 - vmovups xmm8, xmmword ptr [r8+rdx-20H] - vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-20H], 01H - vmovups xmm9, xmmword ptr [r9+rdx-20H] - vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-20H], 01H - vunpcklpd ymm12, ymm8, ymm9 - vunpckhpd ymm13, ymm8, ymm9 - vmovups xmm10, xmmword ptr [r10+rdx-20H] - vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-20H], 01H - vmovups xmm11, xmmword ptr [r11+rdx-20H] - vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-20H], 01H - vunpcklpd ymm14, ymm10, ymm11 - vunpckhpd ymm15, ymm10, ymm11 - vshufps ymm24, ymm12, ymm14, 136 - vshufps ymm25, ymm12, ymm14, 221 - vshufps ymm26, ymm13, ymm15, 136 - vshufps ymm27, ymm13, ymm15, 221 - vmovups xmm8, xmmword ptr [r8+rdx-10H] - vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-10H], 01H - vmovups xmm9, xmmword ptr [r9+rdx-10H] - vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-10H], 01H - vunpcklpd ymm12, ymm8, ymm9 - vunpckhpd ymm13, ymm8, ymm9 - vmovups xmm10, xmmword ptr [r10+rdx-10H] - vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-10H], 01H - vmovups xmm11, xmmword ptr [r11+rdx-10H] - vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-10H], 01H - vunpcklpd ymm14, ymm10, ymm11 - vunpckhpd ymm15, ymm10, ymm11 - vshufps ymm28, ymm12, ymm14, 136 - vshufps ymm29, ymm12, ymm14, 221 - vshufps ymm30, ymm13, ymm15, 136 - vshufps ymm31, ymm13, ymm15, 221 - vpbroadcastd ymm8, dword ptr [BLAKE3_IV_0] - vpbroadcastd ymm9, dword ptr [BLAKE3_IV_1] - vpbroadcastd ymm10, dword ptr [BLAKE3_IV_2] - vpbroadcastd ymm11, dword ptr [BLAKE3_IV_3] - vmovdqa ymm12, ymmword ptr [rsp] - vmovdqa ymm13, ymmword ptr [rsp+40H] - vpbroadcastd ymm14, dword ptr [BLAKE3_BLOCK_LEN] - vpbroadcastd ymm15, dword ptr [rsp+88H] - vpaddd ymm0, ymm0, ymm16 - vpaddd ymm1, ymm1, ymm18 - vpaddd ymm2, ymm2, ymm20 - vpaddd ymm3, ymm3, ymm22 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vprord ymm15, ymm15, 16 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 12 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vpaddd ymm0, ymm0, ymm17 - vpaddd ymm1, ymm1, ymm19 - vpaddd ymm2, ymm2, ymm21 - vpaddd ymm3, ymm3, ymm23 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vprord ymm15, ymm15, 8 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 7 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vpaddd ymm0, ymm0, ymm24 - vpaddd ymm1, ymm1, ymm26 - vpaddd ymm2, ymm2, ymm28 - vpaddd ymm3, ymm3, ymm30 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 16 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vprord ymm4, ymm4, 12 - vpaddd ymm0, ymm0, ymm25 - vpaddd ymm1, ymm1, ymm27 - vpaddd ymm2, ymm2, ymm29 - vpaddd ymm3, ymm3, ymm31 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 8 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vprord ymm4, ymm4, 7 - vpaddd ymm0, ymm0, ymm18 - vpaddd ymm1, ymm1, ymm19 - vpaddd ymm2, ymm2, ymm23 - vpaddd ymm3, ymm3, ymm20 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vprord ymm15, ymm15, 16 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 12 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vpaddd ymm0, ymm0, ymm22 - vpaddd ymm1, ymm1, ymm26 - vpaddd ymm2, ymm2, ymm16 - vpaddd ymm3, ymm3, ymm29 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vprord ymm15, ymm15, 8 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 7 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vpaddd ymm0, ymm0, ymm17 - vpaddd ymm1, ymm1, ymm28 - vpaddd ymm2, ymm2, ymm25 - vpaddd ymm3, ymm3, ymm31 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 16 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vprord ymm4, ymm4, 12 - vpaddd ymm0, ymm0, ymm27 - vpaddd ymm1, ymm1, ymm21 - vpaddd ymm2, ymm2, ymm30 - vpaddd ymm3, ymm3, ymm24 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 8 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vprord ymm4, ymm4, 7 - vpaddd ymm0, ymm0, ymm19 - vpaddd ymm1, ymm1, ymm26 - vpaddd ymm2, ymm2, ymm29 - vpaddd ymm3, ymm3, ymm23 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vprord ymm15, ymm15, 16 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 12 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vpaddd ymm0, ymm0, ymm20 - vpaddd ymm1, ymm1, ymm28 - vpaddd ymm2, ymm2, ymm18 - vpaddd ymm3, ymm3, ymm30 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vprord ymm15, ymm15, 8 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 7 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vpaddd ymm0, ymm0, ymm22 - vpaddd ymm1, ymm1, ymm25 - vpaddd ymm2, ymm2, ymm27 - vpaddd ymm3, ymm3, ymm24 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 16 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vprord ymm4, ymm4, 12 - vpaddd ymm0, ymm0, ymm21 - vpaddd ymm1, ymm1, ymm16 - vpaddd ymm2, ymm2, ymm31 - vpaddd ymm3, ymm3, ymm17 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 8 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vprord ymm4, ymm4, 7 - vpaddd ymm0, ymm0, ymm26 - vpaddd ymm1, ymm1, ymm28 - vpaddd ymm2, ymm2, ymm30 - vpaddd ymm3, ymm3, ymm29 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vprord ymm15, ymm15, 16 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 12 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vpaddd ymm0, ymm0, ymm23 - vpaddd ymm1, ymm1, ymm25 - vpaddd ymm2, ymm2, ymm19 - vpaddd ymm3, ymm3, ymm31 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vprord ymm15, ymm15, 8 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 7 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vpaddd ymm0, ymm0, ymm20 - vpaddd ymm1, ymm1, ymm27 - vpaddd ymm2, ymm2, ymm21 - vpaddd ymm3, ymm3, ymm17 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 16 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vprord ymm4, ymm4, 12 - vpaddd ymm0, ymm0, ymm16 - vpaddd ymm1, ymm1, ymm18 - vpaddd ymm2, ymm2, ymm24 - vpaddd ymm3, ymm3, ymm22 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 8 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vprord ymm4, ymm4, 7 - vpaddd ymm0, ymm0, ymm28 - vpaddd ymm1, ymm1, ymm25 - vpaddd ymm2, ymm2, ymm31 - vpaddd ymm3, ymm3, ymm30 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vprord ymm15, ymm15, 16 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 12 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vpaddd ymm0, ymm0, ymm29 - vpaddd ymm1, ymm1, ymm27 - vpaddd ymm2, ymm2, ymm26 - vpaddd ymm3, ymm3, ymm24 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vprord ymm15, ymm15, 8 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 7 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vpaddd ymm0, ymm0, ymm23 - vpaddd ymm1, ymm1, ymm21 - vpaddd ymm2, ymm2, ymm16 - vpaddd ymm3, ymm3, ymm22 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 16 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vprord ymm4, ymm4, 12 - vpaddd ymm0, ymm0, ymm18 - vpaddd ymm1, ymm1, ymm19 - vpaddd ymm2, ymm2, ymm17 - vpaddd ymm3, ymm3, ymm20 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 8 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vprord ymm4, ymm4, 7 - vpaddd ymm0, ymm0, ymm25 - vpaddd ymm1, ymm1, ymm27 - vpaddd ymm2, ymm2, ymm24 - vpaddd ymm3, ymm3, ymm31 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vprord ymm15, ymm15, 16 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 12 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vpaddd ymm0, ymm0, ymm30 - vpaddd ymm1, ymm1, ymm21 - vpaddd ymm2, ymm2, ymm28 - vpaddd ymm3, ymm3, ymm17 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vprord ymm15, ymm15, 8 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 7 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vpaddd ymm0, ymm0, ymm29 - vpaddd ymm1, ymm1, ymm16 - vpaddd ymm2, ymm2, ymm18 - vpaddd ymm3, ymm3, ymm20 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 16 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vprord ymm4, ymm4, 12 - vpaddd ymm0, ymm0, ymm19 - vpaddd ymm1, ymm1, ymm26 - vpaddd ymm2, ymm2, ymm22 - vpaddd ymm3, ymm3, ymm23 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 8 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vprord ymm4, ymm4, 7 - vpaddd ymm0, ymm0, ymm27 - vpaddd ymm1, ymm1, ymm21 - vpaddd ymm2, ymm2, ymm17 - vpaddd ymm3, ymm3, ymm24 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vprord ymm15, ymm15, 16 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 12 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vpaddd ymm0, ymm0, ymm31 - vpaddd ymm1, ymm1, ymm16 - vpaddd ymm2, ymm2, ymm25 - vpaddd ymm3, ymm3, ymm22 - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm1, ymm1, ymm5 - vpaddd ymm2, ymm2, ymm6 - vpaddd ymm3, ymm3, ymm7 - vpxord ymm12, ymm12, ymm0 - vpxord ymm13, ymm13, ymm1 - vpxord ymm14, ymm14, ymm2 - vpxord ymm15, ymm15, ymm3 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vprord ymm15, ymm15, 8 - vpaddd ymm8, ymm8, ymm12 - vpaddd ymm9, ymm9, ymm13 - vpaddd ymm10, ymm10, ymm14 - vpaddd ymm11, ymm11, ymm15 - vpxord ymm4, ymm4, ymm8 - vpxord ymm5, ymm5, ymm9 - vpxord ymm6, ymm6, ymm10 - vpxord ymm7, ymm7, ymm11 - vprord ymm4, ymm4, 7 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vpaddd ymm0, ymm0, ymm30 - vpaddd ymm1, ymm1, ymm18 - vpaddd ymm2, ymm2, ymm19 - vpaddd ymm3, ymm3, ymm23 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 16 - vprord ymm12, ymm12, 16 - vprord ymm13, ymm13, 16 - vprord ymm14, ymm14, 16 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 12 - vprord ymm6, ymm6, 12 - vprord ymm7, ymm7, 12 - vprord ymm4, ymm4, 12 - vpaddd ymm0, ymm0, ymm26 - vpaddd ymm1, ymm1, ymm28 - vpaddd ymm2, ymm2, ymm20 - vpaddd ymm3, ymm3, ymm29 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm1, ymm1, ymm6 - vpaddd ymm2, ymm2, ymm7 - vpaddd ymm3, ymm3, ymm4 - vpxord ymm15, ymm15, ymm0 - vpxord ymm12, ymm12, ymm1 - vpxord ymm13, ymm13, ymm2 - vpxord ymm14, ymm14, ymm3 - vprord ymm15, ymm15, 8 - vprord ymm12, ymm12, 8 - vprord ymm13, ymm13, 8 - vprord ymm14, ymm14, 8 - vpaddd ymm10, ymm10, ymm15 - vpaddd ymm11, ymm11, ymm12 - vpaddd ymm8, ymm8, ymm13 - vpaddd ymm9, ymm9, ymm14 - vpxord ymm5, ymm5, ymm10 - vpxord ymm6, ymm6, ymm11 - vpxord ymm7, ymm7, ymm8 - vpxord ymm4, ymm4, ymm9 - vprord ymm5, ymm5, 7 - vprord ymm6, ymm6, 7 - vprord ymm7, ymm7, 7 - vprord ymm4, ymm4, 7 - vpxor ymm0, ymm0, ymm8 - vpxor ymm1, ymm1, ymm9 - vpxor ymm2, ymm2, ymm10 - vpxor ymm3, ymm3, ymm11 - vpxor ymm4, ymm4, ymm12 - vpxor ymm5, ymm5, ymm13 - vpxor ymm6, ymm6, ymm14 - vpxor ymm7, ymm7, ymm15 - movzx eax, byte ptr [rbp+78H] - jne innerloop8 - mov rbx, qword ptr [rbp+90H] - vunpcklps ymm8, ymm0, ymm1 - vunpcklps ymm9, ymm2, ymm3 - vunpckhps ymm10, ymm0, ymm1 - vunpcklps ymm11, ymm4, ymm5 - vunpcklps ymm0, ymm6, ymm7 - vshufps ymm12, ymm8, ymm9, 78 - vblendps ymm1, ymm8, ymm12, 0CCH - vshufps ymm8, ymm11, ymm0, 78 - vunpckhps ymm13, ymm2, ymm3 - vblendps ymm2, ymm11, ymm8, 0CCH - vblendps ymm3, ymm12, ymm9, 0CCH - vperm2f128 ymm12, ymm1, ymm2, 20H - vmovups ymmword ptr [rbx], ymm12 - vunpckhps ymm14, ymm4, ymm5 - vblendps ymm4, ymm8, ymm0, 0CCH - vunpckhps ymm15, ymm6, ymm7 - vperm2f128 ymm7, ymm3, ymm4, 20H - vmovups ymmword ptr [rbx+20H], ymm7 - vshufps ymm5, ymm10, ymm13, 78 - vblendps ymm6, ymm5, ymm13, 0CCH - vshufps ymm13, ymm14, ymm15, 78 - vblendps ymm10, ymm10, ymm5, 0CCH - vblendps ymm14, ymm14, ymm13, 0CCH - vperm2f128 ymm8, ymm10, ymm14, 20H - vmovups ymmword ptr [rbx+40H], ymm8 - vblendps ymm15, ymm13, ymm15, 0CCH - vperm2f128 ymm13, ymm6, ymm15, 20H - vmovups ymmword ptr [rbx+60H], ymm13 - vperm2f128 ymm9, ymm1, ymm2, 31H - vperm2f128 ymm11, ymm3, ymm4, 31H - vmovups ymmword ptr [rbx+80H], ymm9 - vperm2f128 ymm14, ymm10, ymm14, 31H - vperm2f128 ymm15, ymm6, ymm15, 31H - vmovups ymmword ptr [rbx+0A0H], ymm11 - vmovups ymmword ptr [rbx+0C0H], ymm14 - vmovups ymmword ptr [rbx+0E0H], ymm15 - vmovdqa ymm0, ymmword ptr [rsp] - vmovdqa ymm2, ymmword ptr [rsp+40H] - vmovdqa32 ymm0 {k1}, ymmword ptr [rsp+1H*20H] - vmovdqa32 ymm2 {k1}, ymmword ptr [rsp+3H*20H] - vmovdqa ymmword ptr [rsp], ymm0 - vmovdqa ymmword ptr [rsp+40H], ymm2 - add rbx, 256 - mov qword ptr [rbp+90H], rbx - add rdi, 64 - sub rsi, 8 -final7blocks: - mov rbx, qword ptr [rbp+90H] - mov r15, qword ptr [rsp+80H] - movzx r13, byte ptr [rbp+78H] - movzx r12, byte ptr [rbp+88H] - test esi, 4H - je final3blocks - vbroadcasti32x4 zmm0, xmmword ptr [rcx] - vbroadcasti32x4 zmm1, xmmword ptr [rcx+1H*10H] - vmovdqa xmm12, xmmword ptr [rsp] - vmovdqa xmm13, xmmword ptr [rsp+40H] - vpunpckldq xmm14, xmm12, xmm13 - vpunpckhdq xmm15, xmm12, xmm13 - vpermq ymm14, ymm14, 0DCH - vpermq ymm15, ymm15, 0DCH - vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN] - vinserti64x4 zmm13, zmm14, ymm15, 01H - mov eax, 17476 - kmovw k2, eax - vpblendmd zmm13 {k2}, zmm13, zmm12 - vbroadcasti32x4 zmm15, xmmword ptr [BLAKE3_IV] - mov r8, qword ptr [rdi] - mov r9, qword ptr [rdi+8H] - mov r10, qword ptr [rdi+10H] - mov r11, qword ptr [rdi+18H] - mov eax, 43690 - kmovw k3, eax - mov eax, 34952 - kmovw k4, eax - movzx eax, byte ptr [rbp+80H] - or eax, r13d - xor edx, edx -ALIGN 16 -innerloop4: - mov r14d, eax - or eax, r12d - add rdx, 64 - cmp rdx, r15 - cmovne eax, r14d - mov dword ptr [rsp+88H], eax - vmovdqa32 zmm2, zmm15 - vpbroadcastd zmm8, dword ptr [rsp+22H*4H] - vpblendmd zmm3 {k4}, zmm13, zmm8 - vmovups zmm8, zmmword ptr [r8+rdx-1H*40H] - vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-4H*10H], 01H - vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-4H*10H], 02H - vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-4H*10H], 03H - vmovups zmm9, zmmword ptr [r8+rdx-30H] - vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-3H*10H], 01H - vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-3H*10H], 02H - vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-3H*10H], 03H - vshufps zmm4, zmm8, zmm9, 136 - vshufps zmm5, zmm8, zmm9, 221 - vmovups zmm8, zmmword ptr [r8+rdx-20H] - vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-2H*10H], 01H - vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-2H*10H], 02H - vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-2H*10H], 03H - vmovups zmm9, zmmword ptr [r8+rdx-10H] - vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-1H*10H], 01H - vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-1H*10H], 02H - vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-1H*10H], 03H - vshufps zmm6, zmm8, zmm9, 136 - vshufps zmm7, zmm8, zmm9, 221 - vpshufd zmm6, zmm6, 93H - vpshufd zmm7, zmm7, 93H - mov al, 7 -roundloop4: - vpaddd zmm0, zmm0, zmm4 - vpaddd zmm0, zmm0, zmm1 - vpxord zmm3, zmm3, zmm0 - vprord zmm3, zmm3, 16 - vpaddd zmm2, zmm2, zmm3 - vpxord zmm1, zmm1, zmm2 - vprord zmm1, zmm1, 12 - vpaddd zmm0, zmm0, zmm5 - vpaddd zmm0, zmm0, zmm1 - vpxord zmm3, zmm3, zmm0 - vprord zmm3, zmm3, 8 - vpaddd zmm2, zmm2, zmm3 - vpxord zmm1, zmm1, zmm2 - vprord zmm1, zmm1, 7 - vpshufd zmm0, zmm0, 93H - vpshufd zmm3, zmm3, 4EH - vpshufd zmm2, zmm2, 39H - vpaddd zmm0, zmm0, zmm6 - vpaddd zmm0, zmm0, zmm1 - vpxord zmm3, zmm3, zmm0 - vprord zmm3, zmm3, 16 - vpaddd zmm2, zmm2, zmm3 - vpxord zmm1, zmm1, zmm2 - vprord zmm1, zmm1, 12 - vpaddd zmm0, zmm0, zmm7 - vpaddd zmm0, zmm0, zmm1 - vpxord zmm3, zmm3, zmm0 - vprord zmm3, zmm3, 8 - vpaddd zmm2, zmm2, zmm3 - vpxord zmm1, zmm1, zmm2 - vprord zmm1, zmm1, 7 - vpshufd zmm0, zmm0, 39H - vpshufd zmm3, zmm3, 4EH - vpshufd zmm2, zmm2, 93H - dec al - jz endroundloop4 - vshufps zmm8, zmm4, zmm5, 214 - vpshufd zmm9, zmm4, 0FH - vpshufd zmm4, zmm8, 39H - vshufps zmm8, zmm6, zmm7, 250 - vpblendmd zmm9 {k3}, zmm9, zmm8 - vpunpcklqdq zmm8, zmm7, zmm5 - vpblendmd zmm8 {k4}, zmm8, zmm6 - vpshufd zmm8, zmm8, 78H - vpunpckhdq zmm5, zmm5, zmm7 - vpunpckldq zmm6, zmm6, zmm5 - vpshufd zmm7, zmm6, 1EH - vmovdqa32 zmm5, zmm9 - vmovdqa32 zmm6, zmm8 - jmp roundloop4 -endroundloop4: - vpxord zmm0, zmm0, zmm2 - vpxord zmm1, zmm1, zmm3 - mov eax, r13d - cmp rdx, r15 - jne innerloop4 - vmovdqu xmmword ptr [rbx], xmm0 - vmovdqu xmmword ptr [rbx+10H], xmm1 - vextracti128 xmmword ptr [rbx+20H], ymm0, 01H - vextracti128 xmmword ptr [rbx+30H], ymm1, 01H - vextracti32x4 xmmword ptr [rbx+4H*10H], zmm0, 02H - vextracti32x4 xmmword ptr [rbx+5H*10H], zmm1, 02H - vextracti32x4 xmmword ptr [rbx+6H*10H], zmm0, 03H - vextracti32x4 xmmword ptr [rbx+7H*10H], zmm1, 03H - vmovdqa xmm0, xmmword ptr [rsp] - vmovdqa xmm2, xmmword ptr [rsp+40H] - vmovdqa32 xmm0 {k1}, xmmword ptr [rsp+1H*10H] - vmovdqa32 xmm2 {k1}, xmmword ptr [rsp+5H*10H] - vmovdqa xmmword ptr [rsp], xmm0 - vmovdqa xmmword ptr [rsp+40H], xmm2 - add rbx, 128 - add rdi, 32 - sub rsi, 4 -final3blocks: - test esi, 2H - je final1block - vbroadcasti128 ymm0, xmmword ptr [rcx] - vbroadcasti128 ymm1, xmmword ptr [rcx+10H] - vmovd xmm13, dword ptr [rsp] - vpinsrd xmm13, xmm13, dword ptr [rsp+40H], 1 - vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN], 2 - vmovd xmm14, dword ptr [rsp+4H] - vpinsrd xmm14, xmm14, dword ptr [rsp+44H], 1 - vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN], 2 - vinserti128 ymm13, ymm13, xmm14, 01H - mov r8, qword ptr [rdi] - mov r9, qword ptr [rdi+8H] - movzx eax, byte ptr [rbp+80H] - or eax, r13d - xor edx, edx -ALIGN 16 -innerloop2: - mov r14d, eax - or eax, r12d - add rdx, 64 - cmp rdx, r15 - cmovne eax, r14d - mov dword ptr [rsp+88H], eax - vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV] - vpbroadcastd ymm8, dword ptr [rsp+88H] - vpblendd ymm3, ymm13, ymm8, 88H - vmovups ymm8, ymmword ptr [r8+rdx-40H] - vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-40H], 01H - vmovups ymm9, ymmword ptr [r8+rdx-30H] - vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-30H], 01H - vshufps ymm4, ymm8, ymm9, 136 - vshufps ymm5, ymm8, ymm9, 221 - vmovups ymm8, ymmword ptr [r8+rdx-20H] - vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-20H], 01H - vmovups ymm9, ymmword ptr [r8+rdx-10H] - vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-10H], 01H - vshufps ymm6, ymm8, ymm9, 136 - vshufps ymm7, ymm8, ymm9, 221 - vpshufd ymm6, ymm6, 93H - vpshufd ymm7, ymm7, 93H - mov al, 7 -roundloop2: - vpaddd ymm0, ymm0, ymm4 - vpaddd ymm0, ymm0, ymm1 - vpxord ymm3, ymm3, ymm0 - vprord ymm3, ymm3, 16 - vpaddd ymm2, ymm2, ymm3 - vpxord ymm1, ymm1, ymm2 - vprord ymm1, ymm1, 12 - vpaddd ymm0, ymm0, ymm5 - vpaddd ymm0, ymm0, ymm1 - vpxord ymm3, ymm3, ymm0 - vprord ymm3, ymm3, 8 - vpaddd ymm2, ymm2, ymm3 - vpxord ymm1, ymm1, ymm2 - vprord ymm1, ymm1, 7 - vpshufd ymm0, ymm0, 93H - vpshufd ymm3, ymm3, 4EH - vpshufd ymm2, ymm2, 39H - vpaddd ymm0, ymm0, ymm6 - vpaddd ymm0, ymm0, ymm1 - vpxord ymm3, ymm3, ymm0 - vprord ymm3, ymm3, 16 - vpaddd ymm2, ymm2, ymm3 - vpxord ymm1, ymm1, ymm2 - vprord ymm1, ymm1, 12 - vpaddd ymm0, ymm0, ymm7 - vpaddd ymm0, ymm0, ymm1 - vpxord ymm3, ymm3, ymm0 - vprord ymm3, ymm3, 8 - vpaddd ymm2, ymm2, ymm3 - vpxord ymm1, ymm1, ymm2 - vprord ymm1, ymm1, 7 - vpshufd ymm0, ymm0, 39H - vpshufd ymm3, ymm3, 4EH - vpshufd ymm2, ymm2, 93H - dec al - jz endroundloop2 - vshufps ymm8, ymm4, ymm5, 214 - vpshufd ymm9, ymm4, 0FH - vpshufd ymm4, ymm8, 39H - vshufps ymm8, ymm6, ymm7, 250 - vpblendd ymm9, ymm9, ymm8, 0AAH - vpunpcklqdq ymm8, ymm7, ymm5 - vpblendd ymm8, ymm8, ymm6, 88H - vpshufd ymm8, ymm8, 78H - vpunpckhdq ymm5, ymm5, ymm7 - vpunpckldq ymm6, ymm6, ymm5 - vpshufd ymm7, ymm6, 1EH - vmovdqa ymm5, ymm9 - vmovdqa ymm6, ymm8 - jmp roundloop2 -endroundloop2: - vpxor ymm0, ymm0, ymm2 - vpxor ymm1, ymm1, ymm3 - mov eax, r13d - cmp rdx, r15 - jne innerloop2 - vmovdqu xmmword ptr [rbx], xmm0 - vmovdqu xmmword ptr [rbx+10H], xmm1 - vextracti128 xmmword ptr [rbx+20H], ymm0, 01H - vextracti128 xmmword ptr [rbx+30H], ymm1, 01H - vmovdqa xmm0, xmmword ptr [rsp] - vmovdqa xmm2, xmmword ptr [rsp+40H] - vmovdqu32 xmm0 {k1}, xmmword ptr [rsp+8H] - vmovdqu32 xmm2 {k1}, xmmword ptr [rsp+48H] - vmovdqa xmmword ptr [rsp], xmm0 - vmovdqa xmmword ptr [rsp+40H], xmm2 - add rbx, 64 - add rdi, 16 - sub rsi, 2 -final1block: - test esi, 1H - je unwind - vmovdqu xmm0, xmmword ptr [rcx] - vmovdqu xmm1, xmmword ptr [rcx+10H] - vmovd xmm14, dword ptr [rsp] - vpinsrd xmm14, xmm14, dword ptr [rsp+40H], 1 - vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN], 2 - vmovdqa xmm15, xmmword ptr [BLAKE3_IV] - mov r8, qword ptr [rdi] - movzx eax, byte ptr [rbp+80H] - or eax, r13d - xor edx, edx -ALIGN 16 -innerloop1: - mov r14d, eax - or eax, r12d - add rdx, 64 - cmp rdx, r15 - cmovne eax, r14d - vpinsrd xmm3, xmm14, eax, 3 - vmovdqa xmm2, xmm15 - vmovups xmm8, xmmword ptr [r8+rdx-40H] - vmovups xmm9, xmmword ptr [r8+rdx-30H] - vshufps xmm4, xmm8, xmm9, 136 - vshufps xmm5, xmm8, xmm9, 221 - vmovups xmm8, xmmword ptr [r8+rdx-20H] - vmovups xmm9, xmmword ptr [r8+rdx-10H] - vshufps xmm6, xmm8, xmm9, 136 - vshufps xmm7, xmm8, xmm9, 221 - vpshufd xmm6, xmm6, 93H - vpshufd xmm7, xmm7, 93H - mov al, 7 -roundloop1: - vpaddd xmm0, xmm0, xmm4 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 16 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 12 - vpaddd xmm0, xmm0, xmm5 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 8 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 7 - vpshufd xmm0, xmm0, 93H - vpshufd xmm3, xmm3, 4EH - vpshufd xmm2, xmm2, 39H - vpaddd xmm0, xmm0, xmm6 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 16 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 12 - vpaddd xmm0, xmm0, xmm7 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 8 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 7 - vpshufd xmm0, xmm0, 39H - vpshufd xmm3, xmm3, 4EH - vpshufd xmm2, xmm2, 93H - dec al - jz endroundloop1 - vshufps xmm8, xmm4, xmm5, 214 - vpshufd xmm9, xmm4, 0FH - vpshufd xmm4, xmm8, 39H - vshufps xmm8, xmm6, xmm7, 250 - vpblendd xmm9, xmm9, xmm8, 0AAH - vpunpcklqdq xmm8, xmm7, xmm5 - vpblendd xmm8, xmm8, xmm6, 88H - vpshufd xmm8, xmm8, 78H - vpunpckhdq xmm5, xmm5, xmm7 - vpunpckldq xmm6, xmm6, xmm5 - vpshufd xmm7, xmm6, 1EH - vmovdqa xmm5, xmm9 - vmovdqa xmm6, xmm8 - jmp roundloop1 -endroundloop1: - vpxor xmm0, xmm0, xmm2 - vpxor xmm1, xmm1, xmm3 - mov eax, r13d - cmp rdx, r15 - jne innerloop1 - vmovdqu xmmword ptr [rbx], xmm0 - vmovdqu xmmword ptr [rbx+10H], xmm1 - jmp unwind - -_blake3_hash_many_avx512 ENDP -blake3_hash_many_avx512 ENDP - -ALIGN 16 -blake3_compress_in_place_avx512 PROC -_blake3_compress_in_place_avx512 PROC - sub rsp, 72 - vmovdqa xmmword ptr [rsp], xmm6 - vmovdqa xmmword ptr [rsp+10H], xmm7 - vmovdqa xmmword ptr [rsp+20H], xmm8 - vmovdqa xmmword ptr [rsp+30H], xmm9 - vmovdqu xmm0, xmmword ptr [rcx] - vmovdqu xmm1, xmmword ptr [rcx+10H] - movzx eax, byte ptr [rsp+70H] - movzx r8d, r8b - shl rax, 32 - add r8, rax - vmovq xmm3, r9 - vmovq xmm4, r8 - vpunpcklqdq xmm3, xmm3, xmm4 - vmovaps xmm2, xmmword ptr [BLAKE3_IV] - vmovups xmm8, xmmword ptr [rdx] - vmovups xmm9, xmmword ptr [rdx+10H] - vshufps xmm4, xmm8, xmm9, 136 - vshufps xmm5, xmm8, xmm9, 221 - vmovups xmm8, xmmword ptr [rdx+20H] - vmovups xmm9, xmmword ptr [rdx+30H] - vshufps xmm6, xmm8, xmm9, 136 - vshufps xmm7, xmm8, xmm9, 221 - vpshufd xmm6, xmm6, 93H - vpshufd xmm7, xmm7, 93H - mov al, 7 -@@: - vpaddd xmm0, xmm0, xmm4 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 16 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 12 - vpaddd xmm0, xmm0, xmm5 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 8 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 7 - vpshufd xmm0, xmm0, 93H - vpshufd xmm3, xmm3, 4EH - vpshufd xmm2, xmm2, 39H - vpaddd xmm0, xmm0, xmm6 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 16 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 12 - vpaddd xmm0, xmm0, xmm7 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 8 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 7 - vpshufd xmm0, xmm0, 39H - vpshufd xmm3, xmm3, 4EH - vpshufd xmm2, xmm2, 93H - dec al - jz @F - vshufps xmm8, xmm4, xmm5, 214 - vpshufd xmm9, xmm4, 0FH - vpshufd xmm4, xmm8, 39H - vshufps xmm8, xmm6, xmm7, 250 - vpblendd xmm9, xmm9, xmm8, 0AAH - vpunpcklqdq xmm8, xmm7, xmm5 - vpblendd xmm8, xmm8, xmm6, 88H - vpshufd xmm8, xmm8, 78H - vpunpckhdq xmm5, xmm5, xmm7 - vpunpckldq xmm6, xmm6, xmm5 - vpshufd xmm7, xmm6, 1EH - vmovdqa xmm5, xmm9 - vmovdqa xmm6, xmm8 - jmp @B -@@: - vpxor xmm0, xmm0, xmm2 - vpxor xmm1, xmm1, xmm3 - vmovdqu xmmword ptr [rcx], xmm0 - vmovdqu xmmword ptr [rcx+10H], xmm1 - vmovdqa xmm6, xmmword ptr [rsp] - vmovdqa xmm7, xmmword ptr [rsp+10H] - vmovdqa xmm8, xmmword ptr [rsp+20H] - vmovdqa xmm9, xmmword ptr [rsp+30H] - add rsp, 72 - ret -_blake3_compress_in_place_avx512 ENDP -blake3_compress_in_place_avx512 ENDP - -ALIGN 16 -blake3_compress_xof_avx512 PROC -_blake3_compress_xof_avx512 PROC - sub rsp, 72 - vmovdqa xmmword ptr [rsp], xmm6 - vmovdqa xmmword ptr [rsp+10H], xmm7 - vmovdqa xmmword ptr [rsp+20H], xmm8 - vmovdqa xmmword ptr [rsp+30H], xmm9 - vmovdqu xmm0, xmmword ptr [rcx] - vmovdqu xmm1, xmmword ptr [rcx+10H] - movzx eax, byte ptr [rsp+70H] - movzx r8d, r8b - mov r10, qword ptr [rsp+78H] - shl rax, 32 - add r8, rax - vmovq xmm3, r9 - vmovq xmm4, r8 - vpunpcklqdq xmm3, xmm3, xmm4 - vmovaps xmm2, xmmword ptr [BLAKE3_IV] - vmovups xmm8, xmmword ptr [rdx] - vmovups xmm9, xmmword ptr [rdx+10H] - vshufps xmm4, xmm8, xmm9, 136 - vshufps xmm5, xmm8, xmm9, 221 - vmovups xmm8, xmmword ptr [rdx+20H] - vmovups xmm9, xmmword ptr [rdx+30H] - vshufps xmm6, xmm8, xmm9, 136 - vshufps xmm7, xmm8, xmm9, 221 - vpshufd xmm6, xmm6, 93H - vpshufd xmm7, xmm7, 93H - mov al, 7 -@@: - vpaddd xmm0, xmm0, xmm4 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 16 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 12 - vpaddd xmm0, xmm0, xmm5 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 8 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 7 - vpshufd xmm0, xmm0, 93H - vpshufd xmm3, xmm3, 4EH - vpshufd xmm2, xmm2, 39H - vpaddd xmm0, xmm0, xmm6 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 16 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 12 - vpaddd xmm0, xmm0, xmm7 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 8 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 7 - vpshufd xmm0, xmm0, 39H - vpshufd xmm3, xmm3, 4EH - vpshufd xmm2, xmm2, 93H - dec al - jz @F - vshufps xmm8, xmm4, xmm5, 214 - vpshufd xmm9, xmm4, 0FH - vpshufd xmm4, xmm8, 39H - vshufps xmm8, xmm6, xmm7, 250 - vpblendd xmm9, xmm9, xmm8, 0AAH - vpunpcklqdq xmm8, xmm7, xmm5 - vpblendd xmm8, xmm8, xmm6, 88H - vpshufd xmm8, xmm8, 78H - vpunpckhdq xmm5, xmm5, xmm7 - vpunpckldq xmm6, xmm6, xmm5 - vpshufd xmm7, xmm6, 1EH - vmovdqa xmm5, xmm9 - vmovdqa xmm6, xmm8 - jmp @B -@@: - vpxor xmm0, xmm0, xmm2 - vpxor xmm1, xmm1, xmm3 - vpxor xmm2, xmm2, xmmword ptr [rcx] - vpxor xmm3, xmm3, xmmword ptr [rcx+10H] - vmovdqu xmmword ptr [r10], xmm0 - vmovdqu xmmword ptr [r10+10H], xmm1 - vmovdqu xmmword ptr [r10+20H], xmm2 - vmovdqu xmmword ptr [r10+30H], xmm3 - vmovdqa xmm6, xmmword ptr [rsp] - vmovdqa xmm7, xmmword ptr [rsp+10H] - vmovdqa xmm8, xmmword ptr [rsp+20H] - vmovdqa xmm9, xmmword ptr [rsp+30H] - add rsp, 72 - ret -_blake3_compress_xof_avx512 ENDP -blake3_compress_xof_avx512 ENDP - -_TEXT ENDS - -_RDATA SEGMENT READONLY PAGE ALIAS(".rdata") 'CONST' -ALIGN 64 -INDEX0: - dd 0, 1, 2, 3, 16, 17, 18, 19 - dd 8, 9, 10, 11, 24, 25, 26, 27 -INDEX1: - dd 4, 5, 6, 7, 20, 21, 22, 23 - dd 12, 13, 14, 15, 28, 29, 30, 31 -ADD0: - dd 0, 1, 2, 3, 4, 5, 6, 7 - dd 8, 9, 10, 11, 12, 13, 14, 15 -ADD1: - dd 1 -ADD16: - dd 16 -BLAKE3_BLOCK_LEN: - dd 64 -ALIGN 64 -BLAKE3_IV: -BLAKE3_IV_0: - dd 06A09E667H -BLAKE3_IV_1: - dd 0BB67AE85H -BLAKE3_IV_2: - dd 03C6EF372H -BLAKE3_IV_3: - dd 0A54FF53AH - -_RDATA ENDS -END diff --git a/thirdparty/BLAKE3/c/blake3_c_rust_bindings/Cargo.toml b/thirdparty/BLAKE3/c/blake3_c_rust_bindings/Cargo.toml deleted file mode 100644 index 2052c7458..000000000 --- a/thirdparty/BLAKE3/c/blake3_c_rust_bindings/Cargo.toml +++ /dev/null @@ -1,29 +0,0 @@ -# These are Rust bindings for the C implementation of BLAKE3. As there is a -# native (and faster) Rust implementation of BLAKE3 provided in this same repo, -# these bindings are not expected to be used in production. They're intended -# for testing and benchmarking. - -[package] -name = "blake3_c_rust_bindings" -version = "0.0.0" -description = "TESTING ONLY Rust bindings for the BLAKE3 C implementation" -edition = "2018" - -[features] -# By default the x86-64 build uses assembly implementations. This feature makes -# the build use the C intrinsics implementations instead. -prefer_intrinsics = [] -# Activate NEON bindings. We don't currently do any CPU feature detection for -# this. If this Cargo feature is on, the NEON gets used. -neon = [] - -[dev-dependencies] -arrayref = "0.3.5" -arrayvec = { version = "0.5.1", default-features = false, features = ["array-sizes-33-128"] } -page_size = "0.4.1" -rand = "0.7.2" -rand_chacha = "0.2.1" -reference_impl = { path = "../../reference_impl" } - -[build-dependencies] -cc = "1.0.48" diff --git a/thirdparty/BLAKE3/c/blake3_c_rust_bindings/README.md b/thirdparty/BLAKE3/c/blake3_c_rust_bindings/README.md deleted file mode 100644 index c44726b90..000000000 --- a/thirdparty/BLAKE3/c/blake3_c_rust_bindings/README.md +++ /dev/null @@ -1,4 +0,0 @@ -These are Rust bindings for the C implementation of BLAKE3. As there is -a native Rust implementation of BLAKE3 provided in this same repo, these -bindings are not expected to be used in production. They're intended for -testing and benchmarking. diff --git a/thirdparty/BLAKE3/c/blake3_c_rust_bindings/benches/bench.rs b/thirdparty/BLAKE3/c/blake3_c_rust_bindings/benches/bench.rs deleted file mode 100644 index 119bd2064..000000000 --- a/thirdparty/BLAKE3/c/blake3_c_rust_bindings/benches/bench.rs +++ /dev/null @@ -1,393 +0,0 @@ -#![feature(test)] - -extern crate test; - -use arrayref::array_ref; -use arrayvec::ArrayVec; -use rand::prelude::*; -use test::Bencher; - -const KIB: usize = 1024; -const MAX_SIMD_DEGREE: usize = 16; - -const BLOCK_LEN: usize = 64; -const CHUNK_LEN: usize = 1024; -const OUT_LEN: usize = 32; - -// This struct randomizes two things: -// 1. The actual bytes of input. -// 2. The page offset the input starts at. -pub struct RandomInput { - buf: Vec<u8>, - len: usize, - offsets: Vec<usize>, - offset_index: usize, -} - -impl RandomInput { - pub fn new(b: &mut Bencher, len: usize) -> Self { - b.bytes += len as u64; - let page_size: usize = page_size::get(); - let mut buf = vec![0u8; len + page_size]; - let mut rng = rand::thread_rng(); - rng.fill_bytes(&mut buf); - let mut offsets: Vec<usize> = (0..page_size).collect(); - offsets.shuffle(&mut rng); - Self { - buf, - len, - offsets, - offset_index: 0, - } - } - - pub fn get(&mut self) -> &[u8] { - let offset = self.offsets[self.offset_index]; - self.offset_index += 1; - if self.offset_index >= self.offsets.len() { - self.offset_index = 0; - } - &self.buf[offset..][..self.len] - } -} - -type CompressInPlaceFn = - unsafe extern "C" fn(cv: *mut u32, block: *const u8, block_len: u8, counter: u64, flags: u8); - -fn bench_single_compression_fn(b: &mut Bencher, f: CompressInPlaceFn) { - let mut state = [1u32; 8]; - let mut r = RandomInput::new(b, 64); - let input = array_ref!(r.get(), 0, 64); - b.iter(|| unsafe { f(state.as_mut_ptr(), input.as_ptr(), 64, 0, 0) }); -} - -#[bench] -fn bench_single_compression_portable(b: &mut Bencher) { - bench_single_compression_fn( - b, - blake3_c_rust_bindings::ffi::blake3_compress_in_place_portable, - ); -} - -#[bench] -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -fn bench_single_compression_sse2(b: &mut Bencher) { - if !blake3_c_rust_bindings::sse2_detected() { - return; - } - bench_single_compression_fn( - b, - blake3_c_rust_bindings::ffi::x86::blake3_compress_in_place_sse2, - ); -} - -#[bench] -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -fn bench_single_compression_sse41(b: &mut Bencher) { - if !blake3_c_rust_bindings::sse41_detected() { - return; - } - bench_single_compression_fn( - b, - blake3_c_rust_bindings::ffi::x86::blake3_compress_in_place_sse41, - ); -} - -#[bench] -fn bench_single_compression_avx512(b: &mut Bencher) { - if !blake3_c_rust_bindings::avx512_detected() { - return; - } - bench_single_compression_fn( - b, - blake3_c_rust_bindings::ffi::x86::blake3_compress_in_place_avx512, - ); -} - -type HashManyFn = unsafe extern "C" fn( - inputs: *const *const u8, - num_inputs: usize, - blocks: usize, - key: *const u32, - counter: u64, - increment_counter: bool, - flags: u8, - flags_start: u8, - flags_end: u8, - out: *mut u8, -); - -fn bench_many_chunks_fn(b: &mut Bencher, f: HashManyFn, degree: usize) { - let mut inputs = Vec::new(); - for _ in 0..degree { - inputs.push(RandomInput::new(b, CHUNK_LEN)); - } - b.iter(|| { - let input_arrays: ArrayVec<[&[u8; CHUNK_LEN]; MAX_SIMD_DEGREE]> = inputs - .iter_mut() - .take(degree) - .map(|i| array_ref!(i.get(), 0, CHUNK_LEN)) - .collect(); - let mut out = [0; MAX_SIMD_DEGREE * OUT_LEN]; - unsafe { - f( - input_arrays.as_ptr() as _, - input_arrays.len(), - CHUNK_LEN / BLOCK_LEN, - [0u32; 8].as_ptr(), - 0, - true, - 0, - 0, - 0, - out.as_mut_ptr(), - ) - } - }); -} - -#[bench] -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -fn bench_many_chunks_sse2(b: &mut Bencher) { - if !blake3_c_rust_bindings::sse2_detected() { - return; - } - bench_many_chunks_fn( - b, - blake3_c_rust_bindings::ffi::x86::blake3_hash_many_sse2, - 4, - ); -} - -#[bench] -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -fn bench_many_chunks_sse41(b: &mut Bencher) { - if !blake3_c_rust_bindings::sse41_detected() { - return; - } - bench_many_chunks_fn( - b, - blake3_c_rust_bindings::ffi::x86::blake3_hash_many_sse41, - 4, - ); -} - -#[bench] -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -fn bench_many_chunks_avx2(b: &mut Bencher) { - if !blake3_c_rust_bindings::avx2_detected() { - return; - } - bench_many_chunks_fn( - b, - blake3_c_rust_bindings::ffi::x86::blake3_hash_many_avx2, - 8, - ); -} - -#[bench] -fn bench_many_chunks_avx512(b: &mut Bencher) { - if !blake3_c_rust_bindings::avx512_detected() { - return; - } - bench_many_chunks_fn( - b, - blake3_c_rust_bindings::ffi::x86::blake3_hash_many_avx512, - 16, - ); -} - -#[bench] -#[cfg(feature = "neon")] -fn bench_many_chunks_neon(b: &mut Bencher) { - // When "neon" is on, NEON support is assumed. - bench_many_chunks_fn( - b, - blake3_c_rust_bindings::ffi::neon::blake3_hash_many_neon, - 4, - ); -} - -// TODO: When we get const generics we can unify this with the chunks code. -fn bench_many_parents_fn(b: &mut Bencher, f: HashManyFn, degree: usize) { - let mut inputs = Vec::new(); - for _ in 0..degree { - inputs.push(RandomInput::new(b, BLOCK_LEN)); - } - b.iter(|| { - let input_arrays: ArrayVec<[&[u8; BLOCK_LEN]; MAX_SIMD_DEGREE]> = inputs - .iter_mut() - .take(degree) - .map(|i| array_ref!(i.get(), 0, BLOCK_LEN)) - .collect(); - let mut out = [0; MAX_SIMD_DEGREE * OUT_LEN]; - unsafe { - f( - input_arrays.as_ptr() as _, - input_arrays.len(), - 1, - [0u32; 8].as_ptr(), - 0, - false, - 0, - 0, - 0, - out.as_mut_ptr(), - ) - } - }); -} - -#[bench] -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -fn bench_many_parents_sse2(b: &mut Bencher) { - if !blake3_c_rust_bindings::sse2_detected() { - return; - } - bench_many_parents_fn( - b, - blake3_c_rust_bindings::ffi::x86::blake3_hash_many_sse2, - 4, - ); -} - -#[bench] -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -fn bench_many_parents_sse41(b: &mut Bencher) { - if !blake3_c_rust_bindings::sse41_detected() { - return; - } - bench_many_parents_fn( - b, - blake3_c_rust_bindings::ffi::x86::blake3_hash_many_sse41, - 4, - ); -} - -#[bench] -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -fn bench_many_parents_avx2(b: &mut Bencher) { - if !blake3_c_rust_bindings::avx2_detected() { - return; - } - bench_many_parents_fn( - b, - blake3_c_rust_bindings::ffi::x86::blake3_hash_many_avx2, - 8, - ); -} - -#[bench] -fn bench_many_parents_avx512(b: &mut Bencher) { - if !blake3_c_rust_bindings::avx512_detected() { - return; - } - bench_many_parents_fn( - b, - blake3_c_rust_bindings::ffi::x86::blake3_hash_many_avx512, - 16, - ); -} - -#[bench] -#[cfg(feature = "neon")] -fn bench_many_parents_neon(b: &mut Bencher) { - // When "neon" is on, NEON support is assumed. - bench_many_parents_fn( - b, - blake3_c_rust_bindings::ffi::neon::blake3_hash_many_neon, - 4, - ); -} - -fn bench_incremental(b: &mut Bencher, len: usize) { - let mut input = RandomInput::new(b, len); - b.iter(|| { - let mut hasher = blake3_c_rust_bindings::Hasher::new(); - hasher.update(input.get()); - let mut out = [0; 32]; - hasher.finalize(&mut out); - out - }); -} - -#[bench] -fn bench_incremental_0001_block(b: &mut Bencher) { - bench_incremental(b, BLOCK_LEN); -} - -#[bench] -fn bench_incremental_0001_kib(b: &mut Bencher) { - bench_incremental(b, 1 * KIB); -} - -#[bench] -fn bench_incremental_0002_kib(b: &mut Bencher) { - bench_incremental(b, 2 * KIB); -} - -#[bench] -fn bench_incremental_0004_kib(b: &mut Bencher) { - bench_incremental(b, 4 * KIB); -} - -#[bench] -fn bench_incremental_0008_kib(b: &mut Bencher) { - bench_incremental(b, 8 * KIB); -} - -#[bench] -fn bench_incremental_0016_kib(b: &mut Bencher) { - bench_incremental(b, 16 * KIB); -} - -#[bench] -fn bench_incremental_0032_kib(b: &mut Bencher) { - bench_incremental(b, 32 * KIB); -} - -#[bench] -fn bench_incremental_0064_kib(b: &mut Bencher) { - bench_incremental(b, 64 * KIB); -} - -#[bench] -fn bench_incremental_0128_kib(b: &mut Bencher) { - bench_incremental(b, 128 * KIB); -} - -#[bench] -fn bench_incremental_0256_kib(b: &mut Bencher) { - bench_incremental(b, 256 * KIB); -} - -#[bench] -fn bench_incremental_0512_kib(b: &mut Bencher) { - bench_incremental(b, 512 * KIB); -} - -#[bench] -fn bench_incremental_1024_kib(b: &mut Bencher) { - bench_incremental(b, 1024 * KIB); -} - -// This checks that update() splits up its input in increasing powers of 2, so -// that it can recover a high degree of parallelism when the number of bytes -// hashed so far is uneven. The performance of this benchmark should be -// reasonably close to bench_incremental_0064_kib, within 80% or so. When we -// had a bug in this logic (https://github.com/BLAKE3-team/BLAKE3/issues/69), -// performance was less than half. -#[bench] -fn bench_two_updates(b: &mut Bencher) { - let len = 65536; - let mut input = RandomInput::new(b, len); - b.iter(|| { - let mut hasher = blake3_c_rust_bindings::Hasher::new(); - let input = input.get(); - hasher.update(&input[..1]); - hasher.update(&input[1..]); - let mut out = [0; 32]; - hasher.finalize(&mut out); - out - }); -} diff --git a/thirdparty/BLAKE3/c/blake3_c_rust_bindings/build.rs b/thirdparty/BLAKE3/c/blake3_c_rust_bindings/build.rs deleted file mode 100644 index d5dc47a81..000000000 --- a/thirdparty/BLAKE3/c/blake3_c_rust_bindings/build.rs +++ /dev/null @@ -1,182 +0,0 @@ -use std::env; - -fn defined(var: &str) -> bool { - env::var_os(var).is_some() -} - -fn target_components() -> Vec<String> { - let target = env::var("TARGET").unwrap(); - target.split("-").map(|s| s.to_string()).collect() -} - -fn is_x86_64() -> bool { - target_components()[0] == "x86_64" -} - -fn is_x86_32() -> bool { - let arch = &target_components()[0]; - arch == "i386" || arch == "i586" || arch == "i686" -} - -fn is_armv7() -> bool { - target_components()[0] == "armv7" -} - -// Windows targets may be using the MSVC toolchain or the GNU toolchain. The -// right compiler flags to use depend on the toolchain. (And we don't want to -// use flag_if_supported, because we don't want features to be silently -// disabled by old compilers.) -fn is_windows_msvc() -> bool { - // Some targets are only two components long, so check in steps. - target_components()[1] == "pc" - && target_components()[2] == "windows" - && target_components()[3] == "msvc" -} - -fn is_windows_gnu() -> bool { - // Some targets are only two components long, so check in steps. - target_components()[1] == "pc" - && target_components()[2] == "windows" - && target_components()[3] == "gnu" -} - -fn new_build() -> cc::Build { - let mut build = cc::Build::new(); - if !is_windows_msvc() { - build.flag("-std=c11"); - } - build -} - -fn c_dir_path(filename: &str) -> String { - // The `cross` tool doesn't support reading files in parent directories. As a hacky workaround - // in `cross_test.sh`, we move the c/ directory around and set BLAKE3_C_DIR_OVERRIDE. Regular - // building and testing doesn't require this. - if let Ok(c_dir_override) = env::var("BLAKE3_C_DIR_OVERRIDE") { - c_dir_override + "/" + filename - } else { - "../".to_string() + filename - } -} - -fn main() -> Result<(), Box<dyn std::error::Error>> { - let mut base_build = new_build(); - base_build.file(c_dir_path("blake3.c")); - base_build.file(c_dir_path("blake3_dispatch.c")); - base_build.file(c_dir_path("blake3_portable.c")); - base_build.compile("blake3_base"); - - if is_x86_64() && !defined("CARGO_FEATURE_PREFER_INTRINSICS") { - // On 64-bit, use the assembly implementations, unless the - // "prefer_intrinsics" feature is enabled. - if is_windows_msvc() { - let mut build = new_build(); - build.file(c_dir_path("blake3_sse2_x86-64_windows_msvc.asm")); - build.file(c_dir_path("blake3_sse41_x86-64_windows_msvc.asm")); - build.file(c_dir_path("blake3_avx2_x86-64_windows_msvc.asm")); - build.file(c_dir_path("blake3_avx512_x86-64_windows_msvc.asm")); - build.compile("blake3_asm"); - } else if is_windows_gnu() { - let mut build = new_build(); - build.file(c_dir_path("blake3_sse2_x86-64_windows_gnu.S")); - build.file(c_dir_path("blake3_sse41_x86-64_windows_gnu.S")); - build.file(c_dir_path("blake3_avx2_x86-64_windows_gnu.S")); - build.file(c_dir_path("blake3_avx512_x86-64_windows_gnu.S")); - build.compile("blake3_asm"); - } else { - // All non-Windows implementations are assumed to support - // Linux-style assembly. These files do contain a small - // explicit workaround for macOS also. - let mut build = new_build(); - build.file(c_dir_path("blake3_sse2_x86-64_unix.S")); - build.file(c_dir_path("blake3_sse41_x86-64_unix.S")); - build.file(c_dir_path("blake3_avx2_x86-64_unix.S")); - build.file(c_dir_path("blake3_avx512_x86-64_unix.S")); - build.compile("blake3_asm"); - } - } else if is_x86_64() || is_x86_32() { - // Assembly implementations are only for 64-bit. On 32-bit, or if - // the "prefer_intrinsics" feature is enabled, use the - // intrinsics-based C implementations. These each need to be - // compiled separately, with the corresponding instruction set - // extension explicitly enabled in the compiler. - - let mut sse2_build = new_build(); - sse2_build.file(c_dir_path("blake3_sse2.c")); - if is_windows_msvc() { - // /arch:SSE2 is the default on x86 and undefined on x86_64: - // https://docs.microsoft.com/en-us/cpp/build/reference/arch-x86 - // It also includes SSE4.1 intrisincs: - // https://stackoverflow.com/a/32183222/823869 - } else { - sse2_build.flag("-msse2"); - } - sse2_build.compile("blake3_sse2"); - - let mut sse41_build = new_build(); - sse41_build.file(c_dir_path("blake3_sse41.c")); - if is_windows_msvc() { - // /arch:SSE2 is the default on x86 and undefined on x86_64: - // https://docs.microsoft.com/en-us/cpp/build/reference/arch-x86 - // It also includes SSE4.1 intrisincs: - // https://stackoverflow.com/a/32183222/823869 - } else { - sse41_build.flag("-msse4.1"); - } - sse41_build.compile("blake3_sse41"); - - let mut avx2_build = new_build(); - avx2_build.file(c_dir_path("blake3_avx2.c")); - if is_windows_msvc() { - avx2_build.flag("/arch:AVX2"); - } else { - avx2_build.flag("-mavx2"); - } - avx2_build.compile("blake3_avx2"); - - let mut avx512_build = new_build(); - avx512_build.file(c_dir_path("blake3_avx512.c")); - if is_windows_msvc() { - // Note that a lot of versions of MSVC don't support /arch:AVX512, - // and they'll discard it with a warning, hopefully leading to a - // build error. - avx512_build.flag("/arch:AVX512"); - } else { - avx512_build.flag("-mavx512f"); - avx512_build.flag("-mavx512vl"); - } - avx512_build.compile("blake3_avx512"); - } - - // We only build NEON code here if 1) it's requested and 2) the root crate - // is not already building it. The only time this will really happen is if - // you build this crate by hand with the "neon" feature for some reason. - if defined("CARGO_FEATURE_NEON") { - let mut neon_build = new_build(); - neon_build.file(c_dir_path("blake3_neon.c")); - // ARMv7 platforms that support NEON generally need the following - // flags. AArch64 supports NEON by default and does not support -mpfu. - if is_armv7() { - neon_build.flag("-mfpu=neon-vfpv4"); - neon_build.flag("-mfloat-abi=hard"); - } - neon_build.compile("blake3_neon"); - } - - // The `cc` crate does not automatically emit rerun-if directives for the - // environment variables it supports, in particular for $CC. We expect to - // do a lot of benchmarking across different compilers, so we explicitly - // add the variables that we're likely to need. - println!("cargo:rerun-if-env-changed=CC"); - println!("cargo:rerun-if-env-changed=CFLAGS"); - - // Ditto for source files, though these shouldn't change as often. - for file in std::fs::read_dir("..")? { - println!( - "cargo:rerun-if-changed={}", - file?.path().to_str().expect("utf-8") - ); - } - - Ok(()) -} diff --git a/thirdparty/BLAKE3/c/blake3_c_rust_bindings/cross_test.sh b/thirdparty/BLAKE3/c/blake3_c_rust_bindings/cross_test.sh deleted file mode 100644 index 94d50affb..000000000 --- a/thirdparty/BLAKE3/c/blake3_c_rust_bindings/cross_test.sh +++ /dev/null @@ -1,31 +0,0 @@ -#! /usr/bin/env bash - -# This hacky script works around the fact that `cross test` does not support -# path dependencies. (It uses a docker shared folder to let the guest access -# project files, so parent directories aren't available.) Solve this problem by -# copying the entire project to a temp dir and rearranging paths to put "c" and -# "reference_impl" underneath "blake3_c_rust_bindings", so that everything is -# accessible. Hopefully this will just run on CI forever and no one will ever -# read this and discover my deep shame. - -set -e -u -o pipefail - -project_root="$(realpath "$(dirname "$BASH_SOURCE")/../..")" -tmpdir="$(mktemp -d)" -echo "Running cross tests in $tmpdir" -cd "$tmpdir" -git clone "$project_root" blake3 -mv blake3/c/blake3_c_rust_bindings . -mv blake3/reference_impl blake3_c_rust_bindings -mv blake3/c blake3_c_rust_bindings -cd blake3_c_rust_bindings -sed -i 's|reference_impl = { path = "../../reference_impl" }|reference_impl = { path = "reference_impl" }|' Cargo.toml - -export BLAKE3_C_DIR_OVERRIDE="./c" -cat > Cross.toml << EOF -[build.env] -passthrough = [ - "BLAKE3_C_DIR_OVERRIDE", -] -EOF -cross test "$@" diff --git a/thirdparty/BLAKE3/c/blake3_c_rust_bindings/src/lib.rs b/thirdparty/BLAKE3/c/blake3_c_rust_bindings/src/lib.rs deleted file mode 100644 index f18fe123f..000000000 --- a/thirdparty/BLAKE3/c/blake3_c_rust_bindings/src/lib.rs +++ /dev/null @@ -1,299 +0,0 @@ -//! These are Rust bindings for the C implementation of BLAKE3. As there is a -//! native (and faster) Rust implementation of BLAKE3 provided in this same -//! repo, these bindings are not expected to be used in production. They're -//! intended for testing and benchmarking. - -use std::ffi::{c_void, CString}; -use std::mem::MaybeUninit; - -#[cfg(test)] -mod test; - -pub const BLOCK_LEN: usize = 64; -pub const CHUNK_LEN: usize = 1024; -pub const OUT_LEN: usize = 32; - -// Feature detection functions for tests and benchmarks. Note that the C code -// does its own feature detection in blake3_dispatch.c. -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -pub fn sse2_detected() -> bool { - is_x86_feature_detected!("sse2") -} - -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -pub fn sse41_detected() -> bool { - is_x86_feature_detected!("sse4.1") -} - -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -pub fn avx2_detected() -> bool { - is_x86_feature_detected!("avx2") -} - -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -pub fn avx512_detected() -> bool { - is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512vl") -} - -#[derive(Clone)] -pub struct Hasher(ffi::blake3_hasher); - -impl Hasher { - pub fn new() -> Self { - let mut c_state = MaybeUninit::uninit(); - unsafe { - ffi::blake3_hasher_init(c_state.as_mut_ptr()); - Self(c_state.assume_init()) - } - } - - pub fn new_keyed(key: &[u8; 32]) -> Self { - let mut c_state = MaybeUninit::uninit(); - unsafe { - ffi::blake3_hasher_init_keyed(c_state.as_mut_ptr(), key.as_ptr()); - Self(c_state.assume_init()) - } - } - - pub fn new_derive_key(context: &str) -> Self { - let mut c_state = MaybeUninit::uninit(); - let context_c_string = CString::new(context).expect("valid C string, no null bytes"); - unsafe { - ffi::blake3_hasher_init_derive_key(c_state.as_mut_ptr(), context_c_string.as_ptr()); - Self(c_state.assume_init()) - } - } - - pub fn new_derive_key_raw(context: &[u8]) -> Self { - let mut c_state = MaybeUninit::uninit(); - unsafe { - ffi::blake3_hasher_init_derive_key_raw( - c_state.as_mut_ptr(), - context.as_ptr() as *const _, - context.len(), - ); - Self(c_state.assume_init()) - } - } - - pub fn update(&mut self, input: &[u8]) { - unsafe { - ffi::blake3_hasher_update(&mut self.0, input.as_ptr() as *const c_void, input.len()); - } - } - - pub fn finalize(&self, output: &mut [u8]) { - unsafe { - ffi::blake3_hasher_finalize(&self.0, output.as_mut_ptr(), output.len()); - } - } - - pub fn finalize_seek(&self, seek: u64, output: &mut [u8]) { - unsafe { - ffi::blake3_hasher_finalize_seek(&self.0, seek, output.as_mut_ptr(), output.len()); - } - } -} - -pub mod ffi { - #[repr(C)] - #[derive(Copy, Clone)] - pub struct blake3_chunk_state { - pub cv: [u32; 8usize], - pub chunk_counter: u64, - pub buf: [u8; 64usize], - pub buf_len: u8, - pub blocks_compressed: u8, - pub flags: u8, - } - - #[repr(C)] - #[derive(Copy, Clone)] - pub struct blake3_hasher { - pub key: [u32; 8usize], - pub chunk: blake3_chunk_state, - pub cv_stack_len: u8, - pub cv_stack: [u8; 1728usize], - } - - extern "C" { - // public interface - pub fn blake3_hasher_init(self_: *mut blake3_hasher); - pub fn blake3_hasher_init_keyed(self_: *mut blake3_hasher, key: *const u8); - pub fn blake3_hasher_init_derive_key( - self_: *mut blake3_hasher, - context: *const ::std::os::raw::c_char, - ); - pub fn blake3_hasher_init_derive_key_raw( - self_: *mut blake3_hasher, - context: *const ::std::os::raw::c_void, - context_len: usize, - ); - pub fn blake3_hasher_update( - self_: *mut blake3_hasher, - input: *const ::std::os::raw::c_void, - input_len: usize, - ); - pub fn blake3_hasher_finalize(self_: *const blake3_hasher, out: *mut u8, out_len: usize); - pub fn blake3_hasher_finalize_seek( - self_: *const blake3_hasher, - seek: u64, - out: *mut u8, - out_len: usize, - ); - - // portable low-level functions - pub fn blake3_compress_in_place_portable( - cv: *mut u32, - block: *const u8, - block_len: u8, - counter: u64, - flags: u8, - ); - pub fn blake3_compress_xof_portable( - cv: *const u32, - block: *const u8, - block_len: u8, - counter: u64, - flags: u8, - out: *mut u8, - ); - pub fn blake3_hash_many_portable( - inputs: *const *const u8, - num_inputs: usize, - blocks: usize, - key: *const u32, - counter: u64, - increment_counter: bool, - flags: u8, - flags_start: u8, - flags_end: u8, - out: *mut u8, - ); - } - - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - pub mod x86 { - extern "C" { - // SSE2 low level functions - pub fn blake3_compress_in_place_sse2( - cv: *mut u32, - block: *const u8, - block_len: u8, - counter: u64, - flags: u8, - ); - pub fn blake3_compress_xof_sse2( - cv: *const u32, - block: *const u8, - block_len: u8, - counter: u64, - flags: u8, - out: *mut u8, - ); - pub fn blake3_hash_many_sse2( - inputs: *const *const u8, - num_inputs: usize, - blocks: usize, - key: *const u32, - counter: u64, - increment_counter: bool, - flags: u8, - flags_start: u8, - flags_end: u8, - out: *mut u8, - ); - - // SSE4.1 low level functions - pub fn blake3_compress_in_place_sse41( - cv: *mut u32, - block: *const u8, - block_len: u8, - counter: u64, - flags: u8, - ); - pub fn blake3_compress_xof_sse41( - cv: *const u32, - block: *const u8, - block_len: u8, - counter: u64, - flags: u8, - out: *mut u8, - ); - pub fn blake3_hash_many_sse41( - inputs: *const *const u8, - num_inputs: usize, - blocks: usize, - key: *const u32, - counter: u64, - increment_counter: bool, - flags: u8, - flags_start: u8, - flags_end: u8, - out: *mut u8, - ); - - // AVX2 low level functions - pub fn blake3_hash_many_avx2( - inputs: *const *const u8, - num_inputs: usize, - blocks: usize, - key: *const u32, - counter: u64, - increment_counter: bool, - flags: u8, - flags_start: u8, - flags_end: u8, - out: *mut u8, - ); - - // AVX-512 low level functions - pub fn blake3_compress_xof_avx512( - cv: *const u32, - block: *const u8, - block_len: u8, - counter: u64, - flags: u8, - out: *mut u8, - ); - pub fn blake3_compress_in_place_avx512( - cv: *mut u32, - block: *const u8, - block_len: u8, - counter: u64, - flags: u8, - ); - pub fn blake3_hash_many_avx512( - inputs: *const *const u8, - num_inputs: usize, - blocks: usize, - key: *const u32, - counter: u64, - increment_counter: bool, - flags: u8, - flags_start: u8, - flags_end: u8, - out: *mut u8, - ); - } - } - - #[cfg(feature = "neon")] - pub mod neon { - extern "C" { - // NEON low level functions - pub fn blake3_hash_many_neon( - inputs: *const *const u8, - num_inputs: usize, - blocks: usize, - key: *const u32, - counter: u64, - increment_counter: bool, - flags: u8, - flags_start: u8, - flags_end: u8, - out: *mut u8, - ); - } - } -} diff --git a/thirdparty/BLAKE3/c/blake3_c_rust_bindings/src/test.rs b/thirdparty/BLAKE3/c/blake3_c_rust_bindings/src/test.rs deleted file mode 100644 index b989ae9c4..000000000 --- a/thirdparty/BLAKE3/c/blake3_c_rust_bindings/src/test.rs +++ /dev/null @@ -1,511 +0,0 @@ -// Most of this code is duplicated from the root `blake3` crate. Perhaps we -// could share more of it in the future. - -use crate::{BLOCK_LEN, CHUNK_LEN, OUT_LEN}; -use arrayref::{array_mut_ref, array_ref}; -use arrayvec::ArrayVec; -use core::usize; -use rand::prelude::*; - -const CHUNK_START: u8 = 1 << 0; -const CHUNK_END: u8 = 1 << 1; -const PARENT: u8 = 1 << 2; -const ROOT: u8 = 1 << 3; -const KEYED_HASH: u8 = 1 << 4; -// const DERIVE_KEY_CONTEXT: u8 = 1 << 5; -// const DERIVE_KEY_MATERIAL: u8 = 1 << 6; - -// Interesting input lengths to run tests on. -pub const TEST_CASES: &[usize] = &[ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - BLOCK_LEN - 1, - BLOCK_LEN, - BLOCK_LEN + 1, - 2 * BLOCK_LEN - 1, - 2 * BLOCK_LEN, - 2 * BLOCK_LEN + 1, - CHUNK_LEN - 1, - CHUNK_LEN, - CHUNK_LEN + 1, - 2 * CHUNK_LEN, - 2 * CHUNK_LEN + 1, - 3 * CHUNK_LEN, - 3 * CHUNK_LEN + 1, - 4 * CHUNK_LEN, - 4 * CHUNK_LEN + 1, - 5 * CHUNK_LEN, - 5 * CHUNK_LEN + 1, - 6 * CHUNK_LEN, - 6 * CHUNK_LEN + 1, - 7 * CHUNK_LEN, - 7 * CHUNK_LEN + 1, - 8 * CHUNK_LEN, - 8 * CHUNK_LEN + 1, - 16 * CHUNK_LEN, // AVX512's bandwidth - 31 * CHUNK_LEN, // 16 + 8 + 4 + 2 + 1 - 100 * CHUNK_LEN, // subtrees larger than MAX_SIMD_DEGREE chunks -]; - -pub const TEST_CASES_MAX: usize = 100 * CHUNK_LEN; - -// There's a test to make sure these two are equal below. -pub const TEST_KEY: [u8; 32] = *b"whats the Elvish word for friend"; -pub const TEST_KEY_WORDS: [u32; 8] = [ - 1952540791, 1752440947, 1816469605, 1752394102, 1919907616, 1868963940, 1919295602, 1684956521, -]; - -// Paint the input with a repeating byte pattern. We use a cycle length of 251, -// because that's the largets prime number less than 256. This makes it -// unlikely to swapping any two adjacent input blocks or chunks will give the -// same answer. -fn paint_test_input(buf: &mut [u8]) { - for (i, b) in buf.iter_mut().enumerate() { - *b = (i % 251) as u8; - } -} - -#[inline(always)] -fn le_bytes_from_words_32(words: &[u32; 8]) -> [u8; 32] { - let mut out = [0; 32]; - *array_mut_ref!(out, 0 * 4, 4) = words[0].to_le_bytes(); - *array_mut_ref!(out, 1 * 4, 4) = words[1].to_le_bytes(); - *array_mut_ref!(out, 2 * 4, 4) = words[2].to_le_bytes(); - *array_mut_ref!(out, 3 * 4, 4) = words[3].to_le_bytes(); - *array_mut_ref!(out, 4 * 4, 4) = words[4].to_le_bytes(); - *array_mut_ref!(out, 5 * 4, 4) = words[5].to_le_bytes(); - *array_mut_ref!(out, 6 * 4, 4) = words[6].to_le_bytes(); - *array_mut_ref!(out, 7 * 4, 4) = words[7].to_le_bytes(); - out -} - -type CompressInPlaceFn = - unsafe extern "C" fn(cv: *mut u32, block: *const u8, block_len: u8, counter: u64, flags: u8); - -type CompressXofFn = unsafe extern "C" fn( - cv: *const u32, - block: *const u8, - block_len: u8, - counter: u64, - flags: u8, - out: *mut u8, -); - -// A shared helper function for platform-specific tests. -pub fn test_compress_fn(compress_in_place_fn: CompressInPlaceFn, compress_xof_fn: CompressXofFn) { - let initial_state = TEST_KEY_WORDS; - let block_len: u8 = 61; - let mut block = [0; BLOCK_LEN]; - paint_test_input(&mut block[..block_len as usize]); - // Use a counter with set bits in both 32-bit words. - let counter = (5u64 << 32) + 6; - let flags = CHUNK_END | ROOT | KEYED_HASH; - - let mut portable_out = [0; 64]; - unsafe { - crate::ffi::blake3_compress_xof_portable( - initial_state.as_ptr(), - block.as_ptr(), - block_len, - counter, - flags, - portable_out.as_mut_ptr(), - ); - } - - let mut test_state = initial_state; - unsafe { - compress_in_place_fn( - test_state.as_mut_ptr(), - block.as_ptr(), - block_len, - counter, - flags, - ) - }; - let test_state_bytes = le_bytes_from_words_32(&test_state); - let mut test_xof = [0; 64]; - unsafe { - compress_xof_fn( - initial_state.as_ptr(), - block.as_ptr(), - block_len, - counter, - flags, - test_xof.as_mut_ptr(), - ) - }; - - assert_eq!(&portable_out[..32], &test_state_bytes[..]); - assert_eq!(&portable_out[..], &test_xof[..]); -} - -// Testing the portable implementation against itself is circular, but why not. -#[test] -fn test_compress_portable() { - test_compress_fn( - crate::ffi::blake3_compress_in_place_portable, - crate::ffi::blake3_compress_xof_portable, - ); -} - -#[test] -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -fn test_compress_sse2() { - if !crate::sse2_detected() { - return; - } - test_compress_fn( - crate::ffi::x86::blake3_compress_in_place_sse2, - crate::ffi::x86::blake3_compress_xof_sse2, - ); -} - -#[test] -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -fn test_compress_sse41() { - if !crate::sse41_detected() { - return; - } - test_compress_fn( - crate::ffi::x86::blake3_compress_in_place_sse41, - crate::ffi::x86::blake3_compress_xof_sse41, - ); -} - -#[test] -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -fn test_compress_avx512() { - if !crate::avx512_detected() { - return; - } - test_compress_fn( - crate::ffi::x86::blake3_compress_in_place_avx512, - crate::ffi::x86::blake3_compress_xof_avx512, - ); -} - -type HashManyFn = unsafe extern "C" fn( - inputs: *const *const u8, - num_inputs: usize, - blocks: usize, - key: *const u32, - counter: u64, - increment_counter: bool, - flags: u8, - flags_start: u8, - flags_end: u8, - out: *mut u8, -); - -// A shared helper function for platform-specific tests. -pub fn test_hash_many_fn(hash_many_fn: HashManyFn) { - // 31 (16 + 8 + 4 + 2 + 1) inputs - const NUM_INPUTS: usize = 31; - let mut input_buf = [0; CHUNK_LEN * NUM_INPUTS]; - crate::test::paint_test_input(&mut input_buf); - // A counter just prior to u32::MAX. - let counter = (1u64 << 32) - 1; - - // First hash chunks. - let mut chunks = ArrayVec::<[&[u8; CHUNK_LEN]; NUM_INPUTS]>::new(); - for i in 0..NUM_INPUTS { - chunks.push(array_ref!(input_buf, i * CHUNK_LEN, CHUNK_LEN)); - } - let mut portable_chunks_out = [0; NUM_INPUTS * OUT_LEN]; - unsafe { - crate::ffi::blake3_hash_many_portable( - chunks.as_ptr() as _, - chunks.len(), - CHUNK_LEN / BLOCK_LEN, - TEST_KEY_WORDS.as_ptr(), - counter, - true, - KEYED_HASH, - CHUNK_START, - CHUNK_END, - portable_chunks_out.as_mut_ptr(), - ); - } - - let mut test_chunks_out = [0; NUM_INPUTS * OUT_LEN]; - unsafe { - hash_many_fn( - chunks.as_ptr() as _, - chunks.len(), - CHUNK_LEN / BLOCK_LEN, - TEST_KEY_WORDS.as_ptr(), - counter, - true, - KEYED_HASH, - CHUNK_START, - CHUNK_END, - test_chunks_out.as_mut_ptr(), - ); - } - for n in 0..NUM_INPUTS { - dbg!(n); - assert_eq!( - &portable_chunks_out[n * OUT_LEN..][..OUT_LEN], - &test_chunks_out[n * OUT_LEN..][..OUT_LEN] - ); - } - - // Then hash parents. - let mut parents = ArrayVec::<[&[u8; 2 * OUT_LEN]; NUM_INPUTS]>::new(); - for i in 0..NUM_INPUTS { - parents.push(array_ref!(input_buf, i * 2 * OUT_LEN, 2 * OUT_LEN)); - } - let mut portable_parents_out = [0; NUM_INPUTS * OUT_LEN]; - unsafe { - crate::ffi::blake3_hash_many_portable( - parents.as_ptr() as _, - parents.len(), - 1, - TEST_KEY_WORDS.as_ptr(), - counter, - false, - KEYED_HASH | PARENT, - 0, - 0, - portable_parents_out.as_mut_ptr(), - ); - } - - let mut test_parents_out = [0; NUM_INPUTS * OUT_LEN]; - unsafe { - hash_many_fn( - parents.as_ptr() as _, - parents.len(), - 1, - TEST_KEY_WORDS.as_ptr(), - counter, - false, - KEYED_HASH | PARENT, - 0, - 0, - test_parents_out.as_mut_ptr(), - ); - } - for n in 0..NUM_INPUTS { - dbg!(n); - assert_eq!( - &portable_parents_out[n * OUT_LEN..][..OUT_LEN], - &test_parents_out[n * OUT_LEN..][..OUT_LEN] - ); - } -} - -// Testing the portable implementation against itself is circular, but why not. -#[test] -fn test_hash_many_portable() { - test_hash_many_fn(crate::ffi::blake3_hash_many_portable); -} - -#[test] -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -fn test_hash_many_sse2() { - if !crate::sse2_detected() { - return; - } - test_hash_many_fn(crate::ffi::x86::blake3_hash_many_sse2); -} - -#[test] -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -fn test_hash_many_sse41() { - if !crate::sse41_detected() { - return; - } - test_hash_many_fn(crate::ffi::x86::blake3_hash_many_sse41); -} - -#[test] -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -fn test_hash_many_avx2() { - if !crate::avx2_detected() { - return; - } - test_hash_many_fn(crate::ffi::x86::blake3_hash_many_avx2); -} - -#[test] -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -fn test_hash_many_avx512() { - if !crate::avx512_detected() { - return; - } - test_hash_many_fn(crate::ffi::x86::blake3_hash_many_avx512); -} - -#[test] -#[cfg(feature = "neon")] -fn test_hash_many_neon() { - test_hash_many_fn(crate::ffi::neon::blake3_hash_many_neon); -} - -#[test] -fn test_compare_reference_impl() { - const OUT: usize = 303; // more than 64, not a multiple of 4 - let mut input_buf = [0; TEST_CASES_MAX]; - paint_test_input(&mut input_buf); - for &case in TEST_CASES { - let input = &input_buf[..case]; - dbg!(case); - - // regular - { - let mut reference_hasher = reference_impl::Hasher::new(); - reference_hasher.update(input); - let mut expected_out = [0; OUT]; - reference_hasher.finalize(&mut expected_out); - - let mut test_hasher = crate::Hasher::new(); - test_hasher.update(input); - let mut test_out = [0; OUT]; - test_hasher.finalize(&mut test_out); - assert_eq!(test_out[..], expected_out[..]); - } - - // keyed - { - let mut reference_hasher = reference_impl::Hasher::new_keyed(&TEST_KEY); - reference_hasher.update(input); - let mut expected_out = [0; OUT]; - reference_hasher.finalize(&mut expected_out); - - let mut test_hasher = crate::Hasher::new_keyed(&TEST_KEY); - test_hasher.update(input); - let mut test_out = [0; OUT]; - test_hasher.finalize(&mut test_out); - assert_eq!(test_out[..], expected_out[..]); - } - - // derive_key - { - let context = "BLAKE3 2019-12-27 16:13:59 example context (not the test vector one)"; - let mut reference_hasher = reference_impl::Hasher::new_derive_key(context); - reference_hasher.update(input); - let mut expected_out = [0; OUT]; - reference_hasher.finalize(&mut expected_out); - - // the regular C string API - let mut test_hasher = crate::Hasher::new_derive_key(context); - test_hasher.update(input); - let mut test_out = [0; OUT]; - test_hasher.finalize(&mut test_out); - assert_eq!(test_out[..], expected_out[..]); - - // the raw bytes API - let mut test_hasher_raw = crate::Hasher::new_derive_key_raw(context.as_bytes()); - test_hasher_raw.update(input); - let mut test_out_raw = [0; OUT]; - test_hasher_raw.finalize(&mut test_out_raw); - assert_eq!(test_out_raw[..], expected_out[..]); - } - } -} - -fn reference_hash(input: &[u8]) -> [u8; OUT_LEN] { - let mut hasher = reference_impl::Hasher::new(); - hasher.update(input); - let mut bytes = [0; OUT_LEN]; - hasher.finalize(&mut bytes); - bytes.into() -} - -#[test] -fn test_compare_update_multiple() { - // Don't use all the long test cases here, since that's unnecessarily slow - // in debug mode. - let mut short_test_cases = TEST_CASES; - while *short_test_cases.last().unwrap() > 4 * CHUNK_LEN { - short_test_cases = &short_test_cases[..short_test_cases.len() - 1]; - } - assert_eq!(*short_test_cases.last().unwrap(), 4 * CHUNK_LEN); - - let mut input_buf = [0; 2 * TEST_CASES_MAX]; - paint_test_input(&mut input_buf); - - for &first_update in short_test_cases { - dbg!(first_update); - let first_input = &input_buf[..first_update]; - let mut test_hasher = crate::Hasher::new(); - test_hasher.update(first_input); - - for &second_update in short_test_cases { - dbg!(second_update); - let second_input = &input_buf[first_update..][..second_update]; - let total_input = &input_buf[..first_update + second_update]; - - // Clone the hasher with first_update bytes already written, so - // that the next iteration can reuse it. - let mut test_hasher = test_hasher.clone(); - test_hasher.update(second_input); - let mut test_out = [0; OUT_LEN]; - test_hasher.finalize(&mut test_out); - - let expected = reference_hash(total_input); - assert_eq!(expected, test_out); - } - } -} - -#[test] -fn test_fuzz_hasher() { - const INPUT_MAX: usize = 4 * CHUNK_LEN; - let mut input_buf = [0; 3 * INPUT_MAX]; - paint_test_input(&mut input_buf); - - // Don't do too many iterations in debug mode, to keep the tests under a - // second or so. CI should run tests in release mode also. Provide an - // environment variable for specifying a larger number of fuzz iterations. - let num_tests = if cfg!(debug_assertions) { 100 } else { 10_000 }; - - // Use a fixed RNG seed for reproducibility. - let mut rng = rand_chacha::ChaCha8Rng::from_seed([1; 32]); - for _num_test in 0..num_tests { - dbg!(_num_test); - let mut hasher = crate::Hasher::new(); - let mut total_input = 0; - // For each test, write 3 inputs of random length. - for _ in 0..3 { - let input_len = rng.gen_range(0, INPUT_MAX + 1); - dbg!(input_len); - let input = &input_buf[total_input..][..input_len]; - hasher.update(input); - total_input += input_len; - } - let expected = reference_hash(&input_buf[..total_input]); - let mut test_out = [0; 32]; - hasher.finalize(&mut test_out); - assert_eq!(expected, test_out); - } -} - -#[test] -fn test_finalize_seek() { - let mut expected = [0; 1000]; - { - let mut reference_hasher = reference_impl::Hasher::new(); - reference_hasher.update(b"foobarbaz"); - reference_hasher.finalize(&mut expected); - } - - let mut test_hasher = crate::Hasher::new(); - test_hasher.update(b"foobarbaz"); - - let mut out = [0; 103]; - for &seek in &[0, 1, 7, 59, 63, 64, 65, 501, expected.len() - out.len()] { - dbg!(seek); - test_hasher.finalize_seek(seek as u64, &mut out); - assert_eq!(&expected[seek..][..out.len()], &out[..]); - } -} diff --git a/thirdparty/BLAKE3/c/blake3_dispatch.c b/thirdparty/BLAKE3/c/blake3_dispatch.c deleted file mode 100644 index 6518478e5..000000000 --- a/thirdparty/BLAKE3/c/blake3_dispatch.c +++ /dev/null @@ -1,276 +0,0 @@ -#include <stdbool.h> -#include <stddef.h> -#include <stdint.h> - -#include "blake3_impl.h" - -#if defined(IS_X86) -#if defined(_MSC_VER) -#include <intrin.h> -#elif defined(__GNUC__) -#include <immintrin.h> -#else -#error "Unimplemented!" -#endif -#endif - -#define MAYBE_UNUSED(x) (void)((x)) - -#if defined(IS_X86) -static uint64_t xgetbv() { -#if defined(_MSC_VER) - return _xgetbv(0); -#else - uint32_t eax = 0, edx = 0; - __asm__ __volatile__("xgetbv\n" : "=a"(eax), "=d"(edx) : "c"(0)); - return ((uint64_t)edx << 32) | eax; -#endif -} - -static void cpuid(uint32_t out[4], uint32_t id) { -#if defined(_MSC_VER) - __cpuid((int *)out, id); -#elif defined(__i386__) || defined(_M_IX86) - __asm__ __volatile__("movl %%ebx, %1\n" - "cpuid\n" - "xchgl %1, %%ebx\n" - : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3]) - : "a"(id)); -#else - __asm__ __volatile__("cpuid\n" - : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) - : "a"(id)); -#endif -} - -static void cpuidex(uint32_t out[4], uint32_t id, uint32_t sid) { -#if defined(_MSC_VER) - __cpuidex((int *)out, id, sid); -#elif defined(__i386__) || defined(_M_IX86) - __asm__ __volatile__("movl %%ebx, %1\n" - "cpuid\n" - "xchgl %1, %%ebx\n" - : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3]) - : "a"(id), "c"(sid)); -#else - __asm__ __volatile__("cpuid\n" - : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) - : "a"(id), "c"(sid)); -#endif -} - -#endif - -enum cpu_feature { - SSE2 = 1 << 0, - SSSE3 = 1 << 1, - SSE41 = 1 << 2, - AVX = 1 << 3, - AVX2 = 1 << 4, - AVX512F = 1 << 5, - AVX512VL = 1 << 6, - /* ... */ - UNDEFINED = 1 << 30 -}; - -#if !defined(BLAKE3_TESTING) -static /* Allow the variable to be controlled manually for testing */ -#endif - enum cpu_feature g_cpu_features = UNDEFINED; - -#if !defined(BLAKE3_TESTING) -static -#endif - enum cpu_feature - get_cpu_features() { - - if (g_cpu_features != UNDEFINED) { - return g_cpu_features; - } else { -#if defined(IS_X86) - uint32_t regs[4] = {0}; - uint32_t *eax = ®s[0], *ebx = ®s[1], *ecx = ®s[2], *edx = ®s[3]; - (void)edx; - enum cpu_feature features = 0; - cpuid(regs, 0); - const int max_id = *eax; - cpuid(regs, 1); -#if defined(__amd64__) || defined(_M_X64) - features |= SSE2; -#else - if (*edx & (1UL << 26)) - features |= SSE2; -#endif - if (*ecx & (1UL << 0)) - features |= SSSE3; - if (*ecx & (1UL << 19)) - features |= SSE41; - - if (*ecx & (1UL << 27)) { // OSXSAVE - const uint64_t mask = xgetbv(); - if ((mask & 6) == 6) { // SSE and AVX states - if (*ecx & (1UL << 28)) - features |= AVX; - if (max_id >= 7) { - cpuidex(regs, 7, 0); - if (*ebx & (1UL << 5)) - features |= AVX2; - if ((mask & 224) == 224) { // Opmask, ZMM_Hi256, Hi16_Zmm - if (*ebx & (1UL << 31)) - features |= AVX512VL; - if (*ebx & (1UL << 16)) - features |= AVX512F; - } - } - } - } - g_cpu_features = features; - return features; -#else - /* How to detect NEON? */ - return 0; -#endif - } -} - -void blake3_compress_in_place(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags) { -#if defined(IS_X86) - const enum cpu_feature features = get_cpu_features(); - MAYBE_UNUSED(features); -#if !defined(BLAKE3_NO_AVX512) - if (features & AVX512VL) { - blake3_compress_in_place_avx512(cv, block, block_len, counter, flags); - return; - } -#endif -#if !defined(BLAKE3_NO_SSE41) - if (features & SSE41) { - blake3_compress_in_place_sse41(cv, block, block_len, counter, flags); - return; - } -#endif -#if !defined(BLAKE3_NO_SSE2) - if (features & SSE2) { - blake3_compress_in_place_sse2(cv, block, block_len, counter, flags); - return; - } -#endif -#endif - blake3_compress_in_place_portable(cv, block, block_len, counter, flags); -} - -void blake3_compress_xof(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, uint8_t flags, - uint8_t out[64]) { -#if defined(IS_X86) - const enum cpu_feature features = get_cpu_features(); - MAYBE_UNUSED(features); -#if !defined(BLAKE3_NO_AVX512) - if (features & AVX512VL) { - blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out); - return; - } -#endif -#if !defined(BLAKE3_NO_SSE41) - if (features & SSE41) { - blake3_compress_xof_sse41(cv, block, block_len, counter, flags, out); - return; - } -#endif -#if !defined(BLAKE3_NO_SSE2) - if (features & SSE2) { - blake3_compress_xof_sse2(cv, block, block_len, counter, flags, out); - return; - } -#endif -#endif - blake3_compress_xof_portable(cv, block, block_len, counter, flags, out); -} - -void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], uint64_t counter, - bool increment_counter, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, uint8_t *out) { -#if defined(IS_X86) - const enum cpu_feature features = get_cpu_features(); - MAYBE_UNUSED(features); -#if !defined(BLAKE3_NO_AVX512) - if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) { - blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter, - increment_counter, flags, flags_start, flags_end, - out); - return; - } -#endif -#if !defined(BLAKE3_NO_AVX2) - if (features & AVX2) { - blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter, - increment_counter, flags, flags_start, flags_end, - out); - return; - } -#endif -#if !defined(BLAKE3_NO_SSE41) - if (features & SSE41) { - blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter, - increment_counter, flags, flags_start, flags_end, - out); - return; - } -#endif -#if !defined(BLAKE3_NO_SSE2) - if (features & SSE2) { - blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter, - increment_counter, flags, flags_start, flags_end, - out); - return; - } -#endif -#endif - -#if defined(BLAKE3_USE_NEON) - blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter, - increment_counter, flags, flags_start, flags_end, out); - return; -#endif - - blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter, - increment_counter, flags, flags_start, flags_end, - out); -} - -// The dynamically detected SIMD degree of the current platform. -size_t blake3_simd_degree(void) { -#if defined(IS_X86) - const enum cpu_feature features = get_cpu_features(); - MAYBE_UNUSED(features); -#if !defined(BLAKE3_NO_AVX512) - if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) { - return 16; - } -#endif -#if !defined(BLAKE3_NO_AVX2) - if (features & AVX2) { - return 8; - } -#endif -#if !defined(BLAKE3_NO_SSE41) - if (features & SSE41) { - return 4; - } -#endif -#if !defined(BLAKE3_NO_SSE2) - if (features & SSE2) { - return 4; - } -#endif -#endif -#if defined(BLAKE3_USE_NEON) - return 4; -#endif - return 1; -} diff --git a/thirdparty/BLAKE3/c/blake3_impl.h b/thirdparty/BLAKE3/c/blake3_impl.h deleted file mode 100644 index 86ab6aa25..000000000 --- a/thirdparty/BLAKE3/c/blake3_impl.h +++ /dev/null @@ -1,269 +0,0 @@ -#ifndef BLAKE3_IMPL_H -#define BLAKE3_IMPL_H - -#include <assert.h> -#include <stdbool.h> -#include <stddef.h> -#include <stdint.h> -#include <string.h> - -#include "blake3.h" - -// internal flags -enum blake3_flags { - CHUNK_START = 1 << 0, - CHUNK_END = 1 << 1, - PARENT = 1 << 2, - ROOT = 1 << 3, - KEYED_HASH = 1 << 4, - DERIVE_KEY_CONTEXT = 1 << 5, - DERIVE_KEY_MATERIAL = 1 << 6, -}; - -// This C implementation tries to support recent versions of GCC, Clang, and -// MSVC. -#if defined(_MSC_VER) -#define INLINE static __forceinline -#else -#define INLINE static inline __attribute__((always_inline)) -#endif - -#if defined(__x86_64__) || defined(_M_X64) -#define IS_X86 -#define IS_X86_64 -#endif - -#if defined(__i386__) || defined(_M_IX86) -#define IS_X86 -#define IS_X86_32 -#endif - -#if defined(IS_X86) -#if defined(_MSC_VER) -#include <intrin.h> -#endif -#include <immintrin.h> -#endif - -#if defined(IS_X86) -#define MAX_SIMD_DEGREE 16 -#elif defined(BLAKE3_USE_NEON) -#define MAX_SIMD_DEGREE 4 -#else -#define MAX_SIMD_DEGREE 1 -#endif - -// There are some places where we want a static size that's equal to the -// MAX_SIMD_DEGREE, but also at least 2. -#define MAX_SIMD_DEGREE_OR_2 (MAX_SIMD_DEGREE > 2 ? MAX_SIMD_DEGREE : 2) - -static const uint32_t IV[8] = {0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, - 0xA54FF53AUL, 0x510E527FUL, 0x9B05688CUL, - 0x1F83D9ABUL, 0x5BE0CD19UL}; - -static const uint8_t MSG_SCHEDULE[7][16] = { - {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, - {2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8}, - {3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1}, - {10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6}, - {12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4}, - {9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7}, - {11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13}, -}; - -/* Find index of the highest set bit */ -/* x is assumed to be nonzero. */ -static unsigned int highest_one(uint64_t x) { -#if defined(__GNUC__) || defined(__clang__) - return 63 ^ __builtin_clzll(x); -#elif defined(_MSC_VER) && defined(IS_X86_64) - unsigned long index; - _BitScanReverse64(&index, x); - return index; -#elif defined(_MSC_VER) && defined(IS_X86_32) - if(x >> 32) { - unsigned long index; - _BitScanReverse(&index, x >> 32); - return 32 + index; - } else { - unsigned long index; - _BitScanReverse(&index, x); - return index; - } -#else - unsigned int c = 0; - if(x & 0xffffffff00000000ULL) { x >>= 32; c += 32; } - if(x & 0x00000000ffff0000ULL) { x >>= 16; c += 16; } - if(x & 0x000000000000ff00ULL) { x >>= 8; c += 8; } - if(x & 0x00000000000000f0ULL) { x >>= 4; c += 4; } - if(x & 0x000000000000000cULL) { x >>= 2; c += 2; } - if(x & 0x0000000000000002ULL) { c += 1; } - return c; -#endif -} - -// Count the number of 1 bits. -INLINE unsigned int popcnt(uint64_t x) { -#if defined(__GNUC__) || defined(__clang__) - return __builtin_popcountll(x); -#else - unsigned int count = 0; - while (x != 0) { - count += 1; - x &= x - 1; - } - return count; -#endif -} - -// Largest power of two less than or equal to x. As a special case, returns 1 -// when x is 0. -INLINE uint64_t round_down_to_power_of_2(uint64_t x) { - return 1ULL << highest_one(x | 1); -} - -INLINE uint32_t counter_low(uint64_t counter) { return (uint32_t)counter; } - -INLINE uint32_t counter_high(uint64_t counter) { - return (uint32_t)(counter >> 32); -} - -INLINE uint32_t load32(const void *src) { - const uint8_t *p = (const uint8_t *)src; - return ((uint32_t)(p[0]) << 0) | ((uint32_t)(p[1]) << 8) | - ((uint32_t)(p[2]) << 16) | ((uint32_t)(p[3]) << 24); -} - -INLINE void load_key_words(const uint8_t key[BLAKE3_KEY_LEN], - uint32_t key_words[8]) { - key_words[0] = load32(&key[0 * 4]); - key_words[1] = load32(&key[1 * 4]); - key_words[2] = load32(&key[2 * 4]); - key_words[3] = load32(&key[3 * 4]); - key_words[4] = load32(&key[4 * 4]); - key_words[5] = load32(&key[5 * 4]); - key_words[6] = load32(&key[6 * 4]); - key_words[7] = load32(&key[7 * 4]); -} - -INLINE void store32(void *dst, uint32_t w) { - uint8_t *p = (uint8_t *)dst; - p[0] = (uint8_t)(w >> 0); - p[1] = (uint8_t)(w >> 8); - p[2] = (uint8_t)(w >> 16); - p[3] = (uint8_t)(w >> 24); -} - -INLINE void store_cv_words(uint8_t bytes_out[32], uint32_t cv_words[8]) { - store32(&bytes_out[0 * 4], cv_words[0]); - store32(&bytes_out[1 * 4], cv_words[1]); - store32(&bytes_out[2 * 4], cv_words[2]); - store32(&bytes_out[3 * 4], cv_words[3]); - store32(&bytes_out[4 * 4], cv_words[4]); - store32(&bytes_out[5 * 4], cv_words[5]); - store32(&bytes_out[6 * 4], cv_words[6]); - store32(&bytes_out[7 * 4], cv_words[7]); -} - -void blake3_compress_in_place(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags); - -void blake3_compress_xof(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, uint8_t flags, - uint8_t out[64]); - -void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], uint64_t counter, - bool increment_counter, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, uint8_t *out); - -size_t blake3_simd_degree(void); - - -// Declarations for implementation-specific functions. -void blake3_compress_in_place_portable(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags); - -void blake3_compress_xof_portable(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags, uint8_t out[64]); - -void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out); - -#if defined(IS_X86) -#if !defined(BLAKE3_NO_SSE2) -void blake3_compress_in_place_sse2(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags); -void blake3_compress_xof_sse2(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags, uint8_t out[64]); -void blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out); -#endif -#if !defined(BLAKE3_NO_SSE41) -void blake3_compress_in_place_sse41(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags); -void blake3_compress_xof_sse41(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags, uint8_t out[64]); -void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out); -#endif -#if !defined(BLAKE3_NO_AVX2) -void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out); -#endif -#if !defined(BLAKE3_NO_AVX512) -void blake3_compress_in_place_avx512(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags); - -void blake3_compress_xof_avx512(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags, uint8_t out[64]); - -void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out); -#endif -#endif - -#if defined(BLAKE3_USE_NEON) -void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out); -#endif - - -#endif /* BLAKE3_IMPL_H */ diff --git a/thirdparty/BLAKE3/c/blake3_neon.c b/thirdparty/BLAKE3/c/blake3_neon.c deleted file mode 100644 index 46691f526..000000000 --- a/thirdparty/BLAKE3/c/blake3_neon.c +++ /dev/null @@ -1,346 +0,0 @@ -#include "blake3_impl.h" - -#include <arm_neon.h> - -// TODO: This is probably incorrect for big-endian ARM. How should that work? -INLINE uint32x4_t loadu_128(const uint8_t src[16]) { - // vld1q_u32 has alignment requirements. Don't use it. - uint32x4_t x; - memcpy(&x, src, 16); - return x; -} - -INLINE void storeu_128(uint32x4_t src, uint8_t dest[16]) { - // vst1q_u32 has alignment requirements. Don't use it. - memcpy(dest, &src, 16); -} - -INLINE uint32x4_t add_128(uint32x4_t a, uint32x4_t b) { - return vaddq_u32(a, b); -} - -INLINE uint32x4_t xor_128(uint32x4_t a, uint32x4_t b) { - return veorq_u32(a, b); -} - -INLINE uint32x4_t set1_128(uint32_t x) { return vld1q_dup_u32(&x); } - -INLINE uint32x4_t set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { - uint32_t array[4] = {a, b, c, d}; - return vld1q_u32(array); -} - -INLINE uint32x4_t rot16_128(uint32x4_t x) { - return vorrq_u32(vshrq_n_u32(x, 16), vshlq_n_u32(x, 32 - 16)); -} - -INLINE uint32x4_t rot12_128(uint32x4_t x) { - return vorrq_u32(vshrq_n_u32(x, 12), vshlq_n_u32(x, 32 - 12)); -} - -INLINE uint32x4_t rot8_128(uint32x4_t x) { - return vorrq_u32(vshrq_n_u32(x, 8), vshlq_n_u32(x, 32 - 8)); -} - -INLINE uint32x4_t rot7_128(uint32x4_t x) { - return vorrq_u32(vshrq_n_u32(x, 7), vshlq_n_u32(x, 32 - 7)); -} - -// TODO: compress_neon - -// TODO: hash2_neon - -/* - * ---------------------------------------------------------------------------- - * hash4_neon - * ---------------------------------------------------------------------------- - */ - -INLINE void round_fn4(uint32x4_t v[16], uint32x4_t m[16], size_t r) { - v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); - v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); - v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); - v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); - v[0] = add_128(v[0], v[4]); - v[1] = add_128(v[1], v[5]); - v[2] = add_128(v[2], v[6]); - v[3] = add_128(v[3], v[7]); - v[12] = xor_128(v[12], v[0]); - v[13] = xor_128(v[13], v[1]); - v[14] = xor_128(v[14], v[2]); - v[15] = xor_128(v[15], v[3]); - v[12] = rot16_128(v[12]); - v[13] = rot16_128(v[13]); - v[14] = rot16_128(v[14]); - v[15] = rot16_128(v[15]); - v[8] = add_128(v[8], v[12]); - v[9] = add_128(v[9], v[13]); - v[10] = add_128(v[10], v[14]); - v[11] = add_128(v[11], v[15]); - v[4] = xor_128(v[4], v[8]); - v[5] = xor_128(v[5], v[9]); - v[6] = xor_128(v[6], v[10]); - v[7] = xor_128(v[7], v[11]); - v[4] = rot12_128(v[4]); - v[5] = rot12_128(v[5]); - v[6] = rot12_128(v[6]); - v[7] = rot12_128(v[7]); - v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); - v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); - v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); - v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); - v[0] = add_128(v[0], v[4]); - v[1] = add_128(v[1], v[5]); - v[2] = add_128(v[2], v[6]); - v[3] = add_128(v[3], v[7]); - v[12] = xor_128(v[12], v[0]); - v[13] = xor_128(v[13], v[1]); - v[14] = xor_128(v[14], v[2]); - v[15] = xor_128(v[15], v[3]); - v[12] = rot8_128(v[12]); - v[13] = rot8_128(v[13]); - v[14] = rot8_128(v[14]); - v[15] = rot8_128(v[15]); - v[8] = add_128(v[8], v[12]); - v[9] = add_128(v[9], v[13]); - v[10] = add_128(v[10], v[14]); - v[11] = add_128(v[11], v[15]); - v[4] = xor_128(v[4], v[8]); - v[5] = xor_128(v[5], v[9]); - v[6] = xor_128(v[6], v[10]); - v[7] = xor_128(v[7], v[11]); - v[4] = rot7_128(v[4]); - v[5] = rot7_128(v[5]); - v[6] = rot7_128(v[6]); - v[7] = rot7_128(v[7]); - - v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); - v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); - v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); - v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); - v[0] = add_128(v[0], v[5]); - v[1] = add_128(v[1], v[6]); - v[2] = add_128(v[2], v[7]); - v[3] = add_128(v[3], v[4]); - v[15] = xor_128(v[15], v[0]); - v[12] = xor_128(v[12], v[1]); - v[13] = xor_128(v[13], v[2]); - v[14] = xor_128(v[14], v[3]); - v[15] = rot16_128(v[15]); - v[12] = rot16_128(v[12]); - v[13] = rot16_128(v[13]); - v[14] = rot16_128(v[14]); - v[10] = add_128(v[10], v[15]); - v[11] = add_128(v[11], v[12]); - v[8] = add_128(v[8], v[13]); - v[9] = add_128(v[9], v[14]); - v[5] = xor_128(v[5], v[10]); - v[6] = xor_128(v[6], v[11]); - v[7] = xor_128(v[7], v[8]); - v[4] = xor_128(v[4], v[9]); - v[5] = rot12_128(v[5]); - v[6] = rot12_128(v[6]); - v[7] = rot12_128(v[7]); - v[4] = rot12_128(v[4]); - v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); - v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); - v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); - v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); - v[0] = add_128(v[0], v[5]); - v[1] = add_128(v[1], v[6]); - v[2] = add_128(v[2], v[7]); - v[3] = add_128(v[3], v[4]); - v[15] = xor_128(v[15], v[0]); - v[12] = xor_128(v[12], v[1]); - v[13] = xor_128(v[13], v[2]); - v[14] = xor_128(v[14], v[3]); - v[15] = rot8_128(v[15]); - v[12] = rot8_128(v[12]); - v[13] = rot8_128(v[13]); - v[14] = rot8_128(v[14]); - v[10] = add_128(v[10], v[15]); - v[11] = add_128(v[11], v[12]); - v[8] = add_128(v[8], v[13]); - v[9] = add_128(v[9], v[14]); - v[5] = xor_128(v[5], v[10]); - v[6] = xor_128(v[6], v[11]); - v[7] = xor_128(v[7], v[8]); - v[4] = xor_128(v[4], v[9]); - v[5] = rot7_128(v[5]); - v[6] = rot7_128(v[6]); - v[7] = rot7_128(v[7]); - v[4] = rot7_128(v[4]); -} - -INLINE void transpose_vecs_128(uint32x4_t vecs[4]) { - // Individually transpose the four 2x2 sub-matrices in each corner. - uint32x4x2_t rows01 = vtrnq_u32(vecs[0], vecs[1]); - uint32x4x2_t rows23 = vtrnq_u32(vecs[2], vecs[3]); - - // Swap the top-right and bottom-left 2x2s (which just got transposed). - vecs[0] = - vcombine_u32(vget_low_u32(rows01.val[0]), vget_low_u32(rows23.val[0])); - vecs[1] = - vcombine_u32(vget_low_u32(rows01.val[1]), vget_low_u32(rows23.val[1])); - vecs[2] = - vcombine_u32(vget_high_u32(rows01.val[0]), vget_high_u32(rows23.val[0])); - vecs[3] = - vcombine_u32(vget_high_u32(rows01.val[1]), vget_high_u32(rows23.val[1])); -} - -INLINE void transpose_msg_vecs4(const uint8_t *const *inputs, - size_t block_offset, uint32x4_t out[16]) { - out[0] = loadu_128(&inputs[0][block_offset + 0 * sizeof(uint32x4_t)]); - out[1] = loadu_128(&inputs[1][block_offset + 0 * sizeof(uint32x4_t)]); - out[2] = loadu_128(&inputs[2][block_offset + 0 * sizeof(uint32x4_t)]); - out[3] = loadu_128(&inputs[3][block_offset + 0 * sizeof(uint32x4_t)]); - out[4] = loadu_128(&inputs[0][block_offset + 1 * sizeof(uint32x4_t)]); - out[5] = loadu_128(&inputs[1][block_offset + 1 * sizeof(uint32x4_t)]); - out[6] = loadu_128(&inputs[2][block_offset + 1 * sizeof(uint32x4_t)]); - out[7] = loadu_128(&inputs[3][block_offset + 1 * sizeof(uint32x4_t)]); - out[8] = loadu_128(&inputs[0][block_offset + 2 * sizeof(uint32x4_t)]); - out[9] = loadu_128(&inputs[1][block_offset + 2 * sizeof(uint32x4_t)]); - out[10] = loadu_128(&inputs[2][block_offset + 2 * sizeof(uint32x4_t)]); - out[11] = loadu_128(&inputs[3][block_offset + 2 * sizeof(uint32x4_t)]); - out[12] = loadu_128(&inputs[0][block_offset + 3 * sizeof(uint32x4_t)]); - out[13] = loadu_128(&inputs[1][block_offset + 3 * sizeof(uint32x4_t)]); - out[14] = loadu_128(&inputs[2][block_offset + 3 * sizeof(uint32x4_t)]); - out[15] = loadu_128(&inputs[3][block_offset + 3 * sizeof(uint32x4_t)]); - transpose_vecs_128(&out[0]); - transpose_vecs_128(&out[4]); - transpose_vecs_128(&out[8]); - transpose_vecs_128(&out[12]); -} - -INLINE void load_counters4(uint64_t counter, bool increment_counter, - uint32x4_t *out_low, uint32x4_t *out_high) { - uint64_t mask = (increment_counter ? ~0 : 0); - *out_low = set4( - counter_low(counter + (mask & 0)), counter_low(counter + (mask & 1)), - counter_low(counter + (mask & 2)), counter_low(counter + (mask & 3))); - *out_high = set4( - counter_high(counter + (mask & 0)), counter_high(counter + (mask & 1)), - counter_high(counter + (mask & 2)), counter_high(counter + (mask & 3))); -} - -void blake3_hash4_neon(const uint8_t *const *inputs, size_t blocks, - const uint32_t key[8], uint64_t counter, - bool increment_counter, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, uint8_t *out) { - uint32x4_t h_vecs[8] = { - set1_128(key[0]), set1_128(key[1]), set1_128(key[2]), set1_128(key[3]), - set1_128(key[4]), set1_128(key[5]), set1_128(key[6]), set1_128(key[7]), - }; - uint32x4_t counter_low_vec, counter_high_vec; - load_counters4(counter, increment_counter, &counter_low_vec, - &counter_high_vec); - uint8_t block_flags = flags | flags_start; - - for (size_t block = 0; block < blocks; block++) { - if (block + 1 == blocks) { - block_flags |= flags_end; - } - uint32x4_t block_len_vec = set1_128(BLAKE3_BLOCK_LEN); - uint32x4_t block_flags_vec = set1_128(block_flags); - uint32x4_t msg_vecs[16]; - transpose_msg_vecs4(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); - - uint32x4_t v[16] = { - h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], - h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], - set1_128(IV[0]), set1_128(IV[1]), set1_128(IV[2]), set1_128(IV[3]), - counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, - }; - round_fn4(v, msg_vecs, 0); - round_fn4(v, msg_vecs, 1); - round_fn4(v, msg_vecs, 2); - round_fn4(v, msg_vecs, 3); - round_fn4(v, msg_vecs, 4); - round_fn4(v, msg_vecs, 5); - round_fn4(v, msg_vecs, 6); - h_vecs[0] = xor_128(v[0], v[8]); - h_vecs[1] = xor_128(v[1], v[9]); - h_vecs[2] = xor_128(v[2], v[10]); - h_vecs[3] = xor_128(v[3], v[11]); - h_vecs[4] = xor_128(v[4], v[12]); - h_vecs[5] = xor_128(v[5], v[13]); - h_vecs[6] = xor_128(v[6], v[14]); - h_vecs[7] = xor_128(v[7], v[15]); - - block_flags = flags; - } - - transpose_vecs_128(&h_vecs[0]); - transpose_vecs_128(&h_vecs[4]); - // The first four vecs now contain the first half of each output, and the - // second four vecs contain the second half of each output. - storeu_128(h_vecs[0], &out[0 * sizeof(uint32x4_t)]); - storeu_128(h_vecs[4], &out[1 * sizeof(uint32x4_t)]); - storeu_128(h_vecs[1], &out[2 * sizeof(uint32x4_t)]); - storeu_128(h_vecs[5], &out[3 * sizeof(uint32x4_t)]); - storeu_128(h_vecs[2], &out[4 * sizeof(uint32x4_t)]); - storeu_128(h_vecs[6], &out[5 * sizeof(uint32x4_t)]); - storeu_128(h_vecs[3], &out[6 * sizeof(uint32x4_t)]); - storeu_128(h_vecs[7], &out[7 * sizeof(uint32x4_t)]); -} - -/* - * ---------------------------------------------------------------------------- - * hash_many_neon - * ---------------------------------------------------------------------------- - */ - -void blake3_compress_in_place_portable(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags); - -INLINE void hash_one_neon(const uint8_t *input, size_t blocks, - const uint32_t key[8], uint64_t counter, - uint8_t flags, uint8_t flags_start, uint8_t flags_end, - uint8_t out[BLAKE3_OUT_LEN]) { - uint32_t cv[8]; - memcpy(cv, key, BLAKE3_KEY_LEN); - uint8_t block_flags = flags | flags_start; - while (blocks > 0) { - if (blocks == 1) { - block_flags |= flags_end; - } - // TODO: Implement compress_neon. However note that according to - // https://github.com/BLAKE2/BLAKE2/commit/7965d3e6e1b4193438b8d3a656787587d2579227, - // compress_neon might not be any faster than compress_portable. - blake3_compress_in_place_portable(cv, input, BLAKE3_BLOCK_LEN, counter, - block_flags); - input = &input[BLAKE3_BLOCK_LEN]; - blocks -= 1; - block_flags = flags; - } - memcpy(out, cv, BLAKE3_OUT_LEN); -} - -void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out) { - while (num_inputs >= 4) { - blake3_hash4_neon(inputs, blocks, key, counter, increment_counter, flags, - flags_start, flags_end, out); - if (increment_counter) { - counter += 4; - } - inputs += 4; - num_inputs -= 4; - out = &out[4 * BLAKE3_OUT_LEN]; - } - while (num_inputs > 0) { - hash_one_neon(inputs[0], blocks, key, counter, flags, flags_start, - flags_end, out); - if (increment_counter) { - counter += 1; - } - inputs += 1; - num_inputs -= 1; - out = &out[BLAKE3_OUT_LEN]; - } -} diff --git a/thirdparty/BLAKE3/c/blake3_portable.c b/thirdparty/BLAKE3/c/blake3_portable.c deleted file mode 100644 index 062dd1b47..000000000 --- a/thirdparty/BLAKE3/c/blake3_portable.c +++ /dev/null @@ -1,160 +0,0 @@ -#include "blake3_impl.h" -#include <string.h> - -INLINE uint32_t rotr32(uint32_t w, uint32_t c) { - return (w >> c) | (w << (32 - c)); -} - -INLINE void g(uint32_t *state, size_t a, size_t b, size_t c, size_t d, - uint32_t x, uint32_t y) { - state[a] = state[a] + state[b] + x; - state[d] = rotr32(state[d] ^ state[a], 16); - state[c] = state[c] + state[d]; - state[b] = rotr32(state[b] ^ state[c], 12); - state[a] = state[a] + state[b] + y; - state[d] = rotr32(state[d] ^ state[a], 8); - state[c] = state[c] + state[d]; - state[b] = rotr32(state[b] ^ state[c], 7); -} - -INLINE void round_fn(uint32_t state[16], const uint32_t *msg, size_t round) { - // Select the message schedule based on the round. - const uint8_t *schedule = MSG_SCHEDULE[round]; - - // Mix the columns. - g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]); - g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]); - g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]); - g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]); - - // Mix the rows. - g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]); - g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]); - g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]); - g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]); -} - -INLINE void compress_pre(uint32_t state[16], const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, uint8_t flags) { - uint32_t block_words[16]; - block_words[0] = load32(block + 4 * 0); - block_words[1] = load32(block + 4 * 1); - block_words[2] = load32(block + 4 * 2); - block_words[3] = load32(block + 4 * 3); - block_words[4] = load32(block + 4 * 4); - block_words[5] = load32(block + 4 * 5); - block_words[6] = load32(block + 4 * 6); - block_words[7] = load32(block + 4 * 7); - block_words[8] = load32(block + 4 * 8); - block_words[9] = load32(block + 4 * 9); - block_words[10] = load32(block + 4 * 10); - block_words[11] = load32(block + 4 * 11); - block_words[12] = load32(block + 4 * 12); - block_words[13] = load32(block + 4 * 13); - block_words[14] = load32(block + 4 * 14); - block_words[15] = load32(block + 4 * 15); - - state[0] = cv[0]; - state[1] = cv[1]; - state[2] = cv[2]; - state[3] = cv[3]; - state[4] = cv[4]; - state[5] = cv[5]; - state[6] = cv[6]; - state[7] = cv[7]; - state[8] = IV[0]; - state[9] = IV[1]; - state[10] = IV[2]; - state[11] = IV[3]; - state[12] = counter_low(counter); - state[13] = counter_high(counter); - state[14] = (uint32_t)block_len; - state[15] = (uint32_t)flags; - - round_fn(state, &block_words[0], 0); - round_fn(state, &block_words[0], 1); - round_fn(state, &block_words[0], 2); - round_fn(state, &block_words[0], 3); - round_fn(state, &block_words[0], 4); - round_fn(state, &block_words[0], 5); - round_fn(state, &block_words[0], 6); -} - -void blake3_compress_in_place_portable(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags) { - uint32_t state[16]; - compress_pre(state, cv, block, block_len, counter, flags); - cv[0] = state[0] ^ state[8]; - cv[1] = state[1] ^ state[9]; - cv[2] = state[2] ^ state[10]; - cv[3] = state[3] ^ state[11]; - cv[4] = state[4] ^ state[12]; - cv[5] = state[5] ^ state[13]; - cv[6] = state[6] ^ state[14]; - cv[7] = state[7] ^ state[15]; -} - -void blake3_compress_xof_portable(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags, uint8_t out[64]) { - uint32_t state[16]; - compress_pre(state, cv, block, block_len, counter, flags); - - store32(&out[0 * 4], state[0] ^ state[8]); - store32(&out[1 * 4], state[1] ^ state[9]); - store32(&out[2 * 4], state[2] ^ state[10]); - store32(&out[3 * 4], state[3] ^ state[11]); - store32(&out[4 * 4], state[4] ^ state[12]); - store32(&out[5 * 4], state[5] ^ state[13]); - store32(&out[6 * 4], state[6] ^ state[14]); - store32(&out[7 * 4], state[7] ^ state[15]); - store32(&out[8 * 4], state[8] ^ cv[0]); - store32(&out[9 * 4], state[9] ^ cv[1]); - store32(&out[10 * 4], state[10] ^ cv[2]); - store32(&out[11 * 4], state[11] ^ cv[3]); - store32(&out[12 * 4], state[12] ^ cv[4]); - store32(&out[13 * 4], state[13] ^ cv[5]); - store32(&out[14 * 4], state[14] ^ cv[6]); - store32(&out[15 * 4], state[15] ^ cv[7]); -} - -INLINE void hash_one_portable(const uint8_t *input, size_t blocks, - const uint32_t key[8], uint64_t counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) { - uint32_t cv[8]; - memcpy(cv, key, BLAKE3_KEY_LEN); - uint8_t block_flags = flags | flags_start; - while (blocks > 0) { - if (blocks == 1) { - block_flags |= flags_end; - } - blake3_compress_in_place_portable(cv, input, BLAKE3_BLOCK_LEN, counter, - block_flags); - input = &input[BLAKE3_BLOCK_LEN]; - blocks -= 1; - block_flags = flags; - } - store_cv_words(out, cv); -} - -void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out) { - while (num_inputs > 0) { - hash_one_portable(inputs[0], blocks, key, counter, flags, flags_start, - flags_end, out); - if (increment_counter) { - counter += 1; - } - inputs += 1; - num_inputs -= 1; - out = &out[BLAKE3_OUT_LEN]; - } -} diff --git a/thirdparty/BLAKE3/c/blake3_sse2.c b/thirdparty/BLAKE3/c/blake3_sse2.c deleted file mode 100644 index 159296688..000000000 --- a/thirdparty/BLAKE3/c/blake3_sse2.c +++ /dev/null @@ -1,565 +0,0 @@ -#include "blake3_impl.h" - -#include <immintrin.h> - -#define DEGREE 4 - -#define _mm_shuffle_ps2(a, b, c) \ - (_mm_castps_si128( \ - _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c)))) - -INLINE __m128i loadu(const uint8_t src[16]) { - return _mm_loadu_si128((const __m128i *)src); -} - -INLINE void storeu(__m128i src, uint8_t dest[16]) { - _mm_storeu_si128((__m128i *)dest, src); -} - -INLINE __m128i addv(__m128i a, __m128i b) { return _mm_add_epi32(a, b); } - -// Note that clang-format doesn't like the name "xor" for some reason. -INLINE __m128i xorv(__m128i a, __m128i b) { return _mm_xor_si128(a, b); } - -INLINE __m128i set1(uint32_t x) { return _mm_set1_epi32((int32_t)x); } - -INLINE __m128i set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { - return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d); -} - -INLINE __m128i rot16(__m128i x) { - return _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, 0xB1), 0xB1); -} - -INLINE __m128i rot12(__m128i x) { - return xorv(_mm_srli_epi32(x, 12), _mm_slli_epi32(x, 32 - 12)); -} - -INLINE __m128i rot8(__m128i x) { - return xorv(_mm_srli_epi32(x, 8), _mm_slli_epi32(x, 32 - 8)); -} - -INLINE __m128i rot7(__m128i x) { - return xorv(_mm_srli_epi32(x, 7), _mm_slli_epi32(x, 32 - 7)); -} - -INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, - __m128i m) { - *row0 = addv(addv(*row0, m), *row1); - *row3 = xorv(*row3, *row0); - *row3 = rot16(*row3); - *row2 = addv(*row2, *row3); - *row1 = xorv(*row1, *row2); - *row1 = rot12(*row1); -} - -INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, - __m128i m) { - *row0 = addv(addv(*row0, m), *row1); - *row3 = xorv(*row3, *row0); - *row3 = rot8(*row3); - *row2 = addv(*row2, *row3); - *row1 = xorv(*row1, *row2); - *row1 = rot7(*row1); -} - -// Note the optimization here of leaving row1 as the unrotated row, rather than -// row0. All the message loads below are adjusted to compensate for this. See -// discussion at https://github.com/sneves/blake2-avx2/pull/4 -INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { - *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3)); - *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); - *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1)); -} - -INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { - *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1)); - *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); - *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3)); -} - -INLINE __m128i blend_epi16(__m128i a, __m128i b, const int imm8) { - const __m128i bits = _mm_set_epi16(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01); - __m128i mask = _mm_set1_epi16(imm8); - mask = _mm_and_si128(mask, bits); - mask = _mm_cmpeq_epi16(mask, bits); - return _mm_or_si128(_mm_and_si128(mask, b), _mm_andnot_si128(mask, a)); -} - -INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, uint8_t flags) { - rows[0] = loadu((uint8_t *)&cv[0]); - rows[1] = loadu((uint8_t *)&cv[4]); - rows[2] = set4(IV[0], IV[1], IV[2], IV[3]); - rows[3] = set4(counter_low(counter), counter_high(counter), - (uint32_t)block_len, (uint32_t)flags); - - __m128i m0 = loadu(&block[sizeof(__m128i) * 0]); - __m128i m1 = loadu(&block[sizeof(__m128i) * 1]); - __m128i m2 = loadu(&block[sizeof(__m128i) * 2]); - __m128i m3 = loadu(&block[sizeof(__m128i) * 3]); - - __m128i t0, t1, t2, t3, tt; - - // Round 1. The first round permutes the message words from the original - // input order, into the groups that get mixed in parallel. - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); // 6 4 2 0 - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); // 7 5 3 1 - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10 8 - t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3)); // 12 10 8 14 - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11 9 - t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3)); // 13 11 9 15 - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 2. This round and all following rounds apply a fixed permutation - // to the message words from the round before. - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = blend_epi16(tt, t1, 0xCC); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 3 - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = blend_epi16(tt, t1, 0xCC); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 4 - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = blend_epi16(tt, t1, 0xCC); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 5 - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = blend_epi16(tt, t1, 0xCC); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 6 - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = blend_epi16(tt, t1, 0xCC); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 7 - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = blend_epi16(tt, t1, 0xCC); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); -} - -void blake3_compress_in_place_sse2(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags) { - __m128i rows[4]; - compress_pre(rows, cv, block, block_len, counter, flags); - storeu(xorv(rows[0], rows[2]), (uint8_t *)&cv[0]); - storeu(xorv(rows[1], rows[3]), (uint8_t *)&cv[4]); -} - -void blake3_compress_xof_sse2(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags, uint8_t out[64]) { - __m128i rows[4]; - compress_pre(rows, cv, block, block_len, counter, flags); - storeu(xorv(rows[0], rows[2]), &out[0]); - storeu(xorv(rows[1], rows[3]), &out[16]); - storeu(xorv(rows[2], loadu((uint8_t *)&cv[0])), &out[32]); - storeu(xorv(rows[3], loadu((uint8_t *)&cv[4])), &out[48]); -} - -INLINE void round_fn(__m128i v[16], __m128i m[16], size_t r) { - v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); - v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); - v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); - v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); - v[0] = addv(v[0], v[4]); - v[1] = addv(v[1], v[5]); - v[2] = addv(v[2], v[6]); - v[3] = addv(v[3], v[7]); - v[12] = xorv(v[12], v[0]); - v[13] = xorv(v[13], v[1]); - v[14] = xorv(v[14], v[2]); - v[15] = xorv(v[15], v[3]); - v[12] = rot16(v[12]); - v[13] = rot16(v[13]); - v[14] = rot16(v[14]); - v[15] = rot16(v[15]); - v[8] = addv(v[8], v[12]); - v[9] = addv(v[9], v[13]); - v[10] = addv(v[10], v[14]); - v[11] = addv(v[11], v[15]); - v[4] = xorv(v[4], v[8]); - v[5] = xorv(v[5], v[9]); - v[6] = xorv(v[6], v[10]); - v[7] = xorv(v[7], v[11]); - v[4] = rot12(v[4]); - v[5] = rot12(v[5]); - v[6] = rot12(v[6]); - v[7] = rot12(v[7]); - v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); - v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); - v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); - v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); - v[0] = addv(v[0], v[4]); - v[1] = addv(v[1], v[5]); - v[2] = addv(v[2], v[6]); - v[3] = addv(v[3], v[7]); - v[12] = xorv(v[12], v[0]); - v[13] = xorv(v[13], v[1]); - v[14] = xorv(v[14], v[2]); - v[15] = xorv(v[15], v[3]); - v[12] = rot8(v[12]); - v[13] = rot8(v[13]); - v[14] = rot8(v[14]); - v[15] = rot8(v[15]); - v[8] = addv(v[8], v[12]); - v[9] = addv(v[9], v[13]); - v[10] = addv(v[10], v[14]); - v[11] = addv(v[11], v[15]); - v[4] = xorv(v[4], v[8]); - v[5] = xorv(v[5], v[9]); - v[6] = xorv(v[6], v[10]); - v[7] = xorv(v[7], v[11]); - v[4] = rot7(v[4]); - v[5] = rot7(v[5]); - v[6] = rot7(v[6]); - v[7] = rot7(v[7]); - - v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); - v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); - v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); - v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); - v[0] = addv(v[0], v[5]); - v[1] = addv(v[1], v[6]); - v[2] = addv(v[2], v[7]); - v[3] = addv(v[3], v[4]); - v[15] = xorv(v[15], v[0]); - v[12] = xorv(v[12], v[1]); - v[13] = xorv(v[13], v[2]); - v[14] = xorv(v[14], v[3]); - v[15] = rot16(v[15]); - v[12] = rot16(v[12]); - v[13] = rot16(v[13]); - v[14] = rot16(v[14]); - v[10] = addv(v[10], v[15]); - v[11] = addv(v[11], v[12]); - v[8] = addv(v[8], v[13]); - v[9] = addv(v[9], v[14]); - v[5] = xorv(v[5], v[10]); - v[6] = xorv(v[6], v[11]); - v[7] = xorv(v[7], v[8]); - v[4] = xorv(v[4], v[9]); - v[5] = rot12(v[5]); - v[6] = rot12(v[6]); - v[7] = rot12(v[7]); - v[4] = rot12(v[4]); - v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); - v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); - v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); - v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); - v[0] = addv(v[0], v[5]); - v[1] = addv(v[1], v[6]); - v[2] = addv(v[2], v[7]); - v[3] = addv(v[3], v[4]); - v[15] = xorv(v[15], v[0]); - v[12] = xorv(v[12], v[1]); - v[13] = xorv(v[13], v[2]); - v[14] = xorv(v[14], v[3]); - v[15] = rot8(v[15]); - v[12] = rot8(v[12]); - v[13] = rot8(v[13]); - v[14] = rot8(v[14]); - v[10] = addv(v[10], v[15]); - v[11] = addv(v[11], v[12]); - v[8] = addv(v[8], v[13]); - v[9] = addv(v[9], v[14]); - v[5] = xorv(v[5], v[10]); - v[6] = xorv(v[6], v[11]); - v[7] = xorv(v[7], v[8]); - v[4] = xorv(v[4], v[9]); - v[5] = rot7(v[5]); - v[6] = rot7(v[6]); - v[7] = rot7(v[7]); - v[4] = rot7(v[4]); -} - -INLINE void transpose_vecs(__m128i vecs[DEGREE]) { - // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is - // 22/33. Note that this doesn't split the vector into two lanes, as the - // AVX2 counterparts do. - __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]); - __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]); - __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]); - __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]); - - // Interleave 64-bit lanes. - __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01); - __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01); - __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23); - __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23); - - vecs[0] = abcd_0; - vecs[1] = abcd_1; - vecs[2] = abcd_2; - vecs[3] = abcd_3; -} - -INLINE void transpose_msg_vecs(const uint8_t *const *inputs, - size_t block_offset, __m128i out[16]) { - out[0] = loadu(&inputs[0][block_offset + 0 * sizeof(__m128i)]); - out[1] = loadu(&inputs[1][block_offset + 0 * sizeof(__m128i)]); - out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m128i)]); - out[3] = loadu(&inputs[3][block_offset + 0 * sizeof(__m128i)]); - out[4] = loadu(&inputs[0][block_offset + 1 * sizeof(__m128i)]); - out[5] = loadu(&inputs[1][block_offset + 1 * sizeof(__m128i)]); - out[6] = loadu(&inputs[2][block_offset + 1 * sizeof(__m128i)]); - out[7] = loadu(&inputs[3][block_offset + 1 * sizeof(__m128i)]); - out[8] = loadu(&inputs[0][block_offset + 2 * sizeof(__m128i)]); - out[9] = loadu(&inputs[1][block_offset + 2 * sizeof(__m128i)]); - out[10] = loadu(&inputs[2][block_offset + 2 * sizeof(__m128i)]); - out[11] = loadu(&inputs[3][block_offset + 2 * sizeof(__m128i)]); - out[12] = loadu(&inputs[0][block_offset + 3 * sizeof(__m128i)]); - out[13] = loadu(&inputs[1][block_offset + 3 * sizeof(__m128i)]); - out[14] = loadu(&inputs[2][block_offset + 3 * sizeof(__m128i)]); - out[15] = loadu(&inputs[3][block_offset + 3 * sizeof(__m128i)]); - for (size_t i = 0; i < 4; ++i) { - _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0); - } - transpose_vecs(&out[0]); - transpose_vecs(&out[4]); - transpose_vecs(&out[8]); - transpose_vecs(&out[12]); -} - -INLINE void load_counters(uint64_t counter, bool increment_counter, - __m128i *out_lo, __m128i *out_hi) { - const __m128i mask = _mm_set1_epi32(-(int32_t)increment_counter); - const __m128i add0 = _mm_set_epi32(3, 2, 1, 0); - const __m128i add1 = _mm_and_si128(mask, add0); - __m128i l = _mm_add_epi32(_mm_set1_epi32(counter), add1); - __m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32(0x80000000)), - _mm_xor_si128( l, _mm_set1_epi32(0x80000000))); - __m128i h = _mm_sub_epi32(_mm_set1_epi32(counter >> 32), carry); - *out_lo = l; - *out_hi = h; -} - -void blake3_hash4_sse2(const uint8_t *const *inputs, size_t blocks, - const uint32_t key[8], uint64_t counter, - bool increment_counter, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, uint8_t *out) { - __m128i h_vecs[8] = { - set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]), - set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]), - }; - __m128i counter_low_vec, counter_high_vec; - load_counters(counter, increment_counter, &counter_low_vec, - &counter_high_vec); - uint8_t block_flags = flags | flags_start; - - for (size_t block = 0; block < blocks; block++) { - if (block + 1 == blocks) { - block_flags |= flags_end; - } - __m128i block_len_vec = set1(BLAKE3_BLOCK_LEN); - __m128i block_flags_vec = set1(block_flags); - __m128i msg_vecs[16]; - transpose_msg_vecs(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); - - __m128i v[16] = { - h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], - h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], - set1(IV[0]), set1(IV[1]), set1(IV[2]), set1(IV[3]), - counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, - }; - round_fn(v, msg_vecs, 0); - round_fn(v, msg_vecs, 1); - round_fn(v, msg_vecs, 2); - round_fn(v, msg_vecs, 3); - round_fn(v, msg_vecs, 4); - round_fn(v, msg_vecs, 5); - round_fn(v, msg_vecs, 6); - h_vecs[0] = xorv(v[0], v[8]); - h_vecs[1] = xorv(v[1], v[9]); - h_vecs[2] = xorv(v[2], v[10]); - h_vecs[3] = xorv(v[3], v[11]); - h_vecs[4] = xorv(v[4], v[12]); - h_vecs[5] = xorv(v[5], v[13]); - h_vecs[6] = xorv(v[6], v[14]); - h_vecs[7] = xorv(v[7], v[15]); - - block_flags = flags; - } - - transpose_vecs(&h_vecs[0]); - transpose_vecs(&h_vecs[4]); - // The first four vecs now contain the first half of each output, and the - // second four vecs contain the second half of each output. - storeu(h_vecs[0], &out[0 * sizeof(__m128i)]); - storeu(h_vecs[4], &out[1 * sizeof(__m128i)]); - storeu(h_vecs[1], &out[2 * sizeof(__m128i)]); - storeu(h_vecs[5], &out[3 * sizeof(__m128i)]); - storeu(h_vecs[2], &out[4 * sizeof(__m128i)]); - storeu(h_vecs[6], &out[5 * sizeof(__m128i)]); - storeu(h_vecs[3], &out[6 * sizeof(__m128i)]); - storeu(h_vecs[7], &out[7 * sizeof(__m128i)]); -} - -INLINE void hash_one_sse2(const uint8_t *input, size_t blocks, - const uint32_t key[8], uint64_t counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) { - uint32_t cv[8]; - memcpy(cv, key, BLAKE3_KEY_LEN); - uint8_t block_flags = flags | flags_start; - while (blocks > 0) { - if (blocks == 1) { - block_flags |= flags_end; - } - blake3_compress_in_place_sse2(cv, input, BLAKE3_BLOCK_LEN, counter, - block_flags); - input = &input[BLAKE3_BLOCK_LEN]; - blocks -= 1; - block_flags = flags; - } - memcpy(out, cv, BLAKE3_OUT_LEN); -} - -void blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out) { - while (num_inputs >= DEGREE) { - blake3_hash4_sse2(inputs, blocks, key, counter, increment_counter, flags, - flags_start, flags_end, out); - if (increment_counter) { - counter += DEGREE; - } - inputs += DEGREE; - num_inputs -= DEGREE; - out = &out[DEGREE * BLAKE3_OUT_LEN]; - } - while (num_inputs > 0) { - hash_one_sse2(inputs[0], blocks, key, counter, flags, flags_start, - flags_end, out); - if (increment_counter) { - counter += 1; - } - inputs += 1; - num_inputs -= 1; - out = &out[BLAKE3_OUT_LEN]; - } -} diff --git a/thirdparty/BLAKE3/c/blake3_sse2_x86-64_unix.S b/thirdparty/BLAKE3/c/blake3_sse2_x86-64_unix.S deleted file mode 100644 index d144046ab..000000000 --- a/thirdparty/BLAKE3/c/blake3_sse2_x86-64_unix.S +++ /dev/null @@ -1,2291 +0,0 @@ -#if defined(__ELF__) && defined(__linux__) -.section .note.GNU-stack,"",%progbits -#endif - -#if defined(__ELF__) && defined(__CET__) && defined(__has_include) -#if __has_include(<cet.h>) -#include <cet.h> -#endif -#endif - -#if !defined(_CET_ENDBR) -#define _CET_ENDBR -#endif - -.intel_syntax noprefix -.global blake3_hash_many_sse2 -.global _blake3_hash_many_sse2 -.global blake3_compress_in_place_sse2 -.global _blake3_compress_in_place_sse2 -.global blake3_compress_xof_sse2 -.global _blake3_compress_xof_sse2 -#ifdef __APPLE__ -.text -#else -.section .text -#endif - .p2align 6 -_blake3_hash_many_sse2: -blake3_hash_many_sse2: - _CET_ENDBR - push r15 - push r14 - push r13 - push r12 - push rbx - push rbp - mov rbp, rsp - sub rsp, 360 - and rsp, 0xFFFFFFFFFFFFFFC0 - neg r9d - movd xmm0, r9d - pshufd xmm0, xmm0, 0x00 - movdqa xmmword ptr [rsp+0x130], xmm0 - movdqa xmm1, xmm0 - pand xmm1, xmmword ptr [ADD0+rip] - pand xmm0, xmmword ptr [ADD1+rip] - movdqa xmmword ptr [rsp+0x150], xmm0 - movd xmm0, r8d - pshufd xmm0, xmm0, 0x00 - paddd xmm0, xmm1 - movdqa xmmword ptr [rsp+0x110], xmm0 - pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] - pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] - pcmpgtd xmm1, xmm0 - shr r8, 32 - movd xmm2, r8d - pshufd xmm2, xmm2, 0x00 - psubd xmm2, xmm1 - movdqa xmmword ptr [rsp+0x120], xmm2 - mov rbx, qword ptr [rbp+0x50] - mov r15, rdx - shl r15, 6 - movzx r13d, byte ptr [rbp+0x38] - movzx r12d, byte ptr [rbp+0x48] - cmp rsi, 4 - jc 3f -2: - movdqu xmm3, xmmword ptr [rcx] - pshufd xmm0, xmm3, 0x00 - pshufd xmm1, xmm3, 0x55 - pshufd xmm2, xmm3, 0xAA - pshufd xmm3, xmm3, 0xFF - movdqu xmm7, xmmword ptr [rcx+0x10] - pshufd xmm4, xmm7, 0x00 - pshufd xmm5, xmm7, 0x55 - pshufd xmm6, xmm7, 0xAA - pshufd xmm7, xmm7, 0xFF - mov r8, qword ptr [rdi] - mov r9, qword ptr [rdi+0x8] - mov r10, qword ptr [rdi+0x10] - mov r11, qword ptr [rdi+0x18] - movzx eax, byte ptr [rbp+0x40] - or eax, r13d - xor edx, edx -9: - mov r14d, eax - or eax, r12d - add rdx, 64 - cmp rdx, r15 - cmovne eax, r14d - movdqu xmm8, xmmword ptr [r8+rdx-0x40] - movdqu xmm9, xmmword ptr [r9+rdx-0x40] - movdqu xmm10, xmmword ptr [r10+rdx-0x40] - movdqu xmm11, xmmword ptr [r11+rdx-0x40] - movdqa xmm12, xmm8 - punpckldq xmm8, xmm9 - punpckhdq xmm12, xmm9 - movdqa xmm14, xmm10 - punpckldq xmm10, xmm11 - punpckhdq xmm14, xmm11 - movdqa xmm9, xmm8 - punpcklqdq xmm8, xmm10 - punpckhqdq xmm9, xmm10 - movdqa xmm13, xmm12 - punpcklqdq xmm12, xmm14 - punpckhqdq xmm13, xmm14 - movdqa xmmword ptr [rsp], xmm8 - movdqa xmmword ptr [rsp+0x10], xmm9 - movdqa xmmword ptr [rsp+0x20], xmm12 - movdqa xmmword ptr [rsp+0x30], xmm13 - movdqu xmm8, xmmword ptr [r8+rdx-0x30] - movdqu xmm9, xmmword ptr [r9+rdx-0x30] - movdqu xmm10, xmmword ptr [r10+rdx-0x30] - movdqu xmm11, xmmword ptr [r11+rdx-0x30] - movdqa xmm12, xmm8 - punpckldq xmm8, xmm9 - punpckhdq xmm12, xmm9 - movdqa xmm14, xmm10 - punpckldq xmm10, xmm11 - punpckhdq xmm14, xmm11 - movdqa xmm9, xmm8 - punpcklqdq xmm8, xmm10 - punpckhqdq xmm9, xmm10 - movdqa xmm13, xmm12 - punpcklqdq xmm12, xmm14 - punpckhqdq xmm13, xmm14 - movdqa xmmword ptr [rsp+0x40], xmm8 - movdqa xmmword ptr [rsp+0x50], xmm9 - movdqa xmmword ptr [rsp+0x60], xmm12 - movdqa xmmword ptr [rsp+0x70], xmm13 - movdqu xmm8, xmmword ptr [r8+rdx-0x20] - movdqu xmm9, xmmword ptr [r9+rdx-0x20] - movdqu xmm10, xmmword ptr [r10+rdx-0x20] - movdqu xmm11, xmmword ptr [r11+rdx-0x20] - movdqa xmm12, xmm8 - punpckldq xmm8, xmm9 - punpckhdq xmm12, xmm9 - movdqa xmm14, xmm10 - punpckldq xmm10, xmm11 - punpckhdq xmm14, xmm11 - movdqa xmm9, xmm8 - punpcklqdq xmm8, xmm10 - punpckhqdq xmm9, xmm10 - movdqa xmm13, xmm12 - punpcklqdq xmm12, xmm14 - punpckhqdq xmm13, xmm14 - movdqa xmmword ptr [rsp+0x80], xmm8 - movdqa xmmword ptr [rsp+0x90], xmm9 - movdqa xmmword ptr [rsp+0xA0], xmm12 - movdqa xmmword ptr [rsp+0xB0], xmm13 - movdqu xmm8, xmmword ptr [r8+rdx-0x10] - movdqu xmm9, xmmword ptr [r9+rdx-0x10] - movdqu xmm10, xmmword ptr [r10+rdx-0x10] - movdqu xmm11, xmmword ptr [r11+rdx-0x10] - movdqa xmm12, xmm8 - punpckldq xmm8, xmm9 - punpckhdq xmm12, xmm9 - movdqa xmm14, xmm10 - punpckldq xmm10, xmm11 - punpckhdq xmm14, xmm11 - movdqa xmm9, xmm8 - punpcklqdq xmm8, xmm10 - punpckhqdq xmm9, xmm10 - movdqa xmm13, xmm12 - punpcklqdq xmm12, xmm14 - punpckhqdq xmm13, xmm14 - movdqa xmmword ptr [rsp+0xC0], xmm8 - movdqa xmmword ptr [rsp+0xD0], xmm9 - movdqa xmmword ptr [rsp+0xE0], xmm12 - movdqa xmmword ptr [rsp+0xF0], xmm13 - movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip] - movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip] - movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip] - movdqa xmm12, xmmword ptr [rsp+0x110] - movdqa xmm13, xmmword ptr [rsp+0x120] - movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip] - movd xmm15, eax - pshufd xmm15, xmm15, 0x00 - prefetcht0 [r8+rdx+0x80] - prefetcht0 [r9+rdx+0x80] - prefetcht0 [r10+rdx+0x80] - prefetcht0 [r11+rdx+0x80] - paddd xmm0, xmmword ptr [rsp] - paddd xmm1, xmmword ptr [rsp+0x20] - paddd xmm2, xmmword ptr [rsp+0x40] - paddd xmm3, xmmword ptr [rsp+0x60] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - pshuflw xmm12, xmm12, 0xB1 - pshufhw xmm12, xmm12, 0xB1 - pshuflw xmm13, xmm13, 0xB1 - pshufhw xmm13, xmm13, 0xB1 - pshuflw xmm14, xmm14, 0xB1 - pshufhw xmm14, xmm14, 0xB1 - pshuflw xmm15, xmm15, 0xB1 - pshufhw xmm15, xmm15, 0xB1 - movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0x10] - paddd xmm1, xmmword ptr [rsp+0x30] - paddd xmm2, xmmword ptr [rsp+0x50] - paddd xmm3, xmmword ptr [rsp+0x70] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmm12 - psrld xmm12, 8 - pslld xmm8, 24 - pxor xmm12, xmm8 - movdqa xmm8, xmm13 - psrld xmm13, 8 - pslld xmm8, 24 - pxor xmm13, xmm8 - movdqa xmm8, xmm14 - psrld xmm14, 8 - pslld xmm8, 24 - pxor xmm14, xmm8 - movdqa xmm8, xmm15 - psrld xmm15, 8 - pslld xmm8, 24 - pxor xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0x80] - paddd xmm1, xmmword ptr [rsp+0xA0] - paddd xmm2, xmmword ptr [rsp+0xC0] - paddd xmm3, xmmword ptr [rsp+0xE0] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - pshuflw xmm15, xmm15, 0xB1 - pshufhw xmm15, xmm15, 0xB1 - pshuflw xmm12, xmm12, 0xB1 - pshufhw xmm12, xmm12, 0xB1 - pshuflw xmm13, xmm13, 0xB1 - pshufhw xmm13, xmm13, 0xB1 - pshuflw xmm14, xmm14, 0xB1 - pshufhw xmm14, xmm14, 0xB1 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0x90] - paddd xmm1, xmmword ptr [rsp+0xB0] - paddd xmm2, xmmword ptr [rsp+0xD0] - paddd xmm3, xmmword ptr [rsp+0xF0] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmm15 - psrld xmm15, 8 - pslld xmm8, 24 - pxor xmm15, xmm8 - movdqa xmm8, xmm12 - psrld xmm12, 8 - pslld xmm8, 24 - pxor xmm12, xmm8 - movdqa xmm8, xmm13 - psrld xmm13, 8 - pslld xmm8, 24 - pxor xmm13, xmm8 - movdqa xmm8, xmm14 - psrld xmm14, 8 - pslld xmm8, 24 - pxor xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0x20] - paddd xmm1, xmmword ptr [rsp+0x30] - paddd xmm2, xmmword ptr [rsp+0x70] - paddd xmm3, xmmword ptr [rsp+0x40] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - pshuflw xmm12, xmm12, 0xB1 - pshufhw xmm12, xmm12, 0xB1 - pshuflw xmm13, xmm13, 0xB1 - pshufhw xmm13, xmm13, 0xB1 - pshuflw xmm14, xmm14, 0xB1 - pshufhw xmm14, xmm14, 0xB1 - pshuflw xmm15, xmm15, 0xB1 - pshufhw xmm15, xmm15, 0xB1 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0x60] - paddd xmm1, xmmword ptr [rsp+0xA0] - paddd xmm2, xmmword ptr [rsp] - paddd xmm3, xmmword ptr [rsp+0xD0] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmm12 - psrld xmm12, 8 - pslld xmm8, 24 - pxor xmm12, xmm8 - movdqa xmm8, xmm13 - psrld xmm13, 8 - pslld xmm8, 24 - pxor xmm13, xmm8 - movdqa xmm8, xmm14 - psrld xmm14, 8 - pslld xmm8, 24 - pxor xmm14, xmm8 - movdqa xmm8, xmm15 - psrld xmm15, 8 - pslld xmm8, 24 - pxor xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0x10] - paddd xmm1, xmmword ptr [rsp+0xC0] - paddd xmm2, xmmword ptr [rsp+0x90] - paddd xmm3, xmmword ptr [rsp+0xF0] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - pshuflw xmm15, xmm15, 0xB1 - pshufhw xmm15, xmm15, 0xB1 - pshuflw xmm12, xmm12, 0xB1 - pshufhw xmm12, xmm12, 0xB1 - pshuflw xmm13, xmm13, 0xB1 - pshufhw xmm13, xmm13, 0xB1 - pshuflw xmm14, xmm14, 0xB1 - pshufhw xmm14, xmm14, 0xB1 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0xB0] - paddd xmm1, xmmword ptr [rsp+0x50] - paddd xmm2, xmmword ptr [rsp+0xE0] - paddd xmm3, xmmword ptr [rsp+0x80] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmm15 - psrld xmm15, 8 - pslld xmm8, 24 - pxor xmm15, xmm8 - movdqa xmm8, xmm12 - psrld xmm12, 8 - pslld xmm8, 24 - pxor xmm12, xmm8 - movdqa xmm8, xmm13 - psrld xmm13, 8 - pslld xmm8, 24 - pxor xmm13, xmm8 - movdqa xmm8, xmm14 - psrld xmm14, 8 - pslld xmm8, 24 - pxor xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0x30] - paddd xmm1, xmmword ptr [rsp+0xA0] - paddd xmm2, xmmword ptr [rsp+0xD0] - paddd xmm3, xmmword ptr [rsp+0x70] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - pshuflw xmm12, xmm12, 0xB1 - pshufhw xmm12, xmm12, 0xB1 - pshuflw xmm13, xmm13, 0xB1 - pshufhw xmm13, xmm13, 0xB1 - pshuflw xmm14, xmm14, 0xB1 - pshufhw xmm14, xmm14, 0xB1 - pshuflw xmm15, xmm15, 0xB1 - pshufhw xmm15, xmm15, 0xB1 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0x40] - paddd xmm1, xmmword ptr [rsp+0xC0] - paddd xmm2, xmmword ptr [rsp+0x20] - paddd xmm3, xmmword ptr [rsp+0xE0] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmm12 - psrld xmm12, 8 - pslld xmm8, 24 - pxor xmm12, xmm8 - movdqa xmm8, xmm13 - psrld xmm13, 8 - pslld xmm8, 24 - pxor xmm13, xmm8 - movdqa xmm8, xmm14 - psrld xmm14, 8 - pslld xmm8, 24 - pxor xmm14, xmm8 - movdqa xmm8, xmm15 - psrld xmm15, 8 - pslld xmm8, 24 - pxor xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0x60] - paddd xmm1, xmmword ptr [rsp+0x90] - paddd xmm2, xmmword ptr [rsp+0xB0] - paddd xmm3, xmmword ptr [rsp+0x80] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - pshuflw xmm15, xmm15, 0xB1 - pshufhw xmm15, xmm15, 0xB1 - pshuflw xmm12, xmm12, 0xB1 - pshufhw xmm12, xmm12, 0xB1 - pshuflw xmm13, xmm13, 0xB1 - pshufhw xmm13, xmm13, 0xB1 - pshuflw xmm14, xmm14, 0xB1 - pshufhw xmm14, xmm14, 0xB1 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0x50] - paddd xmm1, xmmword ptr [rsp] - paddd xmm2, xmmword ptr [rsp+0xF0] - paddd xmm3, xmmword ptr [rsp+0x10] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmm15 - psrld xmm15, 8 - pslld xmm8, 24 - pxor xmm15, xmm8 - movdqa xmm8, xmm12 - psrld xmm12, 8 - pslld xmm8, 24 - pxor xmm12, xmm8 - movdqa xmm8, xmm13 - psrld xmm13, 8 - pslld xmm8, 24 - pxor xmm13, xmm8 - movdqa xmm8, xmm14 - psrld xmm14, 8 - pslld xmm8, 24 - pxor xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0xA0] - paddd xmm1, xmmword ptr [rsp+0xC0] - paddd xmm2, xmmword ptr [rsp+0xE0] - paddd xmm3, xmmword ptr [rsp+0xD0] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - pshuflw xmm12, xmm12, 0xB1 - pshufhw xmm12, xmm12, 0xB1 - pshuflw xmm13, xmm13, 0xB1 - pshufhw xmm13, xmm13, 0xB1 - pshuflw xmm14, xmm14, 0xB1 - pshufhw xmm14, xmm14, 0xB1 - pshuflw xmm15, xmm15, 0xB1 - pshufhw xmm15, xmm15, 0xB1 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0x70] - paddd xmm1, xmmword ptr [rsp+0x90] - paddd xmm2, xmmword ptr [rsp+0x30] - paddd xmm3, xmmword ptr [rsp+0xF0] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmm12 - psrld xmm12, 8 - pslld xmm8, 24 - pxor xmm12, xmm8 - movdqa xmm8, xmm13 - psrld xmm13, 8 - pslld xmm8, 24 - pxor xmm13, xmm8 - movdqa xmm8, xmm14 - psrld xmm14, 8 - pslld xmm8, 24 - pxor xmm14, xmm8 - movdqa xmm8, xmm15 - psrld xmm15, 8 - pslld xmm8, 24 - pxor xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0x40] - paddd xmm1, xmmword ptr [rsp+0xB0] - paddd xmm2, xmmword ptr [rsp+0x50] - paddd xmm3, xmmword ptr [rsp+0x10] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - pshuflw xmm15, xmm15, 0xB1 - pshufhw xmm15, xmm15, 0xB1 - pshuflw xmm12, xmm12, 0xB1 - pshufhw xmm12, xmm12, 0xB1 - pshuflw xmm13, xmm13, 0xB1 - pshufhw xmm13, xmm13, 0xB1 - pshuflw xmm14, xmm14, 0xB1 - pshufhw xmm14, xmm14, 0xB1 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp] - paddd xmm1, xmmword ptr [rsp+0x20] - paddd xmm2, xmmword ptr [rsp+0x80] - paddd xmm3, xmmword ptr [rsp+0x60] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmm15 - psrld xmm15, 8 - pslld xmm8, 24 - pxor xmm15, xmm8 - movdqa xmm8, xmm12 - psrld xmm12, 8 - pslld xmm8, 24 - pxor xmm12, xmm8 - movdqa xmm8, xmm13 - psrld xmm13, 8 - pslld xmm8, 24 - pxor xmm13, xmm8 - movdqa xmm8, xmm14 - psrld xmm14, 8 - pslld xmm8, 24 - pxor xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0xC0] - paddd xmm1, xmmword ptr [rsp+0x90] - paddd xmm2, xmmword ptr [rsp+0xF0] - paddd xmm3, xmmword ptr [rsp+0xE0] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - pshuflw xmm12, xmm12, 0xB1 - pshufhw xmm12, xmm12, 0xB1 - pshuflw xmm13, xmm13, 0xB1 - pshufhw xmm13, xmm13, 0xB1 - pshuflw xmm14, xmm14, 0xB1 - pshufhw xmm14, xmm14, 0xB1 - pshuflw xmm15, xmm15, 0xB1 - pshufhw xmm15, xmm15, 0xB1 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0xD0] - paddd xmm1, xmmword ptr [rsp+0xB0] - paddd xmm2, xmmword ptr [rsp+0xA0] - paddd xmm3, xmmword ptr [rsp+0x80] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmm12 - psrld xmm12, 8 - pslld xmm8, 24 - pxor xmm12, xmm8 - movdqa xmm8, xmm13 - psrld xmm13, 8 - pslld xmm8, 24 - pxor xmm13, xmm8 - movdqa xmm8, xmm14 - psrld xmm14, 8 - pslld xmm8, 24 - pxor xmm14, xmm8 - movdqa xmm8, xmm15 - psrld xmm15, 8 - pslld xmm8, 24 - pxor xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0x70] - paddd xmm1, xmmword ptr [rsp+0x50] - paddd xmm2, xmmword ptr [rsp] - paddd xmm3, xmmword ptr [rsp+0x60] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - pshuflw xmm15, xmm15, 0xB1 - pshufhw xmm15, xmm15, 0xB1 - pshuflw xmm12, xmm12, 0xB1 - pshufhw xmm12, xmm12, 0xB1 - pshuflw xmm13, xmm13, 0xB1 - pshufhw xmm13, xmm13, 0xB1 - pshuflw xmm14, xmm14, 0xB1 - pshufhw xmm14, xmm14, 0xB1 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0x20] - paddd xmm1, xmmword ptr [rsp+0x30] - paddd xmm2, xmmword ptr [rsp+0x10] - paddd xmm3, xmmword ptr [rsp+0x40] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmm15 - psrld xmm15, 8 - pslld xmm8, 24 - pxor xmm15, xmm8 - movdqa xmm8, xmm12 - psrld xmm12, 8 - pslld xmm8, 24 - pxor xmm12, xmm8 - movdqa xmm8, xmm13 - psrld xmm13, 8 - pslld xmm8, 24 - pxor xmm13, xmm8 - movdqa xmm8, xmm14 - psrld xmm14, 8 - pslld xmm8, 24 - pxor xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0x90] - paddd xmm1, xmmword ptr [rsp+0xB0] - paddd xmm2, xmmword ptr [rsp+0x80] - paddd xmm3, xmmword ptr [rsp+0xF0] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - pshuflw xmm12, xmm12, 0xB1 - pshufhw xmm12, xmm12, 0xB1 - pshuflw xmm13, xmm13, 0xB1 - pshufhw xmm13, xmm13, 0xB1 - pshuflw xmm14, xmm14, 0xB1 - pshufhw xmm14, xmm14, 0xB1 - pshuflw xmm15, xmm15, 0xB1 - pshufhw xmm15, xmm15, 0xB1 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0xE0] - paddd xmm1, xmmword ptr [rsp+0x50] - paddd xmm2, xmmword ptr [rsp+0xC0] - paddd xmm3, xmmword ptr [rsp+0x10] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmm12 - psrld xmm12, 8 - pslld xmm8, 24 - pxor xmm12, xmm8 - movdqa xmm8, xmm13 - psrld xmm13, 8 - pslld xmm8, 24 - pxor xmm13, xmm8 - movdqa xmm8, xmm14 - psrld xmm14, 8 - pslld xmm8, 24 - pxor xmm14, xmm8 - movdqa xmm8, xmm15 - psrld xmm15, 8 - pslld xmm8, 24 - pxor xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0xD0] - paddd xmm1, xmmword ptr [rsp] - paddd xmm2, xmmword ptr [rsp+0x20] - paddd xmm3, xmmword ptr [rsp+0x40] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - pshuflw xmm15, xmm15, 0xB1 - pshufhw xmm15, xmm15, 0xB1 - pshuflw xmm12, xmm12, 0xB1 - pshufhw xmm12, xmm12, 0xB1 - pshuflw xmm13, xmm13, 0xB1 - pshufhw xmm13, xmm13, 0xB1 - pshuflw xmm14, xmm14, 0xB1 - pshufhw xmm14, xmm14, 0xB1 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0x30] - paddd xmm1, xmmword ptr [rsp+0xA0] - paddd xmm2, xmmword ptr [rsp+0x60] - paddd xmm3, xmmword ptr [rsp+0x70] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmm15 - psrld xmm15, 8 - pslld xmm8, 24 - pxor xmm15, xmm8 - movdqa xmm8, xmm12 - psrld xmm12, 8 - pslld xmm8, 24 - pxor xmm12, xmm8 - movdqa xmm8, xmm13 - psrld xmm13, 8 - pslld xmm8, 24 - pxor xmm13, xmm8 - movdqa xmm8, xmm14 - psrld xmm14, 8 - pslld xmm8, 24 - pxor xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0xB0] - paddd xmm1, xmmword ptr [rsp+0x50] - paddd xmm2, xmmword ptr [rsp+0x10] - paddd xmm3, xmmword ptr [rsp+0x80] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - pshuflw xmm12, xmm12, 0xB1 - pshufhw xmm12, xmm12, 0xB1 - pshuflw xmm13, xmm13, 0xB1 - pshufhw xmm13, xmm13, 0xB1 - pshuflw xmm14, xmm14, 0xB1 - pshufhw xmm14, xmm14, 0xB1 - pshuflw xmm15, xmm15, 0xB1 - pshufhw xmm15, xmm15, 0xB1 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0xF0] - paddd xmm1, xmmword ptr [rsp] - paddd xmm2, xmmword ptr [rsp+0x90] - paddd xmm3, xmmword ptr [rsp+0x60] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmm12 - psrld xmm12, 8 - pslld xmm8, 24 - pxor xmm12, xmm8 - movdqa xmm8, xmm13 - psrld xmm13, 8 - pslld xmm8, 24 - pxor xmm13, xmm8 - movdqa xmm8, xmm14 - psrld xmm14, 8 - pslld xmm8, 24 - pxor xmm14, xmm8 - movdqa xmm8, xmm15 - psrld xmm15, 8 - pslld xmm8, 24 - pxor xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0xE0] - paddd xmm1, xmmword ptr [rsp+0x20] - paddd xmm2, xmmword ptr [rsp+0x30] - paddd xmm3, xmmword ptr [rsp+0x70] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - pshuflw xmm15, xmm15, 0xB1 - pshufhw xmm15, xmm15, 0xB1 - pshuflw xmm12, xmm12, 0xB1 - pshufhw xmm12, xmm12, 0xB1 - pshuflw xmm13, xmm13, 0xB1 - pshufhw xmm13, xmm13, 0xB1 - pshuflw xmm14, xmm14, 0xB1 - pshufhw xmm14, xmm14, 0xB1 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0xA0] - paddd xmm1, xmmword ptr [rsp+0xC0] - paddd xmm2, xmmword ptr [rsp+0x40] - paddd xmm3, xmmword ptr [rsp+0xD0] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmm15 - psrld xmm15, 8 - pslld xmm8, 24 - pxor xmm15, xmm8 - movdqa xmm8, xmm12 - psrld xmm12, 8 - pslld xmm8, 24 - pxor xmm12, xmm8 - movdqa xmm8, xmm13 - psrld xmm13, 8 - pslld xmm8, 24 - pxor xmm13, xmm8 - movdqa xmm8, xmm14 - psrld xmm14, 8 - pslld xmm8, 24 - pxor xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - pxor xmm0, xmm8 - pxor xmm1, xmm9 - pxor xmm2, xmm10 - pxor xmm3, xmm11 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - pxor xmm4, xmm12 - pxor xmm5, xmm13 - pxor xmm6, xmm14 - pxor xmm7, xmm15 - mov eax, r13d - jne 9b - movdqa xmm9, xmm0 - punpckldq xmm0, xmm1 - punpckhdq xmm9, xmm1 - movdqa xmm11, xmm2 - punpckldq xmm2, xmm3 - punpckhdq xmm11, xmm3 - movdqa xmm1, xmm0 - punpcklqdq xmm0, xmm2 - punpckhqdq xmm1, xmm2 - movdqa xmm3, xmm9 - punpcklqdq xmm9, xmm11 - punpckhqdq xmm3, xmm11 - movdqu xmmword ptr [rbx], xmm0 - movdqu xmmword ptr [rbx+0x20], xmm1 - movdqu xmmword ptr [rbx+0x40], xmm9 - movdqu xmmword ptr [rbx+0x60], xmm3 - movdqa xmm9, xmm4 - punpckldq xmm4, xmm5 - punpckhdq xmm9, xmm5 - movdqa xmm11, xmm6 - punpckldq xmm6, xmm7 - punpckhdq xmm11, xmm7 - movdqa xmm5, xmm4 - punpcklqdq xmm4, xmm6 - punpckhqdq xmm5, xmm6 - movdqa xmm7, xmm9 - punpcklqdq xmm9, xmm11 - punpckhqdq xmm7, xmm11 - movdqu xmmword ptr [rbx+0x10], xmm4 - movdqu xmmword ptr [rbx+0x30], xmm5 - movdqu xmmword ptr [rbx+0x50], xmm9 - movdqu xmmword ptr [rbx+0x70], xmm7 - movdqa xmm1, xmmword ptr [rsp+0x110] - movdqa xmm0, xmm1 - paddd xmm1, xmmword ptr [rsp+0x150] - movdqa xmmword ptr [rsp+0x110], xmm1 - pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] - pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] - pcmpgtd xmm0, xmm1 - movdqa xmm1, xmmword ptr [rsp+0x120] - psubd xmm1, xmm0 - movdqa xmmword ptr [rsp+0x120], xmm1 - add rbx, 128 - add rdi, 32 - sub rsi, 4 - cmp rsi, 4 - jnc 2b - test rsi, rsi - jnz 3f -4: - mov rsp, rbp - pop rbp - pop rbx - pop r12 - pop r13 - pop r14 - pop r15 - ret -.p2align 5 -3: - test esi, 0x2 - je 3f - movups xmm0, xmmword ptr [rcx] - movups xmm1, xmmword ptr [rcx+0x10] - movaps xmm8, xmm0 - movaps xmm9, xmm1 - movd xmm13, dword ptr [rsp+0x110] - movd xmm14, dword ptr [rsp+0x120] - punpckldq xmm13, xmm14 - movaps xmmword ptr [rsp], xmm13 - movd xmm14, dword ptr [rsp+0x114] - movd xmm13, dword ptr [rsp+0x124] - punpckldq xmm14, xmm13 - movaps xmmword ptr [rsp+0x10], xmm14 - mov r8, qword ptr [rdi] - mov r9, qword ptr [rdi+0x8] - movzx eax, byte ptr [rbp+0x40] - or eax, r13d - xor edx, edx -2: - mov r14d, eax - or eax, r12d - add rdx, 64 - cmp rdx, r15 - cmovne eax, r14d - movaps xmm2, xmmword ptr [BLAKE3_IV+rip] - movaps xmm10, xmm2 - movups xmm4, xmmword ptr [r8+rdx-0x40] - movups xmm5, xmmword ptr [r8+rdx-0x30] - movaps xmm3, xmm4 - shufps xmm4, xmm5, 136 - shufps xmm3, xmm5, 221 - movaps xmm5, xmm3 - movups xmm6, xmmword ptr [r8+rdx-0x20] - movups xmm7, xmmword ptr [r8+rdx-0x10] - movaps xmm3, xmm6 - shufps xmm6, xmm7, 136 - pshufd xmm6, xmm6, 0x93 - shufps xmm3, xmm7, 221 - pshufd xmm7, xmm3, 0x93 - movups xmm12, xmmword ptr [r9+rdx-0x40] - movups xmm13, xmmword ptr [r9+rdx-0x30] - movaps xmm11, xmm12 - shufps xmm12, xmm13, 136 - shufps xmm11, xmm13, 221 - movaps xmm13, xmm11 - movups xmm14, xmmword ptr [r9+rdx-0x20] - movups xmm15, xmmword ptr [r9+rdx-0x10] - movaps xmm11, xmm14 - shufps xmm14, xmm15, 136 - pshufd xmm14, xmm14, 0x93 - shufps xmm11, xmm15, 221 - pshufd xmm15, xmm11, 0x93 - shl rax, 0x20 - or rax, 0x40 - movd xmm3, rax - movdqa xmmword ptr [rsp+0x20], xmm3 - movaps xmm3, xmmword ptr [rsp] - movaps xmm11, xmmword ptr [rsp+0x10] - punpcklqdq xmm3, xmmword ptr [rsp+0x20] - punpcklqdq xmm11, xmmword ptr [rsp+0x20] - mov al, 7 -9: - paddd xmm0, xmm4 - paddd xmm8, xmm12 - movaps xmmword ptr [rsp+0x20], xmm4 - movaps xmmword ptr [rsp+0x30], xmm12 - paddd xmm0, xmm1 - paddd xmm8, xmm9 - pxor xmm3, xmm0 - pxor xmm11, xmm8 - pshuflw xmm3, xmm3, 0xB1 - pshufhw xmm3, xmm3, 0xB1 - pshuflw xmm11, xmm11, 0xB1 - pshufhw xmm11, xmm11, 0xB1 - paddd xmm2, xmm3 - paddd xmm10, xmm11 - pxor xmm1, xmm2 - pxor xmm9, xmm10 - movdqa xmm4, xmm1 - pslld xmm1, 20 - psrld xmm4, 12 - por xmm1, xmm4 - movdqa xmm4, xmm9 - pslld xmm9, 20 - psrld xmm4, 12 - por xmm9, xmm4 - paddd xmm0, xmm5 - paddd xmm8, xmm13 - movaps xmmword ptr [rsp+0x40], xmm5 - movaps xmmword ptr [rsp+0x50], xmm13 - paddd xmm0, xmm1 - paddd xmm8, xmm9 - pxor xmm3, xmm0 - pxor xmm11, xmm8 - movdqa xmm13, xmm3 - psrld xmm3, 8 - pslld xmm13, 24 - pxor xmm3, xmm13 - movdqa xmm13, xmm11 - psrld xmm11, 8 - pslld xmm13, 24 - pxor xmm11, xmm13 - paddd xmm2, xmm3 - paddd xmm10, xmm11 - pxor xmm1, xmm2 - pxor xmm9, xmm10 - movdqa xmm4, xmm1 - pslld xmm1, 25 - psrld xmm4, 7 - por xmm1, xmm4 - movdqa xmm4, xmm9 - pslld xmm9, 25 - psrld xmm4, 7 - por xmm9, xmm4 - pshufd xmm0, xmm0, 0x93 - pshufd xmm8, xmm8, 0x93 - pshufd xmm3, xmm3, 0x4E - pshufd xmm11, xmm11, 0x4E - pshufd xmm2, xmm2, 0x39 - pshufd xmm10, xmm10, 0x39 - paddd xmm0, xmm6 - paddd xmm8, xmm14 - paddd xmm0, xmm1 - paddd xmm8, xmm9 - pxor xmm3, xmm0 - pxor xmm11, xmm8 - pshuflw xmm3, xmm3, 0xB1 - pshufhw xmm3, xmm3, 0xB1 - pshuflw xmm11, xmm11, 0xB1 - pshufhw xmm11, xmm11, 0xB1 - paddd xmm2, xmm3 - paddd xmm10, xmm11 - pxor xmm1, xmm2 - pxor xmm9, xmm10 - movdqa xmm4, xmm1 - pslld xmm1, 20 - psrld xmm4, 12 - por xmm1, xmm4 - movdqa xmm4, xmm9 - pslld xmm9, 20 - psrld xmm4, 12 - por xmm9, xmm4 - paddd xmm0, xmm7 - paddd xmm8, xmm15 - paddd xmm0, xmm1 - paddd xmm8, xmm9 - pxor xmm3, xmm0 - pxor xmm11, xmm8 - movdqa xmm13, xmm3 - psrld xmm3, 8 - pslld xmm13, 24 - pxor xmm3, xmm13 - movdqa xmm13, xmm11 - psrld xmm11, 8 - pslld xmm13, 24 - pxor xmm11, xmm13 - paddd xmm2, xmm3 - paddd xmm10, xmm11 - pxor xmm1, xmm2 - pxor xmm9, xmm10 - movdqa xmm4, xmm1 - pslld xmm1, 25 - psrld xmm4, 7 - por xmm1, xmm4 - movdqa xmm4, xmm9 - pslld xmm9, 25 - psrld xmm4, 7 - por xmm9, xmm4 - pshufd xmm0, xmm0, 0x39 - pshufd xmm8, xmm8, 0x39 - pshufd xmm3, xmm3, 0x4E - pshufd xmm11, xmm11, 0x4E - pshufd xmm2, xmm2, 0x93 - pshufd xmm10, xmm10, 0x93 - dec al - je 9f - movdqa xmm12, xmmword ptr [rsp+0x20] - movdqa xmm5, xmmword ptr [rsp+0x40] - pshufd xmm13, xmm12, 0x0F - shufps xmm12, xmm5, 214 - pshufd xmm4, xmm12, 0x39 - movdqa xmm12, xmm6 - shufps xmm12, xmm7, 250 - pand xmm13, xmmword ptr [PBLENDW_0x33_MASK+rip] - pand xmm12, xmmword ptr [PBLENDW_0xCC_MASK+rip] - por xmm13, xmm12 - movdqa xmmword ptr [rsp+0x20], xmm13 - movdqa xmm12, xmm7 - punpcklqdq xmm12, xmm5 - movdqa xmm13, xmm6 - pand xmm12, xmmword ptr [PBLENDW_0x3F_MASK+rip] - pand xmm13, xmmword ptr [PBLENDW_0xC0_MASK+rip] - por xmm12, xmm13 - pshufd xmm12, xmm12, 0x78 - punpckhdq xmm5, xmm7 - punpckldq xmm6, xmm5 - pshufd xmm7, xmm6, 0x1E - movdqa xmmword ptr [rsp+0x40], xmm12 - movdqa xmm5, xmmword ptr [rsp+0x30] - movdqa xmm13, xmmword ptr [rsp+0x50] - pshufd xmm6, xmm5, 0x0F - shufps xmm5, xmm13, 214 - pshufd xmm12, xmm5, 0x39 - movdqa xmm5, xmm14 - shufps xmm5, xmm15, 250 - pand xmm6, xmmword ptr [PBLENDW_0x33_MASK+rip] - pand xmm5, xmmword ptr [PBLENDW_0xCC_MASK+rip] - por xmm6, xmm5 - movdqa xmm5, xmm15 - punpcklqdq xmm5, xmm13 - movdqa xmmword ptr [rsp+0x30], xmm2 - movdqa xmm2, xmm14 - pand xmm5, xmmword ptr [PBLENDW_0x3F_MASK+rip] - pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip] - por xmm5, xmm2 - movdqa xmm2, xmmword ptr [rsp+0x30] - pshufd xmm5, xmm5, 0x78 - punpckhdq xmm13, xmm15 - punpckldq xmm14, xmm13 - pshufd xmm15, xmm14, 0x1E - movdqa xmm13, xmm6 - movdqa xmm14, xmm5 - movdqa xmm5, xmmword ptr [rsp+0x20] - movdqa xmm6, xmmword ptr [rsp+0x40] - jmp 9b -9: - pxor xmm0, xmm2 - pxor xmm1, xmm3 - pxor xmm8, xmm10 - pxor xmm9, xmm11 - mov eax, r13d - cmp rdx, r15 - jne 2b - movups xmmword ptr [rbx], xmm0 - movups xmmword ptr [rbx+0x10], xmm1 - movups xmmword ptr [rbx+0x20], xmm8 - movups xmmword ptr [rbx+0x30], xmm9 - mov eax, dword ptr [rsp+0x130] - neg eax - mov r10d, dword ptr [rsp+0x110+8*rax] - mov r11d, dword ptr [rsp+0x120+8*rax] - mov dword ptr [rsp+0x110], r10d - mov dword ptr [rsp+0x120], r11d - add rdi, 16 - add rbx, 64 - sub rsi, 2 -3: - test esi, 0x1 - je 4b - movups xmm0, xmmword ptr [rcx] - movups xmm1, xmmword ptr [rcx+0x10] - movd xmm13, dword ptr [rsp+0x110] - movd xmm14, dword ptr [rsp+0x120] - punpckldq xmm13, xmm14 - mov r8, qword ptr [rdi] - movzx eax, byte ptr [rbp+0x40] - or eax, r13d - xor edx, edx -2: - mov r14d, eax - or eax, r12d - add rdx, 64 - cmp rdx, r15 - cmovne eax, r14d - movaps xmm2, xmmword ptr [BLAKE3_IV+rip] - shl rax, 32 - or rax, 64 - movd xmm12, rax - movdqa xmm3, xmm13 - punpcklqdq xmm3, xmm12 - movups xmm4, xmmword ptr [r8+rdx-0x40] - movups xmm5, xmmword ptr [r8+rdx-0x30] - movaps xmm8, xmm4 - shufps xmm4, xmm5, 136 - shufps xmm8, xmm5, 221 - movaps xmm5, xmm8 - movups xmm6, xmmword ptr [r8+rdx-0x20] - movups xmm7, xmmword ptr [r8+rdx-0x10] - movaps xmm8, xmm6 - shufps xmm6, xmm7, 136 - pshufd xmm6, xmm6, 0x93 - shufps xmm8, xmm7, 221 - pshufd xmm7, xmm8, 0x93 - mov al, 7 -9: - paddd xmm0, xmm4 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshuflw xmm3, xmm3, 0xB1 - pshufhw xmm3, xmm3, 0xB1 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 20 - psrld xmm11, 12 - por xmm1, xmm11 - paddd xmm0, xmm5 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - movdqa xmm14, xmm3 - psrld xmm3, 8 - pslld xmm14, 24 - pxor xmm3, xmm14 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 25 - psrld xmm11, 7 - por xmm1, xmm11 - pshufd xmm0, xmm0, 0x93 - pshufd xmm3, xmm3, 0x4E - pshufd xmm2, xmm2, 0x39 - paddd xmm0, xmm6 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshuflw xmm3, xmm3, 0xB1 - pshufhw xmm3, xmm3, 0xB1 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 20 - psrld xmm11, 12 - por xmm1, xmm11 - paddd xmm0, xmm7 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - movdqa xmm14, xmm3 - psrld xmm3, 8 - pslld xmm14, 24 - pxor xmm3, xmm14 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 25 - psrld xmm11, 7 - por xmm1, xmm11 - pshufd xmm0, xmm0, 0x39 - pshufd xmm3, xmm3, 0x4E - pshufd xmm2, xmm2, 0x93 - dec al - jz 9f - movdqa xmm8, xmm4 - shufps xmm8, xmm5, 214 - pshufd xmm9, xmm4, 0x0F - pshufd xmm4, xmm8, 0x39 - movdqa xmm8, xmm6 - shufps xmm8, xmm7, 250 - pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] - pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] - por xmm9, xmm8 - movdqa xmm8, xmm7 - punpcklqdq xmm8, xmm5 - movdqa xmm10, xmm6 - pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] - pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] - por xmm8, xmm10 - pshufd xmm8, xmm8, 0x78 - punpckhdq xmm5, xmm7 - punpckldq xmm6, xmm5 - pshufd xmm7, xmm6, 0x1E - movdqa xmm5, xmm9 - movdqa xmm6, xmm8 - jmp 9b -9: - pxor xmm0, xmm2 - pxor xmm1, xmm3 - mov eax, r13d - cmp rdx, r15 - jne 2b - movups xmmword ptr [rbx], xmm0 - movups xmmword ptr [rbx+0x10], xmm1 - jmp 4b - -.p2align 6 -blake3_compress_in_place_sse2: -_blake3_compress_in_place_sse2: - _CET_ENDBR - movups xmm0, xmmword ptr [rdi] - movups xmm1, xmmword ptr [rdi+0x10] - movaps xmm2, xmmword ptr [BLAKE3_IV+rip] - shl r8, 32 - add rdx, r8 - movq xmm3, rcx - movq xmm4, rdx - punpcklqdq xmm3, xmm4 - movups xmm4, xmmword ptr [rsi] - movups xmm5, xmmword ptr [rsi+0x10] - movaps xmm8, xmm4 - shufps xmm4, xmm5, 136 - shufps xmm8, xmm5, 221 - movaps xmm5, xmm8 - movups xmm6, xmmword ptr [rsi+0x20] - movups xmm7, xmmword ptr [rsi+0x30] - movaps xmm8, xmm6 - shufps xmm6, xmm7, 136 - pshufd xmm6, xmm6, 0x93 - shufps xmm8, xmm7, 221 - pshufd xmm7, xmm8, 0x93 - mov al, 7 -9: - paddd xmm0, xmm4 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshuflw xmm3, xmm3, 0xB1 - pshufhw xmm3, xmm3, 0xB1 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 20 - psrld xmm11, 12 - por xmm1, xmm11 - paddd xmm0, xmm5 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - movdqa xmm14, xmm3 - psrld xmm3, 8 - pslld xmm14, 24 - pxor xmm3, xmm14 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 25 - psrld xmm11, 7 - por xmm1, xmm11 - pshufd xmm0, xmm0, 0x93 - pshufd xmm3, xmm3, 0x4E - pshufd xmm2, xmm2, 0x39 - paddd xmm0, xmm6 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshuflw xmm3, xmm3, 0xB1 - pshufhw xmm3, xmm3, 0xB1 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 20 - psrld xmm11, 12 - por xmm1, xmm11 - paddd xmm0, xmm7 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - movdqa xmm14, xmm3 - psrld xmm3, 8 - pslld xmm14, 24 - pxor xmm3, xmm14 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 25 - psrld xmm11, 7 - por xmm1, xmm11 - pshufd xmm0, xmm0, 0x39 - pshufd xmm3, xmm3, 0x4E - pshufd xmm2, xmm2, 0x93 - dec al - jz 9f - movdqa xmm8, xmm4 - shufps xmm8, xmm5, 214 - pshufd xmm9, xmm4, 0x0F - pshufd xmm4, xmm8, 0x39 - movdqa xmm8, xmm6 - shufps xmm8, xmm7, 250 - pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] - pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] - por xmm9, xmm8 - movdqa xmm8, xmm7 - punpcklqdq xmm8, xmm5 - movdqa xmm10, xmm6 - pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] - pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] - por xmm8, xmm10 - pshufd xmm8, xmm8, 0x78 - punpckhdq xmm5, xmm7 - punpckldq xmm6, xmm5 - pshufd xmm7, xmm6, 0x1E - movdqa xmm5, xmm9 - movdqa xmm6, xmm8 - jmp 9b -9: - pxor xmm0, xmm2 - pxor xmm1, xmm3 - movups xmmword ptr [rdi], xmm0 - movups xmmword ptr [rdi+0x10], xmm1 - ret - -.p2align 6 -blake3_compress_xof_sse2: -_blake3_compress_xof_sse2: - _CET_ENDBR - movups xmm0, xmmword ptr [rdi] - movups xmm1, xmmword ptr [rdi+0x10] - movaps xmm2, xmmword ptr [BLAKE3_IV+rip] - movzx eax, r8b - movzx edx, dl - shl rax, 32 - add rdx, rax - movq xmm3, rcx - movq xmm4, rdx - punpcklqdq xmm3, xmm4 - movups xmm4, xmmword ptr [rsi] - movups xmm5, xmmword ptr [rsi+0x10] - movaps xmm8, xmm4 - shufps xmm4, xmm5, 136 - shufps xmm8, xmm5, 221 - movaps xmm5, xmm8 - movups xmm6, xmmword ptr [rsi+0x20] - movups xmm7, xmmword ptr [rsi+0x30] - movaps xmm8, xmm6 - shufps xmm6, xmm7, 136 - pshufd xmm6, xmm6, 0x93 - shufps xmm8, xmm7, 221 - pshufd xmm7, xmm8, 0x93 - mov al, 7 -9: - paddd xmm0, xmm4 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshuflw xmm3, xmm3, 0xB1 - pshufhw xmm3, xmm3, 0xB1 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 20 - psrld xmm11, 12 - por xmm1, xmm11 - paddd xmm0, xmm5 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - movdqa xmm14, xmm3 - psrld xmm3, 8 - pslld xmm14, 24 - pxor xmm3, xmm14 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 25 - psrld xmm11, 7 - por xmm1, xmm11 - pshufd xmm0, xmm0, 0x93 - pshufd xmm3, xmm3, 0x4E - pshufd xmm2, xmm2, 0x39 - paddd xmm0, xmm6 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshuflw xmm3, xmm3, 0xB1 - pshufhw xmm3, xmm3, 0xB1 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 20 - psrld xmm11, 12 - por xmm1, xmm11 - paddd xmm0, xmm7 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - movdqa xmm14, xmm3 - psrld xmm3, 8 - pslld xmm14, 24 - pxor xmm3, xmm14 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 25 - psrld xmm11, 7 - por xmm1, xmm11 - pshufd xmm0, xmm0, 0x39 - pshufd xmm3, xmm3, 0x4E - pshufd xmm2, xmm2, 0x93 - dec al - jz 9f - movdqa xmm8, xmm4 - shufps xmm8, xmm5, 214 - pshufd xmm9, xmm4, 0x0F - pshufd xmm4, xmm8, 0x39 - movdqa xmm8, xmm6 - shufps xmm8, xmm7, 250 - pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] - pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] - por xmm9, xmm8 - movdqa xmm8, xmm7 - punpcklqdq xmm8, xmm5 - movdqa xmm10, xmm6 - pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] - pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] - por xmm8, xmm10 - pshufd xmm8, xmm8, 0x78 - punpckhdq xmm5, xmm7 - punpckldq xmm6, xmm5 - pshufd xmm7, xmm6, 0x1E - movdqa xmm5, xmm9 - movdqa xmm6, xmm8 - jmp 9b -9: - movdqu xmm4, xmmword ptr [rdi] - movdqu xmm5, xmmword ptr [rdi+0x10] - pxor xmm0, xmm2 - pxor xmm1, xmm3 - pxor xmm2, xmm4 - pxor xmm3, xmm5 - movups xmmword ptr [r9], xmm0 - movups xmmword ptr [r9+0x10], xmm1 - movups xmmword ptr [r9+0x20], xmm2 - movups xmmword ptr [r9+0x30], xmm3 - ret - - -#ifdef __APPLE__ -.static_data -#else -.section .rodata -#endif -.p2align 6 -BLAKE3_IV: - .long 0x6A09E667, 0xBB67AE85 - .long 0x3C6EF372, 0xA54FF53A -ADD0: - .long 0, 1, 2, 3 -ADD1: - .long 4, 4, 4, 4 -BLAKE3_IV_0: - .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 -BLAKE3_IV_1: - .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 -BLAKE3_IV_2: - .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 -BLAKE3_IV_3: - .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A -BLAKE3_BLOCK_LEN: - .long 64, 64, 64, 64 -CMP_MSB_MASK: - .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 -PBLENDW_0x33_MASK: - .long 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000 -PBLENDW_0xCC_MASK: - .long 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF -PBLENDW_0x3F_MASK: - .long 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 -PBLENDW_0xC0_MASK: - .long 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF diff --git a/thirdparty/BLAKE3/c/blake3_sse2_x86-64_windows_gnu.S b/thirdparty/BLAKE3/c/blake3_sse2_x86-64_windows_gnu.S deleted file mode 100644 index 494c0c6fd..000000000 --- a/thirdparty/BLAKE3/c/blake3_sse2_x86-64_windows_gnu.S +++ /dev/null @@ -1,2332 +0,0 @@ -.intel_syntax noprefix -.global blake3_hash_many_sse2 -.global _blake3_hash_many_sse2 -.global blake3_compress_in_place_sse2 -.global _blake3_compress_in_place_sse2 -.global blake3_compress_xof_sse2 -.global _blake3_compress_xof_sse2 -.section .text - .p2align 6 -_blake3_hash_many_sse2: -blake3_hash_many_sse2: - push r15 - push r14 - push r13 - push r12 - push rsi - push rdi - push rbx - push rbp - mov rbp, rsp - sub rsp, 528 - and rsp, 0xFFFFFFFFFFFFFFC0 - movdqa xmmword ptr [rsp+0x170], xmm6 - movdqa xmmword ptr [rsp+0x180], xmm7 - movdqa xmmword ptr [rsp+0x190], xmm8 - movdqa xmmword ptr [rsp+0x1A0], xmm9 - movdqa xmmword ptr [rsp+0x1B0], xmm10 - movdqa xmmword ptr [rsp+0x1C0], xmm11 - movdqa xmmword ptr [rsp+0x1D0], xmm12 - movdqa xmmword ptr [rsp+0x1E0], xmm13 - movdqa xmmword ptr [rsp+0x1F0], xmm14 - movdqa xmmword ptr [rsp+0x200], xmm15 - mov rdi, rcx - mov rsi, rdx - mov rdx, r8 - mov rcx, r9 - mov r8, qword ptr [rbp+0x68] - movzx r9, byte ptr [rbp+0x70] - neg r9d - movd xmm0, r9d - pshufd xmm0, xmm0, 0x00 - movdqa xmmword ptr [rsp+0x130], xmm0 - movdqa xmm1, xmm0 - pand xmm1, xmmword ptr [ADD0+rip] - pand xmm0, xmmword ptr [ADD1+rip] - movdqa xmmword ptr [rsp+0x150], xmm0 - movd xmm0, r8d - pshufd xmm0, xmm0, 0x00 - paddd xmm0, xmm1 - movdqa xmmword ptr [rsp+0x110], xmm0 - pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] - pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] - pcmpgtd xmm1, xmm0 - shr r8, 32 - movd xmm2, r8d - pshufd xmm2, xmm2, 0x00 - psubd xmm2, xmm1 - movdqa xmmword ptr [rsp+0x120], xmm2 - mov rbx, qword ptr [rbp+0x90] - mov r15, rdx - shl r15, 6 - movzx r13d, byte ptr [rbp+0x78] - movzx r12d, byte ptr [rbp+0x88] - cmp rsi, 4 - jc 3f -2: - movdqu xmm3, xmmword ptr [rcx] - pshufd xmm0, xmm3, 0x00 - pshufd xmm1, xmm3, 0x55 - pshufd xmm2, xmm3, 0xAA - pshufd xmm3, xmm3, 0xFF - movdqu xmm7, xmmword ptr [rcx+0x10] - pshufd xmm4, xmm7, 0x00 - pshufd xmm5, xmm7, 0x55 - pshufd xmm6, xmm7, 0xAA - pshufd xmm7, xmm7, 0xFF - mov r8, qword ptr [rdi] - mov r9, qword ptr [rdi+0x8] - mov r10, qword ptr [rdi+0x10] - mov r11, qword ptr [rdi+0x18] - movzx eax, byte ptr [rbp+0x80] - or eax, r13d - xor edx, edx -9: - mov r14d, eax - or eax, r12d - add rdx, 64 - cmp rdx, r15 - cmovne eax, r14d - movdqu xmm8, xmmword ptr [r8+rdx-0x40] - movdqu xmm9, xmmword ptr [r9+rdx-0x40] - movdqu xmm10, xmmword ptr [r10+rdx-0x40] - movdqu xmm11, xmmword ptr [r11+rdx-0x40] - movdqa xmm12, xmm8 - punpckldq xmm8, xmm9 - punpckhdq xmm12, xmm9 - movdqa xmm14, xmm10 - punpckldq xmm10, xmm11 - punpckhdq xmm14, xmm11 - movdqa xmm9, xmm8 - punpcklqdq xmm8, xmm10 - punpckhqdq xmm9, xmm10 - movdqa xmm13, xmm12 - punpcklqdq xmm12, xmm14 - punpckhqdq xmm13, xmm14 - movdqa xmmword ptr [rsp], xmm8 - movdqa xmmword ptr [rsp+0x10], xmm9 - movdqa xmmword ptr [rsp+0x20], xmm12 - movdqa xmmword ptr [rsp+0x30], xmm13 - movdqu xmm8, xmmword ptr [r8+rdx-0x30] - movdqu xmm9, xmmword ptr [r9+rdx-0x30] - movdqu xmm10, xmmword ptr [r10+rdx-0x30] - movdqu xmm11, xmmword ptr [r11+rdx-0x30] - movdqa xmm12, xmm8 - punpckldq xmm8, xmm9 - punpckhdq xmm12, xmm9 - movdqa xmm14, xmm10 - punpckldq xmm10, xmm11 - punpckhdq xmm14, xmm11 - movdqa xmm9, xmm8 - punpcklqdq xmm8, xmm10 - punpckhqdq xmm9, xmm10 - movdqa xmm13, xmm12 - punpcklqdq xmm12, xmm14 - punpckhqdq xmm13, xmm14 - movdqa xmmword ptr [rsp+0x40], xmm8 - movdqa xmmword ptr [rsp+0x50], xmm9 - movdqa xmmword ptr [rsp+0x60], xmm12 - movdqa xmmword ptr [rsp+0x70], xmm13 - movdqu xmm8, xmmword ptr [r8+rdx-0x20] - movdqu xmm9, xmmword ptr [r9+rdx-0x20] - movdqu xmm10, xmmword ptr [r10+rdx-0x20] - movdqu xmm11, xmmword ptr [r11+rdx-0x20] - movdqa xmm12, xmm8 - punpckldq xmm8, xmm9 - punpckhdq xmm12, xmm9 - movdqa xmm14, xmm10 - punpckldq xmm10, xmm11 - punpckhdq xmm14, xmm11 - movdqa xmm9, xmm8 - punpcklqdq xmm8, xmm10 - punpckhqdq xmm9, xmm10 - movdqa xmm13, xmm12 - punpcklqdq xmm12, xmm14 - punpckhqdq xmm13, xmm14 - movdqa xmmword ptr [rsp+0x80], xmm8 - movdqa xmmword ptr [rsp+0x90], xmm9 - movdqa xmmword ptr [rsp+0xA0], xmm12 - movdqa xmmword ptr [rsp+0xB0], xmm13 - movdqu xmm8, xmmword ptr [r8+rdx-0x10] - movdqu xmm9, xmmword ptr [r9+rdx-0x10] - movdqu xmm10, xmmword ptr [r10+rdx-0x10] - movdqu xmm11, xmmword ptr [r11+rdx-0x10] - movdqa xmm12, xmm8 - punpckldq xmm8, xmm9 - punpckhdq xmm12, xmm9 - movdqa xmm14, xmm10 - punpckldq xmm10, xmm11 - punpckhdq xmm14, xmm11 - movdqa xmm9, xmm8 - punpcklqdq xmm8, xmm10 - punpckhqdq xmm9, xmm10 - movdqa xmm13, xmm12 - punpcklqdq xmm12, xmm14 - punpckhqdq xmm13, xmm14 - movdqa xmmword ptr [rsp+0xC0], xmm8 - movdqa xmmword ptr [rsp+0xD0], xmm9 - movdqa xmmword ptr [rsp+0xE0], xmm12 - movdqa xmmword ptr [rsp+0xF0], xmm13 - movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip] - movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip] - movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip] - movdqa xmm12, xmmword ptr [rsp+0x110] - movdqa xmm13, xmmword ptr [rsp+0x120] - movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip] - movd xmm15, eax - pshufd xmm15, xmm15, 0x00 - prefetcht0 [r8+rdx+0x80] - prefetcht0 [r9+rdx+0x80] - prefetcht0 [r10+rdx+0x80] - prefetcht0 [r11+rdx+0x80] - paddd xmm0, xmmword ptr [rsp] - paddd xmm1, xmmword ptr [rsp+0x20] - paddd xmm2, xmmword ptr [rsp+0x40] - paddd xmm3, xmmword ptr [rsp+0x60] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - pshuflw xmm12, xmm12, 0xB1 - pshufhw xmm12, xmm12, 0xB1 - pshuflw xmm13, xmm13, 0xB1 - pshufhw xmm13, xmm13, 0xB1 - pshuflw xmm14, xmm14, 0xB1 - pshufhw xmm14, xmm14, 0xB1 - pshuflw xmm15, xmm15, 0xB1 - pshufhw xmm15, xmm15, 0xB1 - movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0x10] - paddd xmm1, xmmword ptr [rsp+0x30] - paddd xmm2, xmmword ptr [rsp+0x50] - paddd xmm3, xmmword ptr [rsp+0x70] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmm12 - psrld xmm12, 8 - pslld xmm8, 24 - pxor xmm12, xmm8 - movdqa xmm8, xmm13 - psrld xmm13, 8 - pslld xmm8, 24 - pxor xmm13, xmm8 - movdqa xmm8, xmm14 - psrld xmm14, 8 - pslld xmm8, 24 - pxor xmm14, xmm8 - movdqa xmm8, xmm15 - psrld xmm15, 8 - pslld xmm8, 24 - pxor xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0x80] - paddd xmm1, xmmword ptr [rsp+0xA0] - paddd xmm2, xmmword ptr [rsp+0xC0] - paddd xmm3, xmmword ptr [rsp+0xE0] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - pshuflw xmm15, xmm15, 0xB1 - pshufhw xmm15, xmm15, 0xB1 - pshuflw xmm12, xmm12, 0xB1 - pshufhw xmm12, xmm12, 0xB1 - pshuflw xmm13, xmm13, 0xB1 - pshufhw xmm13, xmm13, 0xB1 - pshuflw xmm14, xmm14, 0xB1 - pshufhw xmm14, xmm14, 0xB1 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0x90] - paddd xmm1, xmmword ptr [rsp+0xB0] - paddd xmm2, xmmword ptr [rsp+0xD0] - paddd xmm3, xmmword ptr [rsp+0xF0] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmm15 - psrld xmm15, 8 - pslld xmm8, 24 - pxor xmm15, xmm8 - movdqa xmm8, xmm12 - psrld xmm12, 8 - pslld xmm8, 24 - pxor xmm12, xmm8 - movdqa xmm8, xmm13 - psrld xmm13, 8 - pslld xmm8, 24 - pxor xmm13, xmm8 - movdqa xmm8, xmm14 - psrld xmm14, 8 - pslld xmm8, 24 - pxor xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0x20] - paddd xmm1, xmmword ptr [rsp+0x30] - paddd xmm2, xmmword ptr [rsp+0x70] - paddd xmm3, xmmword ptr [rsp+0x40] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - pshuflw xmm12, xmm12, 0xB1 - pshufhw xmm12, xmm12, 0xB1 - pshuflw xmm13, xmm13, 0xB1 - pshufhw xmm13, xmm13, 0xB1 - pshuflw xmm14, xmm14, 0xB1 - pshufhw xmm14, xmm14, 0xB1 - pshuflw xmm15, xmm15, 0xB1 - pshufhw xmm15, xmm15, 0xB1 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0x60] - paddd xmm1, xmmword ptr [rsp+0xA0] - paddd xmm2, xmmword ptr [rsp] - paddd xmm3, xmmword ptr [rsp+0xD0] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmm12 - psrld xmm12, 8 - pslld xmm8, 24 - pxor xmm12, xmm8 - movdqa xmm8, xmm13 - psrld xmm13, 8 - pslld xmm8, 24 - pxor xmm13, xmm8 - movdqa xmm8, xmm14 - psrld xmm14, 8 - pslld xmm8, 24 - pxor xmm14, xmm8 - movdqa xmm8, xmm15 - psrld xmm15, 8 - pslld xmm8, 24 - pxor xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0x10] - paddd xmm1, xmmword ptr [rsp+0xC0] - paddd xmm2, xmmword ptr [rsp+0x90] - paddd xmm3, xmmword ptr [rsp+0xF0] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - pshuflw xmm15, xmm15, 0xB1 - pshufhw xmm15, xmm15, 0xB1 - pshuflw xmm12, xmm12, 0xB1 - pshufhw xmm12, xmm12, 0xB1 - pshuflw xmm13, xmm13, 0xB1 - pshufhw xmm13, xmm13, 0xB1 - pshuflw xmm14, xmm14, 0xB1 - pshufhw xmm14, xmm14, 0xB1 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0xB0] - paddd xmm1, xmmword ptr [rsp+0x50] - paddd xmm2, xmmword ptr [rsp+0xE0] - paddd xmm3, xmmword ptr [rsp+0x80] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmm15 - psrld xmm15, 8 - pslld xmm8, 24 - pxor xmm15, xmm8 - movdqa xmm8, xmm12 - psrld xmm12, 8 - pslld xmm8, 24 - pxor xmm12, xmm8 - movdqa xmm8, xmm13 - psrld xmm13, 8 - pslld xmm8, 24 - pxor xmm13, xmm8 - movdqa xmm8, xmm14 - psrld xmm14, 8 - pslld xmm8, 24 - pxor xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0x30] - paddd xmm1, xmmword ptr [rsp+0xA0] - paddd xmm2, xmmword ptr [rsp+0xD0] - paddd xmm3, xmmword ptr [rsp+0x70] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - pshuflw xmm12, xmm12, 0xB1 - pshufhw xmm12, xmm12, 0xB1 - pshuflw xmm13, xmm13, 0xB1 - pshufhw xmm13, xmm13, 0xB1 - pshuflw xmm14, xmm14, 0xB1 - pshufhw xmm14, xmm14, 0xB1 - pshuflw xmm15, xmm15, 0xB1 - pshufhw xmm15, xmm15, 0xB1 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0x40] - paddd xmm1, xmmword ptr [rsp+0xC0] - paddd xmm2, xmmword ptr [rsp+0x20] - paddd xmm3, xmmword ptr [rsp+0xE0] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmm12 - psrld xmm12, 8 - pslld xmm8, 24 - pxor xmm12, xmm8 - movdqa xmm8, xmm13 - psrld xmm13, 8 - pslld xmm8, 24 - pxor xmm13, xmm8 - movdqa xmm8, xmm14 - psrld xmm14, 8 - pslld xmm8, 24 - pxor xmm14, xmm8 - movdqa xmm8, xmm15 - psrld xmm15, 8 - pslld xmm8, 24 - pxor xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0x60] - paddd xmm1, xmmword ptr [rsp+0x90] - paddd xmm2, xmmword ptr [rsp+0xB0] - paddd xmm3, xmmword ptr [rsp+0x80] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - pshuflw xmm15, xmm15, 0xB1 - pshufhw xmm15, xmm15, 0xB1 - pshuflw xmm12, xmm12, 0xB1 - pshufhw xmm12, xmm12, 0xB1 - pshuflw xmm13, xmm13, 0xB1 - pshufhw xmm13, xmm13, 0xB1 - pshuflw xmm14, xmm14, 0xB1 - pshufhw xmm14, xmm14, 0xB1 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0x50] - paddd xmm1, xmmword ptr [rsp] - paddd xmm2, xmmword ptr [rsp+0xF0] - paddd xmm3, xmmword ptr [rsp+0x10] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmm15 - psrld xmm15, 8 - pslld xmm8, 24 - pxor xmm15, xmm8 - movdqa xmm8, xmm12 - psrld xmm12, 8 - pslld xmm8, 24 - pxor xmm12, xmm8 - movdqa xmm8, xmm13 - psrld xmm13, 8 - pslld xmm8, 24 - pxor xmm13, xmm8 - movdqa xmm8, xmm14 - psrld xmm14, 8 - pslld xmm8, 24 - pxor xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0xA0] - paddd xmm1, xmmword ptr [rsp+0xC0] - paddd xmm2, xmmword ptr [rsp+0xE0] - paddd xmm3, xmmword ptr [rsp+0xD0] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - pshuflw xmm12, xmm12, 0xB1 - pshufhw xmm12, xmm12, 0xB1 - pshuflw xmm13, xmm13, 0xB1 - pshufhw xmm13, xmm13, 0xB1 - pshuflw xmm14, xmm14, 0xB1 - pshufhw xmm14, xmm14, 0xB1 - pshuflw xmm15, xmm15, 0xB1 - pshufhw xmm15, xmm15, 0xB1 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0x70] - paddd xmm1, xmmword ptr [rsp+0x90] - paddd xmm2, xmmword ptr [rsp+0x30] - paddd xmm3, xmmword ptr [rsp+0xF0] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmm12 - psrld xmm12, 8 - pslld xmm8, 24 - pxor xmm12, xmm8 - movdqa xmm8, xmm13 - psrld xmm13, 8 - pslld xmm8, 24 - pxor xmm13, xmm8 - movdqa xmm8, xmm14 - psrld xmm14, 8 - pslld xmm8, 24 - pxor xmm14, xmm8 - movdqa xmm8, xmm15 - psrld xmm15, 8 - pslld xmm8, 24 - pxor xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0x40] - paddd xmm1, xmmword ptr [rsp+0xB0] - paddd xmm2, xmmword ptr [rsp+0x50] - paddd xmm3, xmmword ptr [rsp+0x10] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - pshuflw xmm15, xmm15, 0xB1 - pshufhw xmm15, xmm15, 0xB1 - pshuflw xmm12, xmm12, 0xB1 - pshufhw xmm12, xmm12, 0xB1 - pshuflw xmm13, xmm13, 0xB1 - pshufhw xmm13, xmm13, 0xB1 - pshuflw xmm14, xmm14, 0xB1 - pshufhw xmm14, xmm14, 0xB1 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp] - paddd xmm1, xmmword ptr [rsp+0x20] - paddd xmm2, xmmword ptr [rsp+0x80] - paddd xmm3, xmmword ptr [rsp+0x60] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmm15 - psrld xmm15, 8 - pslld xmm8, 24 - pxor xmm15, xmm8 - movdqa xmm8, xmm12 - psrld xmm12, 8 - pslld xmm8, 24 - pxor xmm12, xmm8 - movdqa xmm8, xmm13 - psrld xmm13, 8 - pslld xmm8, 24 - pxor xmm13, xmm8 - movdqa xmm8, xmm14 - psrld xmm14, 8 - pslld xmm8, 24 - pxor xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0xC0] - paddd xmm1, xmmword ptr [rsp+0x90] - paddd xmm2, xmmword ptr [rsp+0xF0] - paddd xmm3, xmmword ptr [rsp+0xE0] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - pshuflw xmm12, xmm12, 0xB1 - pshufhw xmm12, xmm12, 0xB1 - pshuflw xmm13, xmm13, 0xB1 - pshufhw xmm13, xmm13, 0xB1 - pshuflw xmm14, xmm14, 0xB1 - pshufhw xmm14, xmm14, 0xB1 - pshuflw xmm15, xmm15, 0xB1 - pshufhw xmm15, xmm15, 0xB1 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0xD0] - paddd xmm1, xmmword ptr [rsp+0xB0] - paddd xmm2, xmmword ptr [rsp+0xA0] - paddd xmm3, xmmword ptr [rsp+0x80] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmm12 - psrld xmm12, 8 - pslld xmm8, 24 - pxor xmm12, xmm8 - movdqa xmm8, xmm13 - psrld xmm13, 8 - pslld xmm8, 24 - pxor xmm13, xmm8 - movdqa xmm8, xmm14 - psrld xmm14, 8 - pslld xmm8, 24 - pxor xmm14, xmm8 - movdqa xmm8, xmm15 - psrld xmm15, 8 - pslld xmm8, 24 - pxor xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0x70] - paddd xmm1, xmmword ptr [rsp+0x50] - paddd xmm2, xmmword ptr [rsp] - paddd xmm3, xmmword ptr [rsp+0x60] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - pshuflw xmm15, xmm15, 0xB1 - pshufhw xmm15, xmm15, 0xB1 - pshuflw xmm12, xmm12, 0xB1 - pshufhw xmm12, xmm12, 0xB1 - pshuflw xmm13, xmm13, 0xB1 - pshufhw xmm13, xmm13, 0xB1 - pshuflw xmm14, xmm14, 0xB1 - pshufhw xmm14, xmm14, 0xB1 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0x20] - paddd xmm1, xmmword ptr [rsp+0x30] - paddd xmm2, xmmword ptr [rsp+0x10] - paddd xmm3, xmmword ptr [rsp+0x40] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmm15 - psrld xmm15, 8 - pslld xmm8, 24 - pxor xmm15, xmm8 - movdqa xmm8, xmm12 - psrld xmm12, 8 - pslld xmm8, 24 - pxor xmm12, xmm8 - movdqa xmm8, xmm13 - psrld xmm13, 8 - pslld xmm8, 24 - pxor xmm13, xmm8 - movdqa xmm8, xmm14 - psrld xmm14, 8 - pslld xmm8, 24 - pxor xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0x90] - paddd xmm1, xmmword ptr [rsp+0xB0] - paddd xmm2, xmmword ptr [rsp+0x80] - paddd xmm3, xmmword ptr [rsp+0xF0] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - pshuflw xmm12, xmm12, 0xB1 - pshufhw xmm12, xmm12, 0xB1 - pshuflw xmm13, xmm13, 0xB1 - pshufhw xmm13, xmm13, 0xB1 - pshuflw xmm14, xmm14, 0xB1 - pshufhw xmm14, xmm14, 0xB1 - pshuflw xmm15, xmm15, 0xB1 - pshufhw xmm15, xmm15, 0xB1 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0xE0] - paddd xmm1, xmmword ptr [rsp+0x50] - paddd xmm2, xmmword ptr [rsp+0xC0] - paddd xmm3, xmmword ptr [rsp+0x10] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmm12 - psrld xmm12, 8 - pslld xmm8, 24 - pxor xmm12, xmm8 - movdqa xmm8, xmm13 - psrld xmm13, 8 - pslld xmm8, 24 - pxor xmm13, xmm8 - movdqa xmm8, xmm14 - psrld xmm14, 8 - pslld xmm8, 24 - pxor xmm14, xmm8 - movdqa xmm8, xmm15 - psrld xmm15, 8 - pslld xmm8, 24 - pxor xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0xD0] - paddd xmm1, xmmword ptr [rsp] - paddd xmm2, xmmword ptr [rsp+0x20] - paddd xmm3, xmmword ptr [rsp+0x40] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - pshuflw xmm15, xmm15, 0xB1 - pshufhw xmm15, xmm15, 0xB1 - pshuflw xmm12, xmm12, 0xB1 - pshufhw xmm12, xmm12, 0xB1 - pshuflw xmm13, xmm13, 0xB1 - pshufhw xmm13, xmm13, 0xB1 - pshuflw xmm14, xmm14, 0xB1 - pshufhw xmm14, xmm14, 0xB1 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0x30] - paddd xmm1, xmmword ptr [rsp+0xA0] - paddd xmm2, xmmword ptr [rsp+0x60] - paddd xmm3, xmmword ptr [rsp+0x70] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmm15 - psrld xmm15, 8 - pslld xmm8, 24 - pxor xmm15, xmm8 - movdqa xmm8, xmm12 - psrld xmm12, 8 - pslld xmm8, 24 - pxor xmm12, xmm8 - movdqa xmm8, xmm13 - psrld xmm13, 8 - pslld xmm8, 24 - pxor xmm13, xmm8 - movdqa xmm8, xmm14 - psrld xmm14, 8 - pslld xmm8, 24 - pxor xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0xB0] - paddd xmm1, xmmword ptr [rsp+0x50] - paddd xmm2, xmmword ptr [rsp+0x10] - paddd xmm3, xmmword ptr [rsp+0x80] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - pshuflw xmm12, xmm12, 0xB1 - pshufhw xmm12, xmm12, 0xB1 - pshuflw xmm13, xmm13, 0xB1 - pshufhw xmm13, xmm13, 0xB1 - pshuflw xmm14, xmm14, 0xB1 - pshufhw xmm14, xmm14, 0xB1 - pshuflw xmm15, xmm15, 0xB1 - pshufhw xmm15, xmm15, 0xB1 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0xF0] - paddd xmm1, xmmword ptr [rsp] - paddd xmm2, xmmword ptr [rsp+0x90] - paddd xmm3, xmmword ptr [rsp+0x60] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmm12 - psrld xmm12, 8 - pslld xmm8, 24 - pxor xmm12, xmm8 - movdqa xmm8, xmm13 - psrld xmm13, 8 - pslld xmm8, 24 - pxor xmm13, xmm8 - movdqa xmm8, xmm14 - psrld xmm14, 8 - pslld xmm8, 24 - pxor xmm14, xmm8 - movdqa xmm8, xmm15 - psrld xmm15, 8 - pslld xmm8, 24 - pxor xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0xE0] - paddd xmm1, xmmword ptr [rsp+0x20] - paddd xmm2, xmmword ptr [rsp+0x30] - paddd xmm3, xmmword ptr [rsp+0x70] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - pshuflw xmm15, xmm15, 0xB1 - pshufhw xmm15, xmm15, 0xB1 - pshuflw xmm12, xmm12, 0xB1 - pshufhw xmm12, xmm12, 0xB1 - pshuflw xmm13, xmm13, 0xB1 - pshufhw xmm13, xmm13, 0xB1 - pshuflw xmm14, xmm14, 0xB1 - pshufhw xmm14, xmm14, 0xB1 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0xA0] - paddd xmm1, xmmword ptr [rsp+0xC0] - paddd xmm2, xmmword ptr [rsp+0x40] - paddd xmm3, xmmword ptr [rsp+0xD0] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmm15 - psrld xmm15, 8 - pslld xmm8, 24 - pxor xmm15, xmm8 - movdqa xmm8, xmm12 - psrld xmm12, 8 - pslld xmm8, 24 - pxor xmm12, xmm8 - movdqa xmm8, xmm13 - psrld xmm13, 8 - pslld xmm8, 24 - pxor xmm13, xmm8 - movdqa xmm8, xmm14 - psrld xmm14, 8 - pslld xmm8, 24 - pxor xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - pxor xmm0, xmm8 - pxor xmm1, xmm9 - pxor xmm2, xmm10 - pxor xmm3, xmm11 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - pxor xmm4, xmm12 - pxor xmm5, xmm13 - pxor xmm6, xmm14 - pxor xmm7, xmm15 - mov eax, r13d - jne 9b - movdqa xmm9, xmm0 - punpckldq xmm0, xmm1 - punpckhdq xmm9, xmm1 - movdqa xmm11, xmm2 - punpckldq xmm2, xmm3 - punpckhdq xmm11, xmm3 - movdqa xmm1, xmm0 - punpcklqdq xmm0, xmm2 - punpckhqdq xmm1, xmm2 - movdqa xmm3, xmm9 - punpcklqdq xmm9, xmm11 - punpckhqdq xmm3, xmm11 - movdqu xmmword ptr [rbx], xmm0 - movdqu xmmword ptr [rbx+0x20], xmm1 - movdqu xmmword ptr [rbx+0x40], xmm9 - movdqu xmmword ptr [rbx+0x60], xmm3 - movdqa xmm9, xmm4 - punpckldq xmm4, xmm5 - punpckhdq xmm9, xmm5 - movdqa xmm11, xmm6 - punpckldq xmm6, xmm7 - punpckhdq xmm11, xmm7 - movdqa xmm5, xmm4 - punpcklqdq xmm4, xmm6 - punpckhqdq xmm5, xmm6 - movdqa xmm7, xmm9 - punpcklqdq xmm9, xmm11 - punpckhqdq xmm7, xmm11 - movdqu xmmword ptr [rbx+0x10], xmm4 - movdqu xmmword ptr [rbx+0x30], xmm5 - movdqu xmmword ptr [rbx+0x50], xmm9 - movdqu xmmword ptr [rbx+0x70], xmm7 - movdqa xmm1, xmmword ptr [rsp+0x110] - movdqa xmm0, xmm1 - paddd xmm1, xmmword ptr [rsp+0x150] - movdqa xmmword ptr [rsp+0x110], xmm1 - pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] - pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] - pcmpgtd xmm0, xmm1 - movdqa xmm1, xmmword ptr [rsp+0x120] - psubd xmm1, xmm0 - movdqa xmmword ptr [rsp+0x120], xmm1 - add rbx, 128 - add rdi, 32 - sub rsi, 4 - cmp rsi, 4 - jnc 2b - test rsi, rsi - jne 3f -4: - movdqa xmm6, xmmword ptr [rsp+0x170] - movdqa xmm7, xmmword ptr [rsp+0x180] - movdqa xmm8, xmmword ptr [rsp+0x190] - movdqa xmm9, xmmword ptr [rsp+0x1A0] - movdqa xmm10, xmmword ptr [rsp+0x1B0] - movdqa xmm11, xmmword ptr [rsp+0x1C0] - movdqa xmm12, xmmword ptr [rsp+0x1D0] - movdqa xmm13, xmmword ptr [rsp+0x1E0] - movdqa xmm14, xmmword ptr [rsp+0x1F0] - movdqa xmm15, xmmword ptr [rsp+0x200] - mov rsp, rbp - pop rbp - pop rbx - pop rdi - pop rsi - pop r12 - pop r13 - pop r14 - pop r15 - ret -.p2align 5 -3: - test esi, 0x2 - je 3f - movups xmm0, xmmword ptr [rcx] - movups xmm1, xmmword ptr [rcx+0x10] - movaps xmm8, xmm0 - movaps xmm9, xmm1 - movd xmm13, dword ptr [rsp+0x110] - movd xmm14, dword ptr [rsp+0x120] - punpckldq xmm13, xmm14 - movaps xmmword ptr [rsp], xmm13 - movd xmm14, dword ptr [rsp+0x114] - movd xmm13, dword ptr [rsp+0x124] - punpckldq xmm14, xmm13 - movaps xmmword ptr [rsp+0x10], xmm14 - mov r8, qword ptr [rdi] - mov r9, qword ptr [rdi+0x8] - movzx eax, byte ptr [rbp+0x80] - or eax, r13d - xor edx, edx -2: - mov r14d, eax - or eax, r12d - add rdx, 64 - cmp rdx, r15 - cmovne eax, r14d - movaps xmm2, xmmword ptr [BLAKE3_IV+rip] - movaps xmm10, xmm2 - movups xmm4, xmmword ptr [r8+rdx-0x40] - movups xmm5, xmmword ptr [r8+rdx-0x30] - movaps xmm3, xmm4 - shufps xmm4, xmm5, 136 - shufps xmm3, xmm5, 221 - movaps xmm5, xmm3 - movups xmm6, xmmword ptr [r8+rdx-0x20] - movups xmm7, xmmword ptr [r8+rdx-0x10] - movaps xmm3, xmm6 - shufps xmm6, xmm7, 136 - pshufd xmm6, xmm6, 0x93 - shufps xmm3, xmm7, 221 - pshufd xmm7, xmm3, 0x93 - movups xmm12, xmmword ptr [r9+rdx-0x40] - movups xmm13, xmmword ptr [r9+rdx-0x30] - movaps xmm11, xmm12 - shufps xmm12, xmm13, 136 - shufps xmm11, xmm13, 221 - movaps xmm13, xmm11 - movups xmm14, xmmword ptr [r9+rdx-0x20] - movups xmm15, xmmword ptr [r9+rdx-0x10] - movaps xmm11, xmm14 - shufps xmm14, xmm15, 136 - pshufd xmm14, xmm14, 0x93 - shufps xmm11, xmm15, 221 - pshufd xmm15, xmm11, 0x93 - shl rax, 0x20 - or rax, 0x40 - movd xmm3, rax - movdqa xmmword ptr [rsp+0x20], xmm3 - movaps xmm3, xmmword ptr [rsp] - movaps xmm11, xmmword ptr [rsp+0x10] - punpcklqdq xmm3, xmmword ptr [rsp+0x20] - punpcklqdq xmm11, xmmword ptr [rsp+0x20] - mov al, 7 -9: - paddd xmm0, xmm4 - paddd xmm8, xmm12 - movaps xmmword ptr [rsp+0x20], xmm4 - movaps xmmword ptr [rsp+0x30], xmm12 - paddd xmm0, xmm1 - paddd xmm8, xmm9 - pxor xmm3, xmm0 - pxor xmm11, xmm8 - pshuflw xmm3, xmm3, 0xB1 - pshufhw xmm3, xmm3, 0xB1 - pshuflw xmm11, xmm11, 0xB1 - pshufhw xmm11, xmm11, 0xB1 - paddd xmm2, xmm3 - paddd xmm10, xmm11 - pxor xmm1, xmm2 - pxor xmm9, xmm10 - movdqa xmm4, xmm1 - pslld xmm1, 20 - psrld xmm4, 12 - por xmm1, xmm4 - movdqa xmm4, xmm9 - pslld xmm9, 20 - psrld xmm4, 12 - por xmm9, xmm4 - paddd xmm0, xmm5 - paddd xmm8, xmm13 - movaps xmmword ptr [rsp+0x40], xmm5 - movaps xmmword ptr [rsp+0x50], xmm13 - paddd xmm0, xmm1 - paddd xmm8, xmm9 - pxor xmm3, xmm0 - pxor xmm11, xmm8 - movdqa xmm13, xmm3 - psrld xmm3, 8 - pslld xmm13, 24 - pxor xmm3, xmm13 - movdqa xmm13, xmm11 - psrld xmm11, 8 - pslld xmm13, 24 - pxor xmm11, xmm13 - paddd xmm2, xmm3 - paddd xmm10, xmm11 - pxor xmm1, xmm2 - pxor xmm9, xmm10 - movdqa xmm4, xmm1 - pslld xmm1, 25 - psrld xmm4, 7 - por xmm1, xmm4 - movdqa xmm4, xmm9 - pslld xmm9, 25 - psrld xmm4, 7 - por xmm9, xmm4 - pshufd xmm0, xmm0, 0x93 - pshufd xmm8, xmm8, 0x93 - pshufd xmm3, xmm3, 0x4E - pshufd xmm11, xmm11, 0x4E - pshufd xmm2, xmm2, 0x39 - pshufd xmm10, xmm10, 0x39 - paddd xmm0, xmm6 - paddd xmm8, xmm14 - paddd xmm0, xmm1 - paddd xmm8, xmm9 - pxor xmm3, xmm0 - pxor xmm11, xmm8 - pshuflw xmm3, xmm3, 0xB1 - pshufhw xmm3, xmm3, 0xB1 - pshuflw xmm11, xmm11, 0xB1 - pshufhw xmm11, xmm11, 0xB1 - paddd xmm2, xmm3 - paddd xmm10, xmm11 - pxor xmm1, xmm2 - pxor xmm9, xmm10 - movdqa xmm4, xmm1 - pslld xmm1, 20 - psrld xmm4, 12 - por xmm1, xmm4 - movdqa xmm4, xmm9 - pslld xmm9, 20 - psrld xmm4, 12 - por xmm9, xmm4 - paddd xmm0, xmm7 - paddd xmm8, xmm15 - paddd xmm0, xmm1 - paddd xmm8, xmm9 - pxor xmm3, xmm0 - pxor xmm11, xmm8 - movdqa xmm13, xmm3 - psrld xmm3, 8 - pslld xmm13, 24 - pxor xmm3, xmm13 - movdqa xmm13, xmm11 - psrld xmm11, 8 - pslld xmm13, 24 - pxor xmm11, xmm13 - paddd xmm2, xmm3 - paddd xmm10, xmm11 - pxor xmm1, xmm2 - pxor xmm9, xmm10 - movdqa xmm4, xmm1 - pslld xmm1, 25 - psrld xmm4, 7 - por xmm1, xmm4 - movdqa xmm4, xmm9 - pslld xmm9, 25 - psrld xmm4, 7 - por xmm9, xmm4 - pshufd xmm0, xmm0, 0x39 - pshufd xmm8, xmm8, 0x39 - pshufd xmm3, xmm3, 0x4E - pshufd xmm11, xmm11, 0x4E - pshufd xmm2, xmm2, 0x93 - pshufd xmm10, xmm10, 0x93 - dec al - je 9f - movdqa xmm12, xmmword ptr [rsp+0x20] - movdqa xmm5, xmmword ptr [rsp+0x40] - pshufd xmm13, xmm12, 0x0F - shufps xmm12, xmm5, 214 - pshufd xmm4, xmm12, 0x39 - movdqa xmm12, xmm6 - shufps xmm12, xmm7, 250 - pand xmm13, xmmword ptr [PBLENDW_0x33_MASK+rip] - pand xmm12, xmmword ptr [PBLENDW_0xCC_MASK+rip] - por xmm13, xmm12 - movdqa xmmword ptr [rsp+0x20], xmm13 - movdqa xmm12, xmm7 - punpcklqdq xmm12, xmm5 - movdqa xmm13, xmm6 - pand xmm12, xmmword ptr [PBLENDW_0x3F_MASK+rip] - pand xmm13, xmmword ptr [PBLENDW_0xC0_MASK+rip] - por xmm12, xmm13 - pshufd xmm12, xmm12, 0x78 - punpckhdq xmm5, xmm7 - punpckldq xmm6, xmm5 - pshufd xmm7, xmm6, 0x1E - movdqa xmmword ptr [rsp+0x40], xmm12 - movdqa xmm5, xmmword ptr [rsp+0x30] - movdqa xmm13, xmmword ptr [rsp+0x50] - pshufd xmm6, xmm5, 0x0F - shufps xmm5, xmm13, 214 - pshufd xmm12, xmm5, 0x39 - movdqa xmm5, xmm14 - shufps xmm5, xmm15, 250 - pand xmm6, xmmword ptr [PBLENDW_0x33_MASK+rip] - pand xmm5, xmmword ptr [PBLENDW_0xCC_MASK+rip] - por xmm6, xmm5 - movdqa xmm5, xmm15 - punpcklqdq xmm5, xmm13 - movdqa xmmword ptr [rsp+0x30], xmm2 - movdqa xmm2, xmm14 - pand xmm5, xmmword ptr [PBLENDW_0x3F_MASK+rip] - pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip] - por xmm5, xmm2 - movdqa xmm2, xmmword ptr [rsp+0x30] - pshufd xmm5, xmm5, 0x78 - punpckhdq xmm13, xmm15 - punpckldq xmm14, xmm13 - pshufd xmm15, xmm14, 0x1E - movdqa xmm13, xmm6 - movdqa xmm14, xmm5 - movdqa xmm5, xmmword ptr [rsp+0x20] - movdqa xmm6, xmmword ptr [rsp+0x40] - jmp 9b -9: - pxor xmm0, xmm2 - pxor xmm1, xmm3 - pxor xmm8, xmm10 - pxor xmm9, xmm11 - mov eax, r13d - cmp rdx, r15 - jne 2b - movups xmmword ptr [rbx], xmm0 - movups xmmword ptr [rbx+0x10], xmm1 - movups xmmword ptr [rbx+0x20], xmm8 - movups xmmword ptr [rbx+0x30], xmm9 - mov eax, dword ptr [rsp+0x130] - neg eax - mov r10d, dword ptr [rsp+0x110+8*rax] - mov r11d, dword ptr [rsp+0x120+8*rax] - mov dword ptr [rsp+0x110], r10d - mov dword ptr [rsp+0x120], r11d - add rdi, 16 - add rbx, 64 - sub rsi, 2 -3: - test esi, 0x1 - je 4b - movups xmm0, xmmword ptr [rcx] - movups xmm1, xmmword ptr [rcx+0x10] - movd xmm13, dword ptr [rsp+0x110] - movd xmm14, dword ptr [rsp+0x120] - punpckldq xmm13, xmm14 - mov r8, qword ptr [rdi] - movzx eax, byte ptr [rbp+0x80] - or eax, r13d - xor edx, edx -2: - mov r14d, eax - or eax, r12d - add rdx, 64 - cmp rdx, r15 - cmovne eax, r14d - movaps xmm2, xmmword ptr [BLAKE3_IV+rip] - shl rax, 32 - or rax, 64 - movd xmm12, rax - movdqa xmm3, xmm13 - punpcklqdq xmm3, xmm12 - movups xmm4, xmmword ptr [r8+rdx-0x40] - movups xmm5, xmmword ptr [r8+rdx-0x30] - movaps xmm8, xmm4 - shufps xmm4, xmm5, 136 - shufps xmm8, xmm5, 221 - movaps xmm5, xmm8 - movups xmm6, xmmword ptr [r8+rdx-0x20] - movups xmm7, xmmword ptr [r8+rdx-0x10] - movaps xmm8, xmm6 - shufps xmm6, xmm7, 136 - pshufd xmm6, xmm6, 0x93 - shufps xmm8, xmm7, 221 - pshufd xmm7, xmm8, 0x93 - mov al, 7 -9: - paddd xmm0, xmm4 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshuflw xmm3, xmm3, 0xB1 - pshufhw xmm3, xmm3, 0xB1 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 20 - psrld xmm11, 12 - por xmm1, xmm11 - paddd xmm0, xmm5 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - movdqa xmm14, xmm3 - psrld xmm3, 8 - pslld xmm14, 24 - pxor xmm3, xmm14 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 25 - psrld xmm11, 7 - por xmm1, xmm11 - pshufd xmm0, xmm0, 0x93 - pshufd xmm3, xmm3, 0x4E - pshufd xmm2, xmm2, 0x39 - paddd xmm0, xmm6 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshuflw xmm3, xmm3, 0xB1 - pshufhw xmm3, xmm3, 0xB1 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 20 - psrld xmm11, 12 - por xmm1, xmm11 - paddd xmm0, xmm7 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - movdqa xmm14, xmm3 - psrld xmm3, 8 - pslld xmm14, 24 - pxor xmm3, xmm14 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 25 - psrld xmm11, 7 - por xmm1, xmm11 - pshufd xmm0, xmm0, 0x39 - pshufd xmm3, xmm3, 0x4E - pshufd xmm2, xmm2, 0x93 - dec al - jz 9f - movdqa xmm8, xmm4 - shufps xmm8, xmm5, 214 - pshufd xmm9, xmm4, 0x0F - pshufd xmm4, xmm8, 0x39 - movdqa xmm8, xmm6 - shufps xmm8, xmm7, 250 - pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] - pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] - por xmm9, xmm8 - movdqa xmm8, xmm7 - punpcklqdq xmm8, xmm5 - movdqa xmm10, xmm6 - pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] - pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] - por xmm8, xmm10 - pshufd xmm8, xmm8, 0x78 - punpckhdq xmm5, xmm7 - punpckldq xmm6, xmm5 - pshufd xmm7, xmm6, 0x1E - movdqa xmm5, xmm9 - movdqa xmm6, xmm8 - jmp 9b -9: - pxor xmm0, xmm2 - pxor xmm1, xmm3 - mov eax, r13d - cmp rdx, r15 - jne 2b - movups xmmword ptr [rbx], xmm0 - movups xmmword ptr [rbx+0x10], xmm1 - jmp 4b - -.p2align 6 -blake3_compress_in_place_sse2: -_blake3_compress_in_place_sse2: - sub rsp, 120 - movdqa xmmword ptr [rsp], xmm6 - movdqa xmmword ptr [rsp+0x10], xmm7 - movdqa xmmword ptr [rsp+0x20], xmm8 - movdqa xmmword ptr [rsp+0x30], xmm9 - movdqa xmmword ptr [rsp+0x40], xmm11 - movdqa xmmword ptr [rsp+0x50], xmm14 - movdqa xmmword ptr [rsp+0x60], xmm15 - movups xmm0, xmmword ptr [rcx] - movups xmm1, xmmword ptr [rcx+0x10] - movaps xmm2, xmmword ptr [BLAKE3_IV+rip] - movzx eax, byte ptr [rsp+0xA0] - movzx r8d, r8b - shl rax, 32 - add r8, rax - movq xmm3, r9 - movq xmm4, r8 - punpcklqdq xmm3, xmm4 - movups xmm4, xmmword ptr [rdx] - movups xmm5, xmmword ptr [rdx+0x10] - movaps xmm8, xmm4 - shufps xmm4, xmm5, 136 - shufps xmm8, xmm5, 221 - movaps xmm5, xmm8 - movups xmm6, xmmword ptr [rdx+0x20] - movups xmm7, xmmword ptr [rdx+0x30] - movaps xmm8, xmm6 - shufps xmm6, xmm7, 136 - pshufd xmm6, xmm6, 0x93 - shufps xmm8, xmm7, 221 - pshufd xmm7, xmm8, 0x93 - mov al, 7 -9: - paddd xmm0, xmm4 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshuflw xmm3, xmm3, 0xB1 - pshufhw xmm3, xmm3, 0xB1 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 20 - psrld xmm11, 12 - por xmm1, xmm11 - paddd xmm0, xmm5 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - movdqa xmm14, xmm3 - psrld xmm3, 8 - pslld xmm14, 24 - pxor xmm3, xmm14 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 25 - psrld xmm11, 7 - por xmm1, xmm11 - pshufd xmm0, xmm0, 0x93 - pshufd xmm3, xmm3, 0x4E - pshufd xmm2, xmm2, 0x39 - paddd xmm0, xmm6 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshuflw xmm3, xmm3, 0xB1 - pshufhw xmm3, xmm3, 0xB1 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 20 - psrld xmm11, 12 - por xmm1, xmm11 - paddd xmm0, xmm7 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - movdqa xmm14, xmm3 - psrld xmm3, 8 - pslld xmm14, 24 - pxor xmm3, xmm14 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 25 - psrld xmm11, 7 - por xmm1, xmm11 - pshufd xmm0, xmm0, 0x39 - pshufd xmm3, xmm3, 0x4E - pshufd xmm2, xmm2, 0x93 - dec al - jz 9f - movdqa xmm8, xmm4 - shufps xmm8, xmm5, 214 - pshufd xmm9, xmm4, 0x0F - pshufd xmm4, xmm8, 0x39 - movdqa xmm8, xmm6 - shufps xmm8, xmm7, 250 - pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] - pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] - por xmm9, xmm8 - movdqa xmm8, xmm7 - punpcklqdq xmm8, xmm5 - movdqa xmm10, xmm6 - pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] - pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] - por xmm8, xmm10 - pshufd xmm8, xmm8, 0x78 - punpckhdq xmm5, xmm7 - punpckldq xmm6, xmm5 - pshufd xmm7, xmm6, 0x1E - movdqa xmm5, xmm9 - movdqa xmm6, xmm8 - jmp 9b -9: - pxor xmm0, xmm2 - pxor xmm1, xmm3 - movups xmmword ptr [rcx], xmm0 - movups xmmword ptr [rcx+0x10], xmm1 - movdqa xmm6, xmmword ptr [rsp] - movdqa xmm7, xmmword ptr [rsp+0x10] - movdqa xmm8, xmmword ptr [rsp+0x20] - movdqa xmm9, xmmword ptr [rsp+0x30] - movdqa xmm11, xmmword ptr [rsp+0x40] - movdqa xmm14, xmmword ptr [rsp+0x50] - movdqa xmm15, xmmword ptr [rsp+0x60] - add rsp, 120 - ret - - -.p2align 6 -_blake3_compress_xof_sse2: -blake3_compress_xof_sse2: - sub rsp, 120 - movdqa xmmword ptr [rsp], xmm6 - movdqa xmmword ptr [rsp+0x10], xmm7 - movdqa xmmword ptr [rsp+0x20], xmm8 - movdqa xmmword ptr [rsp+0x30], xmm9 - movdqa xmmword ptr [rsp+0x40], xmm11 - movdqa xmmword ptr [rsp+0x50], xmm14 - movdqa xmmword ptr [rsp+0x60], xmm15 - movups xmm0, xmmword ptr [rcx] - movups xmm1, xmmword ptr [rcx+0x10] - movaps xmm2, xmmword ptr [BLAKE3_IV+rip] - movzx eax, byte ptr [rsp+0xA0] - movzx r8d, r8b - mov r10, qword ptr [rsp+0xA8] - shl rax, 32 - add r8, rax - movq xmm3, r9 - movq xmm4, r8 - punpcklqdq xmm3, xmm4 - movups xmm4, xmmword ptr [rdx] - movups xmm5, xmmword ptr [rdx+0x10] - movaps xmm8, xmm4 - shufps xmm4, xmm5, 136 - shufps xmm8, xmm5, 221 - movaps xmm5, xmm8 - movups xmm6, xmmword ptr [rdx+0x20] - movups xmm7, xmmword ptr [rdx+0x30] - movaps xmm8, xmm6 - shufps xmm6, xmm7, 136 - pshufd xmm6, xmm6, 0x93 - shufps xmm8, xmm7, 221 - pshufd xmm7, xmm8, 0x93 - mov al, 7 -9: - paddd xmm0, xmm4 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshuflw xmm3, xmm3, 0xB1 - pshufhw xmm3, xmm3, 0xB1 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 20 - psrld xmm11, 12 - por xmm1, xmm11 - paddd xmm0, xmm5 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - movdqa xmm14, xmm3 - psrld xmm3, 8 - pslld xmm14, 24 - pxor xmm3, xmm14 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 25 - psrld xmm11, 7 - por xmm1, xmm11 - pshufd xmm0, xmm0, 0x93 - pshufd xmm3, xmm3, 0x4E - pshufd xmm2, xmm2, 0x39 - paddd xmm0, xmm6 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshuflw xmm3, xmm3, 0xB1 - pshufhw xmm3, xmm3, 0xB1 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 20 - psrld xmm11, 12 - por xmm1, xmm11 - paddd xmm0, xmm7 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - movdqa xmm14, xmm3 - psrld xmm3, 8 - pslld xmm14, 24 - pxor xmm3, xmm14 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 25 - psrld xmm11, 7 - por xmm1, xmm11 - pshufd xmm0, xmm0, 0x39 - pshufd xmm3, xmm3, 0x4E - pshufd xmm2, xmm2, 0x93 - dec al - jz 9f - movdqa xmm8, xmm4 - shufps xmm8, xmm5, 214 - pshufd xmm9, xmm4, 0x0F - pshufd xmm4, xmm8, 0x39 - movdqa xmm8, xmm6 - shufps xmm8, xmm7, 250 - pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] - pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] - por xmm9, xmm8 - movdqa xmm8, xmm7 - punpcklqdq xmm8, xmm5 - movdqa xmm10, xmm6 - pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] - pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] - por xmm8, xmm10 - pshufd xmm8, xmm8, 0x78 - punpckhdq xmm5, xmm7 - punpckldq xmm6, xmm5 - pshufd xmm7, xmm6, 0x1E - movdqa xmm5, xmm9 - movdqa xmm6, xmm8 - jmp 9b -9: - movdqu xmm4, xmmword ptr [rcx] - movdqu xmm5, xmmword ptr [rcx+0x10] - pxor xmm0, xmm2 - pxor xmm1, xmm3 - pxor xmm2, xmm4 - pxor xmm3, xmm5 - movups xmmword ptr [r10], xmm0 - movups xmmword ptr [r10+0x10], xmm1 - movups xmmword ptr [r10+0x20], xmm2 - movups xmmword ptr [r10+0x30], xmm3 - movdqa xmm6, xmmword ptr [rsp] - movdqa xmm7, xmmword ptr [rsp+0x10] - movdqa xmm8, xmmword ptr [rsp+0x20] - movdqa xmm9, xmmword ptr [rsp+0x30] - movdqa xmm11, xmmword ptr [rsp+0x40] - movdqa xmm14, xmmword ptr [rsp+0x50] - movdqa xmm15, xmmword ptr [rsp+0x60] - add rsp, 120 - ret - - -.section .rodata -.p2align 6 -BLAKE3_IV: - .long 0x6A09E667, 0xBB67AE85 - .long 0x3C6EF372, 0xA54FF53A -ADD0: - .long 0, 1, 2, 3 -ADD1: - .long 4, 4, 4, 4 -BLAKE3_IV_0: - .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 -BLAKE3_IV_1: - .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 -BLAKE3_IV_2: - .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 -BLAKE3_IV_3: - .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A -BLAKE3_BLOCK_LEN: - .long 64, 64, 64, 64 -CMP_MSB_MASK: - .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 -PBLENDW_0x33_MASK: - .long 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000 -PBLENDW_0xCC_MASK: - .long 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF -PBLENDW_0x3F_MASK: - .long 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 -PBLENDW_0xC0_MASK: - .long 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF diff --git a/thirdparty/BLAKE3/c/blake3_sse2_x86-64_windows_msvc.asm b/thirdparty/BLAKE3/c/blake3_sse2_x86-64_windows_msvc.asm deleted file mode 100644 index 72deb7bbc..000000000 --- a/thirdparty/BLAKE3/c/blake3_sse2_x86-64_windows_msvc.asm +++ /dev/null @@ -1,2350 +0,0 @@ -public _blake3_hash_many_sse2 -public blake3_hash_many_sse2 -public blake3_compress_in_place_sse2 -public _blake3_compress_in_place_sse2 -public blake3_compress_xof_sse2 -public _blake3_compress_xof_sse2 - -_TEXT SEGMENT ALIGN(16) 'CODE' - -ALIGN 16 -blake3_hash_many_sse2 PROC -_blake3_hash_many_sse2 PROC - push r15 - push r14 - push r13 - push r12 - push rsi - push rdi - push rbx - push rbp - mov rbp, rsp - sub rsp, 528 - and rsp, 0FFFFFFFFFFFFFFC0H - movdqa xmmword ptr [rsp+170H], xmm6 - movdqa xmmword ptr [rsp+180H], xmm7 - movdqa xmmword ptr [rsp+190H], xmm8 - movdqa xmmword ptr [rsp+1A0H], xmm9 - movdqa xmmword ptr [rsp+1B0H], xmm10 - movdqa xmmword ptr [rsp+1C0H], xmm11 - movdqa xmmword ptr [rsp+1D0H], xmm12 - movdqa xmmword ptr [rsp+1E0H], xmm13 - movdqa xmmword ptr [rsp+1F0H], xmm14 - movdqa xmmword ptr [rsp+200H], xmm15 - mov rdi, rcx - mov rsi, rdx - mov rdx, r8 - mov rcx, r9 - mov r8, qword ptr [rbp+68H] - movzx r9, byte ptr [rbp+70H] - neg r9d - movd xmm0, r9d - pshufd xmm0, xmm0, 00H - movdqa xmmword ptr [rsp+130H], xmm0 - movdqa xmm1, xmm0 - pand xmm1, xmmword ptr [ADD0] - pand xmm0, xmmword ptr [ADD1] - movdqa xmmword ptr [rsp+150H], xmm0 - movd xmm0, r8d - pshufd xmm0, xmm0, 00H - paddd xmm0, xmm1 - movdqa xmmword ptr [rsp+110H], xmm0 - pxor xmm0, xmmword ptr [CMP_MSB_MASK] - pxor xmm1, xmmword ptr [CMP_MSB_MASK] - pcmpgtd xmm1, xmm0 - shr r8, 32 - movd xmm2, r8d - pshufd xmm2, xmm2, 00H - psubd xmm2, xmm1 - movdqa xmmword ptr [rsp+120H], xmm2 - mov rbx, qword ptr [rbp+90H] - mov r15, rdx - shl r15, 6 - movzx r13d, byte ptr [rbp+78H] - movzx r12d, byte ptr [rbp+88H] - cmp rsi, 4 - jc final3blocks -outerloop4: - movdqu xmm3, xmmword ptr [rcx] - pshufd xmm0, xmm3, 00H - pshufd xmm1, xmm3, 55H - pshufd xmm2, xmm3, 0AAH - pshufd xmm3, xmm3, 0FFH - movdqu xmm7, xmmword ptr [rcx+10H] - pshufd xmm4, xmm7, 00H - pshufd xmm5, xmm7, 55H - pshufd xmm6, xmm7, 0AAH - pshufd xmm7, xmm7, 0FFH - mov r8, qword ptr [rdi] - mov r9, qword ptr [rdi+8H] - mov r10, qword ptr [rdi+10H] - mov r11, qword ptr [rdi+18H] - movzx eax, byte ptr [rbp+80H] - or eax, r13d - xor edx, edx -innerloop4: - mov r14d, eax - or eax, r12d - add rdx, 64 - cmp rdx, r15 - cmovne eax, r14d - movdqu xmm8, xmmword ptr [r8+rdx-40H] - movdqu xmm9, xmmword ptr [r9+rdx-40H] - movdqu xmm10, xmmword ptr [r10+rdx-40H] - movdqu xmm11, xmmword ptr [r11+rdx-40H] - movdqa xmm12, xmm8 - punpckldq xmm8, xmm9 - punpckhdq xmm12, xmm9 - movdqa xmm14, xmm10 - punpckldq xmm10, xmm11 - punpckhdq xmm14, xmm11 - movdqa xmm9, xmm8 - punpcklqdq xmm8, xmm10 - punpckhqdq xmm9, xmm10 - movdqa xmm13, xmm12 - punpcklqdq xmm12, xmm14 - punpckhqdq xmm13, xmm14 - movdqa xmmword ptr [rsp], xmm8 - movdqa xmmword ptr [rsp+10H], xmm9 - movdqa xmmword ptr [rsp+20H], xmm12 - movdqa xmmword ptr [rsp+30H], xmm13 - movdqu xmm8, xmmword ptr [r8+rdx-30H] - movdqu xmm9, xmmword ptr [r9+rdx-30H] - movdqu xmm10, xmmword ptr [r10+rdx-30H] - movdqu xmm11, xmmword ptr [r11+rdx-30H] - movdqa xmm12, xmm8 - punpckldq xmm8, xmm9 - punpckhdq xmm12, xmm9 - movdqa xmm14, xmm10 - punpckldq xmm10, xmm11 - punpckhdq xmm14, xmm11 - movdqa xmm9, xmm8 - punpcklqdq xmm8, xmm10 - punpckhqdq xmm9, xmm10 - movdqa xmm13, xmm12 - punpcklqdq xmm12, xmm14 - punpckhqdq xmm13, xmm14 - movdqa xmmword ptr [rsp+40H], xmm8 - movdqa xmmword ptr [rsp+50H], xmm9 - movdqa xmmword ptr [rsp+60H], xmm12 - movdqa xmmword ptr [rsp+70H], xmm13 - movdqu xmm8, xmmword ptr [r8+rdx-20H] - movdqu xmm9, xmmword ptr [r9+rdx-20H] - movdqu xmm10, xmmword ptr [r10+rdx-20H] - movdqu xmm11, xmmword ptr [r11+rdx-20H] - movdqa xmm12, xmm8 - punpckldq xmm8, xmm9 - punpckhdq xmm12, xmm9 - movdqa xmm14, xmm10 - punpckldq xmm10, xmm11 - punpckhdq xmm14, xmm11 - movdqa xmm9, xmm8 - punpcklqdq xmm8, xmm10 - punpckhqdq xmm9, xmm10 - movdqa xmm13, xmm12 - punpcklqdq xmm12, xmm14 - punpckhqdq xmm13, xmm14 - movdqa xmmword ptr [rsp+80H], xmm8 - movdqa xmmword ptr [rsp+90H], xmm9 - movdqa xmmword ptr [rsp+0A0H], xmm12 - movdqa xmmword ptr [rsp+0B0H], xmm13 - movdqu xmm8, xmmword ptr [r8+rdx-10H] - movdqu xmm9, xmmword ptr [r9+rdx-10H] - movdqu xmm10, xmmword ptr [r10+rdx-10H] - movdqu xmm11, xmmword ptr [r11+rdx-10H] - movdqa xmm12, xmm8 - punpckldq xmm8, xmm9 - punpckhdq xmm12, xmm9 - movdqa xmm14, xmm10 - punpckldq xmm10, xmm11 - punpckhdq xmm14, xmm11 - movdqa xmm9, xmm8 - punpcklqdq xmm8, xmm10 - punpckhqdq xmm9, xmm10 - movdqa xmm13, xmm12 - punpcklqdq xmm12, xmm14 - punpckhqdq xmm13, xmm14 - movdqa xmmword ptr [rsp+0C0H], xmm8 - movdqa xmmword ptr [rsp+0D0H], xmm9 - movdqa xmmword ptr [rsp+0E0H], xmm12 - movdqa xmmword ptr [rsp+0F0H], xmm13 - movdqa xmm9, xmmword ptr [BLAKE3_IV_1] - movdqa xmm10, xmmword ptr [BLAKE3_IV_2] - movdqa xmm11, xmmword ptr [BLAKE3_IV_3] - movdqa xmm12, xmmword ptr [rsp+110H] - movdqa xmm13, xmmword ptr [rsp+120H] - movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN] - movd xmm15, eax - pshufd xmm15, xmm15, 00H - prefetcht0 byte ptr [r8+rdx+80H] - prefetcht0 byte ptr [r9+rdx+80H] - prefetcht0 byte ptr [r10+rdx+80H] - prefetcht0 byte ptr [r11+rdx+80H] - paddd xmm0, xmmword ptr [rsp] - paddd xmm1, xmmword ptr [rsp+20H] - paddd xmm2, xmmword ptr [rsp+40H] - paddd xmm3, xmmword ptr [rsp+60H] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - pshuflw xmm12, xmm12, 0B1H - pshufhw xmm12, xmm12, 0B1H - pshuflw xmm13, xmm13, 0B1H - pshufhw xmm13, xmm13, 0B1H - pshuflw xmm14, xmm14, 0B1H - pshufhw xmm14, xmm14, 0B1H - pshuflw xmm15, xmm15, 0B1H - pshufhw xmm15, xmm15, 0B1H - movdqa xmm8, xmmword ptr [BLAKE3_IV_0] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+100H], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+10H] - paddd xmm1, xmmword ptr [rsp+30H] - paddd xmm2, xmmword ptr [rsp+50H] - paddd xmm3, xmmword ptr [rsp+70H] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmm12 - psrld xmm12, 8 - pslld xmm8, 24 - pxor xmm12, xmm8 - movdqa xmm8, xmm13 - psrld xmm13, 8 - pslld xmm8, 24 - pxor xmm13, xmm8 - movdqa xmm8, xmm14 - psrld xmm14, 8 - pslld xmm8, 24 - pxor xmm14, xmm8 - movdqa xmm8, xmm15 - psrld xmm15, 8 - pslld xmm8, 24 - pxor xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+100H] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+100H], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+80H] - paddd xmm1, xmmword ptr [rsp+0A0H] - paddd xmm2, xmmword ptr [rsp+0C0H] - paddd xmm3, xmmword ptr [rsp+0E0H] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - pshuflw xmm15, xmm15, 0B1H - pshufhw xmm15, xmm15, 0B1H - pshuflw xmm12, xmm12, 0B1H - pshufhw xmm12, xmm12, 0B1H - pshuflw xmm13, xmm13, 0B1H - pshufhw xmm13, xmm13, 0B1H - pshuflw xmm14, xmm14, 0B1H - pshufhw xmm14, xmm14, 0B1H - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+100H] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+100H], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+90H] - paddd xmm1, xmmword ptr [rsp+0B0H] - paddd xmm2, xmmword ptr [rsp+0D0H] - paddd xmm3, xmmword ptr [rsp+0F0H] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmm15 - psrld xmm15, 8 - pslld xmm8, 24 - pxor xmm15, xmm8 - movdqa xmm8, xmm12 - psrld xmm12, 8 - pslld xmm8, 24 - pxor xmm12, xmm8 - movdqa xmm8, xmm13 - psrld xmm13, 8 - pslld xmm8, 24 - pxor xmm13, xmm8 - movdqa xmm8, xmm14 - psrld xmm14, 8 - pslld xmm8, 24 - pxor xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+100H] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+100H], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+20H] - paddd xmm1, xmmword ptr [rsp+30H] - paddd xmm2, xmmword ptr [rsp+70H] - paddd xmm3, xmmword ptr [rsp+40H] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - pshuflw xmm12, xmm12, 0B1H - pshufhw xmm12, xmm12, 0B1H - pshuflw xmm13, xmm13, 0B1H - pshufhw xmm13, xmm13, 0B1H - pshuflw xmm14, xmm14, 0B1H - pshufhw xmm14, xmm14, 0B1H - pshuflw xmm15, xmm15, 0B1H - pshufhw xmm15, xmm15, 0B1H - movdqa xmm8, xmmword ptr [rsp+100H] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+100H], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+60H] - paddd xmm1, xmmword ptr [rsp+0A0H] - paddd xmm2, xmmword ptr [rsp] - paddd xmm3, xmmword ptr [rsp+0D0H] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmm12 - psrld xmm12, 8 - pslld xmm8, 24 - pxor xmm12, xmm8 - movdqa xmm8, xmm13 - psrld xmm13, 8 - pslld xmm8, 24 - pxor xmm13, xmm8 - movdqa xmm8, xmm14 - psrld xmm14, 8 - pslld xmm8, 24 - pxor xmm14, xmm8 - movdqa xmm8, xmm15 - psrld xmm15, 8 - pslld xmm8, 24 - pxor xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+100H] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+100H], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+10H] - paddd xmm1, xmmword ptr [rsp+0C0H] - paddd xmm2, xmmword ptr [rsp+90H] - paddd xmm3, xmmword ptr [rsp+0F0H] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - pshuflw xmm15, xmm15, 0B1H - pshufhw xmm15, xmm15, 0B1H - pshuflw xmm12, xmm12, 0B1H - pshufhw xmm12, xmm12, 0B1H - pshuflw xmm13, xmm13, 0B1H - pshufhw xmm13, xmm13, 0B1H - pshuflw xmm14, xmm14, 0B1H - pshufhw xmm14, xmm14, 0B1H - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+100H] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+100H], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0B0H] - paddd xmm1, xmmword ptr [rsp+50H] - paddd xmm2, xmmword ptr [rsp+0E0H] - paddd xmm3, xmmword ptr [rsp+80H] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmm15 - psrld xmm15, 8 - pslld xmm8, 24 - pxor xmm15, xmm8 - movdqa xmm8, xmm12 - psrld xmm12, 8 - pslld xmm8, 24 - pxor xmm12, xmm8 - movdqa xmm8, xmm13 - psrld xmm13, 8 - pslld xmm8, 24 - pxor xmm13, xmm8 - movdqa xmm8, xmm14 - psrld xmm14, 8 - pslld xmm8, 24 - pxor xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+100H] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+100H], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+30H] - paddd xmm1, xmmword ptr [rsp+0A0H] - paddd xmm2, xmmword ptr [rsp+0D0H] - paddd xmm3, xmmword ptr [rsp+70H] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - pshuflw xmm12, xmm12, 0B1H - pshufhw xmm12, xmm12, 0B1H - pshuflw xmm13, xmm13, 0B1H - pshufhw xmm13, xmm13, 0B1H - pshuflw xmm14, xmm14, 0B1H - pshufhw xmm14, xmm14, 0B1H - pshuflw xmm15, xmm15, 0B1H - pshufhw xmm15, xmm15, 0B1H - movdqa xmm8, xmmword ptr [rsp+100H] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+100H], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+40H] - paddd xmm1, xmmword ptr [rsp+0C0H] - paddd xmm2, xmmword ptr [rsp+20H] - paddd xmm3, xmmword ptr [rsp+0E0H] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmm12 - psrld xmm12, 8 - pslld xmm8, 24 - pxor xmm12, xmm8 - movdqa xmm8, xmm13 - psrld xmm13, 8 - pslld xmm8, 24 - pxor xmm13, xmm8 - movdqa xmm8, xmm14 - psrld xmm14, 8 - pslld xmm8, 24 - pxor xmm14, xmm8 - movdqa xmm8, xmm15 - psrld xmm15, 8 - pslld xmm8, 24 - pxor xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+100H] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+100H], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+60H] - paddd xmm1, xmmword ptr [rsp+90H] - paddd xmm2, xmmword ptr [rsp+0B0H] - paddd xmm3, xmmword ptr [rsp+80H] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - pshuflw xmm15, xmm15, 0B1H - pshufhw xmm15, xmm15, 0B1H - pshuflw xmm12, xmm12, 0B1H - pshufhw xmm12, xmm12, 0B1H - pshuflw xmm13, xmm13, 0B1H - pshufhw xmm13, xmm13, 0B1H - pshuflw xmm14, xmm14, 0B1H - pshufhw xmm14, xmm14, 0B1H - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+100H] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+100H], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+50H] - paddd xmm1, xmmword ptr [rsp] - paddd xmm2, xmmword ptr [rsp+0F0H] - paddd xmm3, xmmword ptr [rsp+10H] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmm15 - psrld xmm15, 8 - pslld xmm8, 24 - pxor xmm15, xmm8 - movdqa xmm8, xmm12 - psrld xmm12, 8 - pslld xmm8, 24 - pxor xmm12, xmm8 - movdqa xmm8, xmm13 - psrld xmm13, 8 - pslld xmm8, 24 - pxor xmm13, xmm8 - movdqa xmm8, xmm14 - psrld xmm14, 8 - pslld xmm8, 24 - pxor xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+100H] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+100H], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0A0H] - paddd xmm1, xmmword ptr [rsp+0C0H] - paddd xmm2, xmmword ptr [rsp+0E0H] - paddd xmm3, xmmword ptr [rsp+0D0H] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - pshuflw xmm12, xmm12, 0B1H - pshufhw xmm12, xmm12, 0B1H - pshuflw xmm13, xmm13, 0B1H - pshufhw xmm13, xmm13, 0B1H - pshuflw xmm14, xmm14, 0B1H - pshufhw xmm14, xmm14, 0B1H - pshuflw xmm15, xmm15, 0B1H - pshufhw xmm15, xmm15, 0B1H - movdqa xmm8, xmmword ptr [rsp+100H] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+100H], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+70H] - paddd xmm1, xmmword ptr [rsp+90H] - paddd xmm2, xmmword ptr [rsp+30H] - paddd xmm3, xmmword ptr [rsp+0F0H] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmm12 - psrld xmm12, 8 - pslld xmm8, 24 - pxor xmm12, xmm8 - movdqa xmm8, xmm13 - psrld xmm13, 8 - pslld xmm8, 24 - pxor xmm13, xmm8 - movdqa xmm8, xmm14 - psrld xmm14, 8 - pslld xmm8, 24 - pxor xmm14, xmm8 - movdqa xmm8, xmm15 - psrld xmm15, 8 - pslld xmm8, 24 - pxor xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+100H] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+100H], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+40H] - paddd xmm1, xmmword ptr [rsp+0B0H] - paddd xmm2, xmmword ptr [rsp+50H] - paddd xmm3, xmmword ptr [rsp+10H] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - pshuflw xmm15, xmm15, 0B1H - pshufhw xmm15, xmm15, 0B1H - pshuflw xmm12, xmm12, 0B1H - pshufhw xmm12, xmm12, 0B1H - pshuflw xmm13, xmm13, 0B1H - pshufhw xmm13, xmm13, 0B1H - pshuflw xmm14, xmm14, 0B1H - pshufhw xmm14, xmm14, 0B1H - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+100H] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+100H], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp] - paddd xmm1, xmmword ptr [rsp+20H] - paddd xmm2, xmmword ptr [rsp+80H] - paddd xmm3, xmmword ptr [rsp+60H] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmm15 - psrld xmm15, 8 - pslld xmm8, 24 - pxor xmm15, xmm8 - movdqa xmm8, xmm12 - psrld xmm12, 8 - pslld xmm8, 24 - pxor xmm12, xmm8 - movdqa xmm8, xmm13 - psrld xmm13, 8 - pslld xmm8, 24 - pxor xmm13, xmm8 - movdqa xmm8, xmm14 - psrld xmm14, 8 - pslld xmm8, 24 - pxor xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+100H] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+100H], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0C0H] - paddd xmm1, xmmword ptr [rsp+90H] - paddd xmm2, xmmword ptr [rsp+0F0H] - paddd xmm3, xmmword ptr [rsp+0E0H] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - pshuflw xmm12, xmm12, 0B1H - pshufhw xmm12, xmm12, 0B1H - pshuflw xmm13, xmm13, 0B1H - pshufhw xmm13, xmm13, 0B1H - pshuflw xmm14, xmm14, 0B1H - pshufhw xmm14, xmm14, 0B1H - pshuflw xmm15, xmm15, 0B1H - pshufhw xmm15, xmm15, 0B1H - movdqa xmm8, xmmword ptr [rsp+100H] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+100H], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0D0H] - paddd xmm1, xmmword ptr [rsp+0B0H] - paddd xmm2, xmmword ptr [rsp+0A0H] - paddd xmm3, xmmword ptr [rsp+80H] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmm12 - psrld xmm12, 8 - pslld xmm8, 24 - pxor xmm12, xmm8 - movdqa xmm8, xmm13 - psrld xmm13, 8 - pslld xmm8, 24 - pxor xmm13, xmm8 - movdqa xmm8, xmm14 - psrld xmm14, 8 - pslld xmm8, 24 - pxor xmm14, xmm8 - movdqa xmm8, xmm15 - psrld xmm15, 8 - pslld xmm8, 24 - pxor xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+100H] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+100H], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+70H] - paddd xmm1, xmmword ptr [rsp+50H] - paddd xmm2, xmmword ptr [rsp] - paddd xmm3, xmmword ptr [rsp+60H] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - pshuflw xmm15, xmm15, 0B1H - pshufhw xmm15, xmm15, 0B1H - pshuflw xmm12, xmm12, 0B1H - pshufhw xmm12, xmm12, 0B1H - pshuflw xmm13, xmm13, 0B1H - pshufhw xmm13, xmm13, 0B1H - pshuflw xmm14, xmm14, 0B1H - pshufhw xmm14, xmm14, 0B1H - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+100H] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+100H], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+20H] - paddd xmm1, xmmword ptr [rsp+30H] - paddd xmm2, xmmword ptr [rsp+10H] - paddd xmm3, xmmword ptr [rsp+40H] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmm15 - psrld xmm15, 8 - pslld xmm8, 24 - pxor xmm15, xmm8 - movdqa xmm8, xmm12 - psrld xmm12, 8 - pslld xmm8, 24 - pxor xmm12, xmm8 - movdqa xmm8, xmm13 - psrld xmm13, 8 - pslld xmm8, 24 - pxor xmm13, xmm8 - movdqa xmm8, xmm14 - psrld xmm14, 8 - pslld xmm8, 24 - pxor xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+100H] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+100H], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+90H] - paddd xmm1, xmmword ptr [rsp+0B0H] - paddd xmm2, xmmword ptr [rsp+80H] - paddd xmm3, xmmword ptr [rsp+0F0H] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - pshuflw xmm12, xmm12, 0B1H - pshufhw xmm12, xmm12, 0B1H - pshuflw xmm13, xmm13, 0B1H - pshufhw xmm13, xmm13, 0B1H - pshuflw xmm14, xmm14, 0B1H - pshufhw xmm14, xmm14, 0B1H - pshuflw xmm15, xmm15, 0B1H - pshufhw xmm15, xmm15, 0B1H - movdqa xmm8, xmmword ptr [rsp+100H] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+100H], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0E0H] - paddd xmm1, xmmword ptr [rsp+50H] - paddd xmm2, xmmword ptr [rsp+0C0H] - paddd xmm3, xmmword ptr [rsp+10H] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmm12 - psrld xmm12, 8 - pslld xmm8, 24 - pxor xmm12, xmm8 - movdqa xmm8, xmm13 - psrld xmm13, 8 - pslld xmm8, 24 - pxor xmm13, xmm8 - movdqa xmm8, xmm14 - psrld xmm14, 8 - pslld xmm8, 24 - pxor xmm14, xmm8 - movdqa xmm8, xmm15 - psrld xmm15, 8 - pslld xmm8, 24 - pxor xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+100H] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+100H], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0D0H] - paddd xmm1, xmmword ptr [rsp] - paddd xmm2, xmmword ptr [rsp+20H] - paddd xmm3, xmmword ptr [rsp+40H] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - pshuflw xmm15, xmm15, 0B1H - pshufhw xmm15, xmm15, 0B1H - pshuflw xmm12, xmm12, 0B1H - pshufhw xmm12, xmm12, 0B1H - pshuflw xmm13, xmm13, 0B1H - pshufhw xmm13, xmm13, 0B1H - pshuflw xmm14, xmm14, 0B1H - pshufhw xmm14, xmm14, 0B1H - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+100H] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+100H], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+30H] - paddd xmm1, xmmword ptr [rsp+0A0H] - paddd xmm2, xmmword ptr [rsp+60H] - paddd xmm3, xmmword ptr [rsp+70H] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmm15 - psrld xmm15, 8 - pslld xmm8, 24 - pxor xmm15, xmm8 - movdqa xmm8, xmm12 - psrld xmm12, 8 - pslld xmm8, 24 - pxor xmm12, xmm8 - movdqa xmm8, xmm13 - psrld xmm13, 8 - pslld xmm8, 24 - pxor xmm13, xmm8 - movdqa xmm8, xmm14 - psrld xmm14, 8 - pslld xmm8, 24 - pxor xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+100H] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+100H], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0B0H] - paddd xmm1, xmmword ptr [rsp+50H] - paddd xmm2, xmmword ptr [rsp+10H] - paddd xmm3, xmmword ptr [rsp+80H] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - pshuflw xmm12, xmm12, 0B1H - pshufhw xmm12, xmm12, 0B1H - pshuflw xmm13, xmm13, 0B1H - pshufhw xmm13, xmm13, 0B1H - pshuflw xmm14, xmm14, 0B1H - pshufhw xmm14, xmm14, 0B1H - pshuflw xmm15, xmm15, 0B1H - pshufhw xmm15, xmm15, 0B1H - movdqa xmm8, xmmword ptr [rsp+100H] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+100H], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0F0H] - paddd xmm1, xmmword ptr [rsp] - paddd xmm2, xmmword ptr [rsp+90H] - paddd xmm3, xmmword ptr [rsp+60H] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmm12 - psrld xmm12, 8 - pslld xmm8, 24 - pxor xmm12, xmm8 - movdqa xmm8, xmm13 - psrld xmm13, 8 - pslld xmm8, 24 - pxor xmm13, xmm8 - movdqa xmm8, xmm14 - psrld xmm14, 8 - pslld xmm8, 24 - pxor xmm14, xmm8 - movdqa xmm8, xmm15 - psrld xmm15, 8 - pslld xmm8, 24 - pxor xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+100H] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+100H], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0E0H] - paddd xmm1, xmmword ptr [rsp+20H] - paddd xmm2, xmmword ptr [rsp+30H] - paddd xmm3, xmmword ptr [rsp+70H] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - pshuflw xmm15, xmm15, 0B1H - pshufhw xmm15, xmm15, 0B1H - pshuflw xmm12, xmm12, 0B1H - pshufhw xmm12, xmm12, 0B1H - pshuflw xmm13, xmm13, 0B1H - pshufhw xmm13, xmm13, 0B1H - pshuflw xmm14, xmm14, 0B1H - pshufhw xmm14, xmm14, 0B1H - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+100H] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+100H], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0A0H] - paddd xmm1, xmmword ptr [rsp+0C0H] - paddd xmm2, xmmword ptr [rsp+40H] - paddd xmm3, xmmword ptr [rsp+0D0H] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmm15 - psrld xmm15, 8 - pslld xmm8, 24 - pxor xmm15, xmm8 - movdqa xmm8, xmm12 - psrld xmm12, 8 - pslld xmm8, 24 - pxor xmm12, xmm8 - movdqa xmm8, xmm13 - psrld xmm13, 8 - pslld xmm8, 24 - pxor xmm13, xmm8 - movdqa xmm8, xmm14 - psrld xmm14, 8 - pslld xmm8, 24 - pxor xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+100H] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - pxor xmm0, xmm8 - pxor xmm1, xmm9 - pxor xmm2, xmm10 - pxor xmm3, xmm11 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - pxor xmm4, xmm12 - pxor xmm5, xmm13 - pxor xmm6, xmm14 - pxor xmm7, xmm15 - mov eax, r13d - jne innerloop4 - movdqa xmm9, xmm0 - punpckldq xmm0, xmm1 - punpckhdq xmm9, xmm1 - movdqa xmm11, xmm2 - punpckldq xmm2, xmm3 - punpckhdq xmm11, xmm3 - movdqa xmm1, xmm0 - punpcklqdq xmm0, xmm2 - punpckhqdq xmm1, xmm2 - movdqa xmm3, xmm9 - punpcklqdq xmm9, xmm11 - punpckhqdq xmm3, xmm11 - movdqu xmmword ptr [rbx], xmm0 - movdqu xmmword ptr [rbx+20H], xmm1 - movdqu xmmword ptr [rbx+40H], xmm9 - movdqu xmmword ptr [rbx+60H], xmm3 - movdqa xmm9, xmm4 - punpckldq xmm4, xmm5 - punpckhdq xmm9, xmm5 - movdqa xmm11, xmm6 - punpckldq xmm6, xmm7 - punpckhdq xmm11, xmm7 - movdqa xmm5, xmm4 - punpcklqdq xmm4, xmm6 - punpckhqdq xmm5, xmm6 - movdqa xmm7, xmm9 - punpcklqdq xmm9, xmm11 - punpckhqdq xmm7, xmm11 - movdqu xmmword ptr [rbx+10H], xmm4 - movdqu xmmword ptr [rbx+30H], xmm5 - movdqu xmmword ptr [rbx+50H], xmm9 - movdqu xmmword ptr [rbx+70H], xmm7 - movdqa xmm1, xmmword ptr [rsp+110H] - movdqa xmm0, xmm1 - paddd xmm1, xmmword ptr [rsp+150H] - movdqa xmmword ptr [rsp+110H], xmm1 - pxor xmm0, xmmword ptr [CMP_MSB_MASK] - pxor xmm1, xmmword ptr [CMP_MSB_MASK] - pcmpgtd xmm0, xmm1 - movdqa xmm1, xmmword ptr [rsp+120H] - psubd xmm1, xmm0 - movdqa xmmword ptr [rsp+120H], xmm1 - add rbx, 128 - add rdi, 32 - sub rsi, 4 - cmp rsi, 4 - jnc outerloop4 - test rsi, rsi - jne final3blocks -unwind: - movdqa xmm6, xmmword ptr [rsp+170H] - movdqa xmm7, xmmword ptr [rsp+180H] - movdqa xmm8, xmmword ptr [rsp+190H] - movdqa xmm9, xmmword ptr [rsp+1A0H] - movdqa xmm10, xmmword ptr [rsp+1B0H] - movdqa xmm11, xmmword ptr [rsp+1C0H] - movdqa xmm12, xmmword ptr [rsp+1D0H] - movdqa xmm13, xmmword ptr [rsp+1E0H] - movdqa xmm14, xmmword ptr [rsp+1F0H] - movdqa xmm15, xmmword ptr [rsp+200H] - mov rsp, rbp - pop rbp - pop rbx - pop rdi - pop rsi - pop r12 - pop r13 - pop r14 - pop r15 - ret -ALIGN 16 -final3blocks: - test esi, 2H - je final1block - movups xmm0, xmmword ptr [rcx] - movups xmm1, xmmword ptr [rcx+10H] - movaps xmm8, xmm0 - movaps xmm9, xmm1 - movd xmm13, dword ptr [rsp+110H] - movd xmm14, dword ptr [rsp+120H] - punpckldq xmm13, xmm14 - movaps xmmword ptr [rsp], xmm13 - movd xmm14, dword ptr [rsp+114H] - movd xmm13, dword ptr [rsp+124H] - punpckldq xmm14, xmm13 - movaps xmmword ptr [rsp+10H], xmm14 - mov r8, qword ptr [rdi] - mov r9, qword ptr [rdi+8H] - movzx eax, byte ptr [rbp+80H] - or eax, r13d - xor edx, edx -innerloop2: - mov r14d, eax - or eax, r12d - add rdx, 64 - cmp rdx, r15 - cmovne eax, r14d - movaps xmm2, xmmword ptr [BLAKE3_IV] - movaps xmm10, xmm2 - movups xmm4, xmmword ptr [r8+rdx-40H] - movups xmm5, xmmword ptr [r8+rdx-30H] - movaps xmm3, xmm4 - shufps xmm4, xmm5, 136 - shufps xmm3, xmm5, 221 - movaps xmm5, xmm3 - movups xmm6, xmmword ptr [r8+rdx-20H] - movups xmm7, xmmword ptr [r8+rdx-10H] - movaps xmm3, xmm6 - shufps xmm6, xmm7, 136 - pshufd xmm6, xmm6, 93H - shufps xmm3, xmm7, 221 - pshufd xmm7, xmm3, 93H - movups xmm12, xmmword ptr [r9+rdx-40H] - movups xmm13, xmmword ptr [r9+rdx-30H] - movaps xmm11, xmm12 - shufps xmm12, xmm13, 136 - shufps xmm11, xmm13, 221 - movaps xmm13, xmm11 - movups xmm14, xmmword ptr [r9+rdx-20H] - movups xmm15, xmmword ptr [r9+rdx-10H] - movaps xmm11, xmm14 - shufps xmm14, xmm15, 136 - pshufd xmm14, xmm14, 93H - shufps xmm11, xmm15, 221 - pshufd xmm15, xmm11, 93H - shl rax, 20H - or rax, 40H - movd xmm3, rax - movdqa xmmword ptr [rsp+20H], xmm3 - movaps xmm3, xmmword ptr [rsp] - movaps xmm11, xmmword ptr [rsp+10H] - punpcklqdq xmm3, xmmword ptr [rsp+20H] - punpcklqdq xmm11, xmmword ptr [rsp+20H] - mov al, 7 -roundloop2: - paddd xmm0, xmm4 - paddd xmm8, xmm12 - movaps xmmword ptr [rsp+20H], xmm4 - movaps xmmword ptr [rsp+30H], xmm12 - paddd xmm0, xmm1 - paddd xmm8, xmm9 - pxor xmm3, xmm0 - pxor xmm11, xmm8 - pshuflw xmm3, xmm3, 0B1H - pshufhw xmm3, xmm3, 0B1H - pshuflw xmm11, xmm11, 0B1H - pshufhw xmm11, xmm11, 0B1H - paddd xmm2, xmm3 - paddd xmm10, xmm11 - pxor xmm1, xmm2 - pxor xmm9, xmm10 - movdqa xmm4, xmm1 - pslld xmm1, 20 - psrld xmm4, 12 - por xmm1, xmm4 - movdqa xmm4, xmm9 - pslld xmm9, 20 - psrld xmm4, 12 - por xmm9, xmm4 - paddd xmm0, xmm5 - paddd xmm8, xmm13 - movaps xmmword ptr [rsp+40H], xmm5 - movaps xmmword ptr [rsp+50H], xmm13 - paddd xmm0, xmm1 - paddd xmm8, xmm9 - pxor xmm3, xmm0 - pxor xmm11, xmm8 - movdqa xmm13, xmm3 - psrld xmm3, 8 - pslld xmm13, 24 - pxor xmm3, xmm13 - movdqa xmm13, xmm11 - psrld xmm11, 8 - pslld xmm13, 24 - pxor xmm11, xmm13 - paddd xmm2, xmm3 - paddd xmm10, xmm11 - pxor xmm1, xmm2 - pxor xmm9, xmm10 - movdqa xmm4, xmm1 - pslld xmm1, 25 - psrld xmm4, 7 - por xmm1, xmm4 - movdqa xmm4, xmm9 - pslld xmm9, 25 - psrld xmm4, 7 - por xmm9, xmm4 - pshufd xmm0, xmm0, 93H - pshufd xmm8, xmm8, 93H - pshufd xmm3, xmm3, 4EH - pshufd xmm11, xmm11, 4EH - pshufd xmm2, xmm2, 39H - pshufd xmm10, xmm10, 39H - paddd xmm0, xmm6 - paddd xmm8, xmm14 - paddd xmm0, xmm1 - paddd xmm8, xmm9 - pxor xmm3, xmm0 - pxor xmm11, xmm8 - pshuflw xmm3, xmm3, 0B1H - pshufhw xmm3, xmm3, 0B1H - pshuflw xmm11, xmm11, 0B1H - pshufhw xmm11, xmm11, 0B1H - paddd xmm2, xmm3 - paddd xmm10, xmm11 - pxor xmm1, xmm2 - pxor xmm9, xmm10 - movdqa xmm4, xmm1 - pslld xmm1, 20 - psrld xmm4, 12 - por xmm1, xmm4 - movdqa xmm4, xmm9 - pslld xmm9, 20 - psrld xmm4, 12 - por xmm9, xmm4 - paddd xmm0, xmm7 - paddd xmm8, xmm15 - paddd xmm0, xmm1 - paddd xmm8, xmm9 - pxor xmm3, xmm0 - pxor xmm11, xmm8 - movdqa xmm13, xmm3 - psrld xmm3, 8 - pslld xmm13, 24 - pxor xmm3, xmm13 - movdqa xmm13, xmm11 - psrld xmm11, 8 - pslld xmm13, 24 - pxor xmm11, xmm13 - paddd xmm2, xmm3 - paddd xmm10, xmm11 - pxor xmm1, xmm2 - pxor xmm9, xmm10 - movdqa xmm4, xmm1 - pslld xmm1, 25 - psrld xmm4, 7 - por xmm1, xmm4 - movdqa xmm4, xmm9 - pslld xmm9, 25 - psrld xmm4, 7 - por xmm9, xmm4 - pshufd xmm0, xmm0, 39H - pshufd xmm8, xmm8, 39H - pshufd xmm3, xmm3, 4EH - pshufd xmm11, xmm11, 4EH - pshufd xmm2, xmm2, 93H - pshufd xmm10, xmm10, 93H - dec al - je endroundloop2 - movdqa xmm12, xmmword ptr [rsp+20H] - movdqa xmm5, xmmword ptr [rsp+40H] - pshufd xmm13, xmm12, 0FH - shufps xmm12, xmm5, 214 - pshufd xmm4, xmm12, 39H - movdqa xmm12, xmm6 - shufps xmm12, xmm7, 250 - pand xmm13, xmmword ptr [PBLENDW_0x33_MASK] - pand xmm12, xmmword ptr [PBLENDW_0xCC_MASK] - por xmm13, xmm12 - movdqa xmmword ptr [rsp+20H], xmm13 - movdqa xmm12, xmm7 - punpcklqdq xmm12, xmm5 - movdqa xmm13, xmm6 - pand xmm12, xmmword ptr [PBLENDW_0x3F_MASK] - pand xmm13, xmmword ptr [PBLENDW_0xC0_MASK] - por xmm12, xmm13 - pshufd xmm12, xmm12, 78H - punpckhdq xmm5, xmm7 - punpckldq xmm6, xmm5 - pshufd xmm7, xmm6, 1EH - movdqa xmmword ptr [rsp+40H], xmm12 - movdqa xmm5, xmmword ptr [rsp+30H] - movdqa xmm13, xmmword ptr [rsp+50H] - pshufd xmm6, xmm5, 0FH - shufps xmm5, xmm13, 214 - pshufd xmm12, xmm5, 39H - movdqa xmm5, xmm14 - shufps xmm5, xmm15, 250 - pand xmm6, xmmword ptr [PBLENDW_0x33_MASK] - pand xmm5, xmmword ptr [PBLENDW_0xCC_MASK] - por xmm6, xmm5 - movdqa xmm5, xmm15 - punpcklqdq xmm5, xmm13 - movdqa xmmword ptr [rsp+30H], xmm2 - movdqa xmm2, xmm14 - pand xmm5, xmmword ptr [PBLENDW_0x3F_MASK] - pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK] - por xmm5, xmm2 - movdqa xmm2, xmmword ptr [rsp+30H] - pshufd xmm5, xmm5, 78H - punpckhdq xmm13, xmm15 - punpckldq xmm14, xmm13 - pshufd xmm15, xmm14, 1EH - movdqa xmm13, xmm6 - movdqa xmm14, xmm5 - movdqa xmm5, xmmword ptr [rsp+20H] - movdqa xmm6, xmmword ptr [rsp+40H] - jmp roundloop2 -endroundloop2: - pxor xmm0, xmm2 - pxor xmm1, xmm3 - pxor xmm8, xmm10 - pxor xmm9, xmm11 - mov eax, r13d - cmp rdx, r15 - jne innerloop2 - movups xmmword ptr [rbx], xmm0 - movups xmmword ptr [rbx+10H], xmm1 - movups xmmword ptr [rbx+20H], xmm8 - movups xmmword ptr [rbx+30H], xmm9 - mov eax, dword ptr [rsp+130H] - neg eax - mov r10d, dword ptr [rsp+110H+8*rax] - mov r11d, dword ptr [rsp+120H+8*rax] - mov dword ptr [rsp+110H], r10d - mov dword ptr [rsp+120H], r11d - add rdi, 16 - add rbx, 64 - sub rsi, 2 -final1block: - test esi, 1H - je unwind - movups xmm0, xmmword ptr [rcx] - movups xmm1, xmmword ptr [rcx+10H] - movd xmm13, dword ptr [rsp+110H] - movd xmm14, dword ptr [rsp+120H] - punpckldq xmm13, xmm14 - mov r8, qword ptr [rdi] - movzx eax, byte ptr [rbp+80H] - or eax, r13d - xor edx, edx -innerloop1: - mov r14d, eax - or eax, r12d - add rdx, 64 - cmp rdx, r15 - cmovne eax, r14d - movaps xmm2, xmmword ptr [BLAKE3_IV] - shl rax, 32 - or rax, 64 - movd xmm12, rax - movdqa xmm3, xmm13 - punpcklqdq xmm3, xmm12 - movups xmm4, xmmword ptr [r8+rdx-40H] - movups xmm5, xmmword ptr [r8+rdx-30H] - movaps xmm8, xmm4 - shufps xmm4, xmm5, 136 - shufps xmm8, xmm5, 221 - movaps xmm5, xmm8 - movups xmm6, xmmword ptr [r8+rdx-20H] - movups xmm7, xmmword ptr [r8+rdx-10H] - movaps xmm8, xmm6 - shufps xmm6, xmm7, 136 - pshufd xmm6, xmm6, 93H - shufps xmm8, xmm7, 221 - pshufd xmm7, xmm8, 93H - mov al, 7 -roundloop1: - paddd xmm0, xmm4 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshuflw xmm3, xmm3, 0B1H - pshufhw xmm3, xmm3, 0B1H - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 20 - psrld xmm11, 12 - por xmm1, xmm11 - paddd xmm0, xmm5 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - movdqa xmm14, xmm3 - psrld xmm3, 8 - pslld xmm14, 24 - pxor xmm3, xmm14 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 25 - psrld xmm11, 7 - por xmm1, xmm11 - pshufd xmm0, xmm0, 93H - pshufd xmm3, xmm3, 4EH - pshufd xmm2, xmm2, 39H - paddd xmm0, xmm6 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshuflw xmm3, xmm3, 0B1H - pshufhw xmm3, xmm3, 0B1H - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 20 - psrld xmm11, 12 - por xmm1, xmm11 - paddd xmm0, xmm7 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - movdqa xmm14, xmm3 - psrld xmm3, 8 - pslld xmm14, 24 - pxor xmm3, xmm14 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 25 - psrld xmm11, 7 - por xmm1, xmm11 - pshufd xmm0, xmm0, 39H - pshufd xmm3, xmm3, 4EH - pshufd xmm2, xmm2, 93H - dec al - jz endroundloop1 - movdqa xmm8, xmm4 - shufps xmm8, xmm5, 214 - pshufd xmm9, xmm4, 0FH - pshufd xmm4, xmm8, 39H - movdqa xmm8, xmm6 - shufps xmm8, xmm7, 250 - pand xmm9, xmmword ptr [PBLENDW_0x33_MASK] - pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK] - por xmm9, xmm8 - movdqa xmm8, xmm7 - punpcklqdq xmm8, xmm5 - movdqa xmm10, xmm6 - pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK] - pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK] - por xmm8, xmm10 - pshufd xmm8, xmm8, 78H - punpckhdq xmm5, xmm7 - punpckldq xmm6, xmm5 - pshufd xmm7, xmm6, 1EH - movdqa xmm5, xmm9 - movdqa xmm6, xmm8 - jmp roundloop1 -endroundloop1: - pxor xmm0, xmm2 - pxor xmm1, xmm3 - mov eax, r13d - cmp rdx, r15 - jne innerloop1 - movups xmmword ptr [rbx], xmm0 - movups xmmword ptr [rbx+10H], xmm1 - jmp unwind -_blake3_hash_many_sse2 ENDP -blake3_hash_many_sse2 ENDP - -blake3_compress_in_place_sse2 PROC -_blake3_compress_in_place_sse2 PROC - sub rsp, 120 - movdqa xmmword ptr [rsp], xmm6 - movdqa xmmword ptr [rsp+10H], xmm7 - movdqa xmmword ptr [rsp+20H], xmm8 - movdqa xmmword ptr [rsp+30H], xmm9 - movdqa xmmword ptr [rsp+40H], xmm11 - movdqa xmmword ptr [rsp+50H], xmm14 - movdqa xmmword ptr [rsp+60H], xmm15 - movups xmm0, xmmword ptr [rcx] - movups xmm1, xmmword ptr [rcx+10H] - movaps xmm2, xmmword ptr [BLAKE3_IV] - movzx eax, byte ptr [rsp+0A0H] - movzx r8d, r8b - shl rax, 32 - add r8, rax - movq xmm3, r9 - movq xmm4, r8 - punpcklqdq xmm3, xmm4 - movups xmm4, xmmword ptr [rdx] - movups xmm5, xmmword ptr [rdx+10H] - movaps xmm8, xmm4 - shufps xmm4, xmm5, 136 - shufps xmm8, xmm5, 221 - movaps xmm5, xmm8 - movups xmm6, xmmword ptr [rdx+20H] - movups xmm7, xmmword ptr [rdx+30H] - movaps xmm8, xmm6 - shufps xmm6, xmm7, 136 - pshufd xmm6, xmm6, 93H - shufps xmm8, xmm7, 221 - pshufd xmm7, xmm8, 93H - mov al, 7 -@@: - paddd xmm0, xmm4 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshuflw xmm3, xmm3, 0B1H - pshufhw xmm3, xmm3, 0B1H - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 20 - psrld xmm11, 12 - por xmm1, xmm11 - paddd xmm0, xmm5 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - movdqa xmm14, xmm3 - psrld xmm3, 8 - pslld xmm14, 24 - pxor xmm3, xmm14 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 25 - psrld xmm11, 7 - por xmm1, xmm11 - pshufd xmm0, xmm0, 93H - pshufd xmm3, xmm3, 4EH - pshufd xmm2, xmm2, 39H - paddd xmm0, xmm6 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshuflw xmm3, xmm3, 0B1H - pshufhw xmm3, xmm3, 0B1H - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 20 - psrld xmm11, 12 - por xmm1, xmm11 - paddd xmm0, xmm7 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - movdqa xmm14, xmm3 - psrld xmm3, 8 - pslld xmm14, 24 - pxor xmm3, xmm14 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 25 - psrld xmm11, 7 - por xmm1, xmm11 - pshufd xmm0, xmm0, 39H - pshufd xmm3, xmm3, 4EH - pshufd xmm2, xmm2, 93H - dec al - jz @F - movdqa xmm8, xmm4 - shufps xmm8, xmm5, 214 - pshufd xmm9, xmm4, 0FH - pshufd xmm4, xmm8, 39H - movdqa xmm8, xmm6 - shufps xmm8, xmm7, 250 - pand xmm9, xmmword ptr [PBLENDW_0x33_MASK] - pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK] - por xmm9, xmm8 - movdqa xmm8, xmm7 - punpcklqdq xmm8, xmm5 - movdqa xmm10, xmm6 - pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK] - pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK] - por xmm8, xmm10 - pshufd xmm8, xmm8, 78H - punpckhdq xmm5, xmm7 - punpckldq xmm6, xmm5 - pshufd xmm7, xmm6, 1EH - movdqa xmm5, xmm9 - movdqa xmm6, xmm8 - jmp @B -@@: - pxor xmm0, xmm2 - pxor xmm1, xmm3 - movups xmmword ptr [rcx], xmm0 - movups xmmword ptr [rcx+10H], xmm1 - movdqa xmm6, xmmword ptr [rsp] - movdqa xmm7, xmmword ptr [rsp+10H] - movdqa xmm8, xmmword ptr [rsp+20H] - movdqa xmm9, xmmword ptr [rsp+30H] - movdqa xmm11, xmmword ptr [rsp+40H] - movdqa xmm14, xmmword ptr [rsp+50H] - movdqa xmm15, xmmword ptr [rsp+60H] - add rsp, 120 - ret -_blake3_compress_in_place_sse2 ENDP -blake3_compress_in_place_sse2 ENDP - -ALIGN 16 -blake3_compress_xof_sse2 PROC -_blake3_compress_xof_sse2 PROC - sub rsp, 120 - movdqa xmmword ptr [rsp], xmm6 - movdqa xmmword ptr [rsp+10H], xmm7 - movdqa xmmword ptr [rsp+20H], xmm8 - movdqa xmmword ptr [rsp+30H], xmm9 - movdqa xmmword ptr [rsp+40H], xmm11 - movdqa xmmword ptr [rsp+50H], xmm14 - movdqa xmmword ptr [rsp+60H], xmm15 - movups xmm0, xmmword ptr [rcx] - movups xmm1, xmmword ptr [rcx+10H] - movaps xmm2, xmmword ptr [BLAKE3_IV] - movzx eax, byte ptr [rsp+0A0H] - movzx r8d, r8b - mov r10, qword ptr [rsp+0A8H] - shl rax, 32 - add r8, rax - movq xmm3, r9 - movq xmm4, r8 - punpcklqdq xmm3, xmm4 - movups xmm4, xmmword ptr [rdx] - movups xmm5, xmmword ptr [rdx+10H] - movaps xmm8, xmm4 - shufps xmm4, xmm5, 136 - shufps xmm8, xmm5, 221 - movaps xmm5, xmm8 - movups xmm6, xmmword ptr [rdx+20H] - movups xmm7, xmmword ptr [rdx+30H] - movaps xmm8, xmm6 - shufps xmm6, xmm7, 136 - pshufd xmm6, xmm6, 93H - shufps xmm8, xmm7, 221 - pshufd xmm7, xmm8, 93H - mov al, 7 -@@: - paddd xmm0, xmm4 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshuflw xmm3, xmm3, 0B1H - pshufhw xmm3, xmm3, 0B1H - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 20 - psrld xmm11, 12 - por xmm1, xmm11 - paddd xmm0, xmm5 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - movdqa xmm14, xmm3 - psrld xmm3, 8 - pslld xmm14, 24 - pxor xmm3, xmm14 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 25 - psrld xmm11, 7 - por xmm1, xmm11 - pshufd xmm0, xmm0, 93H - pshufd xmm3, xmm3, 4EH - pshufd xmm2, xmm2, 39H - paddd xmm0, xmm6 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshuflw xmm3, xmm3, 0B1H - pshufhw xmm3, xmm3, 0B1H - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 20 - psrld xmm11, 12 - por xmm1, xmm11 - paddd xmm0, xmm7 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - movdqa xmm14, xmm3 - psrld xmm3, 8 - pslld xmm14, 24 - pxor xmm3, xmm14 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 25 - psrld xmm11, 7 - por xmm1, xmm11 - pshufd xmm0, xmm0, 39H - pshufd xmm3, xmm3, 4EH - pshufd xmm2, xmm2, 93H - dec al - jz @F - movdqa xmm8, xmm4 - shufps xmm8, xmm5, 214 - pshufd xmm9, xmm4, 0FH - pshufd xmm4, xmm8, 39H - movdqa xmm8, xmm6 - shufps xmm8, xmm7, 250 - pand xmm9, xmmword ptr [PBLENDW_0x33_MASK] - pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK] - por xmm9, xmm8 - movdqa xmm8, xmm7 - punpcklqdq xmm8, xmm5 - movdqa xmm10, xmm6 - pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK] - pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK] - por xmm8, xmm10 - pshufd xmm8, xmm8, 78H - punpckhdq xmm5, xmm7 - punpckldq xmm6, xmm5 - pshufd xmm7, xmm6, 1EH - movdqa xmm5, xmm9 - movdqa xmm6, xmm8 - jmp @B -@@: - movdqu xmm4, xmmword ptr [rcx] - movdqu xmm5, xmmword ptr [rcx+10H] - pxor xmm0, xmm2 - pxor xmm1, xmm3 - pxor xmm2, xmm4 - pxor xmm3, xmm5 - movups xmmword ptr [r10], xmm0 - movups xmmword ptr [r10+10H], xmm1 - movups xmmword ptr [r10+20H], xmm2 - movups xmmword ptr [r10+30H], xmm3 - movdqa xmm6, xmmword ptr [rsp] - movdqa xmm7, xmmword ptr [rsp+10H] - movdqa xmm8, xmmword ptr [rsp+20H] - movdqa xmm9, xmmword ptr [rsp+30H] - movdqa xmm11, xmmword ptr [rsp+40H] - movdqa xmm14, xmmword ptr [rsp+50H] - movdqa xmm15, xmmword ptr [rsp+60H] - add rsp, 120 - ret -_blake3_compress_xof_sse2 ENDP -blake3_compress_xof_sse2 ENDP - -_TEXT ENDS - - -_RDATA SEGMENT READONLY PAGE ALIAS(".rdata") 'CONST' -ALIGN 64 -BLAKE3_IV: - dd 6A09E667H, 0BB67AE85H, 3C6EF372H, 0A54FF53AH - -ADD0: - dd 0, 1, 2, 3 - -ADD1: - dd 4 dup (4) - -BLAKE3_IV_0: - dd 4 dup (6A09E667H) - -BLAKE3_IV_1: - dd 4 dup (0BB67AE85H) - -BLAKE3_IV_2: - dd 4 dup (3C6EF372H) - -BLAKE3_IV_3: - dd 4 dup (0A54FF53AH) - -BLAKE3_BLOCK_LEN: - dd 4 dup (64) - -CMP_MSB_MASK: - dd 8 dup(80000000H) - -PBLENDW_0x33_MASK: - dd 0FFFFFFFFH, 000000000H, 0FFFFFFFFH, 000000000H -PBLENDW_0xCC_MASK: - dd 000000000H, 0FFFFFFFFH, 000000000H, 0FFFFFFFFH -PBLENDW_0x3F_MASK: - dd 0FFFFFFFFH, 0FFFFFFFFH, 0FFFFFFFFH, 000000000H -PBLENDW_0xC0_MASK: - dd 000000000H, 000000000H, 000000000H, 0FFFFFFFFH - -_RDATA ENDS -END diff --git a/thirdparty/BLAKE3/c/blake3_sse41.c b/thirdparty/BLAKE3/c/blake3_sse41.c deleted file mode 100644 index b31122533..000000000 --- a/thirdparty/BLAKE3/c/blake3_sse41.c +++ /dev/null @@ -1,559 +0,0 @@ -#include "blake3_impl.h" - -#include <immintrin.h> - -#define DEGREE 4 - -#define _mm_shuffle_ps2(a, b, c) \ - (_mm_castps_si128( \ - _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c)))) - -INLINE __m128i loadu(const uint8_t src[16]) { - return _mm_loadu_si128((const __m128i *)src); -} - -INLINE void storeu(__m128i src, uint8_t dest[16]) { - _mm_storeu_si128((__m128i *)dest, src); -} - -INLINE __m128i addv(__m128i a, __m128i b) { return _mm_add_epi32(a, b); } - -// Note that clang-format doesn't like the name "xor" for some reason. -INLINE __m128i xorv(__m128i a, __m128i b) { return _mm_xor_si128(a, b); } - -INLINE __m128i set1(uint32_t x) { return _mm_set1_epi32((int32_t)x); } - -INLINE __m128i set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { - return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d); -} - -INLINE __m128i rot16(__m128i x) { - return _mm_shuffle_epi8( - x, _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2)); -} - -INLINE __m128i rot12(__m128i x) { - return xorv(_mm_srli_epi32(x, 12), _mm_slli_epi32(x, 32 - 12)); -} - -INLINE __m128i rot8(__m128i x) { - return _mm_shuffle_epi8( - x, _mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1)); -} - -INLINE __m128i rot7(__m128i x) { - return xorv(_mm_srli_epi32(x, 7), _mm_slli_epi32(x, 32 - 7)); -} - -INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, - __m128i m) { - *row0 = addv(addv(*row0, m), *row1); - *row3 = xorv(*row3, *row0); - *row3 = rot16(*row3); - *row2 = addv(*row2, *row3); - *row1 = xorv(*row1, *row2); - *row1 = rot12(*row1); -} - -INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, - __m128i m) { - *row0 = addv(addv(*row0, m), *row1); - *row3 = xorv(*row3, *row0); - *row3 = rot8(*row3); - *row2 = addv(*row2, *row3); - *row1 = xorv(*row1, *row2); - *row1 = rot7(*row1); -} - -// Note the optimization here of leaving row1 as the unrotated row, rather than -// row0. All the message loads below are adjusted to compensate for this. See -// discussion at https://github.com/sneves/blake2-avx2/pull/4 -INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { - *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3)); - *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); - *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1)); -} - -INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { - *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1)); - *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); - *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3)); -} - -INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, uint8_t flags) { - rows[0] = loadu((uint8_t *)&cv[0]); - rows[1] = loadu((uint8_t *)&cv[4]); - rows[2] = set4(IV[0], IV[1], IV[2], IV[3]); - rows[3] = set4(counter_low(counter), counter_high(counter), - (uint32_t)block_len, (uint32_t)flags); - - __m128i m0 = loadu(&block[sizeof(__m128i) * 0]); - __m128i m1 = loadu(&block[sizeof(__m128i) * 1]); - __m128i m2 = loadu(&block[sizeof(__m128i) * 2]); - __m128i m3 = loadu(&block[sizeof(__m128i) * 3]); - - __m128i t0, t1, t2, t3, tt; - - // Round 1. The first round permutes the message words from the original - // input order, into the groups that get mixed in parallel. - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); // 6 4 2 0 - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); // 7 5 3 1 - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10 8 - t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3)); // 12 10 8 14 - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11 9 - t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3)); // 13 11 9 15 - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 2. This round and all following rounds apply a fixed permutation - // to the message words from the round before. - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 3 - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 4 - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 5 - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 6 - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 7 - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); -} - -void blake3_compress_in_place_sse41(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags) { - __m128i rows[4]; - compress_pre(rows, cv, block, block_len, counter, flags); - storeu(xorv(rows[0], rows[2]), (uint8_t *)&cv[0]); - storeu(xorv(rows[1], rows[3]), (uint8_t *)&cv[4]); -} - -void blake3_compress_xof_sse41(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags, uint8_t out[64]) { - __m128i rows[4]; - compress_pre(rows, cv, block, block_len, counter, flags); - storeu(xorv(rows[0], rows[2]), &out[0]); - storeu(xorv(rows[1], rows[3]), &out[16]); - storeu(xorv(rows[2], loadu((uint8_t *)&cv[0])), &out[32]); - storeu(xorv(rows[3], loadu((uint8_t *)&cv[4])), &out[48]); -} - -INLINE void round_fn(__m128i v[16], __m128i m[16], size_t r) { - v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); - v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); - v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); - v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); - v[0] = addv(v[0], v[4]); - v[1] = addv(v[1], v[5]); - v[2] = addv(v[2], v[6]); - v[3] = addv(v[3], v[7]); - v[12] = xorv(v[12], v[0]); - v[13] = xorv(v[13], v[1]); - v[14] = xorv(v[14], v[2]); - v[15] = xorv(v[15], v[3]); - v[12] = rot16(v[12]); - v[13] = rot16(v[13]); - v[14] = rot16(v[14]); - v[15] = rot16(v[15]); - v[8] = addv(v[8], v[12]); - v[9] = addv(v[9], v[13]); - v[10] = addv(v[10], v[14]); - v[11] = addv(v[11], v[15]); - v[4] = xorv(v[4], v[8]); - v[5] = xorv(v[5], v[9]); - v[6] = xorv(v[6], v[10]); - v[7] = xorv(v[7], v[11]); - v[4] = rot12(v[4]); - v[5] = rot12(v[5]); - v[6] = rot12(v[6]); - v[7] = rot12(v[7]); - v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); - v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); - v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); - v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); - v[0] = addv(v[0], v[4]); - v[1] = addv(v[1], v[5]); - v[2] = addv(v[2], v[6]); - v[3] = addv(v[3], v[7]); - v[12] = xorv(v[12], v[0]); - v[13] = xorv(v[13], v[1]); - v[14] = xorv(v[14], v[2]); - v[15] = xorv(v[15], v[3]); - v[12] = rot8(v[12]); - v[13] = rot8(v[13]); - v[14] = rot8(v[14]); - v[15] = rot8(v[15]); - v[8] = addv(v[8], v[12]); - v[9] = addv(v[9], v[13]); - v[10] = addv(v[10], v[14]); - v[11] = addv(v[11], v[15]); - v[4] = xorv(v[4], v[8]); - v[5] = xorv(v[5], v[9]); - v[6] = xorv(v[6], v[10]); - v[7] = xorv(v[7], v[11]); - v[4] = rot7(v[4]); - v[5] = rot7(v[5]); - v[6] = rot7(v[6]); - v[7] = rot7(v[7]); - - v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); - v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); - v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); - v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); - v[0] = addv(v[0], v[5]); - v[1] = addv(v[1], v[6]); - v[2] = addv(v[2], v[7]); - v[3] = addv(v[3], v[4]); - v[15] = xorv(v[15], v[0]); - v[12] = xorv(v[12], v[1]); - v[13] = xorv(v[13], v[2]); - v[14] = xorv(v[14], v[3]); - v[15] = rot16(v[15]); - v[12] = rot16(v[12]); - v[13] = rot16(v[13]); - v[14] = rot16(v[14]); - v[10] = addv(v[10], v[15]); - v[11] = addv(v[11], v[12]); - v[8] = addv(v[8], v[13]); - v[9] = addv(v[9], v[14]); - v[5] = xorv(v[5], v[10]); - v[6] = xorv(v[6], v[11]); - v[7] = xorv(v[7], v[8]); - v[4] = xorv(v[4], v[9]); - v[5] = rot12(v[5]); - v[6] = rot12(v[6]); - v[7] = rot12(v[7]); - v[4] = rot12(v[4]); - v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); - v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); - v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); - v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); - v[0] = addv(v[0], v[5]); - v[1] = addv(v[1], v[6]); - v[2] = addv(v[2], v[7]); - v[3] = addv(v[3], v[4]); - v[15] = xorv(v[15], v[0]); - v[12] = xorv(v[12], v[1]); - v[13] = xorv(v[13], v[2]); - v[14] = xorv(v[14], v[3]); - v[15] = rot8(v[15]); - v[12] = rot8(v[12]); - v[13] = rot8(v[13]); - v[14] = rot8(v[14]); - v[10] = addv(v[10], v[15]); - v[11] = addv(v[11], v[12]); - v[8] = addv(v[8], v[13]); - v[9] = addv(v[9], v[14]); - v[5] = xorv(v[5], v[10]); - v[6] = xorv(v[6], v[11]); - v[7] = xorv(v[7], v[8]); - v[4] = xorv(v[4], v[9]); - v[5] = rot7(v[5]); - v[6] = rot7(v[6]); - v[7] = rot7(v[7]); - v[4] = rot7(v[4]); -} - -INLINE void transpose_vecs(__m128i vecs[DEGREE]) { - // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is - // 22/33. Note that this doesn't split the vector into two lanes, as the - // AVX2 counterparts do. - __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]); - __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]); - __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]); - __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]); - - // Interleave 64-bit lanes. - __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01); - __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01); - __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23); - __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23); - - vecs[0] = abcd_0; - vecs[1] = abcd_1; - vecs[2] = abcd_2; - vecs[3] = abcd_3; -} - -INLINE void transpose_msg_vecs(const uint8_t *const *inputs, - size_t block_offset, __m128i out[16]) { - out[0] = loadu(&inputs[0][block_offset + 0 * sizeof(__m128i)]); - out[1] = loadu(&inputs[1][block_offset + 0 * sizeof(__m128i)]); - out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m128i)]); - out[3] = loadu(&inputs[3][block_offset + 0 * sizeof(__m128i)]); - out[4] = loadu(&inputs[0][block_offset + 1 * sizeof(__m128i)]); - out[5] = loadu(&inputs[1][block_offset + 1 * sizeof(__m128i)]); - out[6] = loadu(&inputs[2][block_offset + 1 * sizeof(__m128i)]); - out[7] = loadu(&inputs[3][block_offset + 1 * sizeof(__m128i)]); - out[8] = loadu(&inputs[0][block_offset + 2 * sizeof(__m128i)]); - out[9] = loadu(&inputs[1][block_offset + 2 * sizeof(__m128i)]); - out[10] = loadu(&inputs[2][block_offset + 2 * sizeof(__m128i)]); - out[11] = loadu(&inputs[3][block_offset + 2 * sizeof(__m128i)]); - out[12] = loadu(&inputs[0][block_offset + 3 * sizeof(__m128i)]); - out[13] = loadu(&inputs[1][block_offset + 3 * sizeof(__m128i)]); - out[14] = loadu(&inputs[2][block_offset + 3 * sizeof(__m128i)]); - out[15] = loadu(&inputs[3][block_offset + 3 * sizeof(__m128i)]); - for (size_t i = 0; i < 4; ++i) { - _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0); - } - transpose_vecs(&out[0]); - transpose_vecs(&out[4]); - transpose_vecs(&out[8]); - transpose_vecs(&out[12]); -} - -INLINE void load_counters(uint64_t counter, bool increment_counter, - __m128i *out_lo, __m128i *out_hi) { - const __m128i mask = _mm_set1_epi32(-(int32_t)increment_counter); - const __m128i add0 = _mm_set_epi32(3, 2, 1, 0); - const __m128i add1 = _mm_and_si128(mask, add0); - __m128i l = _mm_add_epi32(_mm_set1_epi32(counter), add1); - __m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32(0x80000000)), - _mm_xor_si128( l, _mm_set1_epi32(0x80000000))); - __m128i h = _mm_sub_epi32(_mm_set1_epi32(counter >> 32), carry); - *out_lo = l; - *out_hi = h; -} - -void blake3_hash4_sse41(const uint8_t *const *inputs, size_t blocks, - const uint32_t key[8], uint64_t counter, - bool increment_counter, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, uint8_t *out) { - __m128i h_vecs[8] = { - set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]), - set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]), - }; - __m128i counter_low_vec, counter_high_vec; - load_counters(counter, increment_counter, &counter_low_vec, - &counter_high_vec); - uint8_t block_flags = flags | flags_start; - - for (size_t block = 0; block < blocks; block++) { - if (block + 1 == blocks) { - block_flags |= flags_end; - } - __m128i block_len_vec = set1(BLAKE3_BLOCK_LEN); - __m128i block_flags_vec = set1(block_flags); - __m128i msg_vecs[16]; - transpose_msg_vecs(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); - - __m128i v[16] = { - h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], - h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], - set1(IV[0]), set1(IV[1]), set1(IV[2]), set1(IV[3]), - counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, - }; - round_fn(v, msg_vecs, 0); - round_fn(v, msg_vecs, 1); - round_fn(v, msg_vecs, 2); - round_fn(v, msg_vecs, 3); - round_fn(v, msg_vecs, 4); - round_fn(v, msg_vecs, 5); - round_fn(v, msg_vecs, 6); - h_vecs[0] = xorv(v[0], v[8]); - h_vecs[1] = xorv(v[1], v[9]); - h_vecs[2] = xorv(v[2], v[10]); - h_vecs[3] = xorv(v[3], v[11]); - h_vecs[4] = xorv(v[4], v[12]); - h_vecs[5] = xorv(v[5], v[13]); - h_vecs[6] = xorv(v[6], v[14]); - h_vecs[7] = xorv(v[7], v[15]); - - block_flags = flags; - } - - transpose_vecs(&h_vecs[0]); - transpose_vecs(&h_vecs[4]); - // The first four vecs now contain the first half of each output, and the - // second four vecs contain the second half of each output. - storeu(h_vecs[0], &out[0 * sizeof(__m128i)]); - storeu(h_vecs[4], &out[1 * sizeof(__m128i)]); - storeu(h_vecs[1], &out[2 * sizeof(__m128i)]); - storeu(h_vecs[5], &out[3 * sizeof(__m128i)]); - storeu(h_vecs[2], &out[4 * sizeof(__m128i)]); - storeu(h_vecs[6], &out[5 * sizeof(__m128i)]); - storeu(h_vecs[3], &out[6 * sizeof(__m128i)]); - storeu(h_vecs[7], &out[7 * sizeof(__m128i)]); -} - -INLINE void hash_one_sse41(const uint8_t *input, size_t blocks, - const uint32_t key[8], uint64_t counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) { - uint32_t cv[8]; - memcpy(cv, key, BLAKE3_KEY_LEN); - uint8_t block_flags = flags | flags_start; - while (blocks > 0) { - if (blocks == 1) { - block_flags |= flags_end; - } - blake3_compress_in_place_sse41(cv, input, BLAKE3_BLOCK_LEN, counter, - block_flags); - input = &input[BLAKE3_BLOCK_LEN]; - blocks -= 1; - block_flags = flags; - } - memcpy(out, cv, BLAKE3_OUT_LEN); -} - -void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out) { - while (num_inputs >= DEGREE) { - blake3_hash4_sse41(inputs, blocks, key, counter, increment_counter, flags, - flags_start, flags_end, out); - if (increment_counter) { - counter += DEGREE; - } - inputs += DEGREE; - num_inputs -= DEGREE; - out = &out[DEGREE * BLAKE3_OUT_LEN]; - } - while (num_inputs > 0) { - hash_one_sse41(inputs[0], blocks, key, counter, flags, flags_start, - flags_end, out); - if (increment_counter) { - counter += 1; - } - inputs += 1; - num_inputs -= 1; - out = &out[BLAKE3_OUT_LEN]; - } -} diff --git a/thirdparty/BLAKE3/c/blake3_sse41_x86-64_unix.S b/thirdparty/BLAKE3/c/blake3_sse41_x86-64_unix.S deleted file mode 100644 index a3ff64269..000000000 --- a/thirdparty/BLAKE3/c/blake3_sse41_x86-64_unix.S +++ /dev/null @@ -1,2028 +0,0 @@ -#if defined(__ELF__) && defined(__linux__) -.section .note.GNU-stack,"",%progbits -#endif - -#if defined(__ELF__) && defined(__CET__) && defined(__has_include) -#if __has_include(<cet.h>) -#include <cet.h> -#endif -#endif - -#if !defined(_CET_ENDBR) -#define _CET_ENDBR -#endif - -.intel_syntax noprefix -.global blake3_hash_many_sse41 -.global _blake3_hash_many_sse41 -.global blake3_compress_in_place_sse41 -.global _blake3_compress_in_place_sse41 -.global blake3_compress_xof_sse41 -.global _blake3_compress_xof_sse41 -#ifdef __APPLE__ -.text -#else -.section .text -#endif - .p2align 6 -_blake3_hash_many_sse41: -blake3_hash_many_sse41: - _CET_ENDBR - push r15 - push r14 - push r13 - push r12 - push rbx - push rbp - mov rbp, rsp - sub rsp, 360 - and rsp, 0xFFFFFFFFFFFFFFC0 - neg r9d - movd xmm0, r9d - pshufd xmm0, xmm0, 0x00 - movdqa xmmword ptr [rsp+0x130], xmm0 - movdqa xmm1, xmm0 - pand xmm1, xmmword ptr [ADD0+rip] - pand xmm0, xmmword ptr [ADD1+rip] - movdqa xmmword ptr [rsp+0x150], xmm0 - movd xmm0, r8d - pshufd xmm0, xmm0, 0x00 - paddd xmm0, xmm1 - movdqa xmmword ptr [rsp+0x110], xmm0 - pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] - pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] - pcmpgtd xmm1, xmm0 - shr r8, 32 - movd xmm2, r8d - pshufd xmm2, xmm2, 0x00 - psubd xmm2, xmm1 - movdqa xmmword ptr [rsp+0x120], xmm2 - mov rbx, qword ptr [rbp+0x50] - mov r15, rdx - shl r15, 6 - movzx r13d, byte ptr [rbp+0x38] - movzx r12d, byte ptr [rbp+0x48] - cmp rsi, 4 - jc 3f -2: - movdqu xmm3, xmmword ptr [rcx] - pshufd xmm0, xmm3, 0x00 - pshufd xmm1, xmm3, 0x55 - pshufd xmm2, xmm3, 0xAA - pshufd xmm3, xmm3, 0xFF - movdqu xmm7, xmmword ptr [rcx+0x10] - pshufd xmm4, xmm7, 0x00 - pshufd xmm5, xmm7, 0x55 - pshufd xmm6, xmm7, 0xAA - pshufd xmm7, xmm7, 0xFF - mov r8, qword ptr [rdi] - mov r9, qword ptr [rdi+0x8] - mov r10, qword ptr [rdi+0x10] - mov r11, qword ptr [rdi+0x18] - movzx eax, byte ptr [rbp+0x40] - or eax, r13d - xor edx, edx -9: - mov r14d, eax - or eax, r12d - add rdx, 64 - cmp rdx, r15 - cmovne eax, r14d - movdqu xmm8, xmmword ptr [r8+rdx-0x40] - movdqu xmm9, xmmword ptr [r9+rdx-0x40] - movdqu xmm10, xmmword ptr [r10+rdx-0x40] - movdqu xmm11, xmmword ptr [r11+rdx-0x40] - movdqa xmm12, xmm8 - punpckldq xmm8, xmm9 - punpckhdq xmm12, xmm9 - movdqa xmm14, xmm10 - punpckldq xmm10, xmm11 - punpckhdq xmm14, xmm11 - movdqa xmm9, xmm8 - punpcklqdq xmm8, xmm10 - punpckhqdq xmm9, xmm10 - movdqa xmm13, xmm12 - punpcklqdq xmm12, xmm14 - punpckhqdq xmm13, xmm14 - movdqa xmmword ptr [rsp], xmm8 - movdqa xmmword ptr [rsp+0x10], xmm9 - movdqa xmmword ptr [rsp+0x20], xmm12 - movdqa xmmword ptr [rsp+0x30], xmm13 - movdqu xmm8, xmmword ptr [r8+rdx-0x30] - movdqu xmm9, xmmword ptr [r9+rdx-0x30] - movdqu xmm10, xmmword ptr [r10+rdx-0x30] - movdqu xmm11, xmmword ptr [r11+rdx-0x30] - movdqa xmm12, xmm8 - punpckldq xmm8, xmm9 - punpckhdq xmm12, xmm9 - movdqa xmm14, xmm10 - punpckldq xmm10, xmm11 - punpckhdq xmm14, xmm11 - movdqa xmm9, xmm8 - punpcklqdq xmm8, xmm10 - punpckhqdq xmm9, xmm10 - movdqa xmm13, xmm12 - punpcklqdq xmm12, xmm14 - punpckhqdq xmm13, xmm14 - movdqa xmmword ptr [rsp+0x40], xmm8 - movdqa xmmword ptr [rsp+0x50], xmm9 - movdqa xmmword ptr [rsp+0x60], xmm12 - movdqa xmmword ptr [rsp+0x70], xmm13 - movdqu xmm8, xmmword ptr [r8+rdx-0x20] - movdqu xmm9, xmmword ptr [r9+rdx-0x20] - movdqu xmm10, xmmword ptr [r10+rdx-0x20] - movdqu xmm11, xmmword ptr [r11+rdx-0x20] - movdqa xmm12, xmm8 - punpckldq xmm8, xmm9 - punpckhdq xmm12, xmm9 - movdqa xmm14, xmm10 - punpckldq xmm10, xmm11 - punpckhdq xmm14, xmm11 - movdqa xmm9, xmm8 - punpcklqdq xmm8, xmm10 - punpckhqdq xmm9, xmm10 - movdqa xmm13, xmm12 - punpcklqdq xmm12, xmm14 - punpckhqdq xmm13, xmm14 - movdqa xmmword ptr [rsp+0x80], xmm8 - movdqa xmmword ptr [rsp+0x90], xmm9 - movdqa xmmword ptr [rsp+0xA0], xmm12 - movdqa xmmword ptr [rsp+0xB0], xmm13 - movdqu xmm8, xmmword ptr [r8+rdx-0x10] - movdqu xmm9, xmmword ptr [r9+rdx-0x10] - movdqu xmm10, xmmword ptr [r10+rdx-0x10] - movdqu xmm11, xmmword ptr [r11+rdx-0x10] - movdqa xmm12, xmm8 - punpckldq xmm8, xmm9 - punpckhdq xmm12, xmm9 - movdqa xmm14, xmm10 - punpckldq xmm10, xmm11 - punpckhdq xmm14, xmm11 - movdqa xmm9, xmm8 - punpcklqdq xmm8, xmm10 - punpckhqdq xmm9, xmm10 - movdqa xmm13, xmm12 - punpcklqdq xmm12, xmm14 - punpckhqdq xmm13, xmm14 - movdqa xmmword ptr [rsp+0xC0], xmm8 - movdqa xmmword ptr [rsp+0xD0], xmm9 - movdqa xmmword ptr [rsp+0xE0], xmm12 - movdqa xmmword ptr [rsp+0xF0], xmm13 - movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip] - movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip] - movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip] - movdqa xmm12, xmmword ptr [rsp+0x110] - movdqa xmm13, xmmword ptr [rsp+0x120] - movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip] - movd xmm15, eax - pshufd xmm15, xmm15, 0x00 - prefetcht0 [r8+rdx+0x80] - prefetcht0 [r9+rdx+0x80] - prefetcht0 [r10+rdx+0x80] - prefetcht0 [r11+rdx+0x80] - paddd xmm0, xmmword ptr [rsp] - paddd xmm1, xmmword ptr [rsp+0x20] - paddd xmm2, xmmword ptr [rsp+0x40] - paddd xmm3, xmmword ptr [rsp+0x60] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0x10] - paddd xmm1, xmmword ptr [rsp+0x30] - paddd xmm2, xmmword ptr [rsp+0x50] - paddd xmm3, xmmword ptr [rsp+0x70] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0x80] - paddd xmm1, xmmword ptr [rsp+0xA0] - paddd xmm2, xmmword ptr [rsp+0xC0] - paddd xmm3, xmmword ptr [rsp+0xE0] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0x90] - paddd xmm1, xmmword ptr [rsp+0xB0] - paddd xmm2, xmmword ptr [rsp+0xD0] - paddd xmm3, xmmword ptr [rsp+0xF0] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0x20] - paddd xmm1, xmmword ptr [rsp+0x30] - paddd xmm2, xmmword ptr [rsp+0x70] - paddd xmm3, xmmword ptr [rsp+0x40] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0x60] - paddd xmm1, xmmword ptr [rsp+0xA0] - paddd xmm2, xmmword ptr [rsp] - paddd xmm3, xmmword ptr [rsp+0xD0] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0x10] - paddd xmm1, xmmword ptr [rsp+0xC0] - paddd xmm2, xmmword ptr [rsp+0x90] - paddd xmm3, xmmword ptr [rsp+0xF0] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0xB0] - paddd xmm1, xmmword ptr [rsp+0x50] - paddd xmm2, xmmword ptr [rsp+0xE0] - paddd xmm3, xmmword ptr [rsp+0x80] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0x30] - paddd xmm1, xmmword ptr [rsp+0xA0] - paddd xmm2, xmmword ptr [rsp+0xD0] - paddd xmm3, xmmword ptr [rsp+0x70] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0x40] - paddd xmm1, xmmword ptr [rsp+0xC0] - paddd xmm2, xmmword ptr [rsp+0x20] - paddd xmm3, xmmword ptr [rsp+0xE0] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0x60] - paddd xmm1, xmmword ptr [rsp+0x90] - paddd xmm2, xmmword ptr [rsp+0xB0] - paddd xmm3, xmmword ptr [rsp+0x80] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0x50] - paddd xmm1, xmmword ptr [rsp] - paddd xmm2, xmmword ptr [rsp+0xF0] - paddd xmm3, xmmword ptr [rsp+0x10] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0xA0] - paddd xmm1, xmmword ptr [rsp+0xC0] - paddd xmm2, xmmword ptr [rsp+0xE0] - paddd xmm3, xmmword ptr [rsp+0xD0] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0x70] - paddd xmm1, xmmword ptr [rsp+0x90] - paddd xmm2, xmmword ptr [rsp+0x30] - paddd xmm3, xmmword ptr [rsp+0xF0] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0x40] - paddd xmm1, xmmword ptr [rsp+0xB0] - paddd xmm2, xmmword ptr [rsp+0x50] - paddd xmm3, xmmword ptr [rsp+0x10] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp] - paddd xmm1, xmmword ptr [rsp+0x20] - paddd xmm2, xmmword ptr [rsp+0x80] - paddd xmm3, xmmword ptr [rsp+0x60] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0xC0] - paddd xmm1, xmmword ptr [rsp+0x90] - paddd xmm2, xmmword ptr [rsp+0xF0] - paddd xmm3, xmmword ptr [rsp+0xE0] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0xD0] - paddd xmm1, xmmword ptr [rsp+0xB0] - paddd xmm2, xmmword ptr [rsp+0xA0] - paddd xmm3, xmmword ptr [rsp+0x80] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0x70] - paddd xmm1, xmmword ptr [rsp+0x50] - paddd xmm2, xmmword ptr [rsp] - paddd xmm3, xmmword ptr [rsp+0x60] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0x20] - paddd xmm1, xmmword ptr [rsp+0x30] - paddd xmm2, xmmword ptr [rsp+0x10] - paddd xmm3, xmmword ptr [rsp+0x40] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0x90] - paddd xmm1, xmmword ptr [rsp+0xB0] - paddd xmm2, xmmword ptr [rsp+0x80] - paddd xmm3, xmmword ptr [rsp+0xF0] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0xE0] - paddd xmm1, xmmword ptr [rsp+0x50] - paddd xmm2, xmmword ptr [rsp+0xC0] - paddd xmm3, xmmword ptr [rsp+0x10] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0xD0] - paddd xmm1, xmmword ptr [rsp] - paddd xmm2, xmmword ptr [rsp+0x20] - paddd xmm3, xmmword ptr [rsp+0x40] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0x30] - paddd xmm1, xmmword ptr [rsp+0xA0] - paddd xmm2, xmmword ptr [rsp+0x60] - paddd xmm3, xmmword ptr [rsp+0x70] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0xB0] - paddd xmm1, xmmword ptr [rsp+0x50] - paddd xmm2, xmmword ptr [rsp+0x10] - paddd xmm3, xmmword ptr [rsp+0x80] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0xF0] - paddd xmm1, xmmword ptr [rsp] - paddd xmm2, xmmword ptr [rsp+0x90] - paddd xmm3, xmmword ptr [rsp+0x60] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0xE0] - paddd xmm1, xmmword ptr [rsp+0x20] - paddd xmm2, xmmword ptr [rsp+0x30] - paddd xmm3, xmmword ptr [rsp+0x70] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0xA0] - paddd xmm1, xmmword ptr [rsp+0xC0] - paddd xmm2, xmmword ptr [rsp+0x40] - paddd xmm3, xmmword ptr [rsp+0xD0] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - pxor xmm0, xmm8 - pxor xmm1, xmm9 - pxor xmm2, xmm10 - pxor xmm3, xmm11 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - pxor xmm4, xmm12 - pxor xmm5, xmm13 - pxor xmm6, xmm14 - pxor xmm7, xmm15 - mov eax, r13d - jne 9b - movdqa xmm9, xmm0 - punpckldq xmm0, xmm1 - punpckhdq xmm9, xmm1 - movdqa xmm11, xmm2 - punpckldq xmm2, xmm3 - punpckhdq xmm11, xmm3 - movdqa xmm1, xmm0 - punpcklqdq xmm0, xmm2 - punpckhqdq xmm1, xmm2 - movdqa xmm3, xmm9 - punpcklqdq xmm9, xmm11 - punpckhqdq xmm3, xmm11 - movdqu xmmword ptr [rbx], xmm0 - movdqu xmmword ptr [rbx+0x20], xmm1 - movdqu xmmword ptr [rbx+0x40], xmm9 - movdqu xmmword ptr [rbx+0x60], xmm3 - movdqa xmm9, xmm4 - punpckldq xmm4, xmm5 - punpckhdq xmm9, xmm5 - movdqa xmm11, xmm6 - punpckldq xmm6, xmm7 - punpckhdq xmm11, xmm7 - movdqa xmm5, xmm4 - punpcklqdq xmm4, xmm6 - punpckhqdq xmm5, xmm6 - movdqa xmm7, xmm9 - punpcklqdq xmm9, xmm11 - punpckhqdq xmm7, xmm11 - movdqu xmmword ptr [rbx+0x10], xmm4 - movdqu xmmword ptr [rbx+0x30], xmm5 - movdqu xmmword ptr [rbx+0x50], xmm9 - movdqu xmmword ptr [rbx+0x70], xmm7 - movdqa xmm1, xmmword ptr [rsp+0x110] - movdqa xmm0, xmm1 - paddd xmm1, xmmword ptr [rsp+0x150] - movdqa xmmword ptr [rsp+0x110], xmm1 - pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] - pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] - pcmpgtd xmm0, xmm1 - movdqa xmm1, xmmword ptr [rsp+0x120] - psubd xmm1, xmm0 - movdqa xmmword ptr [rsp+0x120], xmm1 - add rbx, 128 - add rdi, 32 - sub rsi, 4 - cmp rsi, 4 - jnc 2b - test rsi, rsi - jnz 3f -4: - mov rsp, rbp - pop rbp - pop rbx - pop r12 - pop r13 - pop r14 - pop r15 - ret -.p2align 5 -3: - test esi, 0x2 - je 3f - movups xmm0, xmmword ptr [rcx] - movups xmm1, xmmword ptr [rcx+0x10] - movaps xmm8, xmm0 - movaps xmm9, xmm1 - movd xmm13, dword ptr [rsp+0x110] - pinsrd xmm13, dword ptr [rsp+0x120], 1 - pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 - movaps xmmword ptr [rsp], xmm13 - movd xmm14, dword ptr [rsp+0x114] - pinsrd xmm14, dword ptr [rsp+0x124], 1 - pinsrd xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 - movaps xmmword ptr [rsp+0x10], xmm14 - mov r8, qword ptr [rdi] - mov r9, qword ptr [rdi+0x8] - movzx eax, byte ptr [rbp+0x40] - or eax, r13d - xor edx, edx -2: - mov r14d, eax - or eax, r12d - add rdx, 64 - cmp rdx, r15 - cmovne eax, r14d - movaps xmm2, xmmword ptr [BLAKE3_IV+rip] - movaps xmm10, xmm2 - movups xmm4, xmmword ptr [r8+rdx-0x40] - movups xmm5, xmmword ptr [r8+rdx-0x30] - movaps xmm3, xmm4 - shufps xmm4, xmm5, 136 - shufps xmm3, xmm5, 221 - movaps xmm5, xmm3 - movups xmm6, xmmword ptr [r8+rdx-0x20] - movups xmm7, xmmword ptr [r8+rdx-0x10] - movaps xmm3, xmm6 - shufps xmm6, xmm7, 136 - pshufd xmm6, xmm6, 0x93 - shufps xmm3, xmm7, 221 - pshufd xmm7, xmm3, 0x93 - movups xmm12, xmmword ptr [r9+rdx-0x40] - movups xmm13, xmmword ptr [r9+rdx-0x30] - movaps xmm11, xmm12 - shufps xmm12, xmm13, 136 - shufps xmm11, xmm13, 221 - movaps xmm13, xmm11 - movups xmm14, xmmword ptr [r9+rdx-0x20] - movups xmm15, xmmword ptr [r9+rdx-0x10] - movaps xmm11, xmm14 - shufps xmm14, xmm15, 136 - pshufd xmm14, xmm14, 0x93 - shufps xmm11, xmm15, 221 - pshufd xmm15, xmm11, 0x93 - movaps xmm3, xmmword ptr [rsp] - movaps xmm11, xmmword ptr [rsp+0x10] - pinsrd xmm3, eax, 3 - pinsrd xmm11, eax, 3 - mov al, 7 -9: - paddd xmm0, xmm4 - paddd xmm8, xmm12 - movaps xmmword ptr [rsp+0x20], xmm4 - movaps xmmword ptr [rsp+0x30], xmm12 - paddd xmm0, xmm1 - paddd xmm8, xmm9 - pxor xmm3, xmm0 - pxor xmm11, xmm8 - movaps xmm12, xmmword ptr [ROT16+rip] - pshufb xmm3, xmm12 - pshufb xmm11, xmm12 - paddd xmm2, xmm3 - paddd xmm10, xmm11 - pxor xmm1, xmm2 - pxor xmm9, xmm10 - movdqa xmm4, xmm1 - pslld xmm1, 20 - psrld xmm4, 12 - por xmm1, xmm4 - movdqa xmm4, xmm9 - pslld xmm9, 20 - psrld xmm4, 12 - por xmm9, xmm4 - paddd xmm0, xmm5 - paddd xmm8, xmm13 - movaps xmmword ptr [rsp+0x40], xmm5 - movaps xmmword ptr [rsp+0x50], xmm13 - paddd xmm0, xmm1 - paddd xmm8, xmm9 - pxor xmm3, xmm0 - pxor xmm11, xmm8 - movaps xmm13, xmmword ptr [ROT8+rip] - pshufb xmm3, xmm13 - pshufb xmm11, xmm13 - paddd xmm2, xmm3 - paddd xmm10, xmm11 - pxor xmm1, xmm2 - pxor xmm9, xmm10 - movdqa xmm4, xmm1 - pslld xmm1, 25 - psrld xmm4, 7 - por xmm1, xmm4 - movdqa xmm4, xmm9 - pslld xmm9, 25 - psrld xmm4, 7 - por xmm9, xmm4 - pshufd xmm0, xmm0, 0x93 - pshufd xmm8, xmm8, 0x93 - pshufd xmm3, xmm3, 0x4E - pshufd xmm11, xmm11, 0x4E - pshufd xmm2, xmm2, 0x39 - pshufd xmm10, xmm10, 0x39 - paddd xmm0, xmm6 - paddd xmm8, xmm14 - paddd xmm0, xmm1 - paddd xmm8, xmm9 - pxor xmm3, xmm0 - pxor xmm11, xmm8 - pshufb xmm3, xmm12 - pshufb xmm11, xmm12 - paddd xmm2, xmm3 - paddd xmm10, xmm11 - pxor xmm1, xmm2 - pxor xmm9, xmm10 - movdqa xmm4, xmm1 - pslld xmm1, 20 - psrld xmm4, 12 - por xmm1, xmm4 - movdqa xmm4, xmm9 - pslld xmm9, 20 - psrld xmm4, 12 - por xmm9, xmm4 - paddd xmm0, xmm7 - paddd xmm8, xmm15 - paddd xmm0, xmm1 - paddd xmm8, xmm9 - pxor xmm3, xmm0 - pxor xmm11, xmm8 - pshufb xmm3, xmm13 - pshufb xmm11, xmm13 - paddd xmm2, xmm3 - paddd xmm10, xmm11 - pxor xmm1, xmm2 - pxor xmm9, xmm10 - movdqa xmm4, xmm1 - pslld xmm1, 25 - psrld xmm4, 7 - por xmm1, xmm4 - movdqa xmm4, xmm9 - pslld xmm9, 25 - psrld xmm4, 7 - por xmm9, xmm4 - pshufd xmm0, xmm0, 0x39 - pshufd xmm8, xmm8, 0x39 - pshufd xmm3, xmm3, 0x4E - pshufd xmm11, xmm11, 0x4E - pshufd xmm2, xmm2, 0x93 - pshufd xmm10, xmm10, 0x93 - dec al - je 9f - movdqa xmm12, xmmword ptr [rsp+0x20] - movdqa xmm5, xmmword ptr [rsp+0x40] - pshufd xmm13, xmm12, 0x0F - shufps xmm12, xmm5, 214 - pshufd xmm4, xmm12, 0x39 - movdqa xmm12, xmm6 - shufps xmm12, xmm7, 250 - pblendw xmm13, xmm12, 0xCC - movdqa xmm12, xmm7 - punpcklqdq xmm12, xmm5 - pblendw xmm12, xmm6, 0xC0 - pshufd xmm12, xmm12, 0x78 - punpckhdq xmm5, xmm7 - punpckldq xmm6, xmm5 - pshufd xmm7, xmm6, 0x1E - movdqa xmmword ptr [rsp+0x20], xmm13 - movdqa xmmword ptr [rsp+0x40], xmm12 - movdqa xmm5, xmmword ptr [rsp+0x30] - movdqa xmm13, xmmword ptr [rsp+0x50] - pshufd xmm6, xmm5, 0x0F - shufps xmm5, xmm13, 214 - pshufd xmm12, xmm5, 0x39 - movdqa xmm5, xmm14 - shufps xmm5, xmm15, 250 - pblendw xmm6, xmm5, 0xCC - movdqa xmm5, xmm15 - punpcklqdq xmm5, xmm13 - pblendw xmm5, xmm14, 0xC0 - pshufd xmm5, xmm5, 0x78 - punpckhdq xmm13, xmm15 - punpckldq xmm14, xmm13 - pshufd xmm15, xmm14, 0x1E - movdqa xmm13, xmm6 - movdqa xmm14, xmm5 - movdqa xmm5, xmmword ptr [rsp+0x20] - movdqa xmm6, xmmword ptr [rsp+0x40] - jmp 9b -9: - pxor xmm0, xmm2 - pxor xmm1, xmm3 - pxor xmm8, xmm10 - pxor xmm9, xmm11 - mov eax, r13d - cmp rdx, r15 - jne 2b - movups xmmword ptr [rbx], xmm0 - movups xmmword ptr [rbx+0x10], xmm1 - movups xmmword ptr [rbx+0x20], xmm8 - movups xmmword ptr [rbx+0x30], xmm9 - movdqa xmm0, xmmword ptr [rsp+0x130] - movdqa xmm1, xmmword ptr [rsp+0x110] - movdqa xmm2, xmmword ptr [rsp+0x120] - movdqu xmm3, xmmword ptr [rsp+0x118] - movdqu xmm4, xmmword ptr [rsp+0x128] - blendvps xmm1, xmm3, xmm0 - blendvps xmm2, xmm4, xmm0 - movdqa xmmword ptr [rsp+0x110], xmm1 - movdqa xmmword ptr [rsp+0x120], xmm2 - add rdi, 16 - add rbx, 64 - sub rsi, 2 -3: - test esi, 0x1 - je 4b - movups xmm0, xmmword ptr [rcx] - movups xmm1, xmmword ptr [rcx+0x10] - movd xmm13, dword ptr [rsp+0x110] - pinsrd xmm13, dword ptr [rsp+0x120], 1 - pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 - movaps xmm14, xmmword ptr [ROT8+rip] - movaps xmm15, xmmword ptr [ROT16+rip] - mov r8, qword ptr [rdi] - movzx eax, byte ptr [rbp+0x40] - or eax, r13d - xor edx, edx -2: - mov r14d, eax - or eax, r12d - add rdx, 64 - cmp rdx, r15 - cmovne eax, r14d - movaps xmm2, xmmword ptr [BLAKE3_IV+rip] - movaps xmm3, xmm13 - pinsrd xmm3, eax, 3 - movups xmm4, xmmword ptr [r8+rdx-0x40] - movups xmm5, xmmword ptr [r8+rdx-0x30] - movaps xmm8, xmm4 - shufps xmm4, xmm5, 136 - shufps xmm8, xmm5, 221 - movaps xmm5, xmm8 - movups xmm6, xmmword ptr [r8+rdx-0x20] - movups xmm7, xmmword ptr [r8+rdx-0x10] - movaps xmm8, xmm6 - shufps xmm6, xmm7, 136 - pshufd xmm6, xmm6, 0x93 - shufps xmm8, xmm7, 221 - pshufd xmm7, xmm8, 0x93 - mov al, 7 -9: - paddd xmm0, xmm4 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm15 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 20 - psrld xmm11, 12 - por xmm1, xmm11 - paddd xmm0, xmm5 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm14 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 25 - psrld xmm11, 7 - por xmm1, xmm11 - pshufd xmm0, xmm0, 0x93 - pshufd xmm3, xmm3, 0x4E - pshufd xmm2, xmm2, 0x39 - paddd xmm0, xmm6 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm15 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 20 - psrld xmm11, 12 - por xmm1, xmm11 - paddd xmm0, xmm7 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm14 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 25 - psrld xmm11, 7 - por xmm1, xmm11 - pshufd xmm0, xmm0, 0x39 - pshufd xmm3, xmm3, 0x4E - pshufd xmm2, xmm2, 0x93 - dec al - jz 9f - movdqa xmm8, xmm4 - shufps xmm8, xmm5, 214 - pshufd xmm9, xmm4, 0x0F - pshufd xmm4, xmm8, 0x39 - movdqa xmm8, xmm6 - shufps xmm8, xmm7, 250 - pblendw xmm9, xmm8, 0xCC - movdqa xmm8, xmm7 - punpcklqdq xmm8, xmm5 - pblendw xmm8, xmm6, 0xC0 - pshufd xmm8, xmm8, 0x78 - punpckhdq xmm5, xmm7 - punpckldq xmm6, xmm5 - pshufd xmm7, xmm6, 0x1E - movdqa xmm5, xmm9 - movdqa xmm6, xmm8 - jmp 9b -9: - pxor xmm0, xmm2 - pxor xmm1, xmm3 - mov eax, r13d - cmp rdx, r15 - jne 2b - movups xmmword ptr [rbx], xmm0 - movups xmmword ptr [rbx+0x10], xmm1 - jmp 4b - -.p2align 6 -blake3_compress_in_place_sse41: -_blake3_compress_in_place_sse41: - _CET_ENDBR - movups xmm0, xmmword ptr [rdi] - movups xmm1, xmmword ptr [rdi+0x10] - movaps xmm2, xmmword ptr [BLAKE3_IV+rip] - shl r8, 32 - add rdx, r8 - movq xmm3, rcx - movq xmm4, rdx - punpcklqdq xmm3, xmm4 - movups xmm4, xmmword ptr [rsi] - movups xmm5, xmmword ptr [rsi+0x10] - movaps xmm8, xmm4 - shufps xmm4, xmm5, 136 - shufps xmm8, xmm5, 221 - movaps xmm5, xmm8 - movups xmm6, xmmword ptr [rsi+0x20] - movups xmm7, xmmword ptr [rsi+0x30] - movaps xmm8, xmm6 - shufps xmm6, xmm7, 136 - pshufd xmm6, xmm6, 0x93 - shufps xmm8, xmm7, 221 - pshufd xmm7, xmm8, 0x93 - movaps xmm14, xmmword ptr [ROT8+rip] - movaps xmm15, xmmword ptr [ROT16+rip] - mov al, 7 -9: - paddd xmm0, xmm4 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm15 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 20 - psrld xmm11, 12 - por xmm1, xmm11 - paddd xmm0, xmm5 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm14 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 25 - psrld xmm11, 7 - por xmm1, xmm11 - pshufd xmm0, xmm0, 0x93 - pshufd xmm3, xmm3, 0x4E - pshufd xmm2, xmm2, 0x39 - paddd xmm0, xmm6 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm15 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 20 - psrld xmm11, 12 - por xmm1, xmm11 - paddd xmm0, xmm7 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm14 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 25 - psrld xmm11, 7 - por xmm1, xmm11 - pshufd xmm0, xmm0, 0x39 - pshufd xmm3, xmm3, 0x4E - pshufd xmm2, xmm2, 0x93 - dec al - jz 9f - movdqa xmm8, xmm4 - shufps xmm8, xmm5, 214 - pshufd xmm9, xmm4, 0x0F - pshufd xmm4, xmm8, 0x39 - movdqa xmm8, xmm6 - shufps xmm8, xmm7, 250 - pblendw xmm9, xmm8, 0xCC - movdqa xmm8, xmm7 - punpcklqdq xmm8, xmm5 - pblendw xmm8, xmm6, 0xC0 - pshufd xmm8, xmm8, 0x78 - punpckhdq xmm5, xmm7 - punpckldq xmm6, xmm5 - pshufd xmm7, xmm6, 0x1E - movdqa xmm5, xmm9 - movdqa xmm6, xmm8 - jmp 9b -9: - pxor xmm0, xmm2 - pxor xmm1, xmm3 - movups xmmword ptr [rdi], xmm0 - movups xmmword ptr [rdi+0x10], xmm1 - ret - -.p2align 6 -blake3_compress_xof_sse41: -_blake3_compress_xof_sse41: - _CET_ENDBR - movups xmm0, xmmword ptr [rdi] - movups xmm1, xmmword ptr [rdi+0x10] - movaps xmm2, xmmword ptr [BLAKE3_IV+rip] - movzx eax, r8b - movzx edx, dl - shl rax, 32 - add rdx, rax - movq xmm3, rcx - movq xmm4, rdx - punpcklqdq xmm3, xmm4 - movups xmm4, xmmword ptr [rsi] - movups xmm5, xmmword ptr [rsi+0x10] - movaps xmm8, xmm4 - shufps xmm4, xmm5, 136 - shufps xmm8, xmm5, 221 - movaps xmm5, xmm8 - movups xmm6, xmmword ptr [rsi+0x20] - movups xmm7, xmmword ptr [rsi+0x30] - movaps xmm8, xmm6 - shufps xmm6, xmm7, 136 - pshufd xmm6, xmm6, 0x93 - shufps xmm8, xmm7, 221 - pshufd xmm7, xmm8, 0x93 - movaps xmm14, xmmword ptr [ROT8+rip] - movaps xmm15, xmmword ptr [ROT16+rip] - mov al, 7 -9: - paddd xmm0, xmm4 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm15 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 20 - psrld xmm11, 12 - por xmm1, xmm11 - paddd xmm0, xmm5 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm14 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 25 - psrld xmm11, 7 - por xmm1, xmm11 - pshufd xmm0, xmm0, 0x93 - pshufd xmm3, xmm3, 0x4E - pshufd xmm2, xmm2, 0x39 - paddd xmm0, xmm6 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm15 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 20 - psrld xmm11, 12 - por xmm1, xmm11 - paddd xmm0, xmm7 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm14 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 25 - psrld xmm11, 7 - por xmm1, xmm11 - pshufd xmm0, xmm0, 0x39 - pshufd xmm3, xmm3, 0x4E - pshufd xmm2, xmm2, 0x93 - dec al - jz 9f - movdqa xmm8, xmm4 - shufps xmm8, xmm5, 214 - pshufd xmm9, xmm4, 0x0F - pshufd xmm4, xmm8, 0x39 - movdqa xmm8, xmm6 - shufps xmm8, xmm7, 250 - pblendw xmm9, xmm8, 0xCC - movdqa xmm8, xmm7 - punpcklqdq xmm8, xmm5 - pblendw xmm8, xmm6, 0xC0 - pshufd xmm8, xmm8, 0x78 - punpckhdq xmm5, xmm7 - punpckldq xmm6, xmm5 - pshufd xmm7, xmm6, 0x1E - movdqa xmm5, xmm9 - movdqa xmm6, xmm8 - jmp 9b -9: - movdqu xmm4, xmmword ptr [rdi] - movdqu xmm5, xmmword ptr [rdi+0x10] - pxor xmm0, xmm2 - pxor xmm1, xmm3 - pxor xmm2, xmm4 - pxor xmm3, xmm5 - movups xmmword ptr [r9], xmm0 - movups xmmword ptr [r9+0x10], xmm1 - movups xmmword ptr [r9+0x20], xmm2 - movups xmmword ptr [r9+0x30], xmm3 - ret - - -#ifdef __APPLE__ -.static_data -#else -.section .rodata -#endif -.p2align 6 -BLAKE3_IV: - .long 0x6A09E667, 0xBB67AE85 - .long 0x3C6EF372, 0xA54FF53A -ROT16: - .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 -ROT8: - .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 -ADD0: - .long 0, 1, 2, 3 -ADD1: - .long 4, 4, 4, 4 -BLAKE3_IV_0: - .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 -BLAKE3_IV_1: - .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 -BLAKE3_IV_2: - .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 -BLAKE3_IV_3: - .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A -BLAKE3_BLOCK_LEN: - .long 64, 64, 64, 64 -CMP_MSB_MASK: - .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 diff --git a/thirdparty/BLAKE3/c/blake3_sse41_x86-64_windows_gnu.S b/thirdparty/BLAKE3/c/blake3_sse41_x86-64_windows_gnu.S deleted file mode 100644 index 60d0a4042..000000000 --- a/thirdparty/BLAKE3/c/blake3_sse41_x86-64_windows_gnu.S +++ /dev/null @@ -1,2069 +0,0 @@ -.intel_syntax noprefix -.global blake3_hash_many_sse41 -.global _blake3_hash_many_sse41 -.global blake3_compress_in_place_sse41 -.global _blake3_compress_in_place_sse41 -.global blake3_compress_xof_sse41 -.global _blake3_compress_xof_sse41 -.section .text - .p2align 6 -_blake3_hash_many_sse41: -blake3_hash_many_sse41: - push r15 - push r14 - push r13 - push r12 - push rsi - push rdi - push rbx - push rbp - mov rbp, rsp - sub rsp, 528 - and rsp, 0xFFFFFFFFFFFFFFC0 - movdqa xmmword ptr [rsp+0x170], xmm6 - movdqa xmmword ptr [rsp+0x180], xmm7 - movdqa xmmword ptr [rsp+0x190], xmm8 - movdqa xmmword ptr [rsp+0x1A0], xmm9 - movdqa xmmword ptr [rsp+0x1B0], xmm10 - movdqa xmmword ptr [rsp+0x1C0], xmm11 - movdqa xmmword ptr [rsp+0x1D0], xmm12 - movdqa xmmword ptr [rsp+0x1E0], xmm13 - movdqa xmmword ptr [rsp+0x1F0], xmm14 - movdqa xmmword ptr [rsp+0x200], xmm15 - mov rdi, rcx - mov rsi, rdx - mov rdx, r8 - mov rcx, r9 - mov r8, qword ptr [rbp+0x68] - movzx r9, byte ptr [rbp+0x70] - neg r9d - movd xmm0, r9d - pshufd xmm0, xmm0, 0x00 - movdqa xmmword ptr [rsp+0x130], xmm0 - movdqa xmm1, xmm0 - pand xmm1, xmmword ptr [ADD0+rip] - pand xmm0, xmmword ptr [ADD1+rip] - movdqa xmmword ptr [rsp+0x150], xmm0 - movd xmm0, r8d - pshufd xmm0, xmm0, 0x00 - paddd xmm0, xmm1 - movdqa xmmword ptr [rsp+0x110], xmm0 - pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] - pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] - pcmpgtd xmm1, xmm0 - shr r8, 32 - movd xmm2, r8d - pshufd xmm2, xmm2, 0x00 - psubd xmm2, xmm1 - movdqa xmmword ptr [rsp+0x120], xmm2 - mov rbx, qword ptr [rbp+0x90] - mov r15, rdx - shl r15, 6 - movzx r13d, byte ptr [rbp+0x78] - movzx r12d, byte ptr [rbp+0x88] - cmp rsi, 4 - jc 3f -2: - movdqu xmm3, xmmword ptr [rcx] - pshufd xmm0, xmm3, 0x00 - pshufd xmm1, xmm3, 0x55 - pshufd xmm2, xmm3, 0xAA - pshufd xmm3, xmm3, 0xFF - movdqu xmm7, xmmword ptr [rcx+0x10] - pshufd xmm4, xmm7, 0x00 - pshufd xmm5, xmm7, 0x55 - pshufd xmm6, xmm7, 0xAA - pshufd xmm7, xmm7, 0xFF - mov r8, qword ptr [rdi] - mov r9, qword ptr [rdi+0x8] - mov r10, qword ptr [rdi+0x10] - mov r11, qword ptr [rdi+0x18] - movzx eax, byte ptr [rbp+0x80] - or eax, r13d - xor edx, edx -9: - mov r14d, eax - or eax, r12d - add rdx, 64 - cmp rdx, r15 - cmovne eax, r14d - movdqu xmm8, xmmword ptr [r8+rdx-0x40] - movdqu xmm9, xmmword ptr [r9+rdx-0x40] - movdqu xmm10, xmmword ptr [r10+rdx-0x40] - movdqu xmm11, xmmword ptr [r11+rdx-0x40] - movdqa xmm12, xmm8 - punpckldq xmm8, xmm9 - punpckhdq xmm12, xmm9 - movdqa xmm14, xmm10 - punpckldq xmm10, xmm11 - punpckhdq xmm14, xmm11 - movdqa xmm9, xmm8 - punpcklqdq xmm8, xmm10 - punpckhqdq xmm9, xmm10 - movdqa xmm13, xmm12 - punpcklqdq xmm12, xmm14 - punpckhqdq xmm13, xmm14 - movdqa xmmword ptr [rsp], xmm8 - movdqa xmmword ptr [rsp+0x10], xmm9 - movdqa xmmword ptr [rsp+0x20], xmm12 - movdqa xmmword ptr [rsp+0x30], xmm13 - movdqu xmm8, xmmword ptr [r8+rdx-0x30] - movdqu xmm9, xmmword ptr [r9+rdx-0x30] - movdqu xmm10, xmmword ptr [r10+rdx-0x30] - movdqu xmm11, xmmword ptr [r11+rdx-0x30] - movdqa xmm12, xmm8 - punpckldq xmm8, xmm9 - punpckhdq xmm12, xmm9 - movdqa xmm14, xmm10 - punpckldq xmm10, xmm11 - punpckhdq xmm14, xmm11 - movdqa xmm9, xmm8 - punpcklqdq xmm8, xmm10 - punpckhqdq xmm9, xmm10 - movdqa xmm13, xmm12 - punpcklqdq xmm12, xmm14 - punpckhqdq xmm13, xmm14 - movdqa xmmword ptr [rsp+0x40], xmm8 - movdqa xmmword ptr [rsp+0x50], xmm9 - movdqa xmmword ptr [rsp+0x60], xmm12 - movdqa xmmword ptr [rsp+0x70], xmm13 - movdqu xmm8, xmmword ptr [r8+rdx-0x20] - movdqu xmm9, xmmword ptr [r9+rdx-0x20] - movdqu xmm10, xmmword ptr [r10+rdx-0x20] - movdqu xmm11, xmmword ptr [r11+rdx-0x20] - movdqa xmm12, xmm8 - punpckldq xmm8, xmm9 - punpckhdq xmm12, xmm9 - movdqa xmm14, xmm10 - punpckldq xmm10, xmm11 - punpckhdq xmm14, xmm11 - movdqa xmm9, xmm8 - punpcklqdq xmm8, xmm10 - punpckhqdq xmm9, xmm10 - movdqa xmm13, xmm12 - punpcklqdq xmm12, xmm14 - punpckhqdq xmm13, xmm14 - movdqa xmmword ptr [rsp+0x80], xmm8 - movdqa xmmword ptr [rsp+0x90], xmm9 - movdqa xmmword ptr [rsp+0xA0], xmm12 - movdqa xmmword ptr [rsp+0xB0], xmm13 - movdqu xmm8, xmmword ptr [r8+rdx-0x10] - movdqu xmm9, xmmword ptr [r9+rdx-0x10] - movdqu xmm10, xmmword ptr [r10+rdx-0x10] - movdqu xmm11, xmmword ptr [r11+rdx-0x10] - movdqa xmm12, xmm8 - punpckldq xmm8, xmm9 - punpckhdq xmm12, xmm9 - movdqa xmm14, xmm10 - punpckldq xmm10, xmm11 - punpckhdq xmm14, xmm11 - movdqa xmm9, xmm8 - punpcklqdq xmm8, xmm10 - punpckhqdq xmm9, xmm10 - movdqa xmm13, xmm12 - punpcklqdq xmm12, xmm14 - punpckhqdq xmm13, xmm14 - movdqa xmmword ptr [rsp+0xC0], xmm8 - movdqa xmmword ptr [rsp+0xD0], xmm9 - movdqa xmmword ptr [rsp+0xE0], xmm12 - movdqa xmmword ptr [rsp+0xF0], xmm13 - movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip] - movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip] - movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip] - movdqa xmm12, xmmword ptr [rsp+0x110] - movdqa xmm13, xmmword ptr [rsp+0x120] - movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip] - movd xmm15, eax - pshufd xmm15, xmm15, 0x00 - prefetcht0 [r8+rdx+0x80] - prefetcht0 [r9+rdx+0x80] - prefetcht0 [r10+rdx+0x80] - prefetcht0 [r11+rdx+0x80] - paddd xmm0, xmmword ptr [rsp] - paddd xmm1, xmmword ptr [rsp+0x20] - paddd xmm2, xmmword ptr [rsp+0x40] - paddd xmm3, xmmword ptr [rsp+0x60] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0x10] - paddd xmm1, xmmword ptr [rsp+0x30] - paddd xmm2, xmmword ptr [rsp+0x50] - paddd xmm3, xmmword ptr [rsp+0x70] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0x80] - paddd xmm1, xmmword ptr [rsp+0xA0] - paddd xmm2, xmmword ptr [rsp+0xC0] - paddd xmm3, xmmword ptr [rsp+0xE0] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0x90] - paddd xmm1, xmmword ptr [rsp+0xB0] - paddd xmm2, xmmword ptr [rsp+0xD0] - paddd xmm3, xmmword ptr [rsp+0xF0] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0x20] - paddd xmm1, xmmword ptr [rsp+0x30] - paddd xmm2, xmmword ptr [rsp+0x70] - paddd xmm3, xmmword ptr [rsp+0x40] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0x60] - paddd xmm1, xmmword ptr [rsp+0xA0] - paddd xmm2, xmmword ptr [rsp] - paddd xmm3, xmmword ptr [rsp+0xD0] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0x10] - paddd xmm1, xmmword ptr [rsp+0xC0] - paddd xmm2, xmmword ptr [rsp+0x90] - paddd xmm3, xmmword ptr [rsp+0xF0] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0xB0] - paddd xmm1, xmmword ptr [rsp+0x50] - paddd xmm2, xmmword ptr [rsp+0xE0] - paddd xmm3, xmmword ptr [rsp+0x80] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0x30] - paddd xmm1, xmmword ptr [rsp+0xA0] - paddd xmm2, xmmword ptr [rsp+0xD0] - paddd xmm3, xmmword ptr [rsp+0x70] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0x40] - paddd xmm1, xmmword ptr [rsp+0xC0] - paddd xmm2, xmmword ptr [rsp+0x20] - paddd xmm3, xmmword ptr [rsp+0xE0] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0x60] - paddd xmm1, xmmword ptr [rsp+0x90] - paddd xmm2, xmmword ptr [rsp+0xB0] - paddd xmm3, xmmword ptr [rsp+0x80] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0x50] - paddd xmm1, xmmword ptr [rsp] - paddd xmm2, xmmword ptr [rsp+0xF0] - paddd xmm3, xmmword ptr [rsp+0x10] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0xA0] - paddd xmm1, xmmword ptr [rsp+0xC0] - paddd xmm2, xmmword ptr [rsp+0xE0] - paddd xmm3, xmmword ptr [rsp+0xD0] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0x70] - paddd xmm1, xmmword ptr [rsp+0x90] - paddd xmm2, xmmword ptr [rsp+0x30] - paddd xmm3, xmmword ptr [rsp+0xF0] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0x40] - paddd xmm1, xmmword ptr [rsp+0xB0] - paddd xmm2, xmmword ptr [rsp+0x50] - paddd xmm3, xmmword ptr [rsp+0x10] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp] - paddd xmm1, xmmword ptr [rsp+0x20] - paddd xmm2, xmmword ptr [rsp+0x80] - paddd xmm3, xmmword ptr [rsp+0x60] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0xC0] - paddd xmm1, xmmword ptr [rsp+0x90] - paddd xmm2, xmmword ptr [rsp+0xF0] - paddd xmm3, xmmword ptr [rsp+0xE0] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0xD0] - paddd xmm1, xmmword ptr [rsp+0xB0] - paddd xmm2, xmmword ptr [rsp+0xA0] - paddd xmm3, xmmword ptr [rsp+0x80] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0x70] - paddd xmm1, xmmword ptr [rsp+0x50] - paddd xmm2, xmmword ptr [rsp] - paddd xmm3, xmmword ptr [rsp+0x60] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0x20] - paddd xmm1, xmmword ptr [rsp+0x30] - paddd xmm2, xmmword ptr [rsp+0x10] - paddd xmm3, xmmword ptr [rsp+0x40] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0x90] - paddd xmm1, xmmword ptr [rsp+0xB0] - paddd xmm2, xmmword ptr [rsp+0x80] - paddd xmm3, xmmword ptr [rsp+0xF0] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0xE0] - paddd xmm1, xmmword ptr [rsp+0x50] - paddd xmm2, xmmword ptr [rsp+0xC0] - paddd xmm3, xmmword ptr [rsp+0x10] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0xD0] - paddd xmm1, xmmword ptr [rsp] - paddd xmm2, xmmword ptr [rsp+0x20] - paddd xmm3, xmmword ptr [rsp+0x40] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0x30] - paddd xmm1, xmmword ptr [rsp+0xA0] - paddd xmm2, xmmword ptr [rsp+0x60] - paddd xmm3, xmmword ptr [rsp+0x70] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0xB0] - paddd xmm1, xmmword ptr [rsp+0x50] - paddd xmm2, xmmword ptr [rsp+0x10] - paddd xmm3, xmmword ptr [rsp+0x80] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0xF0] - paddd xmm1, xmmword ptr [rsp] - paddd xmm2, xmmword ptr [rsp+0x90] - paddd xmm3, xmmword ptr [rsp+0x60] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0xE0] - paddd xmm1, xmmword ptr [rsp+0x20] - paddd xmm2, xmmword ptr [rsp+0x30] - paddd xmm3, xmmword ptr [rsp+0x70] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+0x100], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0xA0] - paddd xmm1, xmmword ptr [rsp+0xC0] - paddd xmm2, xmmword ptr [rsp+0x40] - paddd xmm3, xmmword ptr [rsp+0xD0] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+0x100] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - pxor xmm0, xmm8 - pxor xmm1, xmm9 - pxor xmm2, xmm10 - pxor xmm3, xmm11 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - pxor xmm4, xmm12 - pxor xmm5, xmm13 - pxor xmm6, xmm14 - pxor xmm7, xmm15 - mov eax, r13d - jne 9b - movdqa xmm9, xmm0 - punpckldq xmm0, xmm1 - punpckhdq xmm9, xmm1 - movdqa xmm11, xmm2 - punpckldq xmm2, xmm3 - punpckhdq xmm11, xmm3 - movdqa xmm1, xmm0 - punpcklqdq xmm0, xmm2 - punpckhqdq xmm1, xmm2 - movdqa xmm3, xmm9 - punpcklqdq xmm9, xmm11 - punpckhqdq xmm3, xmm11 - movdqu xmmword ptr [rbx], xmm0 - movdqu xmmword ptr [rbx+0x20], xmm1 - movdqu xmmword ptr [rbx+0x40], xmm9 - movdqu xmmword ptr [rbx+0x60], xmm3 - movdqa xmm9, xmm4 - punpckldq xmm4, xmm5 - punpckhdq xmm9, xmm5 - movdqa xmm11, xmm6 - punpckldq xmm6, xmm7 - punpckhdq xmm11, xmm7 - movdqa xmm5, xmm4 - punpcklqdq xmm4, xmm6 - punpckhqdq xmm5, xmm6 - movdqa xmm7, xmm9 - punpcklqdq xmm9, xmm11 - punpckhqdq xmm7, xmm11 - movdqu xmmword ptr [rbx+0x10], xmm4 - movdqu xmmword ptr [rbx+0x30], xmm5 - movdqu xmmword ptr [rbx+0x50], xmm9 - movdqu xmmword ptr [rbx+0x70], xmm7 - movdqa xmm1, xmmword ptr [rsp+0x110] - movdqa xmm0, xmm1 - paddd xmm1, xmmword ptr [rsp+0x150] - movdqa xmmword ptr [rsp+0x110], xmm1 - pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] - pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] - pcmpgtd xmm0, xmm1 - movdqa xmm1, xmmword ptr [rsp+0x120] - psubd xmm1, xmm0 - movdqa xmmword ptr [rsp+0x120], xmm1 - add rbx, 128 - add rdi, 32 - sub rsi, 4 - cmp rsi, 4 - jnc 2b - test rsi, rsi - jne 3f -4: - movdqa xmm6, xmmword ptr [rsp+0x170] - movdqa xmm7, xmmword ptr [rsp+0x180] - movdqa xmm8, xmmword ptr [rsp+0x190] - movdqa xmm9, xmmword ptr [rsp+0x1A0] - movdqa xmm10, xmmword ptr [rsp+0x1B0] - movdqa xmm11, xmmword ptr [rsp+0x1C0] - movdqa xmm12, xmmword ptr [rsp+0x1D0] - movdqa xmm13, xmmword ptr [rsp+0x1E0] - movdqa xmm14, xmmword ptr [rsp+0x1F0] - movdqa xmm15, xmmword ptr [rsp+0x200] - mov rsp, rbp - pop rbp - pop rbx - pop rdi - pop rsi - pop r12 - pop r13 - pop r14 - pop r15 - ret -.p2align 5 -3: - test esi, 0x2 - je 3f - movups xmm0, xmmword ptr [rcx] - movups xmm1, xmmword ptr [rcx+0x10] - movaps xmm8, xmm0 - movaps xmm9, xmm1 - movd xmm13, dword ptr [rsp+0x110] - pinsrd xmm13, dword ptr [rsp+0x120], 1 - pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 - movaps xmmword ptr [rsp], xmm13 - movd xmm14, dword ptr [rsp+0x114] - pinsrd xmm14, dword ptr [rsp+0x124], 1 - pinsrd xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 - movaps xmmword ptr [rsp+0x10], xmm14 - mov r8, qword ptr [rdi] - mov r9, qword ptr [rdi+0x8] - movzx eax, byte ptr [rbp+0x80] - or eax, r13d - xor edx, edx -2: - mov r14d, eax - or eax, r12d - add rdx, 64 - cmp rdx, r15 - cmovne eax, r14d - movaps xmm2, xmmword ptr [BLAKE3_IV+rip] - movaps xmm10, xmm2 - movups xmm4, xmmword ptr [r8+rdx-0x40] - movups xmm5, xmmword ptr [r8+rdx-0x30] - movaps xmm3, xmm4 - shufps xmm4, xmm5, 136 - shufps xmm3, xmm5, 221 - movaps xmm5, xmm3 - movups xmm6, xmmword ptr [r8+rdx-0x20] - movups xmm7, xmmword ptr [r8+rdx-0x10] - movaps xmm3, xmm6 - shufps xmm6, xmm7, 136 - pshufd xmm6, xmm6, 0x93 - shufps xmm3, xmm7, 221 - pshufd xmm7, xmm3, 0x93 - movups xmm12, xmmword ptr [r9+rdx-0x40] - movups xmm13, xmmword ptr [r9+rdx-0x30] - movaps xmm11, xmm12 - shufps xmm12, xmm13, 136 - shufps xmm11, xmm13, 221 - movaps xmm13, xmm11 - movups xmm14, xmmword ptr [r9+rdx-0x20] - movups xmm15, xmmword ptr [r9+rdx-0x10] - movaps xmm11, xmm14 - shufps xmm14, xmm15, 136 - pshufd xmm14, xmm14, 0x93 - shufps xmm11, xmm15, 221 - pshufd xmm15, xmm11, 0x93 - movaps xmm3, xmmword ptr [rsp] - movaps xmm11, xmmword ptr [rsp+0x10] - pinsrd xmm3, eax, 3 - pinsrd xmm11, eax, 3 - mov al, 7 -9: - paddd xmm0, xmm4 - paddd xmm8, xmm12 - movaps xmmword ptr [rsp+0x20], xmm4 - movaps xmmword ptr [rsp+0x30], xmm12 - paddd xmm0, xmm1 - paddd xmm8, xmm9 - pxor xmm3, xmm0 - pxor xmm11, xmm8 - movaps xmm12, xmmword ptr [ROT16+rip] - pshufb xmm3, xmm12 - pshufb xmm11, xmm12 - paddd xmm2, xmm3 - paddd xmm10, xmm11 - pxor xmm1, xmm2 - pxor xmm9, xmm10 - movdqa xmm4, xmm1 - pslld xmm1, 20 - psrld xmm4, 12 - por xmm1, xmm4 - movdqa xmm4, xmm9 - pslld xmm9, 20 - psrld xmm4, 12 - por xmm9, xmm4 - paddd xmm0, xmm5 - paddd xmm8, xmm13 - movaps xmmword ptr [rsp+0x40], xmm5 - movaps xmmword ptr [rsp+0x50], xmm13 - paddd xmm0, xmm1 - paddd xmm8, xmm9 - pxor xmm3, xmm0 - pxor xmm11, xmm8 - movaps xmm13, xmmword ptr [ROT8+rip] - pshufb xmm3, xmm13 - pshufb xmm11, xmm13 - paddd xmm2, xmm3 - paddd xmm10, xmm11 - pxor xmm1, xmm2 - pxor xmm9, xmm10 - movdqa xmm4, xmm1 - pslld xmm1, 25 - psrld xmm4, 7 - por xmm1, xmm4 - movdqa xmm4, xmm9 - pslld xmm9, 25 - psrld xmm4, 7 - por xmm9, xmm4 - pshufd xmm0, xmm0, 0x93 - pshufd xmm8, xmm8, 0x93 - pshufd xmm3, xmm3, 0x4E - pshufd xmm11, xmm11, 0x4E - pshufd xmm2, xmm2, 0x39 - pshufd xmm10, xmm10, 0x39 - paddd xmm0, xmm6 - paddd xmm8, xmm14 - paddd xmm0, xmm1 - paddd xmm8, xmm9 - pxor xmm3, xmm0 - pxor xmm11, xmm8 - pshufb xmm3, xmm12 - pshufb xmm11, xmm12 - paddd xmm2, xmm3 - paddd xmm10, xmm11 - pxor xmm1, xmm2 - pxor xmm9, xmm10 - movdqa xmm4, xmm1 - pslld xmm1, 20 - psrld xmm4, 12 - por xmm1, xmm4 - movdqa xmm4, xmm9 - pslld xmm9, 20 - psrld xmm4, 12 - por xmm9, xmm4 - paddd xmm0, xmm7 - paddd xmm8, xmm15 - paddd xmm0, xmm1 - paddd xmm8, xmm9 - pxor xmm3, xmm0 - pxor xmm11, xmm8 - pshufb xmm3, xmm13 - pshufb xmm11, xmm13 - paddd xmm2, xmm3 - paddd xmm10, xmm11 - pxor xmm1, xmm2 - pxor xmm9, xmm10 - movdqa xmm4, xmm1 - pslld xmm1, 25 - psrld xmm4, 7 - por xmm1, xmm4 - movdqa xmm4, xmm9 - pslld xmm9, 25 - psrld xmm4, 7 - por xmm9, xmm4 - pshufd xmm0, xmm0, 0x39 - pshufd xmm8, xmm8, 0x39 - pshufd xmm3, xmm3, 0x4E - pshufd xmm11, xmm11, 0x4E - pshufd xmm2, xmm2, 0x93 - pshufd xmm10, xmm10, 0x93 - dec al - je 9f - movdqa xmm12, xmmword ptr [rsp+0x20] - movdqa xmm5, xmmword ptr [rsp+0x40] - pshufd xmm13, xmm12, 0x0F - shufps xmm12, xmm5, 214 - pshufd xmm4, xmm12, 0x39 - movdqa xmm12, xmm6 - shufps xmm12, xmm7, 250 - pblendw xmm13, xmm12, 0xCC - movdqa xmm12, xmm7 - punpcklqdq xmm12, xmm5 - pblendw xmm12, xmm6, 0xC0 - pshufd xmm12, xmm12, 0x78 - punpckhdq xmm5, xmm7 - punpckldq xmm6, xmm5 - pshufd xmm7, xmm6, 0x1E - movdqa xmmword ptr [rsp+0x20], xmm13 - movdqa xmmword ptr [rsp+0x40], xmm12 - movdqa xmm5, xmmword ptr [rsp+0x30] - movdqa xmm13, xmmword ptr [rsp+0x50] - pshufd xmm6, xmm5, 0x0F - shufps xmm5, xmm13, 214 - pshufd xmm12, xmm5, 0x39 - movdqa xmm5, xmm14 - shufps xmm5, xmm15, 250 - pblendw xmm6, xmm5, 0xCC - movdqa xmm5, xmm15 - punpcklqdq xmm5, xmm13 - pblendw xmm5, xmm14, 0xC0 - pshufd xmm5, xmm5, 0x78 - punpckhdq xmm13, xmm15 - punpckldq xmm14, xmm13 - pshufd xmm15, xmm14, 0x1E - movdqa xmm13, xmm6 - movdqa xmm14, xmm5 - movdqa xmm5, xmmword ptr [rsp+0x20] - movdqa xmm6, xmmword ptr [rsp+0x40] - jmp 9b -9: - pxor xmm0, xmm2 - pxor xmm1, xmm3 - pxor xmm8, xmm10 - pxor xmm9, xmm11 - mov eax, r13d - cmp rdx, r15 - jne 2b - movups xmmword ptr [rbx], xmm0 - movups xmmword ptr [rbx+0x10], xmm1 - movups xmmword ptr [rbx+0x20], xmm8 - movups xmmword ptr [rbx+0x30], xmm9 - movdqa xmm0, xmmword ptr [rsp+0x130] - movdqa xmm1, xmmword ptr [rsp+0x110] - movdqa xmm2, xmmword ptr [rsp+0x120] - movdqu xmm3, xmmword ptr [rsp+0x118] - movdqu xmm4, xmmword ptr [rsp+0x128] - blendvps xmm1, xmm3, xmm0 - blendvps xmm2, xmm4, xmm0 - movdqa xmmword ptr [rsp+0x110], xmm1 - movdqa xmmword ptr [rsp+0x120], xmm2 - add rdi, 16 - add rbx, 64 - sub rsi, 2 -3: - test esi, 0x1 - je 4b - movups xmm0, xmmword ptr [rcx] - movups xmm1, xmmword ptr [rcx+0x10] - movd xmm13, dword ptr [rsp+0x110] - pinsrd xmm13, dword ptr [rsp+0x120], 1 - pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 - movaps xmm14, xmmword ptr [ROT8+rip] - movaps xmm15, xmmword ptr [ROT16+rip] - mov r8, qword ptr [rdi] - movzx eax, byte ptr [rbp+0x80] - or eax, r13d - xor edx, edx -2: - mov r14d, eax - or eax, r12d - add rdx, 64 - cmp rdx, r15 - cmovne eax, r14d - movaps xmm2, xmmword ptr [BLAKE3_IV+rip] - movaps xmm3, xmm13 - pinsrd xmm3, eax, 3 - movups xmm4, xmmword ptr [r8+rdx-0x40] - movups xmm5, xmmword ptr [r8+rdx-0x30] - movaps xmm8, xmm4 - shufps xmm4, xmm5, 136 - shufps xmm8, xmm5, 221 - movaps xmm5, xmm8 - movups xmm6, xmmword ptr [r8+rdx-0x20] - movups xmm7, xmmword ptr [r8+rdx-0x10] - movaps xmm8, xmm6 - shufps xmm6, xmm7, 136 - pshufd xmm6, xmm6, 0x93 - shufps xmm8, xmm7, 221 - pshufd xmm7, xmm8, 0x93 - mov al, 7 -9: - paddd xmm0, xmm4 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm15 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 20 - psrld xmm11, 12 - por xmm1, xmm11 - paddd xmm0, xmm5 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm14 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 25 - psrld xmm11, 7 - por xmm1, xmm11 - pshufd xmm0, xmm0, 0x93 - pshufd xmm3, xmm3, 0x4E - pshufd xmm2, xmm2, 0x39 - paddd xmm0, xmm6 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm15 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 20 - psrld xmm11, 12 - por xmm1, xmm11 - paddd xmm0, xmm7 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm14 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 25 - psrld xmm11, 7 - por xmm1, xmm11 - pshufd xmm0, xmm0, 0x39 - pshufd xmm3, xmm3, 0x4E - pshufd xmm2, xmm2, 0x93 - dec al - jz 9f - movdqa xmm8, xmm4 - shufps xmm8, xmm5, 214 - pshufd xmm9, xmm4, 0x0F - pshufd xmm4, xmm8, 0x39 - movdqa xmm8, xmm6 - shufps xmm8, xmm7, 250 - pblendw xmm9, xmm8, 0xCC - movdqa xmm8, xmm7 - punpcklqdq xmm8, xmm5 - pblendw xmm8, xmm6, 0xC0 - pshufd xmm8, xmm8, 0x78 - punpckhdq xmm5, xmm7 - punpckldq xmm6, xmm5 - pshufd xmm7, xmm6, 0x1E - movdqa xmm5, xmm9 - movdqa xmm6, xmm8 - jmp 9b -9: - pxor xmm0, xmm2 - pxor xmm1, xmm3 - mov eax, r13d - cmp rdx, r15 - jne 2b - movups xmmword ptr [rbx], xmm0 - movups xmmword ptr [rbx+0x10], xmm1 - jmp 4b - -.p2align 6 -blake3_compress_in_place_sse41: -_blake3_compress_in_place_sse41: - sub rsp, 120 - movdqa xmmword ptr [rsp], xmm6 - movdqa xmmword ptr [rsp+0x10], xmm7 - movdqa xmmword ptr [rsp+0x20], xmm8 - movdqa xmmword ptr [rsp+0x30], xmm9 - movdqa xmmword ptr [rsp+0x40], xmm11 - movdqa xmmword ptr [rsp+0x50], xmm14 - movdqa xmmword ptr [rsp+0x60], xmm15 - movups xmm0, xmmword ptr [rcx] - movups xmm1, xmmword ptr [rcx+0x10] - movaps xmm2, xmmword ptr [BLAKE3_IV+rip] - movzx eax, byte ptr [rsp+0xA0] - movzx r8d, r8b - shl rax, 32 - add r8, rax - movq xmm3, r9 - movq xmm4, r8 - punpcklqdq xmm3, xmm4 - movups xmm4, xmmword ptr [rdx] - movups xmm5, xmmword ptr [rdx+0x10] - movaps xmm8, xmm4 - shufps xmm4, xmm5, 136 - shufps xmm8, xmm5, 221 - movaps xmm5, xmm8 - movups xmm6, xmmword ptr [rdx+0x20] - movups xmm7, xmmword ptr [rdx+0x30] - movaps xmm8, xmm6 - shufps xmm6, xmm7, 136 - pshufd xmm6, xmm6, 0x93 - shufps xmm8, xmm7, 221 - pshufd xmm7, xmm8, 0x93 - movaps xmm14, xmmword ptr [ROT8+rip] - movaps xmm15, xmmword ptr [ROT16+rip] - mov al, 7 -9: - paddd xmm0, xmm4 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm15 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 20 - psrld xmm11, 12 - por xmm1, xmm11 - paddd xmm0, xmm5 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm14 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 25 - psrld xmm11, 7 - por xmm1, xmm11 - pshufd xmm0, xmm0, 0x93 - pshufd xmm3, xmm3, 0x4E - pshufd xmm2, xmm2, 0x39 - paddd xmm0, xmm6 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm15 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 20 - psrld xmm11, 12 - por xmm1, xmm11 - paddd xmm0, xmm7 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm14 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 25 - psrld xmm11, 7 - por xmm1, xmm11 - pshufd xmm0, xmm0, 0x39 - pshufd xmm3, xmm3, 0x4E - pshufd xmm2, xmm2, 0x93 - dec al - jz 9f - movdqa xmm8, xmm4 - shufps xmm8, xmm5, 214 - pshufd xmm9, xmm4, 0x0F - pshufd xmm4, xmm8, 0x39 - movdqa xmm8, xmm6 - shufps xmm8, xmm7, 250 - pblendw xmm9, xmm8, 0xCC - movdqa xmm8, xmm7 - punpcklqdq xmm8, xmm5 - pblendw xmm8, xmm6, 0xC0 - pshufd xmm8, xmm8, 0x78 - punpckhdq xmm5, xmm7 - punpckldq xmm6, xmm5 - pshufd xmm7, xmm6, 0x1E - movdqa xmm5, xmm9 - movdqa xmm6, xmm8 - jmp 9b -9: - pxor xmm0, xmm2 - pxor xmm1, xmm3 - movups xmmword ptr [rcx], xmm0 - movups xmmword ptr [rcx+0x10], xmm1 - movdqa xmm6, xmmword ptr [rsp] - movdqa xmm7, xmmword ptr [rsp+0x10] - movdqa xmm8, xmmword ptr [rsp+0x20] - movdqa xmm9, xmmword ptr [rsp+0x30] - movdqa xmm11, xmmword ptr [rsp+0x40] - movdqa xmm14, xmmword ptr [rsp+0x50] - movdqa xmm15, xmmword ptr [rsp+0x60] - add rsp, 120 - ret - - -.p2align 6 -_blake3_compress_xof_sse41: -blake3_compress_xof_sse41: - sub rsp, 120 - movdqa xmmword ptr [rsp], xmm6 - movdqa xmmword ptr [rsp+0x10], xmm7 - movdqa xmmword ptr [rsp+0x20], xmm8 - movdqa xmmword ptr [rsp+0x30], xmm9 - movdqa xmmword ptr [rsp+0x40], xmm11 - movdqa xmmword ptr [rsp+0x50], xmm14 - movdqa xmmword ptr [rsp+0x60], xmm15 - movups xmm0, xmmword ptr [rcx] - movups xmm1, xmmword ptr [rcx+0x10] - movaps xmm2, xmmword ptr [BLAKE3_IV+rip] - movzx eax, byte ptr [rsp+0xA0] - movzx r8d, r8b - mov r10, qword ptr [rsp+0xA8] - shl rax, 32 - add r8, rax - movq xmm3, r9 - movq xmm4, r8 - punpcklqdq xmm3, xmm4 - movups xmm4, xmmword ptr [rdx] - movups xmm5, xmmword ptr [rdx+0x10] - movaps xmm8, xmm4 - shufps xmm4, xmm5, 136 - shufps xmm8, xmm5, 221 - movaps xmm5, xmm8 - movups xmm6, xmmword ptr [rdx+0x20] - movups xmm7, xmmword ptr [rdx+0x30] - movaps xmm8, xmm6 - shufps xmm6, xmm7, 136 - pshufd xmm6, xmm6, 0x93 - shufps xmm8, xmm7, 221 - pshufd xmm7, xmm8, 0x93 - movaps xmm14, xmmword ptr [ROT8+rip] - movaps xmm15, xmmword ptr [ROT16+rip] - mov al, 7 -9: - paddd xmm0, xmm4 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm15 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 20 - psrld xmm11, 12 - por xmm1, xmm11 - paddd xmm0, xmm5 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm14 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 25 - psrld xmm11, 7 - por xmm1, xmm11 - pshufd xmm0, xmm0, 0x93 - pshufd xmm3, xmm3, 0x4E - pshufd xmm2, xmm2, 0x39 - paddd xmm0, xmm6 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm15 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 20 - psrld xmm11, 12 - por xmm1, xmm11 - paddd xmm0, xmm7 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm14 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 25 - psrld xmm11, 7 - por xmm1, xmm11 - pshufd xmm0, xmm0, 0x39 - pshufd xmm3, xmm3, 0x4E - pshufd xmm2, xmm2, 0x93 - dec al - jz 9f - movdqa xmm8, xmm4 - shufps xmm8, xmm5, 214 - pshufd xmm9, xmm4, 0x0F - pshufd xmm4, xmm8, 0x39 - movdqa xmm8, xmm6 - shufps xmm8, xmm7, 250 - pblendw xmm9, xmm8, 0xCC - movdqa xmm8, xmm7 - punpcklqdq xmm8, xmm5 - pblendw xmm8, xmm6, 0xC0 - pshufd xmm8, xmm8, 0x78 - punpckhdq xmm5, xmm7 - punpckldq xmm6, xmm5 - pshufd xmm7, xmm6, 0x1E - movdqa xmm5, xmm9 - movdqa xmm6, xmm8 - jmp 9b -9: - movdqu xmm4, xmmword ptr [rcx] - movdqu xmm5, xmmword ptr [rcx+0x10] - pxor xmm0, xmm2 - pxor xmm1, xmm3 - pxor xmm2, xmm4 - pxor xmm3, xmm5 - movups xmmword ptr [r10], xmm0 - movups xmmword ptr [r10+0x10], xmm1 - movups xmmword ptr [r10+0x20], xmm2 - movups xmmword ptr [r10+0x30], xmm3 - movdqa xmm6, xmmword ptr [rsp] - movdqa xmm7, xmmword ptr [rsp+0x10] - movdqa xmm8, xmmword ptr [rsp+0x20] - movdqa xmm9, xmmword ptr [rsp+0x30] - movdqa xmm11, xmmword ptr [rsp+0x40] - movdqa xmm14, xmmword ptr [rsp+0x50] - movdqa xmm15, xmmword ptr [rsp+0x60] - add rsp, 120 - ret - - -.section .rodata -.p2align 6 -BLAKE3_IV: - .long 0x6A09E667, 0xBB67AE85 - .long 0x3C6EF372, 0xA54FF53A -ROT16: - .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 -ROT8: - .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 -ADD0: - .long 0, 1, 2, 3 -ADD1: - .long 4, 4, 4, 4 -BLAKE3_IV_0: - .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 -BLAKE3_IV_1: - .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 -BLAKE3_IV_2: - .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 -BLAKE3_IV_3: - .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A -BLAKE3_BLOCK_LEN: - .long 64, 64, 64, 64 -CMP_MSB_MASK: - .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 diff --git a/thirdparty/BLAKE3/c/blake3_sse41_x86-64_windows_msvc.asm b/thirdparty/BLAKE3/c/blake3_sse41_x86-64_windows_msvc.asm deleted file mode 100644 index 87001e4d3..000000000 --- a/thirdparty/BLAKE3/c/blake3_sse41_x86-64_windows_msvc.asm +++ /dev/null @@ -1,2089 +0,0 @@ -public _blake3_hash_many_sse41 -public blake3_hash_many_sse41 -public blake3_compress_in_place_sse41 -public _blake3_compress_in_place_sse41 -public blake3_compress_xof_sse41 -public _blake3_compress_xof_sse41 - -_TEXT SEGMENT ALIGN(16) 'CODE' - -ALIGN 16 -blake3_hash_many_sse41 PROC -_blake3_hash_many_sse41 PROC - push r15 - push r14 - push r13 - push r12 - push rsi - push rdi - push rbx - push rbp - mov rbp, rsp - sub rsp, 528 - and rsp, 0FFFFFFFFFFFFFFC0H - movdqa xmmword ptr [rsp+170H], xmm6 - movdqa xmmword ptr [rsp+180H], xmm7 - movdqa xmmword ptr [rsp+190H], xmm8 - movdqa xmmword ptr [rsp+1A0H], xmm9 - movdqa xmmword ptr [rsp+1B0H], xmm10 - movdqa xmmword ptr [rsp+1C0H], xmm11 - movdqa xmmword ptr [rsp+1D0H], xmm12 - movdqa xmmword ptr [rsp+1E0H], xmm13 - movdqa xmmword ptr [rsp+1F0H], xmm14 - movdqa xmmword ptr [rsp+200H], xmm15 - mov rdi, rcx - mov rsi, rdx - mov rdx, r8 - mov rcx, r9 - mov r8, qword ptr [rbp+68H] - movzx r9, byte ptr [rbp+70H] - neg r9d - movd xmm0, r9d - pshufd xmm0, xmm0, 00H - movdqa xmmword ptr [rsp+130H], xmm0 - movdqa xmm1, xmm0 - pand xmm1, xmmword ptr [ADD0] - pand xmm0, xmmword ptr [ADD1] - movdqa xmmword ptr [rsp+150H], xmm0 - movd xmm0, r8d - pshufd xmm0, xmm0, 00H - paddd xmm0, xmm1 - movdqa xmmword ptr [rsp+110H], xmm0 - pxor xmm0, xmmword ptr [CMP_MSB_MASK] - pxor xmm1, xmmword ptr [CMP_MSB_MASK] - pcmpgtd xmm1, xmm0 - shr r8, 32 - movd xmm2, r8d - pshufd xmm2, xmm2, 00H - psubd xmm2, xmm1 - movdqa xmmword ptr [rsp+120H], xmm2 - mov rbx, qword ptr [rbp+90H] - mov r15, rdx - shl r15, 6 - movzx r13d, byte ptr [rbp+78H] - movzx r12d, byte ptr [rbp+88H] - cmp rsi, 4 - jc final3blocks -outerloop4: - movdqu xmm3, xmmword ptr [rcx] - pshufd xmm0, xmm3, 00H - pshufd xmm1, xmm3, 55H - pshufd xmm2, xmm3, 0AAH - pshufd xmm3, xmm3, 0FFH - movdqu xmm7, xmmword ptr [rcx+10H] - pshufd xmm4, xmm7, 00H - pshufd xmm5, xmm7, 55H - pshufd xmm6, xmm7, 0AAH - pshufd xmm7, xmm7, 0FFH - mov r8, qword ptr [rdi] - mov r9, qword ptr [rdi+8H] - mov r10, qword ptr [rdi+10H] - mov r11, qword ptr [rdi+18H] - movzx eax, byte ptr [rbp+80H] - or eax, r13d - xor edx, edx -innerloop4: - mov r14d, eax - or eax, r12d - add rdx, 64 - cmp rdx, r15 - cmovne eax, r14d - movdqu xmm8, xmmword ptr [r8+rdx-40H] - movdqu xmm9, xmmword ptr [r9+rdx-40H] - movdqu xmm10, xmmword ptr [r10+rdx-40H] - movdqu xmm11, xmmword ptr [r11+rdx-40H] - movdqa xmm12, xmm8 - punpckldq xmm8, xmm9 - punpckhdq xmm12, xmm9 - movdqa xmm14, xmm10 - punpckldq xmm10, xmm11 - punpckhdq xmm14, xmm11 - movdqa xmm9, xmm8 - punpcklqdq xmm8, xmm10 - punpckhqdq xmm9, xmm10 - movdqa xmm13, xmm12 - punpcklqdq xmm12, xmm14 - punpckhqdq xmm13, xmm14 - movdqa xmmword ptr [rsp], xmm8 - movdqa xmmword ptr [rsp+10H], xmm9 - movdqa xmmword ptr [rsp+20H], xmm12 - movdqa xmmword ptr [rsp+30H], xmm13 - movdqu xmm8, xmmword ptr [r8+rdx-30H] - movdqu xmm9, xmmword ptr [r9+rdx-30H] - movdqu xmm10, xmmword ptr [r10+rdx-30H] - movdqu xmm11, xmmword ptr [r11+rdx-30H] - movdqa xmm12, xmm8 - punpckldq xmm8, xmm9 - punpckhdq xmm12, xmm9 - movdqa xmm14, xmm10 - punpckldq xmm10, xmm11 - punpckhdq xmm14, xmm11 - movdqa xmm9, xmm8 - punpcklqdq xmm8, xmm10 - punpckhqdq xmm9, xmm10 - movdqa xmm13, xmm12 - punpcklqdq xmm12, xmm14 - punpckhqdq xmm13, xmm14 - movdqa xmmword ptr [rsp+40H], xmm8 - movdqa xmmword ptr [rsp+50H], xmm9 - movdqa xmmword ptr [rsp+60H], xmm12 - movdqa xmmword ptr [rsp+70H], xmm13 - movdqu xmm8, xmmword ptr [r8+rdx-20H] - movdqu xmm9, xmmword ptr [r9+rdx-20H] - movdqu xmm10, xmmword ptr [r10+rdx-20H] - movdqu xmm11, xmmword ptr [r11+rdx-20H] - movdqa xmm12, xmm8 - punpckldq xmm8, xmm9 - punpckhdq xmm12, xmm9 - movdqa xmm14, xmm10 - punpckldq xmm10, xmm11 - punpckhdq xmm14, xmm11 - movdqa xmm9, xmm8 - punpcklqdq xmm8, xmm10 - punpckhqdq xmm9, xmm10 - movdqa xmm13, xmm12 - punpcklqdq xmm12, xmm14 - punpckhqdq xmm13, xmm14 - movdqa xmmword ptr [rsp+80H], xmm8 - movdqa xmmword ptr [rsp+90H], xmm9 - movdqa xmmword ptr [rsp+0A0H], xmm12 - movdqa xmmword ptr [rsp+0B0H], xmm13 - movdqu xmm8, xmmword ptr [r8+rdx-10H] - movdqu xmm9, xmmword ptr [r9+rdx-10H] - movdqu xmm10, xmmword ptr [r10+rdx-10H] - movdqu xmm11, xmmword ptr [r11+rdx-10H] - movdqa xmm12, xmm8 - punpckldq xmm8, xmm9 - punpckhdq xmm12, xmm9 - movdqa xmm14, xmm10 - punpckldq xmm10, xmm11 - punpckhdq xmm14, xmm11 - movdqa xmm9, xmm8 - punpcklqdq xmm8, xmm10 - punpckhqdq xmm9, xmm10 - movdqa xmm13, xmm12 - punpcklqdq xmm12, xmm14 - punpckhqdq xmm13, xmm14 - movdqa xmmword ptr [rsp+0C0H], xmm8 - movdqa xmmword ptr [rsp+0D0H], xmm9 - movdqa xmmword ptr [rsp+0E0H], xmm12 - movdqa xmmword ptr [rsp+0F0H], xmm13 - movdqa xmm9, xmmword ptr [BLAKE3_IV_1] - movdqa xmm10, xmmword ptr [BLAKE3_IV_2] - movdqa xmm11, xmmword ptr [BLAKE3_IV_3] - movdqa xmm12, xmmword ptr [rsp+110H] - movdqa xmm13, xmmword ptr [rsp+120H] - movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN] - movd xmm15, eax - pshufd xmm15, xmm15, 00H - prefetcht0 byte ptr [r8+rdx+80H] - prefetcht0 byte ptr [r9+rdx+80H] - prefetcht0 byte ptr [r10+rdx+80H] - prefetcht0 byte ptr [r11+rdx+80H] - paddd xmm0, xmmword ptr [rsp] - paddd xmm1, xmmword ptr [rsp+20H] - paddd xmm2, xmmword ptr [rsp+40H] - paddd xmm3, xmmword ptr [rsp+60H] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT16] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [BLAKE3_IV_0] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+100H], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+10H] - paddd xmm1, xmmword ptr [rsp+30H] - paddd xmm2, xmmword ptr [rsp+50H] - paddd xmm3, xmmword ptr [rsp+70H] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT8] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+100H] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+100H], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+80H] - paddd xmm1, xmmword ptr [rsp+0A0H] - paddd xmm2, xmmword ptr [rsp+0C0H] - paddd xmm3, xmmword ptr [rsp+0E0H] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT16] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+100H] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+100H], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+90H] - paddd xmm1, xmmword ptr [rsp+0B0H] - paddd xmm2, xmmword ptr [rsp+0D0H] - paddd xmm3, xmmword ptr [rsp+0F0H] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT8] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+100H] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+100H], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+20H] - paddd xmm1, xmmword ptr [rsp+30H] - paddd xmm2, xmmword ptr [rsp+70H] - paddd xmm3, xmmword ptr [rsp+40H] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT16] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+100H] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+100H], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+60H] - paddd xmm1, xmmword ptr [rsp+0A0H] - paddd xmm2, xmmword ptr [rsp] - paddd xmm3, xmmword ptr [rsp+0D0H] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT8] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+100H] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+100H], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+10H] - paddd xmm1, xmmword ptr [rsp+0C0H] - paddd xmm2, xmmword ptr [rsp+90H] - paddd xmm3, xmmword ptr [rsp+0F0H] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT16] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+100H] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+100H], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0B0H] - paddd xmm1, xmmword ptr [rsp+50H] - paddd xmm2, xmmword ptr [rsp+0E0H] - paddd xmm3, xmmword ptr [rsp+80H] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT8] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+100H] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+100H], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+30H] - paddd xmm1, xmmword ptr [rsp+0A0H] - paddd xmm2, xmmword ptr [rsp+0D0H] - paddd xmm3, xmmword ptr [rsp+70H] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT16] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+100H] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+100H], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+40H] - paddd xmm1, xmmword ptr [rsp+0C0H] - paddd xmm2, xmmword ptr [rsp+20H] - paddd xmm3, xmmword ptr [rsp+0E0H] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT8] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+100H] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+100H], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+60H] - paddd xmm1, xmmword ptr [rsp+90H] - paddd xmm2, xmmword ptr [rsp+0B0H] - paddd xmm3, xmmword ptr [rsp+80H] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT16] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+100H] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+100H], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+50H] - paddd xmm1, xmmword ptr [rsp] - paddd xmm2, xmmword ptr [rsp+0F0H] - paddd xmm3, xmmword ptr [rsp+10H] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT8] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+100H] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+100H], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0A0H] - paddd xmm1, xmmword ptr [rsp+0C0H] - paddd xmm2, xmmword ptr [rsp+0E0H] - paddd xmm3, xmmword ptr [rsp+0D0H] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT16] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+100H] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+100H], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+70H] - paddd xmm1, xmmword ptr [rsp+90H] - paddd xmm2, xmmword ptr [rsp+30H] - paddd xmm3, xmmword ptr [rsp+0F0H] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT8] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+100H] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+100H], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+40H] - paddd xmm1, xmmword ptr [rsp+0B0H] - paddd xmm2, xmmword ptr [rsp+50H] - paddd xmm3, xmmword ptr [rsp+10H] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT16] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+100H] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+100H], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp] - paddd xmm1, xmmword ptr [rsp+20H] - paddd xmm2, xmmword ptr [rsp+80H] - paddd xmm3, xmmword ptr [rsp+60H] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT8] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+100H] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+100H], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0C0H] - paddd xmm1, xmmword ptr [rsp+90H] - paddd xmm2, xmmword ptr [rsp+0F0H] - paddd xmm3, xmmword ptr [rsp+0E0H] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT16] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+100H] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+100H], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0D0H] - paddd xmm1, xmmword ptr [rsp+0B0H] - paddd xmm2, xmmword ptr [rsp+0A0H] - paddd xmm3, xmmword ptr [rsp+80H] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT8] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+100H] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+100H], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+70H] - paddd xmm1, xmmword ptr [rsp+50H] - paddd xmm2, xmmword ptr [rsp] - paddd xmm3, xmmword ptr [rsp+60H] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT16] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+100H] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+100H], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+20H] - paddd xmm1, xmmword ptr [rsp+30H] - paddd xmm2, xmmword ptr [rsp+10H] - paddd xmm3, xmmword ptr [rsp+40H] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT8] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+100H] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+100H], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+90H] - paddd xmm1, xmmword ptr [rsp+0B0H] - paddd xmm2, xmmword ptr [rsp+80H] - paddd xmm3, xmmword ptr [rsp+0F0H] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT16] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+100H] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+100H], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0E0H] - paddd xmm1, xmmword ptr [rsp+50H] - paddd xmm2, xmmword ptr [rsp+0C0H] - paddd xmm3, xmmword ptr [rsp+10H] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT8] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+100H] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+100H], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0D0H] - paddd xmm1, xmmword ptr [rsp] - paddd xmm2, xmmword ptr [rsp+20H] - paddd xmm3, xmmword ptr [rsp+40H] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT16] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+100H] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+100H], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+30H] - paddd xmm1, xmmword ptr [rsp+0A0H] - paddd xmm2, xmmword ptr [rsp+60H] - paddd xmm3, xmmword ptr [rsp+70H] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT8] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+100H] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+100H], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0B0H] - paddd xmm1, xmmword ptr [rsp+50H] - paddd xmm2, xmmword ptr [rsp+10H] - paddd xmm3, xmmword ptr [rsp+80H] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT16] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+100H] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+100H], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0F0H] - paddd xmm1, xmmword ptr [rsp] - paddd xmm2, xmmword ptr [rsp+90H] - paddd xmm3, xmmword ptr [rsp+60H] - paddd xmm0, xmm4 - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 - pxor xmm12, xmm0 - pxor xmm13, xmm1 - pxor xmm14, xmm2 - pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT8] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 - movdqa xmm8, xmmword ptr [rsp+100H] - paddd xmm8, xmm12 - paddd xmm9, xmm13 - paddd xmm10, xmm14 - paddd xmm11, xmm15 - pxor xmm4, xmm8 - pxor xmm5, xmm9 - pxor xmm6, xmm10 - pxor xmm7, xmm11 - movdqa xmmword ptr [rsp+100H], xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - paddd xmm0, xmmword ptr [rsp+0E0H] - paddd xmm1, xmmword ptr [rsp+20H] - paddd xmm2, xmmword ptr [rsp+30H] - paddd xmm3, xmmword ptr [rsp+70H] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT16] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+100H] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - movdqa xmmword ptr [rsp+100H], xmm8 - movdqa xmm8, xmm5 - psrld xmm8, 12 - pslld xmm5, 20 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 12 - pslld xmm6, 20 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 12 - pslld xmm7, 20 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 12 - pslld xmm4, 20 - por xmm4, xmm8 - paddd xmm0, xmmword ptr [rsp+0A0H] - paddd xmm1, xmmword ptr [rsp+0C0H] - paddd xmm2, xmmword ptr [rsp+40H] - paddd xmm3, xmmword ptr [rsp+0D0H] - paddd xmm0, xmm5 - paddd xmm1, xmm6 - paddd xmm2, xmm7 - paddd xmm3, xmm4 - pxor xmm15, xmm0 - pxor xmm12, xmm1 - pxor xmm13, xmm2 - pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT8] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - paddd xmm10, xmm15 - paddd xmm11, xmm12 - movdqa xmm8, xmmword ptr [rsp+100H] - paddd xmm8, xmm13 - paddd xmm9, xmm14 - pxor xmm5, xmm10 - pxor xmm6, xmm11 - pxor xmm7, xmm8 - pxor xmm4, xmm9 - pxor xmm0, xmm8 - pxor xmm1, xmm9 - pxor xmm2, xmm10 - pxor xmm3, xmm11 - movdqa xmm8, xmm5 - psrld xmm8, 7 - pslld xmm5, 25 - por xmm5, xmm8 - movdqa xmm8, xmm6 - psrld xmm8, 7 - pslld xmm6, 25 - por xmm6, xmm8 - movdqa xmm8, xmm7 - psrld xmm8, 7 - pslld xmm7, 25 - por xmm7, xmm8 - movdqa xmm8, xmm4 - psrld xmm8, 7 - pslld xmm4, 25 - por xmm4, xmm8 - pxor xmm4, xmm12 - pxor xmm5, xmm13 - pxor xmm6, xmm14 - pxor xmm7, xmm15 - mov eax, r13d - jne innerloop4 - movdqa xmm9, xmm0 - punpckldq xmm0, xmm1 - punpckhdq xmm9, xmm1 - movdqa xmm11, xmm2 - punpckldq xmm2, xmm3 - punpckhdq xmm11, xmm3 - movdqa xmm1, xmm0 - punpcklqdq xmm0, xmm2 - punpckhqdq xmm1, xmm2 - movdqa xmm3, xmm9 - punpcklqdq xmm9, xmm11 - punpckhqdq xmm3, xmm11 - movdqu xmmword ptr [rbx], xmm0 - movdqu xmmword ptr [rbx+20H], xmm1 - movdqu xmmword ptr [rbx+40H], xmm9 - movdqu xmmword ptr [rbx+60H], xmm3 - movdqa xmm9, xmm4 - punpckldq xmm4, xmm5 - punpckhdq xmm9, xmm5 - movdqa xmm11, xmm6 - punpckldq xmm6, xmm7 - punpckhdq xmm11, xmm7 - movdqa xmm5, xmm4 - punpcklqdq xmm4, xmm6 - punpckhqdq xmm5, xmm6 - movdqa xmm7, xmm9 - punpcklqdq xmm9, xmm11 - punpckhqdq xmm7, xmm11 - movdqu xmmword ptr [rbx+10H], xmm4 - movdqu xmmword ptr [rbx+30H], xmm5 - movdqu xmmword ptr [rbx+50H], xmm9 - movdqu xmmword ptr [rbx+70H], xmm7 - movdqa xmm1, xmmword ptr [rsp+110H] - movdqa xmm0, xmm1 - paddd xmm1, xmmword ptr [rsp+150H] - movdqa xmmword ptr [rsp+110H], xmm1 - pxor xmm0, xmmword ptr [CMP_MSB_MASK] - pxor xmm1, xmmword ptr [CMP_MSB_MASK] - pcmpgtd xmm0, xmm1 - movdqa xmm1, xmmword ptr [rsp+120H] - psubd xmm1, xmm0 - movdqa xmmword ptr [rsp+120H], xmm1 - add rbx, 128 - add rdi, 32 - sub rsi, 4 - cmp rsi, 4 - jnc outerloop4 - test rsi, rsi - jne final3blocks -unwind: - movdqa xmm6, xmmword ptr [rsp+170H] - movdqa xmm7, xmmword ptr [rsp+180H] - movdqa xmm8, xmmword ptr [rsp+190H] - movdqa xmm9, xmmword ptr [rsp+1A0H] - movdqa xmm10, xmmword ptr [rsp+1B0H] - movdqa xmm11, xmmword ptr [rsp+1C0H] - movdqa xmm12, xmmword ptr [rsp+1D0H] - movdqa xmm13, xmmword ptr [rsp+1E0H] - movdqa xmm14, xmmword ptr [rsp+1F0H] - movdqa xmm15, xmmword ptr [rsp+200H] - mov rsp, rbp - pop rbp - pop rbx - pop rdi - pop rsi - pop r12 - pop r13 - pop r14 - pop r15 - ret -ALIGN 16 -final3blocks: - test esi, 2H - je final1block - movups xmm0, xmmword ptr [rcx] - movups xmm1, xmmword ptr [rcx+10H] - movaps xmm8, xmm0 - movaps xmm9, xmm1 - movd xmm13, dword ptr [rsp+110H] - pinsrd xmm13, dword ptr [rsp+120H], 1 - pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN], 2 - movaps xmmword ptr [rsp], xmm13 - movd xmm14, dword ptr [rsp+114H] - pinsrd xmm14, dword ptr [rsp+124H], 1 - pinsrd xmm14, dword ptr [BLAKE3_BLOCK_LEN], 2 - movaps xmmword ptr [rsp+10H], xmm14 - mov r8, qword ptr [rdi] - mov r9, qword ptr [rdi+8H] - movzx eax, byte ptr [rbp+80H] - or eax, r13d - xor edx, edx -innerloop2: - mov r14d, eax - or eax, r12d - add rdx, 64 - cmp rdx, r15 - cmovne eax, r14d - movaps xmm2, xmmword ptr [BLAKE3_IV] - movaps xmm10, xmm2 - movups xmm4, xmmword ptr [r8+rdx-40H] - movups xmm5, xmmword ptr [r8+rdx-30H] - movaps xmm3, xmm4 - shufps xmm4, xmm5, 136 - shufps xmm3, xmm5, 221 - movaps xmm5, xmm3 - movups xmm6, xmmword ptr [r8+rdx-20H] - movups xmm7, xmmword ptr [r8+rdx-10H] - movaps xmm3, xmm6 - shufps xmm6, xmm7, 136 - pshufd xmm6, xmm6, 93H - shufps xmm3, xmm7, 221 - pshufd xmm7, xmm3, 93H - movups xmm12, xmmword ptr [r9+rdx-40H] - movups xmm13, xmmword ptr [r9+rdx-30H] - movaps xmm11, xmm12 - shufps xmm12, xmm13, 136 - shufps xmm11, xmm13, 221 - movaps xmm13, xmm11 - movups xmm14, xmmword ptr [r9+rdx-20H] - movups xmm15, xmmword ptr [r9+rdx-10H] - movaps xmm11, xmm14 - shufps xmm14, xmm15, 136 - pshufd xmm14, xmm14, 93H - shufps xmm11, xmm15, 221 - pshufd xmm15, xmm11, 93H - movaps xmm3, xmmword ptr [rsp] - movaps xmm11, xmmword ptr [rsp+10H] - pinsrd xmm3, eax, 3 - pinsrd xmm11, eax, 3 - mov al, 7 -roundloop2: - paddd xmm0, xmm4 - paddd xmm8, xmm12 - movaps xmmword ptr [rsp+20H], xmm4 - movaps xmmword ptr [rsp+30H], xmm12 - paddd xmm0, xmm1 - paddd xmm8, xmm9 - pxor xmm3, xmm0 - pxor xmm11, xmm8 - movaps xmm12, xmmword ptr [ROT16] - pshufb xmm3, xmm12 - pshufb xmm11, xmm12 - paddd xmm2, xmm3 - paddd xmm10, xmm11 - pxor xmm1, xmm2 - pxor xmm9, xmm10 - movdqa xmm4, xmm1 - pslld xmm1, 20 - psrld xmm4, 12 - por xmm1, xmm4 - movdqa xmm4, xmm9 - pslld xmm9, 20 - psrld xmm4, 12 - por xmm9, xmm4 - paddd xmm0, xmm5 - paddd xmm8, xmm13 - movaps xmmword ptr [rsp+40H], xmm5 - movaps xmmword ptr [rsp+50H], xmm13 - paddd xmm0, xmm1 - paddd xmm8, xmm9 - pxor xmm3, xmm0 - pxor xmm11, xmm8 - movaps xmm13, xmmword ptr [ROT8] - pshufb xmm3, xmm13 - pshufb xmm11, xmm13 - paddd xmm2, xmm3 - paddd xmm10, xmm11 - pxor xmm1, xmm2 - pxor xmm9, xmm10 - movdqa xmm4, xmm1 - pslld xmm1, 25 - psrld xmm4, 7 - por xmm1, xmm4 - movdqa xmm4, xmm9 - pslld xmm9, 25 - psrld xmm4, 7 - por xmm9, xmm4 - pshufd xmm0, xmm0, 93H - pshufd xmm8, xmm8, 93H - pshufd xmm3, xmm3, 4EH - pshufd xmm11, xmm11, 4EH - pshufd xmm2, xmm2, 39H - pshufd xmm10, xmm10, 39H - paddd xmm0, xmm6 - paddd xmm8, xmm14 - paddd xmm0, xmm1 - paddd xmm8, xmm9 - pxor xmm3, xmm0 - pxor xmm11, xmm8 - pshufb xmm3, xmm12 - pshufb xmm11, xmm12 - paddd xmm2, xmm3 - paddd xmm10, xmm11 - pxor xmm1, xmm2 - pxor xmm9, xmm10 - movdqa xmm4, xmm1 - pslld xmm1, 20 - psrld xmm4, 12 - por xmm1, xmm4 - movdqa xmm4, xmm9 - pslld xmm9, 20 - psrld xmm4, 12 - por xmm9, xmm4 - paddd xmm0, xmm7 - paddd xmm8, xmm15 - paddd xmm0, xmm1 - paddd xmm8, xmm9 - pxor xmm3, xmm0 - pxor xmm11, xmm8 - pshufb xmm3, xmm13 - pshufb xmm11, xmm13 - paddd xmm2, xmm3 - paddd xmm10, xmm11 - pxor xmm1, xmm2 - pxor xmm9, xmm10 - movdqa xmm4, xmm1 - pslld xmm1, 25 - psrld xmm4, 7 - por xmm1, xmm4 - movdqa xmm4, xmm9 - pslld xmm9, 25 - psrld xmm4, 7 - por xmm9, xmm4 - pshufd xmm0, xmm0, 39H - pshufd xmm8, xmm8, 39H - pshufd xmm3, xmm3, 4EH - pshufd xmm11, xmm11, 4EH - pshufd xmm2, xmm2, 93H - pshufd xmm10, xmm10, 93H - dec al - je endroundloop2 - movdqa xmm12, xmmword ptr [rsp+20H] - movdqa xmm5, xmmword ptr [rsp+40H] - pshufd xmm13, xmm12, 0FH - shufps xmm12, xmm5, 214 - pshufd xmm4, xmm12, 39H - movdqa xmm12, xmm6 - shufps xmm12, xmm7, 250 - pblendw xmm13, xmm12, 0CCH - movdqa xmm12, xmm7 - punpcklqdq xmm12, xmm5 - pblendw xmm12, xmm6, 0C0H - pshufd xmm12, xmm12, 78H - punpckhdq xmm5, xmm7 - punpckldq xmm6, xmm5 - pshufd xmm7, xmm6, 1EH - movdqa xmmword ptr [rsp+20H], xmm13 - movdqa xmmword ptr [rsp+40H], xmm12 - movdqa xmm5, xmmword ptr [rsp+30H] - movdqa xmm13, xmmword ptr [rsp+50H] - pshufd xmm6, xmm5, 0FH - shufps xmm5, xmm13, 214 - pshufd xmm12, xmm5, 39H - movdqa xmm5, xmm14 - shufps xmm5, xmm15, 250 - pblendw xmm6, xmm5, 0CCH - movdqa xmm5, xmm15 - punpcklqdq xmm5, xmm13 - pblendw xmm5, xmm14, 0C0H - pshufd xmm5, xmm5, 78H - punpckhdq xmm13, xmm15 - punpckldq xmm14, xmm13 - pshufd xmm15, xmm14, 1EH - movdqa xmm13, xmm6 - movdqa xmm14, xmm5 - movdqa xmm5, xmmword ptr [rsp+20H] - movdqa xmm6, xmmword ptr [rsp+40H] - jmp roundloop2 -endroundloop2: - pxor xmm0, xmm2 - pxor xmm1, xmm3 - pxor xmm8, xmm10 - pxor xmm9, xmm11 - mov eax, r13d - cmp rdx, r15 - jne innerloop2 - movups xmmword ptr [rbx], xmm0 - movups xmmword ptr [rbx+10H], xmm1 - movups xmmword ptr [rbx+20H], xmm8 - movups xmmword ptr [rbx+30H], xmm9 - movdqa xmm0, xmmword ptr [rsp+130H] - movdqa xmm1, xmmword ptr [rsp+110H] - movdqa xmm2, xmmword ptr [rsp+120H] - movdqu xmm3, xmmword ptr [rsp+118H] - movdqu xmm4, xmmword ptr [rsp+128H] - blendvps xmm1, xmm3, xmm0 - blendvps xmm2, xmm4, xmm0 - movdqa xmmword ptr [rsp+110H], xmm1 - movdqa xmmword ptr [rsp+120H], xmm2 - add rdi, 16 - add rbx, 64 - sub rsi, 2 -final1block: - test esi, 1H - je unwind - movups xmm0, xmmword ptr [rcx] - movups xmm1, xmmword ptr [rcx+10H] - movd xmm13, dword ptr [rsp+110H] - pinsrd xmm13, dword ptr [rsp+120H], 1 - pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN], 2 - movaps xmm14, xmmword ptr [ROT8] - movaps xmm15, xmmword ptr [ROT16] - mov r8, qword ptr [rdi] - movzx eax, byte ptr [rbp+80H] - or eax, r13d - xor edx, edx -innerloop1: - mov r14d, eax - or eax, r12d - add rdx, 64 - cmp rdx, r15 - cmovne eax, r14d - movaps xmm2, xmmword ptr [BLAKE3_IV] - movaps xmm3, xmm13 - pinsrd xmm3, eax, 3 - movups xmm4, xmmword ptr [r8+rdx-40H] - movups xmm5, xmmword ptr [r8+rdx-30H] - movaps xmm8, xmm4 - shufps xmm4, xmm5, 136 - shufps xmm8, xmm5, 221 - movaps xmm5, xmm8 - movups xmm6, xmmword ptr [r8+rdx-20H] - movups xmm7, xmmword ptr [r8+rdx-10H] - movaps xmm8, xmm6 - shufps xmm6, xmm7, 136 - pshufd xmm6, xmm6, 93H - shufps xmm8, xmm7, 221 - pshufd xmm7, xmm8, 93H - mov al, 7 -roundloop1: - paddd xmm0, xmm4 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm15 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 20 - psrld xmm11, 12 - por xmm1, xmm11 - paddd xmm0, xmm5 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm14 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 25 - psrld xmm11, 7 - por xmm1, xmm11 - pshufd xmm0, xmm0, 93H - pshufd xmm3, xmm3, 4EH - pshufd xmm2, xmm2, 39H - paddd xmm0, xmm6 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm15 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 20 - psrld xmm11, 12 - por xmm1, xmm11 - paddd xmm0, xmm7 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm14 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 25 - psrld xmm11, 7 - por xmm1, xmm11 - pshufd xmm0, xmm0, 39H - pshufd xmm3, xmm3, 4EH - pshufd xmm2, xmm2, 93H - dec al - jz endroundloop1 - movdqa xmm8, xmm4 - shufps xmm8, xmm5, 214 - pshufd xmm9, xmm4, 0FH - pshufd xmm4, xmm8, 39H - movdqa xmm8, xmm6 - shufps xmm8, xmm7, 250 - pblendw xmm9, xmm8, 0CCH - movdqa xmm8, xmm7 - punpcklqdq xmm8, xmm5 - pblendw xmm8, xmm6, 0C0H - pshufd xmm8, xmm8, 78H - punpckhdq xmm5, xmm7 - punpckldq xmm6, xmm5 - pshufd xmm7, xmm6, 1EH - movdqa xmm5, xmm9 - movdqa xmm6, xmm8 - jmp roundloop1 -endroundloop1: - pxor xmm0, xmm2 - pxor xmm1, xmm3 - mov eax, r13d - cmp rdx, r15 - jne innerloop1 - movups xmmword ptr [rbx], xmm0 - movups xmmword ptr [rbx+10H], xmm1 - jmp unwind -_blake3_hash_many_sse41 ENDP -blake3_hash_many_sse41 ENDP - -blake3_compress_in_place_sse41 PROC -_blake3_compress_in_place_sse41 PROC - sub rsp, 120 - movdqa xmmword ptr [rsp], xmm6 - movdqa xmmword ptr [rsp+10H], xmm7 - movdqa xmmword ptr [rsp+20H], xmm8 - movdqa xmmword ptr [rsp+30H], xmm9 - movdqa xmmword ptr [rsp+40H], xmm11 - movdqa xmmword ptr [rsp+50H], xmm14 - movdqa xmmword ptr [rsp+60H], xmm15 - movups xmm0, xmmword ptr [rcx] - movups xmm1, xmmword ptr [rcx+10H] - movaps xmm2, xmmword ptr [BLAKE3_IV] - movzx eax, byte ptr [rsp+0A0H] - movzx r8d, r8b - shl rax, 32 - add r8, rax - movq xmm3, r9 - movq xmm4, r8 - punpcklqdq xmm3, xmm4 - movups xmm4, xmmword ptr [rdx] - movups xmm5, xmmword ptr [rdx+10H] - movaps xmm8, xmm4 - shufps xmm4, xmm5, 136 - shufps xmm8, xmm5, 221 - movaps xmm5, xmm8 - movups xmm6, xmmword ptr [rdx+20H] - movups xmm7, xmmword ptr [rdx+30H] - movaps xmm8, xmm6 - shufps xmm6, xmm7, 136 - pshufd xmm6, xmm6, 93H - shufps xmm8, xmm7, 221 - pshufd xmm7, xmm8, 93H - movaps xmm14, xmmword ptr [ROT8] - movaps xmm15, xmmword ptr [ROT16] - mov al, 7 -@@: - paddd xmm0, xmm4 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm15 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 20 - psrld xmm11, 12 - por xmm1, xmm11 - paddd xmm0, xmm5 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm14 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 25 - psrld xmm11, 7 - por xmm1, xmm11 - pshufd xmm0, xmm0, 93H - pshufd xmm3, xmm3, 4EH - pshufd xmm2, xmm2, 39H - paddd xmm0, xmm6 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm15 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 20 - psrld xmm11, 12 - por xmm1, xmm11 - paddd xmm0, xmm7 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm14 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 25 - psrld xmm11, 7 - por xmm1, xmm11 - pshufd xmm0, xmm0, 39H - pshufd xmm3, xmm3, 4EH - pshufd xmm2, xmm2, 93H - dec al - jz @F - movdqa xmm8, xmm4 - shufps xmm8, xmm5, 214 - pshufd xmm9, xmm4, 0FH - pshufd xmm4, xmm8, 39H - movdqa xmm8, xmm6 - shufps xmm8, xmm7, 250 - pblendw xmm9, xmm8, 0CCH - movdqa xmm8, xmm7 - punpcklqdq xmm8, xmm5 - pblendw xmm8, xmm6, 0C0H - pshufd xmm8, xmm8, 78H - punpckhdq xmm5, xmm7 - punpckldq xmm6, xmm5 - pshufd xmm7, xmm6, 1EH - movdqa xmm5, xmm9 - movdqa xmm6, xmm8 - jmp @B -@@: - pxor xmm0, xmm2 - pxor xmm1, xmm3 - movups xmmword ptr [rcx], xmm0 - movups xmmword ptr [rcx+10H], xmm1 - movdqa xmm6, xmmword ptr [rsp] - movdqa xmm7, xmmword ptr [rsp+10H] - movdqa xmm8, xmmword ptr [rsp+20H] - movdqa xmm9, xmmword ptr [rsp+30H] - movdqa xmm11, xmmword ptr [rsp+40H] - movdqa xmm14, xmmword ptr [rsp+50H] - movdqa xmm15, xmmword ptr [rsp+60H] - add rsp, 120 - ret -_blake3_compress_in_place_sse41 ENDP -blake3_compress_in_place_sse41 ENDP - -ALIGN 16 -blake3_compress_xof_sse41 PROC -_blake3_compress_xof_sse41 PROC - sub rsp, 120 - movdqa xmmword ptr [rsp], xmm6 - movdqa xmmword ptr [rsp+10H], xmm7 - movdqa xmmword ptr [rsp+20H], xmm8 - movdqa xmmword ptr [rsp+30H], xmm9 - movdqa xmmword ptr [rsp+40H], xmm11 - movdqa xmmword ptr [rsp+50H], xmm14 - movdqa xmmword ptr [rsp+60H], xmm15 - movups xmm0, xmmword ptr [rcx] - movups xmm1, xmmword ptr [rcx+10H] - movaps xmm2, xmmword ptr [BLAKE3_IV] - movzx eax, byte ptr [rsp+0A0H] - movzx r8d, r8b - mov r10, qword ptr [rsp+0A8H] - shl rax, 32 - add r8, rax - movq xmm3, r9 - movq xmm4, r8 - punpcklqdq xmm3, xmm4 - movups xmm4, xmmword ptr [rdx] - movups xmm5, xmmword ptr [rdx+10H] - movaps xmm8, xmm4 - shufps xmm4, xmm5, 136 - shufps xmm8, xmm5, 221 - movaps xmm5, xmm8 - movups xmm6, xmmword ptr [rdx+20H] - movups xmm7, xmmword ptr [rdx+30H] - movaps xmm8, xmm6 - shufps xmm6, xmm7, 136 - pshufd xmm6, xmm6, 93H - shufps xmm8, xmm7, 221 - pshufd xmm7, xmm8, 93H - movaps xmm14, xmmword ptr [ROT8] - movaps xmm15, xmmword ptr [ROT16] - mov al, 7 -@@: - paddd xmm0, xmm4 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm15 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 20 - psrld xmm11, 12 - por xmm1, xmm11 - paddd xmm0, xmm5 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm14 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 25 - psrld xmm11, 7 - por xmm1, xmm11 - pshufd xmm0, xmm0, 93H - pshufd xmm3, xmm3, 4EH - pshufd xmm2, xmm2, 39H - paddd xmm0, xmm6 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm15 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 20 - psrld xmm11, 12 - por xmm1, xmm11 - paddd xmm0, xmm7 - paddd xmm0, xmm1 - pxor xmm3, xmm0 - pshufb xmm3, xmm14 - paddd xmm2, xmm3 - pxor xmm1, xmm2 - movdqa xmm11, xmm1 - pslld xmm1, 25 - psrld xmm11, 7 - por xmm1, xmm11 - pshufd xmm0, xmm0, 39H - pshufd xmm3, xmm3, 4EH - pshufd xmm2, xmm2, 93H - dec al - jz @F - movdqa xmm8, xmm4 - shufps xmm8, xmm5, 214 - pshufd xmm9, xmm4, 0FH - pshufd xmm4, xmm8, 39H - movdqa xmm8, xmm6 - shufps xmm8, xmm7, 250 - pblendw xmm9, xmm8, 0CCH - movdqa xmm8, xmm7 - punpcklqdq xmm8, xmm5 - pblendw xmm8, xmm6, 0C0H - pshufd xmm8, xmm8, 78H - punpckhdq xmm5, xmm7 - punpckldq xmm6, xmm5 - pshufd xmm7, xmm6, 1EH - movdqa xmm5, xmm9 - movdqa xmm6, xmm8 - jmp @B -@@: - movdqu xmm4, xmmword ptr [rcx] - movdqu xmm5, xmmword ptr [rcx+10H] - pxor xmm0, xmm2 - pxor xmm1, xmm3 - pxor xmm2, xmm4 - pxor xmm3, xmm5 - movups xmmword ptr [r10], xmm0 - movups xmmword ptr [r10+10H], xmm1 - movups xmmword ptr [r10+20H], xmm2 - movups xmmword ptr [r10+30H], xmm3 - movdqa xmm6, xmmword ptr [rsp] - movdqa xmm7, xmmword ptr [rsp+10H] - movdqa xmm8, xmmword ptr [rsp+20H] - movdqa xmm9, xmmword ptr [rsp+30H] - movdqa xmm11, xmmword ptr [rsp+40H] - movdqa xmm14, xmmword ptr [rsp+50H] - movdqa xmm15, xmmword ptr [rsp+60H] - add rsp, 120 - ret -_blake3_compress_xof_sse41 ENDP -blake3_compress_xof_sse41 ENDP - -_TEXT ENDS - - -_RDATA SEGMENT READONLY PAGE ALIAS(".rdata") 'CONST' -ALIGN 64 -BLAKE3_IV: - dd 6A09E667H, 0BB67AE85H, 3C6EF372H, 0A54FF53AH - -ADD0: - dd 0, 1, 2, 3 - -ADD1: - dd 4 dup (4) - -BLAKE3_IV_0: - dd 4 dup (6A09E667H) - -BLAKE3_IV_1: - dd 4 dup (0BB67AE85H) - -BLAKE3_IV_2: - dd 4 dup (3C6EF372H) - -BLAKE3_IV_3: - dd 4 dup (0A54FF53AH) - -BLAKE3_BLOCK_LEN: - dd 4 dup (64) - -ROT16: - db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 - -ROT8: - db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 - -CMP_MSB_MASK: - dd 8 dup(80000000H) - -_RDATA ENDS -END - diff --git a/thirdparty/BLAKE3/c/example.c b/thirdparty/BLAKE3/c/example.c deleted file mode 100644 index 02fe3c32b..000000000 --- a/thirdparty/BLAKE3/c/example.c +++ /dev/null @@ -1,27 +0,0 @@ -#include "blake3.h" -#include <stdio.h> -#include <unistd.h> - -int main() { - // Initialize the hasher. - blake3_hasher hasher; - blake3_hasher_init(&hasher); - - // Read input bytes from stdin. - unsigned char buf[65536]; - ssize_t n; - while ((n = read(STDIN_FILENO, buf, sizeof(buf))) > 0) { - blake3_hasher_update(&hasher, buf, n); - } - - // Finalize the hash. BLAKE3_OUT_LEN is the default output length, 32 bytes. - uint8_t output[BLAKE3_OUT_LEN]; - blake3_hasher_finalize(&hasher, output, BLAKE3_OUT_LEN); - - // Print the hash as hexadecimal. - for (size_t i = 0; i < BLAKE3_OUT_LEN; i++) { - printf("%02x", output[i]); - } - printf("\n"); - return 0; -} diff --git a/thirdparty/BLAKE3/c/main.c b/thirdparty/BLAKE3/c/main.c deleted file mode 100644 index 9b8a436f3..000000000 --- a/thirdparty/BLAKE3/c/main.c +++ /dev/null @@ -1,166 +0,0 @@ -/* - * This main file is intended for testing via `make test`. It does not build in - * other settings. See README.md in this directory for examples of how to build - * C code. - */ - -#include <assert.h> -#include <errno.h> -#include <stdbool.h> -#include <stdint.h> -#include <stdio.h> -#include <stdlib.h> - -#include "blake3.h" -#include "blake3_impl.h" - -#define HASH_MODE 0 -#define KEYED_HASH_MODE 1 -#define DERIVE_KEY_MODE 2 - -static void hex_char_value(uint8_t c, uint8_t *value, bool *valid) { - if ('0' <= c && c <= '9') { - *value = c - '0'; - *valid = true; - } else if ('a' <= c && c <= 'f') { - *value = 10 + c - 'a'; - *valid = true; - } else { - *valid = false; - } -} - -static int parse_key(char *hex_key, uint8_t out[BLAKE3_KEY_LEN]) { - size_t hex_len = strlen(hex_key); - if (hex_len != 64) { - fprintf(stderr, "Expected a 64-char hexadecimal key, got %zu chars.\n", - hex_len); - return 1; - } - for (size_t i = 0; i < 64; i++) { - uint8_t value; - bool valid; - hex_char_value(hex_key[i], &value, &valid); - if (!valid) { - fprintf(stderr, "Invalid hex char.\n"); - return 1; - } - if (i % 2 == 0) { - out[i / 2] = 0; - value <<= 4; - } - out[i / 2] += value; - } - return 0; -} - -/* A little repetition here */ -enum cpu_feature { - SSE2 = 1 << 0, - SSSE3 = 1 << 1, - SSE41 = 1 << 2, - AVX = 1 << 3, - AVX2 = 1 << 4, - AVX512F = 1 << 5, - AVX512VL = 1 << 6, - /* ... */ - UNDEFINED = 1 << 30 -}; - -extern enum cpu_feature g_cpu_features; -enum cpu_feature get_cpu_features(); - -int main(int argc, char **argv) { - size_t out_len = BLAKE3_OUT_LEN; - uint8_t key[BLAKE3_KEY_LEN]; - char *context = ""; - uint8_t mode = HASH_MODE; - while (argc > 1) { - if (argc <= 2) { - fprintf(stderr, "Odd number of arguments.\n"); - return 1; - } - if (strcmp("--length", argv[1]) == 0) { - char *endptr = NULL; - errno = 0; - unsigned long long out_len_ll = strtoull(argv[2], &endptr, 10); - if (errno != 0 || out_len > SIZE_MAX || endptr == argv[2] || - *endptr != 0) { - fprintf(stderr, "Bad length argument.\n"); - return 1; - } - out_len = (size_t)out_len_ll; - } else if (strcmp("--keyed", argv[1]) == 0) { - mode = KEYED_HASH_MODE; - int ret = parse_key(argv[2], key); - if (ret != 0) { - return ret; - } - } else if (strcmp("--derive-key", argv[1]) == 0) { - mode = DERIVE_KEY_MODE; - context = argv[2]; - } else { - fprintf(stderr, "Unknown flag.\n"); - return 1; - } - argc -= 2; - argv += 2; - } - - /* - * We're going to hash the input multiple times, so we need to buffer it all. - * This is just for test cases, so go ahead and assume that the input is less - * than 1 MiB. - */ - size_t buf_capacity = 1 << 20; - uint8_t *buf = malloc(buf_capacity); - assert(buf != NULL); - size_t buf_len = 0; - while (1) { - size_t n = fread(&buf[buf_len], 1, buf_capacity - buf_len, stdin); - if (n == 0) { - break; - } - buf_len += n; - assert(buf_len < buf_capacity); - } - - const int mask = get_cpu_features(); - int feature = 0; - do { - fprintf(stderr, "Testing 0x%08X\n", feature); - g_cpu_features = feature; - blake3_hasher hasher; - switch (mode) { - case HASH_MODE: - blake3_hasher_init(&hasher); - break; - case KEYED_HASH_MODE: - blake3_hasher_init_keyed(&hasher, key); - break; - case DERIVE_KEY_MODE: - blake3_hasher_init_derive_key(&hasher, context); - break; - default: - abort(); - } - - blake3_hasher_update(&hasher, buf, buf_len); - - /* TODO: An incremental output reader API to avoid this allocation. */ - uint8_t *out = malloc(out_len); - if (out_len > 0 && out == NULL) { - fprintf(stderr, "malloc() failed.\n"); - return 1; - } - blake3_hasher_finalize(&hasher, out, out_len); - for (size_t i = 0; i < out_len; i++) { - printf("%02x", out[i]); - } - printf("\n"); - free(out); - feature = (feature - mask) & mask; - } while (feature != 0); - free(buf); - return 0; -} diff --git a/thirdparty/BLAKE3/c/test.py b/thirdparty/BLAKE3/c/test.py deleted file mode 100644 index b0b192950..000000000 --- a/thirdparty/BLAKE3/c/test.py +++ /dev/null @@ -1,97 +0,0 @@ -#! /usr/bin/env python3 - -from binascii import hexlify -import json -from os import path -import subprocess - -HERE = path.dirname(__file__) -TEST_VECTORS_PATH = path.join(HERE, "..", "test_vectors", "test_vectors.json") -TEST_VECTORS = json.load(open(TEST_VECTORS_PATH)) - - -def run_blake3(args, input): - output = subprocess.run([path.join(HERE, "blake3")] + args, - input=input, - stdout=subprocess.PIPE, - check=True) - return output.stdout.decode().strip() - - -# Fill the input with a repeating byte pattern. We use a cycle length of 251, -# because that's the largets prime number less than 256. This makes it unlikely -# to swapping any two adjacent input blocks or chunks will give the same -# answer. -def make_test_input(length): - i = 0 - buf = bytearray() - while len(buf) < length: - buf.append(i) - i = (i + 1) % 251 - return buf - - -def main(): - for case in TEST_VECTORS["cases"]: - input_len = case["input_len"] - input = make_test_input(input_len) - hex_key = hexlify(TEST_VECTORS["key"].encode()) - context_string = TEST_VECTORS["context_string"] - expected_hash_xof = case["hash"] - expected_hash = expected_hash_xof[:64] - expected_keyed_hash_xof = case["keyed_hash"] - expected_keyed_hash = expected_keyed_hash_xof[:64] - expected_derive_key_xof = case["derive_key"] - expected_derive_key = expected_derive_key_xof[:64] - - # Test the default hash. - test_hash = run_blake3([], input) - for line in test_hash.splitlines(): - assert expected_hash == line, \ - "hash({}): {} != {}".format(input_len, expected_hash, line) - - # Test the extended hash. - xof_len = len(expected_hash_xof) // 2 - test_hash_xof = run_blake3(["--length", str(xof_len)], input) - for line in test_hash_xof.splitlines(): - assert expected_hash_xof == line, \ - "hash_xof({}): {} != {}".format( - input_len, expected_hash_xof, line) - - # Test the default keyed hash. - test_keyed_hash = run_blake3(["--keyed", hex_key], input) - for line in test_keyed_hash.splitlines(): - assert expected_keyed_hash == line, \ - "keyed_hash({}): {} != {}".format( - input_len, expected_keyed_hash, line) - - # Test the extended keyed hash. - xof_len = len(expected_keyed_hash_xof) // 2 - test_keyed_hash_xof = run_blake3( - ["--keyed", hex_key, "--length", - str(xof_len)], input) - for line in test_keyed_hash_xof.splitlines(): - assert expected_keyed_hash_xof == line, \ - "keyed_hash_xof({}): {} != {}".format( - input_len, expected_keyed_hash_xof, line) - - # Test the default derive key. - test_derive_key = run_blake3(["--derive-key", context_string], input) - for line in test_derive_key.splitlines(): - assert expected_derive_key == line, \ - "derive_key({}): {} != {}".format( - input_len, expected_derive_key, line) - - # Test the extended derive key. - xof_len = len(expected_derive_key_xof) // 2 - test_derive_key_xof = run_blake3( - ["--derive-key", context_string, "--length", - str(xof_len)], input) - for line in test_derive_key_xof.splitlines(): - assert expected_derive_key_xof == line, \ - "derive_key_xof({}): {} != {}".format( - input_len, expected_derive_key_xof, line) - - -if __name__ == "__main__": - main() diff --git a/thirdparty/BLAKE3/lib/Linux_x64/libblake3.a b/thirdparty/BLAKE3/lib/Linux_x64/libblake3.a Binary files differdeleted file mode 100644 index b956e22cb..000000000 --- a/thirdparty/BLAKE3/lib/Linux_x64/libblake3.a +++ /dev/null diff --git a/thirdparty/BLAKE3/lib/Mac_arm64/libblake3.a b/thirdparty/BLAKE3/lib/Mac_arm64/libblake3.a Binary files differdeleted file mode 100644 index a86e4001e..000000000 --- a/thirdparty/BLAKE3/lib/Mac_arm64/libblake3.a +++ /dev/null diff --git a/thirdparty/BLAKE3/lib/Mac_x64/libblake3.a b/thirdparty/BLAKE3/lib/Mac_x64/libblake3.a Binary files differdeleted file mode 100644 index c2ed0276a..000000000 --- a/thirdparty/BLAKE3/lib/Mac_x64/libblake3.a +++ /dev/null diff --git a/thirdparty/BLAKE3/lib/Win64/BLAKE3.lib b/thirdparty/BLAKE3/lib/Win64/BLAKE3.lib Binary files differdeleted file mode 100644 index 1308d9928..000000000 --- a/thirdparty/BLAKE3/lib/Win64/BLAKE3.lib +++ /dev/null diff --git a/thirdparty/BLAKE3/media/B3.svg b/thirdparty/BLAKE3/media/B3.svg deleted file mode 100644 index a50da0ce9..000000000 --- a/thirdparty/BLAKE3/media/B3.svg +++ /dev/null @@ -1,70 +0,0 @@ -<?xml version="1.0" encoding="UTF-8" standalone="no"?> -<!-- Created with Inkscape (http://www.inkscape.org/) --> - -<svg - xmlns:dc="http://purl.org/dc/elements/1.1/" - xmlns:cc="http://creativecommons.org/ns#" - xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" - xmlns:svg="http://www.w3.org/2000/svg" - xmlns="http://www.w3.org/2000/svg" - xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" - xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" - width="13.356165mm" - height="7.1437497mm" - viewBox="0 0 13.356165 7.1437497" - version="1.1" - id="svg8" - sodipodi:docname="B3.svg" - inkscape:version="0.92.4 5da689c313, 2019-01-14"> - <defs - id="defs2" /> - <sodipodi:namedview - id="base" - pagecolor="#ffffff" - bordercolor="#666666" - borderopacity="1.0" - inkscape:pageopacity="0.0" - inkscape:pageshadow="2" - inkscape:zoom="4" - inkscape:cx="72.73328" - inkscape:cy="-34.835127" - inkscape:document-units="mm" - inkscape:current-layer="layer1" - showgrid="false" - inkscape:window-width="1920" - inkscape:window-height="1016" - inkscape:window-x="0" - inkscape:window-y="27" - inkscape:window-maximized="1" /> - <metadata - id="metadata5"> - <rdf:RDF> - <cc:Work - rdf:about=""> - <dc:format>image/svg+xml</dc:format> - <dc:type - rdf:resource="http://purl.org/dc/dcmitype/StillImage" /> - <dc:title /> - </cc:Work> - </rdf:RDF> - </metadata> - <g - inkscape:label="Layer 1" - inkscape:groupmode="layer" - id="layer1" - transform="translate(-24.441005,-113.52518)"> - <g - aria-label="B3" - style="font-style:normal;font-weight:normal;font-size:10.58333302px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458332" - id="text868"> - <path - d="m 28.176921,113.52518 q 0.635,0 1.0795,0.14817 0.455084,0.13758 0.740834,0.40216 0.296333,0.254 0.433916,0.61384 0.137584,0.35983 0.137584,0.79375 0,0.62441 -0.264584,1.00541 -0.254,0.381 -0.762,0.58209 0.508,0.21166 0.783167,0.61383 0.275167,0.39158 0.275167,1.016 0,0.43392 -0.137584,0.79375 -0.137583,0.35983 -0.433916,0.62442 -0.28575,0.254 -0.740834,0.40216 -0.4445,0.14817 -1.0795,0.14817 h -3.174999 q -0.592667,0 -0.592667,-0.58208 v -5.97959 q 0,-0.58208 0.592667,-0.58208 z m -2.508249,5.78908 q 0,0.11642 0.137583,0.11642 h 2.434166 q 0.5715,0 0.836084,-0.24342 0.264583,-0.24341 0.264583,-0.68791 0,-0.92075 -1.100667,-0.92075 h -2.571749 z m 0,-2.77283 h 2.539999 q 1.100667,0 1.100667,-0.85725 0,-0.42333 -0.264583,-0.67733 -0.254,-0.254 -0.8255,-0.254 h -2.413 q -0.137583,0 -0.137583,0.127 z" - style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-family:Nasalization;-inkscape-font-specification:Nasalization;stroke-width:0.26458332" - id="path814" /> - <path - d="m 35.38417,113.52518 q 0.635,0 1.0795,0.14817 0.455083,0.13758 0.740833,0.40216 0.296333,0.254 0.433917,0.60325 0.137583,0.34925 0.137583,0.762 0,0.635 -0.264583,1.03717 -0.254,0.39158 -0.751417,0.60325 0.508,0.21167 0.772583,0.62442 0.264584,0.40216 0.264584,1.04775 0,0.40216 -0.137584,0.75141 -0.137583,0.34925 -0.423333,0.61384 -0.28575,0.254 -0.740833,0.40216 -0.4445,0.14817 -1.0795,0.14817 h -3.608917 v -1.24883 h 3.608917 q 0.550333,0 0.814917,-0.23284 0.264583,-0.24341 0.264583,-0.67733 0,-0.85725 -1.090083,-0.85725 h -2.201334 v -1.13242 h 2.169584 q 0.550333,0 0.814916,-0.20108 0.275167,-0.21167 0.275167,-0.65617 0,-0.40216 -0.254,-0.64558 -0.254,-0.24342 -0.8255,-0.24342 h -3.566583 v -1.24883 z" - style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-family:Nasalization;-inkscape-font-specification:Nasalization;stroke-width:0.26458332" - id="path816" /> - </g> - </g> -</svg> diff --git a/thirdparty/BLAKE3/media/BLAKE3.svg b/thirdparty/BLAKE3/media/BLAKE3.svg deleted file mode 100644 index 2d50c2d3b..000000000 --- a/thirdparty/BLAKE3/media/BLAKE3.svg +++ /dev/null @@ -1,85 +0,0 @@ -<?xml version="1.0" encoding="UTF-8" standalone="no"?> -<!-- Created with Inkscape (http://www.inkscape.org/) --> - -<svg - xmlns:dc="http://purl.org/dc/elements/1.1/" - xmlns:cc="http://creativecommons.org/ns#" - xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" - xmlns:svg="http://www.w3.org/2000/svg" - xmlns="http://www.w3.org/2000/svg" - xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" - xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" - width="43.063534mm" - height="7.2707496mm" - viewBox="0 0 43.063534 7.2707496" - version="1.1" - id="svg8" - sodipodi:docname="BLAKE3.svg" - inkscape:version="0.92.4 5da689c313, 2019-01-14"> - <defs - id="defs2" /> - <sodipodi:namedview - id="base" - pagecolor="#ffffff" - bordercolor="#666666" - borderopacity="1.0" - inkscape:pageopacity="0.0" - inkscape:pageshadow="2" - inkscape:zoom="4" - inkscape:cx="72.73328" - inkscape:cy="-34.835127" - inkscape:document-units="mm" - inkscape:current-layer="layer1" - showgrid="false" - inkscape:window-width="1920" - inkscape:window-height="1016" - inkscape:window-x="0" - inkscape:window-y="27" - inkscape:window-maximized="1" /> - <metadata - id="metadata5"> - <rdf:RDF> - <cc:Work - rdf:about=""> - <dc:format>image/svg+xml</dc:format> - <dc:type - rdf:resource="http://purl.org/dc/dcmitype/StillImage" /> - </cc:Work> - </rdf:RDF> - </metadata> - <g - inkscape:label="Layer 1" - inkscape:groupmode="layer" - id="layer1" - transform="translate(-24.441005,-113.39818)"> - <g - aria-label="BLAKE3" - style="font-style:normal;font-weight:normal;font-size:10.58333302px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458332" - id="text868"> - <path - d="m 28.176921,113.52518 q 0.635,0 1.0795,0.14817 0.455084,0.13758 0.740834,0.40216 0.296333,0.254 0.433916,0.61384 0.137584,0.35983 0.137584,0.79375 0,0.62441 -0.264584,1.00541 -0.254,0.381 -0.762,0.58209 0.508,0.21166 0.783167,0.61383 0.275167,0.39158 0.275167,1.016 0,0.43392 -0.137584,0.79375 -0.137583,0.35983 -0.433916,0.62442 -0.28575,0.254 -0.740834,0.40216 -0.4445,0.14817 -1.0795,0.14817 h -3.174999 q -0.592667,0 -0.592667,-0.58208 v -5.97959 q 0,-0.58208 0.592667,-0.58208 z m -2.508249,5.78908 q 0,0.11642 0.137583,0.11642 h 2.434166 q 0.5715,0 0.836084,-0.24342 0.264583,-0.24341 0.264583,-0.68791 0,-0.92075 -1.100667,-0.92075 h -2.571749 z m 0,-2.77283 h 2.539999 q 1.100667,0 1.100667,-0.85725 0,-0.42333 -0.264583,-0.67733 -0.254,-0.254 -0.8255,-0.254 h -2.413 q -0.137583,0 -0.137583,0.127 z" - style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-family:Nasalization;-inkscape-font-specification:Nasalization;stroke-width:0.26458332" - id="path814" /> - <path - d="m 33.22517,113.52518 v 4.66725 q 0,0.254 0.0635,0.48683 0.07408,0.22225 0.243417,0.39159 0.169333,0.15875 0.4445,0.254 0.28575,0.0953 0.709083,0.0953 h 2.772833 v 1.24883 h -2.846916 q -0.709084,0 -1.217084,-0.17992 -0.497416,-0.1905 -0.814916,-0.51858 -0.3175,-0.32808 -0.465667,-0.77258 -0.137583,-0.45509 -0.137583,-0.99484 v -4.67783 z" - style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-family:Nasalization;-inkscape-font-specification:Nasalization;stroke-width:0.26458332" - id="path816" /> - <path - d="M 39.342334,120.66893 H 37.9665 l 2.50825,-6.35 q 0.201084,-0.508 0.560917,-0.70908 0.370417,-0.21167 0.941917,-0.21167 0.560916,0 0.92075,0.21167 0.370416,0.20108 0.560916,0.70908 l 2.413,6.35 h -1.386416 l -2.169584,-5.74675 q -0.09525,-0.24342 -0.34925,-0.24342 -0.254,0 -0.359833,0.24342 z" - style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-family:Nasalization;-inkscape-font-specification:Nasalization;stroke-width:0.26458332" - id="path818" /> - <path - d="m 48.179401,113.52518 v 3.02683 h 0.687917 q 0.455083,0 0.740833,-0.0212 0.296333,-0.0318 0.486833,-0.127 0.1905,-0.0953 0.3175,-0.26459 0.137584,-0.17991 0.28575,-0.47625 l 1.090084,-2.13783 h 1.344083 l -1.121833,2.2225 q -0.243417,0.47625 -0.518584,0.79375 -0.275166,0.3175 -0.719666,0.508 0.254,0.0635 0.4445,0.17992 0.1905,0.10583 0.34925,0.27516 0.169333,0.15875 0.3175,0.39159 0.148166,0.22225 0.306916,0.52916 l 1.153584,2.24367 h -1.397 l -1.090084,-2.11667 q -0.148166,-0.28575 -0.28575,-0.45508 -0.137583,-0.16933 -0.34925,-0.26458 -0.211666,-0.0952 -0.529166,-0.11642 -0.3175,-0.0317 -0.8255,-0.0317 h -0.687917 v 2.9845 h -1.248833 v -7.14375 z" - style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-family:Nasalization;-inkscape-font-specification:Nasalization;stroke-width:0.26458332" - id="path820" /> - <path - d="m 60.127965,113.52518 v 1.24883 h -3.577166 q -0.5715,0 -0.8255,0.24342 -0.254,0.24342 -0.254,0.65617 0,0.84666 1.090083,0.84666 h 3.513667 v 1.13242 h -3.545417 q -1.090083,0 -1.090083,0.86783 0,0.42334 0.264583,0.66675 0.264583,0.23284 0.814917,0.23284 h 3.6195 v 1.24883 h -3.6195 q -0.635,0 -1.090083,-0.14817 -0.4445,-0.14816 -0.740834,-0.40216 -0.28575,-0.26459 -0.423333,-0.62442 -0.127,-0.35983 -0.127,-0.77258 0,-0.61384 0.264583,-1.016 0.264584,-0.41275 0.762,-0.62442 -1.005416,-0.41275 -1.005416,-1.60867 0,-0.42333 0.137583,-0.78316 0.137583,-0.35984 0.423333,-0.61384 0.296334,-0.26458 0.740834,-0.40216 0.455083,-0.14817 1.090083,-0.14817 z" - style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-family:Nasalization;-inkscape-font-specification:Nasalization;stroke-width:0.26458332" - id="path822" /> - <path - d="m 65.091539,113.52518 q 0.635,0 1.0795,0.14817 0.455083,0.13758 0.740833,0.40216 0.296333,0.254 0.433917,0.60325 0.137583,0.34925 0.137583,0.762 0,0.635 -0.264583,1.03717 -0.254,0.39158 -0.751417,0.60325 0.508,0.21167 0.772583,0.62442 0.264584,0.40216 0.264584,1.04775 0,0.40216 -0.137584,0.75141 -0.137583,0.34925 -0.423333,0.61384 -0.28575,0.254 -0.740833,0.40216 -0.4445,0.14817 -1.0795,0.14817 h -3.608917 v -1.24883 h 3.608917 q 0.550333,0 0.814916,-0.23284 0.264584,-0.24341 0.264584,-0.67733 0,-0.85725 -1.090084,-0.85725 h -2.201333 v -1.13242 h 2.169583 q 0.550334,0 0.814917,-0.20108 0.275167,-0.21167 0.275167,-0.65617 0,-0.40216 -0.254,-0.64558 -0.254,-0.24342 -0.8255,-0.24342 h -3.566583 v -1.24883 z" - style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-family:Nasalization;-inkscape-font-specification:Nasalization;stroke-width:0.26458332" - id="path824" /> - </g> - </g> -</svg> diff --git a/thirdparty/BLAKE3/media/speed.svg b/thirdparty/BLAKE3/media/speed.svg deleted file mode 100644 index 7bd65ca3c..000000000 --- a/thirdparty/BLAKE3/media/speed.svg +++ /dev/null @@ -1,1474 +0,0 @@ -<?xml version="1.0" encoding="utf-8" standalone="no"?> -<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" - "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"> -<!-- Created with matplotlib (https://matplotlib.org/) --> -<svg height="331.389812pt" version="1.1" viewBox="0 0 449.761406 331.389812" width="449.761406pt" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"> - <defs> - <style type="text/css"> -*{stroke-linecap:butt;stroke-linejoin:round;} - </style> - </defs> - <g id="figure_1"> - <g id="patch_1"> - <path d="M 0 331.389812 -L 449.761406 331.389812 -L 449.761406 0 -L 0 0 -z -" style="fill:#ffffff;"/> - </g> - <g id="axes_1"> - <g id="patch_2"> - <path d="M 71.443906 288.430125 -L 428.563906 288.430125 -L 428.563906 22.318125 -L 71.443906 22.318125 -z -" style="fill:#ffffff;"/> - </g> - <g id="matplotlib.axis_1"> - <g id="xtick_1"> - <g id="line2d_1"> - <defs> - <path d="M 0 0 -L 0 6 -" id="me95d5351a6" style="stroke:#262626;stroke-width:1.25;"/> - </defs> - <g> - <use style="fill:#262626;stroke:#262626;stroke-width:1.25;" x="71.443906" xlink:href="#me95d5351a6" y="288.430125"/> - </g> - </g> - <g id="text_1"> - <!-- 0 --> - <defs> - <path d="M 31.78125 66.40625 -Q 24.171875 66.40625 20.328125 58.90625 -Q 16.5 51.421875 16.5 36.375 -Q 16.5 21.390625 20.328125 13.890625 -Q 24.171875 6.390625 31.78125 6.390625 -Q 39.453125 6.390625 43.28125 13.890625 -Q 47.125 21.390625 47.125 36.375 -Q 47.125 51.421875 43.28125 58.90625 -Q 39.453125 66.40625 31.78125 66.40625 -z -M 31.78125 74.21875 -Q 44.046875 74.21875 50.515625 64.515625 -Q 56.984375 54.828125 56.984375 36.375 -Q 56.984375 17.96875 50.515625 8.265625 -Q 44.046875 -1.421875 31.78125 -1.421875 -Q 19.53125 -1.421875 13.0625 8.265625 -Q 6.59375 17.96875 6.59375 36.375 -Q 6.59375 54.828125 13.0625 64.515625 -Q 19.53125 74.21875 31.78125 74.21875 -z -" id="DejaVuSans-48"/> - </defs> - <g style="fill:#262626;" transform="translate(67.944531 306.288406)scale(0.11 -0.11)"> - <use xlink:href="#DejaVuSans-48"/> - </g> - </g> - </g> - <g id="xtick_2"> - <g id="line2d_2"> - <g> - <use style="fill:#262626;stroke:#262626;stroke-width:1.25;" x="116.083906" xlink:href="#me95d5351a6" y="288.430125"/> - </g> - </g> - <g id="text_2"> - <!-- 1000 --> - <defs> - <path d="M 12.40625 8.296875 -L 28.515625 8.296875 -L 28.515625 63.921875 -L 10.984375 60.40625 -L 10.984375 69.390625 -L 28.421875 72.90625 -L 38.28125 72.90625 -L 38.28125 8.296875 -L 54.390625 8.296875 -L 54.390625 0 -L 12.40625 0 -z -" id="DejaVuSans-49"/> - </defs> - <g style="fill:#262626;" transform="translate(102.086406 306.288406)scale(0.11 -0.11)"> - <use xlink:href="#DejaVuSans-49"/> - <use x="63.623047" xlink:href="#DejaVuSans-48"/> - <use x="127.246094" xlink:href="#DejaVuSans-48"/> - <use x="190.869141" xlink:href="#DejaVuSans-48"/> - </g> - </g> - </g> - <g id="xtick_3"> - <g id="line2d_3"> - <g> - <use style="fill:#262626;stroke:#262626;stroke-width:1.25;" x="160.723906" xlink:href="#me95d5351a6" y="288.430125"/> - </g> - </g> - <g id="text_3"> - <!-- 2000 --> - <defs> - <path d="M 19.1875 8.296875 -L 53.609375 8.296875 -L 53.609375 0 -L 7.328125 0 -L 7.328125 8.296875 -Q 12.9375 14.109375 22.625 23.890625 -Q 32.328125 33.6875 34.8125 36.53125 -Q 39.546875 41.84375 41.421875 45.53125 -Q 43.3125 49.21875 43.3125 52.78125 -Q 43.3125 58.59375 39.234375 62.25 -Q 35.15625 65.921875 28.609375 65.921875 -Q 23.96875 65.921875 18.8125 64.3125 -Q 13.671875 62.703125 7.8125 59.421875 -L 7.8125 69.390625 -Q 13.765625 71.78125 18.9375 73 -Q 24.125 74.21875 28.421875 74.21875 -Q 39.75 74.21875 46.484375 68.546875 -Q 53.21875 62.890625 53.21875 53.421875 -Q 53.21875 48.921875 51.53125 44.890625 -Q 49.859375 40.875 45.40625 35.40625 -Q 44.1875 33.984375 37.640625 27.21875 -Q 31.109375 20.453125 19.1875 8.296875 -z -" id="DejaVuSans-50"/> - </defs> - <g style="fill:#262626;" transform="translate(146.726406 306.288406)scale(0.11 -0.11)"> - <use xlink:href="#DejaVuSans-50"/> - <use x="63.623047" xlink:href="#DejaVuSans-48"/> - <use x="127.246094" xlink:href="#DejaVuSans-48"/> - <use x="190.869141" xlink:href="#DejaVuSans-48"/> - </g> - </g> - </g> - <g id="xtick_4"> - <g id="line2d_4"> - <g> - <use style="fill:#262626;stroke:#262626;stroke-width:1.25;" x="205.363906" xlink:href="#me95d5351a6" y="288.430125"/> - </g> - </g> - <g id="text_4"> - <!-- 3000 --> - <defs> - <path d="M 40.578125 39.3125 -Q 47.65625 37.796875 51.625 33 -Q 55.609375 28.21875 55.609375 21.1875 -Q 55.609375 10.40625 48.1875 4.484375 -Q 40.765625 -1.421875 27.09375 -1.421875 -Q 22.515625 -1.421875 17.65625 -0.515625 -Q 12.796875 0.390625 7.625 2.203125 -L 7.625 11.71875 -Q 11.71875 9.328125 16.59375 8.109375 -Q 21.484375 6.890625 26.8125 6.890625 -Q 36.078125 6.890625 40.9375 10.546875 -Q 45.796875 14.203125 45.796875 21.1875 -Q 45.796875 27.640625 41.28125 31.265625 -Q 36.765625 34.90625 28.71875 34.90625 -L 20.21875 34.90625 -L 20.21875 43.015625 -L 29.109375 43.015625 -Q 36.375 43.015625 40.234375 45.921875 -Q 44.09375 48.828125 44.09375 54.296875 -Q 44.09375 59.90625 40.109375 62.90625 -Q 36.140625 65.921875 28.71875 65.921875 -Q 24.65625 65.921875 20.015625 65.03125 -Q 15.375 64.15625 9.8125 62.3125 -L 9.8125 71.09375 -Q 15.4375 72.65625 20.34375 73.4375 -Q 25.25 74.21875 29.59375 74.21875 -Q 40.828125 74.21875 47.359375 69.109375 -Q 53.90625 64.015625 53.90625 55.328125 -Q 53.90625 49.265625 50.4375 45.09375 -Q 46.96875 40.921875 40.578125 39.3125 -z -" id="DejaVuSans-51"/> - </defs> - <g style="fill:#262626;" transform="translate(191.366406 306.288406)scale(0.11 -0.11)"> - <use xlink:href="#DejaVuSans-51"/> - <use x="63.623047" xlink:href="#DejaVuSans-48"/> - <use x="127.246094" xlink:href="#DejaVuSans-48"/> - <use x="190.869141" xlink:href="#DejaVuSans-48"/> - </g> - </g> - </g> - <g id="xtick_5"> - <g id="line2d_5"> - <g> - <use style="fill:#262626;stroke:#262626;stroke-width:1.25;" x="250.003906" xlink:href="#me95d5351a6" y="288.430125"/> - </g> - </g> - <g id="text_5"> - <!-- 4000 --> - <defs> - <path d="M 37.796875 64.3125 -L 12.890625 25.390625 -L 37.796875 25.390625 -z -M 35.203125 72.90625 -L 47.609375 72.90625 -L 47.609375 25.390625 -L 58.015625 25.390625 -L 58.015625 17.1875 -L 47.609375 17.1875 -L 47.609375 0 -L 37.796875 0 -L 37.796875 17.1875 -L 4.890625 17.1875 -L 4.890625 26.703125 -z -" id="DejaVuSans-52"/> - </defs> - <g style="fill:#262626;" transform="translate(236.006406 306.288406)scale(0.11 -0.11)"> - <use xlink:href="#DejaVuSans-52"/> - <use x="63.623047" xlink:href="#DejaVuSans-48"/> - <use x="127.246094" xlink:href="#DejaVuSans-48"/> - <use x="190.869141" xlink:href="#DejaVuSans-48"/> - </g> - </g> - </g> - <g id="xtick_6"> - <g id="line2d_6"> - <g> - <use style="fill:#262626;stroke:#262626;stroke-width:1.25;" x="294.643906" xlink:href="#me95d5351a6" y="288.430125"/> - </g> - </g> - <g id="text_6"> - <!-- 5000 --> - <defs> - <path d="M 10.796875 72.90625 -L 49.515625 72.90625 -L 49.515625 64.59375 -L 19.828125 64.59375 -L 19.828125 46.734375 -Q 21.96875 47.46875 24.109375 47.828125 -Q 26.265625 48.1875 28.421875 48.1875 -Q 40.625 48.1875 47.75 41.5 -Q 54.890625 34.8125 54.890625 23.390625 -Q 54.890625 11.625 47.5625 5.09375 -Q 40.234375 -1.421875 26.90625 -1.421875 -Q 22.3125 -1.421875 17.546875 -0.640625 -Q 12.796875 0.140625 7.71875 1.703125 -L 7.71875 11.625 -Q 12.109375 9.234375 16.796875 8.0625 -Q 21.484375 6.890625 26.703125 6.890625 -Q 35.15625 6.890625 40.078125 11.328125 -Q 45.015625 15.765625 45.015625 23.390625 -Q 45.015625 31 40.078125 35.4375 -Q 35.15625 39.890625 26.703125 39.890625 -Q 22.75 39.890625 18.8125 39.015625 -Q 14.890625 38.140625 10.796875 36.28125 -z -" id="DejaVuSans-53"/> - </defs> - <g style="fill:#262626;" transform="translate(280.646406 306.288406)scale(0.11 -0.11)"> - <use xlink:href="#DejaVuSans-53"/> - <use x="63.623047" xlink:href="#DejaVuSans-48"/> - <use x="127.246094" xlink:href="#DejaVuSans-48"/> - <use x="190.869141" xlink:href="#DejaVuSans-48"/> - </g> - </g> - </g> - <g id="xtick_7"> - <g id="line2d_7"> - <g> - <use style="fill:#262626;stroke:#262626;stroke-width:1.25;" x="339.283906" xlink:href="#me95d5351a6" y="288.430125"/> - </g> - </g> - <g id="text_7"> - <!-- 6000 --> - <defs> - <path d="M 33.015625 40.375 -Q 26.375 40.375 22.484375 35.828125 -Q 18.609375 31.296875 18.609375 23.390625 -Q 18.609375 15.53125 22.484375 10.953125 -Q 26.375 6.390625 33.015625 6.390625 -Q 39.65625 6.390625 43.53125 10.953125 -Q 47.40625 15.53125 47.40625 23.390625 -Q 47.40625 31.296875 43.53125 35.828125 -Q 39.65625 40.375 33.015625 40.375 -z -M 52.59375 71.296875 -L 52.59375 62.3125 -Q 48.875 64.0625 45.09375 64.984375 -Q 41.3125 65.921875 37.59375 65.921875 -Q 27.828125 65.921875 22.671875 59.328125 -Q 17.53125 52.734375 16.796875 39.40625 -Q 19.671875 43.65625 24.015625 45.921875 -Q 28.375 48.1875 33.59375 48.1875 -Q 44.578125 48.1875 50.953125 41.515625 -Q 57.328125 34.859375 57.328125 23.390625 -Q 57.328125 12.15625 50.6875 5.359375 -Q 44.046875 -1.421875 33.015625 -1.421875 -Q 20.359375 -1.421875 13.671875 8.265625 -Q 6.984375 17.96875 6.984375 36.375 -Q 6.984375 53.65625 15.1875 63.9375 -Q 23.390625 74.21875 37.203125 74.21875 -Q 40.921875 74.21875 44.703125 73.484375 -Q 48.484375 72.75 52.59375 71.296875 -z -" id="DejaVuSans-54"/> - </defs> - <g style="fill:#262626;" transform="translate(325.286406 306.288406)scale(0.11 -0.11)"> - <use xlink:href="#DejaVuSans-54"/> - <use x="63.623047" xlink:href="#DejaVuSans-48"/> - <use x="127.246094" xlink:href="#DejaVuSans-48"/> - <use x="190.869141" xlink:href="#DejaVuSans-48"/> - </g> - </g> - </g> - <g id="xtick_8"> - <g id="line2d_8"> - <g> - <use style="fill:#262626;stroke:#262626;stroke-width:1.25;" x="383.923906" xlink:href="#me95d5351a6" y="288.430125"/> - </g> - </g> - <g id="text_8"> - <!-- 7000 --> - <defs> - <path d="M 8.203125 72.90625 -L 55.078125 72.90625 -L 55.078125 68.703125 -L 28.609375 0 -L 18.3125 0 -L 43.21875 64.59375 -L 8.203125 64.59375 -z -" id="DejaVuSans-55"/> - </defs> - <g style="fill:#262626;" transform="translate(369.926406 306.288406)scale(0.11 -0.11)"> - <use xlink:href="#DejaVuSans-55"/> - <use x="63.623047" xlink:href="#DejaVuSans-48"/> - <use x="127.246094" xlink:href="#DejaVuSans-48"/> - <use x="190.869141" xlink:href="#DejaVuSans-48"/> - </g> - </g> - </g> - <g id="xtick_9"> - <g id="line2d_9"> - <g> - <use style="fill:#262626;stroke:#262626;stroke-width:1.25;" x="428.563906" xlink:href="#me95d5351a6" y="288.430125"/> - </g> - </g> - <g id="text_9"> - <!-- 8000 --> - <defs> - <path d="M 31.78125 34.625 -Q 24.75 34.625 20.71875 30.859375 -Q 16.703125 27.09375 16.703125 20.515625 -Q 16.703125 13.921875 20.71875 10.15625 -Q 24.75 6.390625 31.78125 6.390625 -Q 38.8125 6.390625 42.859375 10.171875 -Q 46.921875 13.96875 46.921875 20.515625 -Q 46.921875 27.09375 42.890625 30.859375 -Q 38.875 34.625 31.78125 34.625 -z -M 21.921875 38.8125 -Q 15.578125 40.375 12.03125 44.71875 -Q 8.5 49.078125 8.5 55.328125 -Q 8.5 64.0625 14.71875 69.140625 -Q 20.953125 74.21875 31.78125 74.21875 -Q 42.671875 74.21875 48.875 69.140625 -Q 55.078125 64.0625 55.078125 55.328125 -Q 55.078125 49.078125 51.53125 44.71875 -Q 48 40.375 41.703125 38.8125 -Q 48.828125 37.15625 52.796875 32.3125 -Q 56.78125 27.484375 56.78125 20.515625 -Q 56.78125 9.90625 50.3125 4.234375 -Q 43.84375 -1.421875 31.78125 -1.421875 -Q 19.734375 -1.421875 13.25 4.234375 -Q 6.78125 9.90625 6.78125 20.515625 -Q 6.78125 27.484375 10.78125 32.3125 -Q 14.796875 37.15625 21.921875 38.8125 -z -M 18.3125 54.390625 -Q 18.3125 48.734375 21.84375 45.5625 -Q 25.390625 42.390625 31.78125 42.390625 -Q 38.140625 42.390625 41.71875 45.5625 -Q 45.3125 48.734375 45.3125 54.390625 -Q 45.3125 60.0625 41.71875 63.234375 -Q 38.140625 66.40625 31.78125 66.40625 -Q 25.390625 66.40625 21.84375 63.234375 -Q 18.3125 60.0625 18.3125 54.390625 -z -" id="DejaVuSans-56"/> - </defs> - <g style="fill:#262626;" transform="translate(414.566406 306.288406)scale(0.11 -0.11)"> - <use xlink:href="#DejaVuSans-56"/> - <use x="63.623047" xlink:href="#DejaVuSans-48"/> - <use x="127.246094" xlink:href="#DejaVuSans-48"/> - <use x="190.869141" xlink:href="#DejaVuSans-48"/> - </g> - </g> - </g> - <g id="text_10"> - <!-- Speed (MiB/s) --> - <defs> - <path d="M 53.515625 70.515625 -L 53.515625 60.890625 -Q 47.90625 63.578125 42.921875 64.890625 -Q 37.9375 66.21875 33.296875 66.21875 -Q 25.25 66.21875 20.875 63.09375 -Q 16.5 59.96875 16.5 54.203125 -Q 16.5 49.359375 19.40625 46.890625 -Q 22.3125 44.4375 30.421875 42.921875 -L 36.375 41.703125 -Q 47.40625 39.59375 52.65625 34.296875 -Q 57.90625 29 57.90625 20.125 -Q 57.90625 9.515625 50.796875 4.046875 -Q 43.703125 -1.421875 29.984375 -1.421875 -Q 24.8125 -1.421875 18.96875 -0.25 -Q 13.140625 0.921875 6.890625 3.21875 -L 6.890625 13.375 -Q 12.890625 10.015625 18.65625 8.296875 -Q 24.421875 6.59375 29.984375 6.59375 -Q 38.421875 6.59375 43.015625 9.90625 -Q 47.609375 13.234375 47.609375 19.390625 -Q 47.609375 24.75 44.3125 27.78125 -Q 41.015625 30.8125 33.5 32.328125 -L 27.484375 33.5 -Q 16.453125 35.6875 11.515625 40.375 -Q 6.59375 45.0625 6.59375 53.421875 -Q 6.59375 63.09375 13.40625 68.65625 -Q 20.21875 74.21875 32.171875 74.21875 -Q 37.3125 74.21875 42.625 73.28125 -Q 47.953125 72.359375 53.515625 70.515625 -z -" id="DejaVuSans-83"/> - <path d="M 18.109375 8.203125 -L 18.109375 -20.796875 -L 9.078125 -20.796875 -L 9.078125 54.6875 -L 18.109375 54.6875 -L 18.109375 46.390625 -Q 20.953125 51.265625 25.265625 53.625 -Q 29.59375 56 35.59375 56 -Q 45.5625 56 51.78125 48.09375 -Q 58.015625 40.1875 58.015625 27.296875 -Q 58.015625 14.40625 51.78125 6.484375 -Q 45.5625 -1.421875 35.59375 -1.421875 -Q 29.59375 -1.421875 25.265625 0.953125 -Q 20.953125 3.328125 18.109375 8.203125 -z -M 48.6875 27.296875 -Q 48.6875 37.203125 44.609375 42.84375 -Q 40.53125 48.484375 33.40625 48.484375 -Q 26.265625 48.484375 22.1875 42.84375 -Q 18.109375 37.203125 18.109375 27.296875 -Q 18.109375 17.390625 22.1875 11.75 -Q 26.265625 6.109375 33.40625 6.109375 -Q 40.53125 6.109375 44.609375 11.75 -Q 48.6875 17.390625 48.6875 27.296875 -z -" id="DejaVuSans-112"/> - <path d="M 56.203125 29.59375 -L 56.203125 25.203125 -L 14.890625 25.203125 -Q 15.484375 15.921875 20.484375 11.0625 -Q 25.484375 6.203125 34.421875 6.203125 -Q 39.59375 6.203125 44.453125 7.46875 -Q 49.3125 8.734375 54.109375 11.28125 -L 54.109375 2.78125 -Q 49.265625 0.734375 44.1875 -0.34375 -Q 39.109375 -1.421875 33.890625 -1.421875 -Q 20.796875 -1.421875 13.15625 6.1875 -Q 5.515625 13.8125 5.515625 26.8125 -Q 5.515625 40.234375 12.765625 48.109375 -Q 20.015625 56 32.328125 56 -Q 43.359375 56 49.78125 48.890625 -Q 56.203125 41.796875 56.203125 29.59375 -z -M 47.21875 32.234375 -Q 47.125 39.59375 43.09375 43.984375 -Q 39.0625 48.390625 32.421875 48.390625 -Q 24.90625 48.390625 20.390625 44.140625 -Q 15.875 39.890625 15.1875 32.171875 -z -" id="DejaVuSans-101"/> - <path d="M 45.40625 46.390625 -L 45.40625 75.984375 -L 54.390625 75.984375 -L 54.390625 0 -L 45.40625 0 -L 45.40625 8.203125 -Q 42.578125 3.328125 38.25 0.953125 -Q 33.9375 -1.421875 27.875 -1.421875 -Q 17.96875 -1.421875 11.734375 6.484375 -Q 5.515625 14.40625 5.515625 27.296875 -Q 5.515625 40.1875 11.734375 48.09375 -Q 17.96875 56 27.875 56 -Q 33.9375 56 38.25 53.625 -Q 42.578125 51.265625 45.40625 46.390625 -z -M 14.796875 27.296875 -Q 14.796875 17.390625 18.875 11.75 -Q 22.953125 6.109375 30.078125 6.109375 -Q 37.203125 6.109375 41.296875 11.75 -Q 45.40625 17.390625 45.40625 27.296875 -Q 45.40625 37.203125 41.296875 42.84375 -Q 37.203125 48.484375 30.078125 48.484375 -Q 22.953125 48.484375 18.875 42.84375 -Q 14.796875 37.203125 14.796875 27.296875 -z -" id="DejaVuSans-100"/> - <path id="DejaVuSans-32"/> - <path d="M 31 75.875 -Q 24.46875 64.65625 21.28125 53.65625 -Q 18.109375 42.671875 18.109375 31.390625 -Q 18.109375 20.125 21.3125 9.0625 -Q 24.515625 -2 31 -13.1875 -L 23.1875 -13.1875 -Q 15.875 -1.703125 12.234375 9.375 -Q 8.59375 20.453125 8.59375 31.390625 -Q 8.59375 42.28125 12.203125 53.3125 -Q 15.828125 64.359375 23.1875 75.875 -z -" id="DejaVuSans-40"/> - <path d="M 9.8125 72.90625 -L 24.515625 72.90625 -L 43.109375 23.296875 -L 61.8125 72.90625 -L 76.515625 72.90625 -L 76.515625 0 -L 66.890625 0 -L 66.890625 64.015625 -L 48.09375 14.015625 -L 38.1875 14.015625 -L 19.390625 64.015625 -L 19.390625 0 -L 9.8125 0 -z -" id="DejaVuSans-77"/> - <path d="M 9.421875 54.6875 -L 18.40625 54.6875 -L 18.40625 0 -L 9.421875 0 -z -M 9.421875 75.984375 -L 18.40625 75.984375 -L 18.40625 64.59375 -L 9.421875 64.59375 -z -" id="DejaVuSans-105"/> - <path d="M 19.671875 34.8125 -L 19.671875 8.109375 -L 35.5 8.109375 -Q 43.453125 8.109375 47.28125 11.40625 -Q 51.125 14.703125 51.125 21.484375 -Q 51.125 28.328125 47.28125 31.5625 -Q 43.453125 34.8125 35.5 34.8125 -z -M 19.671875 64.796875 -L 19.671875 42.828125 -L 34.28125 42.828125 -Q 41.5 42.828125 45.03125 45.53125 -Q 48.578125 48.25 48.578125 53.8125 -Q 48.578125 59.328125 45.03125 62.0625 -Q 41.5 64.796875 34.28125 64.796875 -z -M 9.8125 72.90625 -L 35.015625 72.90625 -Q 46.296875 72.90625 52.390625 68.21875 -Q 58.5 63.53125 58.5 54.890625 -Q 58.5 48.1875 55.375 44.234375 -Q 52.25 40.28125 46.1875 39.3125 -Q 53.46875 37.75 57.5 32.78125 -Q 61.53125 27.828125 61.53125 20.40625 -Q 61.53125 10.640625 54.890625 5.3125 -Q 48.25 0 35.984375 0 -L 9.8125 0 -z -" id="DejaVuSans-66"/> - <path d="M 25.390625 72.90625 -L 33.6875 72.90625 -L 8.296875 -9.28125 -L 0 -9.28125 -z -" id="DejaVuSans-47"/> - <path d="M 44.28125 53.078125 -L 44.28125 44.578125 -Q 40.484375 46.53125 36.375 47.5 -Q 32.28125 48.484375 27.875 48.484375 -Q 21.1875 48.484375 17.84375 46.4375 -Q 14.5 44.390625 14.5 40.28125 -Q 14.5 37.15625 16.890625 35.375 -Q 19.28125 33.59375 26.515625 31.984375 -L 29.59375 31.296875 -Q 39.15625 29.25 43.1875 25.515625 -Q 47.21875 21.78125 47.21875 15.09375 -Q 47.21875 7.46875 41.1875 3.015625 -Q 35.15625 -1.421875 24.609375 -1.421875 -Q 20.21875 -1.421875 15.453125 -0.5625 -Q 10.6875 0.296875 5.421875 2 -L 5.421875 11.28125 -Q 10.40625 8.6875 15.234375 7.390625 -Q 20.0625 6.109375 24.8125 6.109375 -Q 31.15625 6.109375 34.5625 8.28125 -Q 37.984375 10.453125 37.984375 14.40625 -Q 37.984375 18.0625 35.515625 20.015625 -Q 33.0625 21.96875 24.703125 23.78125 -L 21.578125 24.515625 -Q 13.234375 26.265625 9.515625 29.90625 -Q 5.8125 33.546875 5.8125 39.890625 -Q 5.8125 47.609375 11.28125 51.796875 -Q 16.75 56 26.8125 56 -Q 31.78125 56 36.171875 55.265625 -Q 40.578125 54.546875 44.28125 53.078125 -z -" id="DejaVuSans-115"/> - <path d="M 8.015625 75.875 -L 15.828125 75.875 -Q 23.140625 64.359375 26.78125 53.3125 -Q 30.421875 42.28125 30.421875 31.390625 -Q 30.421875 20.453125 26.78125 9.375 -Q 23.140625 -1.703125 15.828125 -13.1875 -L 8.015625 -13.1875 -Q 14.5 -2 17.703125 9.0625 -Q 20.90625 20.125 20.90625 31.390625 -Q 20.90625 42.671875 17.703125 53.65625 -Q 14.5 64.65625 8.015625 75.875 -z -" id="DejaVuSans-41"/> - </defs> - <g style="fill:#262626;" transform="translate(208.497031 321.694187)scale(0.12 -0.12)"> - <use xlink:href="#DejaVuSans-83"/> - <use x="63.476562" xlink:href="#DejaVuSans-112"/> - <use x="126.953125" xlink:href="#DejaVuSans-101"/> - <use x="188.476562" xlink:href="#DejaVuSans-101"/> - <use x="250" xlink:href="#DejaVuSans-100"/> - <use x="313.476562" xlink:href="#DejaVuSans-32"/> - <use x="345.263672" xlink:href="#DejaVuSans-40"/> - <use x="384.277344" xlink:href="#DejaVuSans-77"/> - <use x="470.556641" xlink:href="#DejaVuSans-105"/> - <use x="498.339844" xlink:href="#DejaVuSans-66"/> - <use x="566.943359" xlink:href="#DejaVuSans-47"/> - <use x="600.634766" xlink:href="#DejaVuSans-115"/> - <use x="652.734375" xlink:href="#DejaVuSans-41"/> - </g> - </g> - </g> - <g id="matplotlib.axis_2"> - <g id="ytick_1"> - <g id="line2d_10"> - <defs> - <path d="M 0 0 -L -6 0 -" id="m7d1bb602a9" style="stroke:#262626;stroke-width:1.25;"/> - </defs> - <g> - <use style="fill:#262626;stroke:#262626;stroke-width:1.25;" x="71.443906" xlink:href="#m7d1bb602a9" y="38.950125"/> - </g> - </g> - <g id="text_11"> - <!-- BLAKE3 --> - <defs> - <path d="M 9.8125 72.90625 -L 19.671875 72.90625 -L 19.671875 8.296875 -L 55.171875 8.296875 -L 55.171875 0 -L 9.8125 0 -z -" id="DejaVuSans-76"/> - <path d="M 34.1875 63.1875 -L 20.796875 26.90625 -L 47.609375 26.90625 -z -M 28.609375 72.90625 -L 39.796875 72.90625 -L 67.578125 0 -L 57.328125 0 -L 50.6875 18.703125 -L 17.828125 18.703125 -L 11.1875 0 -L 0.78125 0 -z -" id="DejaVuSans-65"/> - <path d="M 9.8125 72.90625 -L 19.671875 72.90625 -L 19.671875 42.09375 -L 52.390625 72.90625 -L 65.09375 72.90625 -L 28.90625 38.921875 -L 67.671875 0 -L 54.6875 0 -L 19.671875 35.109375 -L 19.671875 0 -L 9.8125 0 -z -" id="DejaVuSans-75"/> - <path d="M 9.8125 72.90625 -L 55.90625 72.90625 -L 55.90625 64.59375 -L 19.671875 64.59375 -L 19.671875 43.015625 -L 54.390625 43.015625 -L 54.390625 34.71875 -L 19.671875 34.71875 -L 19.671875 8.296875 -L 56.78125 8.296875 -L 56.78125 0 -L 9.8125 0 -z -" id="DejaVuSans-69"/> - </defs> - <g style="fill:#262626;" transform="translate(19.576719 43.129266)scale(0.11 -0.11)"> - <use xlink:href="#DejaVuSans-66"/> - <use x="68.603516" xlink:href="#DejaVuSans-76"/> - <use x="124.347656" xlink:href="#DejaVuSans-65"/> - <use x="192.755859" xlink:href="#DejaVuSans-75"/> - <use x="258.332031" xlink:href="#DejaVuSans-69"/> - <use x="321.515625" xlink:href="#DejaVuSans-51"/> - </g> - </g> - </g> - <g id="ytick_2"> - <g id="line2d_11"> - <g> - <use style="fill:#262626;stroke:#262626;stroke-width:1.25;" x="71.443906" xlink:href="#m7d1bb602a9" y="72.214125"/> - </g> - </g> - <g id="text_12"> - <!-- BLAKE2b --> - <defs> - <path d="M 48.6875 27.296875 -Q 48.6875 37.203125 44.609375 42.84375 -Q 40.53125 48.484375 33.40625 48.484375 -Q 26.265625 48.484375 22.1875 42.84375 -Q 18.109375 37.203125 18.109375 27.296875 -Q 18.109375 17.390625 22.1875 11.75 -Q 26.265625 6.109375 33.40625 6.109375 -Q 40.53125 6.109375 44.609375 11.75 -Q 48.6875 17.390625 48.6875 27.296875 -z -M 18.109375 46.390625 -Q 20.953125 51.265625 25.265625 53.625 -Q 29.59375 56 35.59375 56 -Q 45.5625 56 51.78125 48.09375 -Q 58.015625 40.1875 58.015625 27.296875 -Q 58.015625 14.40625 51.78125 6.484375 -Q 45.5625 -1.421875 35.59375 -1.421875 -Q 29.59375 -1.421875 25.265625 0.953125 -Q 20.953125 3.328125 18.109375 8.203125 -L 18.109375 0 -L 9.078125 0 -L 9.078125 75.984375 -L 18.109375 75.984375 -z -" id="DejaVuSans-98"/> - </defs> - <g style="fill:#262626;" transform="translate(12.593438 76.393266)scale(0.11 -0.11)"> - <use xlink:href="#DejaVuSans-66"/> - <use x="68.603516" xlink:href="#DejaVuSans-76"/> - <use x="124.347656" xlink:href="#DejaVuSans-65"/> - <use x="192.755859" xlink:href="#DejaVuSans-75"/> - <use x="258.332031" xlink:href="#DejaVuSans-69"/> - <use x="321.515625" xlink:href="#DejaVuSans-50"/> - <use x="385.138672" xlink:href="#DejaVuSans-98"/> - </g> - </g> - </g> - <g id="ytick_3"> - <g id="line2d_12"> - <g> - <use style="fill:#262626;stroke:#262626;stroke-width:1.25;" x="71.443906" xlink:href="#m7d1bb602a9" y="105.478125"/> - </g> - </g> - <g id="text_13"> - <!-- SHA-1 --> - <defs> - <path d="M 9.8125 72.90625 -L 19.671875 72.90625 -L 19.671875 43.015625 -L 55.515625 43.015625 -L 55.515625 72.90625 -L 65.375 72.90625 -L 65.375 0 -L 55.515625 0 -L 55.515625 34.71875 -L 19.671875 34.71875 -L 19.671875 0 -L 9.8125 0 -z -" id="DejaVuSans-72"/> - <path d="M 4.890625 31.390625 -L 31.203125 31.390625 -L 31.203125 23.390625 -L 4.890625 23.390625 -z -" id="DejaVuSans-45"/> - </defs> - <g style="fill:#262626;" transform="translate(28.199687 109.657266)scale(0.11 -0.11)"> - <use xlink:href="#DejaVuSans-83"/> - <use x="63.476562" xlink:href="#DejaVuSans-72"/> - <use x="138.671875" xlink:href="#DejaVuSans-65"/> - <use x="207.048828" xlink:href="#DejaVuSans-45"/> - <use x="243.132812" xlink:href="#DejaVuSans-49"/> - </g> - </g> - </g> - <g id="ytick_4"> - <g id="line2d_13"> - <g> - <use style="fill:#262626;stroke:#262626;stroke-width:1.25;" x="71.443906" xlink:href="#m7d1bb602a9" y="138.742125"/> - </g> - </g> - <g id="text_14"> - <!-- BLAKE2s --> - <g style="fill:#262626;" transform="translate(13.846406 142.921266)scale(0.11 -0.11)"> - <use xlink:href="#DejaVuSans-66"/> - <use x="68.603516" xlink:href="#DejaVuSans-76"/> - <use x="124.347656" xlink:href="#DejaVuSans-65"/> - <use x="192.755859" xlink:href="#DejaVuSans-75"/> - <use x="258.332031" xlink:href="#DejaVuSans-69"/> - <use x="321.515625" xlink:href="#DejaVuSans-50"/> - <use x="385.138672" xlink:href="#DejaVuSans-115"/> - </g> - </g> - </g> - <g id="ytick_5"> - <g id="line2d_14"> - <g> - <use style="fill:#262626;stroke:#262626;stroke-width:1.25;" x="71.443906" xlink:href="#m7d1bb602a9" y="172.006125"/> - </g> - </g> - <g id="text_15"> - <!-- MD5 --> - <defs> - <path d="M 19.671875 64.796875 -L 19.671875 8.109375 -L 31.59375 8.109375 -Q 46.6875 8.109375 53.6875 14.9375 -Q 60.6875 21.78125 60.6875 36.53125 -Q 60.6875 51.171875 53.6875 57.984375 -Q 46.6875 64.796875 31.59375 64.796875 -z -M 9.8125 72.90625 -L 30.078125 72.90625 -Q 51.265625 72.90625 61.171875 64.09375 -Q 71.09375 55.28125 71.09375 36.53125 -Q 71.09375 17.671875 61.125 8.828125 -Q 51.171875 0 30.078125 0 -L 9.8125 0 -z -" id="DejaVuSans-68"/> - </defs> - <g style="fill:#262626;" transform="translate(36.984219 176.185266)scale(0.11 -0.11)"> - <use xlink:href="#DejaVuSans-77"/> - <use x="86.279297" xlink:href="#DejaVuSans-68"/> - <use x="163.28125" xlink:href="#DejaVuSans-53"/> - </g> - </g> - </g> - <g id="ytick_6"> - <g id="line2d_15"> - <g> - <use style="fill:#262626;stroke:#262626;stroke-width:1.25;" x="71.443906" xlink:href="#m7d1bb602a9" y="205.270125"/> - </g> - </g> - <g id="text_16"> - <!-- SHA-512 --> - <g style="fill:#262626;" transform="translate(14.202188 209.449266)scale(0.11 -0.11)"> - <use xlink:href="#DejaVuSans-83"/> - <use x="63.476562" xlink:href="#DejaVuSans-72"/> - <use x="138.671875" xlink:href="#DejaVuSans-65"/> - <use x="207.048828" xlink:href="#DejaVuSans-45"/> - <use x="243.132812" xlink:href="#DejaVuSans-53"/> - <use x="306.755859" xlink:href="#DejaVuSans-49"/> - <use x="370.378906" xlink:href="#DejaVuSans-50"/> - </g> - </g> - </g> - <g id="ytick_7"> - <g id="line2d_16"> - <g> - <use style="fill:#262626;stroke:#262626;stroke-width:1.25;" x="71.443906" xlink:href="#m7d1bb602a9" y="238.534125"/> - </g> - </g> - <g id="text_17"> - <!-- SHA-256 --> - <g style="fill:#262626;" transform="translate(14.202188 242.713266)scale(0.11 -0.11)"> - <use xlink:href="#DejaVuSans-83"/> - <use x="63.476562" xlink:href="#DejaVuSans-72"/> - <use x="138.671875" xlink:href="#DejaVuSans-65"/> - <use x="207.048828" xlink:href="#DejaVuSans-45"/> - <use x="243.132812" xlink:href="#DejaVuSans-50"/> - <use x="306.755859" xlink:href="#DejaVuSans-53"/> - <use x="370.378906" xlink:href="#DejaVuSans-54"/> - </g> - </g> - </g> - <g id="ytick_8"> - <g id="line2d_17"> - <g> - <use style="fill:#262626;stroke:#262626;stroke-width:1.25;" x="71.443906" xlink:href="#m7d1bb602a9" y="271.798125"/> - </g> - </g> - <g id="text_18"> - <!-- SHA3-256 --> - <g style="fill:#262626;" transform="translate(7.2 275.977266)scale(0.11 -0.11)"> - <use xlink:href="#DejaVuSans-83"/> - <use x="63.476562" xlink:href="#DejaVuSans-72"/> - <use x="138.671875" xlink:href="#DejaVuSans-65"/> - <use x="207.080078" xlink:href="#DejaVuSans-51"/> - <use x="270.703125" xlink:href="#DejaVuSans-45"/> - <use x="306.787109" xlink:href="#DejaVuSans-50"/> - <use x="370.410156" xlink:href="#DejaVuSans-53"/> - <use x="434.033203" xlink:href="#DejaVuSans-54"/> - </g> - </g> - </g> - </g> - <g id="patch_3"> - <path clip-path="url(#p6091bd3d0f)" d="M 71.443906 25.644525 -L 377.942146 25.644525 -L 377.942146 52.255725 -L 71.443906 52.255725 -z -" style="fill:#df2020;stroke:#000000;stroke-linejoin:miter;"/> - </g> - <g id="patch_4"> - <path clip-path="url(#p6091bd3d0f)" d="M 71.443906 58.908525 -L 130.011586 58.908525 -L 130.011586 85.519725 -L 71.443906 85.519725 -z -" style="fill:#df2020;stroke:#000000;stroke-linejoin:miter;"/> - </g> - <g id="patch_5"> - <path clip-path="url(#p6091bd3d0f)" d="M 71.443906 92.172525 -L 117.289186 92.172525 -L 117.289186 118.783725 -L 71.443906 118.783725 -z -" style="fill:#df2020;stroke:#000000;stroke-linejoin:miter;"/> - </g> - <g id="patch_6"> - <path clip-path="url(#p6091bd3d0f)" d="M 71.443906 125.436525 -L 110.548546 125.436525 -L 110.548546 152.047725 -L 71.443906 152.047725 -z -" style="fill:#df2020;stroke:#000000;stroke-linejoin:miter;"/> - </g> - <g id="patch_7"> - <path clip-path="url(#p6091bd3d0f)" d="M 71.443906 158.700525 -L 104.477506 158.700525 -L 104.477506 185.311725 -L 71.443906 185.311725 -z -" style="fill:#df2020;stroke:#000000;stroke-linejoin:miter;"/> - </g> - <g id="patch_8"> - <path clip-path="url(#p6091bd3d0f)" d="M 71.443906 191.964525 -L 103.584706 191.964525 -L 103.584706 218.575725 -L 71.443906 218.575725 -z -" style="fill:#df2020;stroke:#000000;stroke-linejoin:miter;"/> - </g> - <g id="patch_9"> - <path clip-path="url(#p6091bd3d0f)" d="M 71.443906 225.228525 -L 93.049666 225.228525 -L 93.049666 251.839725 -L 71.443906 251.839725 -z -" style="fill:#df2020;stroke:#000000;stroke-linejoin:miter;"/> - </g> - <g id="patch_10"> - <path clip-path="url(#p6091bd3d0f)" d="M 71.443906 258.492525 -L 89.032066 258.492525 -L 89.032066 285.103725 -L 71.443906 285.103725 -z -" style="fill:#df2020;stroke:#000000;stroke-linejoin:miter;"/> - </g> - <g id="line2d_18"> - <path clip-path="url(#p6091bd3d0f)" d="M 0 0 -" style="fill:none;stroke:#424242;stroke-linecap:round;stroke-width:2.7;"/> - </g> - <g id="line2d_19"> - <path clip-path="url(#p6091bd3d0f)" d="M 0 0 -" style="fill:none;stroke:#424242;stroke-linecap:round;stroke-width:2.7;"/> - </g> - <g id="line2d_20"> - <path clip-path="url(#p6091bd3d0f)" d="M 0 0 -" style="fill:none;stroke:#424242;stroke-linecap:round;stroke-width:2.7;"/> - </g> - <g id="line2d_21"> - <path clip-path="url(#p6091bd3d0f)" d="M 0 0 -" style="fill:none;stroke:#424242;stroke-linecap:round;stroke-width:2.7;"/> - </g> - <g id="line2d_22"> - <path clip-path="url(#p6091bd3d0f)" d="M 0 0 -" style="fill:none;stroke:#424242;stroke-linecap:round;stroke-width:2.7;"/> - </g> - <g id="line2d_23"> - <path clip-path="url(#p6091bd3d0f)" d="M 0 0 -" style="fill:none;stroke:#424242;stroke-linecap:round;stroke-width:2.7;"/> - </g> - <g id="line2d_24"> - <path clip-path="url(#p6091bd3d0f)" d="M 0 0 -" style="fill:none;stroke:#424242;stroke-linecap:round;stroke-width:2.7;"/> - </g> - <g id="line2d_25"> - <path clip-path="url(#p6091bd3d0f)" d="M 0 0 -" style="fill:none;stroke:#424242;stroke-linecap:round;stroke-width:2.7;"/> - </g> - <g id="patch_11"> - <path d="M 71.443906 288.430125 -L 71.443906 22.318125 -" style="fill:none;stroke:#262626;stroke-linecap:square;stroke-linejoin:miter;stroke-width:1.25;"/> - </g> - <g id="patch_12"> - <path d="M 428.563906 288.430125 -L 428.563906 22.318125 -" style="fill:none;stroke:#262626;stroke-linecap:square;stroke-linejoin:miter;stroke-width:1.25;"/> - </g> - <g id="patch_13"> - <path d="M 71.443906 288.430125 -L 428.563906 288.430125 -" style="fill:none;stroke:#262626;stroke-linecap:square;stroke-linejoin:miter;stroke-width:1.25;"/> - </g> - <g id="patch_14"> - <path d="M 71.443906 22.318125 -L 428.563906 22.318125 -" style="fill:none;stroke:#262626;stroke-linecap:square;stroke-linejoin:miter;stroke-width:1.25;"/> - </g> - <g id="text_19"> - <!-- 6866 --> - <g style="fill:#262626;" transform="translate(382.406146 43.939725)scale(0.12 -0.12)"> - <use xlink:href="#DejaVuSans-54"/> - <use x="63.623047" xlink:href="#DejaVuSans-56"/> - <use x="127.246094" xlink:href="#DejaVuSans-54"/> - <use x="190.869141" xlink:href="#DejaVuSans-54"/> - </g> - </g> - <g id="text_20"> - <!-- 1312 --> - <g style="fill:#262626;" transform="translate(134.475586 77.203725)scale(0.12 -0.12)"> - <use xlink:href="#DejaVuSans-49"/> - <use x="63.623047" xlink:href="#DejaVuSans-51"/> - <use x="127.246094" xlink:href="#DejaVuSans-49"/> - <use x="190.869141" xlink:href="#DejaVuSans-50"/> - </g> - </g> - <g id="text_21"> - <!-- 1027 --> - <g style="fill:#262626;" transform="translate(121.753186 110.467725)scale(0.12 -0.12)"> - <use xlink:href="#DejaVuSans-49"/> - <use x="63.623047" xlink:href="#DejaVuSans-48"/> - <use x="127.246094" xlink:href="#DejaVuSans-50"/> - <use x="190.869141" xlink:href="#DejaVuSans-55"/> - </g> - </g> - <g id="text_22"> - <!-- 876 --> - <g style="fill:#262626;" transform="translate(115.012546 143.731725)scale(0.12 -0.12)"> - <use xlink:href="#DejaVuSans-56"/> - <use x="63.623047" xlink:href="#DejaVuSans-55"/> - <use x="127.246094" xlink:href="#DejaVuSans-54"/> - </g> - </g> - <g id="text_23"> - <!-- 740 --> - <g style="fill:#262626;" transform="translate(108.941506 176.995725)scale(0.12 -0.12)"> - <use xlink:href="#DejaVuSans-55"/> - <use x="63.623047" xlink:href="#DejaVuSans-52"/> - <use x="127.246094" xlink:href="#DejaVuSans-48"/> - </g> - </g> - <g id="text_24"> - <!-- 720 --> - <g style="fill:#262626;" transform="translate(108.048706 210.259725)scale(0.12 -0.12)"> - <use xlink:href="#DejaVuSans-55"/> - <use x="63.623047" xlink:href="#DejaVuSans-50"/> - <use x="127.246094" xlink:href="#DejaVuSans-48"/> - </g> - </g> - <g id="text_25"> - <!-- 484 --> - <g style="fill:#262626;" transform="translate(97.513666 243.523725)scale(0.12 -0.12)"> - <use xlink:href="#DejaVuSans-52"/> - <use x="63.623047" xlink:href="#DejaVuSans-56"/> - <use x="127.246094" xlink:href="#DejaVuSans-52"/> - </g> - </g> - <g id="text_26"> - <!-- 394 --> - <defs> - <path d="M 10.984375 1.515625 -L 10.984375 10.5 -Q 14.703125 8.734375 18.5 7.8125 -Q 22.3125 6.890625 25.984375 6.890625 -Q 35.75 6.890625 40.890625 13.453125 -Q 46.046875 20.015625 46.78125 33.40625 -Q 43.953125 29.203125 39.59375 26.953125 -Q 35.25 24.703125 29.984375 24.703125 -Q 19.046875 24.703125 12.671875 31.3125 -Q 6.296875 37.9375 6.296875 49.421875 -Q 6.296875 60.640625 12.9375 67.421875 -Q 19.578125 74.21875 30.609375 74.21875 -Q 43.265625 74.21875 49.921875 64.515625 -Q 56.59375 54.828125 56.59375 36.375 -Q 56.59375 19.140625 48.40625 8.859375 -Q 40.234375 -1.421875 26.421875 -1.421875 -Q 22.703125 -1.421875 18.890625 -0.6875 -Q 15.09375 0.046875 10.984375 1.515625 -z -M 30.609375 32.421875 -Q 37.25 32.421875 41.125 36.953125 -Q 45.015625 41.5 45.015625 49.421875 -Q 45.015625 57.28125 41.125 61.84375 -Q 37.25 66.40625 30.609375 66.40625 -Q 23.96875 66.40625 20.09375 61.84375 -Q 16.21875 57.28125 16.21875 49.421875 -Q 16.21875 41.5 20.09375 36.953125 -Q 23.96875 32.421875 30.609375 32.421875 -z -" id="DejaVuSans-57"/> - </defs> - <g style="fill:#262626;" transform="translate(93.496066 276.787725)scale(0.12 -0.12)"> - <use xlink:href="#DejaVuSans-51"/> - <use x="63.623047" xlink:href="#DejaVuSans-57"/> - <use x="127.246094" xlink:href="#DejaVuSans-52"/> - </g> - </g> - <g id="text_27"> - <!-- Performance on AWS c5.metal, 16 KiB input, 1 thread --> - <defs> - <path d="M 19.671875 64.796875 -L 19.671875 37.40625 -L 32.078125 37.40625 -Q 38.96875 37.40625 42.71875 40.96875 -Q 46.484375 44.53125 46.484375 51.125 -Q 46.484375 57.671875 42.71875 61.234375 -Q 38.96875 64.796875 32.078125 64.796875 -z -M 9.8125 72.90625 -L 32.078125 72.90625 -Q 44.34375 72.90625 50.609375 67.359375 -Q 56.890625 61.8125 56.890625 51.125 -Q 56.890625 40.328125 50.609375 34.8125 -Q 44.34375 29.296875 32.078125 29.296875 -L 19.671875 29.296875 -L 19.671875 0 -L 9.8125 0 -z -" id="DejaVuSans-80"/> - <path d="M 41.109375 46.296875 -Q 39.59375 47.171875 37.8125 47.578125 -Q 36.03125 48 33.890625 48 -Q 26.265625 48 22.1875 43.046875 -Q 18.109375 38.09375 18.109375 28.8125 -L 18.109375 0 -L 9.078125 0 -L 9.078125 54.6875 -L 18.109375 54.6875 -L 18.109375 46.1875 -Q 20.953125 51.171875 25.484375 53.578125 -Q 30.03125 56 36.53125 56 -Q 37.453125 56 38.578125 55.875 -Q 39.703125 55.765625 41.0625 55.515625 -z -" id="DejaVuSans-114"/> - <path d="M 37.109375 75.984375 -L 37.109375 68.5 -L 28.515625 68.5 -Q 23.6875 68.5 21.796875 66.546875 -Q 19.921875 64.59375 19.921875 59.515625 -L 19.921875 54.6875 -L 34.71875 54.6875 -L 34.71875 47.703125 -L 19.921875 47.703125 -L 19.921875 0 -L 10.890625 0 -L 10.890625 47.703125 -L 2.296875 47.703125 -L 2.296875 54.6875 -L 10.890625 54.6875 -L 10.890625 58.5 -Q 10.890625 67.625 15.140625 71.796875 -Q 19.390625 75.984375 28.609375 75.984375 -z -" id="DejaVuSans-102"/> - <path d="M 30.609375 48.390625 -Q 23.390625 48.390625 19.1875 42.75 -Q 14.984375 37.109375 14.984375 27.296875 -Q 14.984375 17.484375 19.15625 11.84375 -Q 23.34375 6.203125 30.609375 6.203125 -Q 37.796875 6.203125 41.984375 11.859375 -Q 46.1875 17.53125 46.1875 27.296875 -Q 46.1875 37.015625 41.984375 42.703125 -Q 37.796875 48.390625 30.609375 48.390625 -z -M 30.609375 56 -Q 42.328125 56 49.015625 48.375 -Q 55.71875 40.765625 55.71875 27.296875 -Q 55.71875 13.875 49.015625 6.21875 -Q 42.328125 -1.421875 30.609375 -1.421875 -Q 18.84375 -1.421875 12.171875 6.21875 -Q 5.515625 13.875 5.515625 27.296875 -Q 5.515625 40.765625 12.171875 48.375 -Q 18.84375 56 30.609375 56 -z -" id="DejaVuSans-111"/> - <path d="M 52 44.1875 -Q 55.375 50.25 60.0625 53.125 -Q 64.75 56 71.09375 56 -Q 79.640625 56 84.28125 50.015625 -Q 88.921875 44.046875 88.921875 33.015625 -L 88.921875 0 -L 79.890625 0 -L 79.890625 32.71875 -Q 79.890625 40.578125 77.09375 44.375 -Q 74.3125 48.1875 68.609375 48.1875 -Q 61.625 48.1875 57.5625 43.546875 -Q 53.515625 38.921875 53.515625 30.90625 -L 53.515625 0 -L 44.484375 0 -L 44.484375 32.71875 -Q 44.484375 40.625 41.703125 44.40625 -Q 38.921875 48.1875 33.109375 48.1875 -Q 26.21875 48.1875 22.15625 43.53125 -Q 18.109375 38.875 18.109375 30.90625 -L 18.109375 0 -L 9.078125 0 -L 9.078125 54.6875 -L 18.109375 54.6875 -L 18.109375 46.1875 -Q 21.1875 51.21875 25.484375 53.609375 -Q 29.78125 56 35.6875 56 -Q 41.65625 56 45.828125 52.96875 -Q 50 49.953125 52 44.1875 -z -" id="DejaVuSans-109"/> - <path d="M 34.28125 27.484375 -Q 23.390625 27.484375 19.1875 25 -Q 14.984375 22.515625 14.984375 16.5 -Q 14.984375 11.71875 18.140625 8.90625 -Q 21.296875 6.109375 26.703125 6.109375 -Q 34.1875 6.109375 38.703125 11.40625 -Q 43.21875 16.703125 43.21875 25.484375 -L 43.21875 27.484375 -z -M 52.203125 31.203125 -L 52.203125 0 -L 43.21875 0 -L 43.21875 8.296875 -Q 40.140625 3.328125 35.546875 0.953125 -Q 30.953125 -1.421875 24.3125 -1.421875 -Q 15.921875 -1.421875 10.953125 3.296875 -Q 6 8.015625 6 15.921875 -Q 6 25.140625 12.171875 29.828125 -Q 18.359375 34.515625 30.609375 34.515625 -L 43.21875 34.515625 -L 43.21875 35.40625 -Q 43.21875 41.609375 39.140625 45 -Q 35.0625 48.390625 27.6875 48.390625 -Q 23 48.390625 18.546875 47.265625 -Q 14.109375 46.140625 10.015625 43.890625 -L 10.015625 52.203125 -Q 14.9375 54.109375 19.578125 55.046875 -Q 24.21875 56 28.609375 56 -Q 40.484375 56 46.34375 49.84375 -Q 52.203125 43.703125 52.203125 31.203125 -z -" id="DejaVuSans-97"/> - <path d="M 54.890625 33.015625 -L 54.890625 0 -L 45.90625 0 -L 45.90625 32.71875 -Q 45.90625 40.484375 42.875 44.328125 -Q 39.84375 48.1875 33.796875 48.1875 -Q 26.515625 48.1875 22.3125 43.546875 -Q 18.109375 38.921875 18.109375 30.90625 -L 18.109375 0 -L 9.078125 0 -L 9.078125 54.6875 -L 18.109375 54.6875 -L 18.109375 46.1875 -Q 21.34375 51.125 25.703125 53.5625 -Q 30.078125 56 35.796875 56 -Q 45.21875 56 50.046875 50.171875 -Q 54.890625 44.34375 54.890625 33.015625 -z -" id="DejaVuSans-110"/> - <path d="M 48.78125 52.59375 -L 48.78125 44.1875 -Q 44.96875 46.296875 41.140625 47.34375 -Q 37.3125 48.390625 33.40625 48.390625 -Q 24.65625 48.390625 19.8125 42.84375 -Q 14.984375 37.3125 14.984375 27.296875 -Q 14.984375 17.28125 19.8125 11.734375 -Q 24.65625 6.203125 33.40625 6.203125 -Q 37.3125 6.203125 41.140625 7.25 -Q 44.96875 8.296875 48.78125 10.40625 -L 48.78125 2.09375 -Q 45.015625 0.34375 40.984375 -0.53125 -Q 36.96875 -1.421875 32.421875 -1.421875 -Q 20.0625 -1.421875 12.78125 6.34375 -Q 5.515625 14.109375 5.515625 27.296875 -Q 5.515625 40.671875 12.859375 48.328125 -Q 20.21875 56 33.015625 56 -Q 37.15625 56 41.109375 55.140625 -Q 45.0625 54.296875 48.78125 52.59375 -z -" id="DejaVuSans-99"/> - <path d="M 3.328125 72.90625 -L 13.28125 72.90625 -L 28.609375 11.28125 -L 43.890625 72.90625 -L 54.984375 72.90625 -L 70.3125 11.28125 -L 85.59375 72.90625 -L 95.609375 72.90625 -L 77.296875 0 -L 64.890625 0 -L 49.515625 63.28125 -L 33.984375 0 -L 21.578125 0 -z -" id="DejaVuSans-87"/> - <path d="M 10.6875 12.40625 -L 21 12.40625 -L 21 0 -L 10.6875 0 -z -" id="DejaVuSans-46"/> - <path d="M 18.3125 70.21875 -L 18.3125 54.6875 -L 36.8125 54.6875 -L 36.8125 47.703125 -L 18.3125 47.703125 -L 18.3125 18.015625 -Q 18.3125 11.328125 20.140625 9.421875 -Q 21.96875 7.515625 27.59375 7.515625 -L 36.8125 7.515625 -L 36.8125 0 -L 27.59375 0 -Q 17.1875 0 13.234375 3.875 -Q 9.28125 7.765625 9.28125 18.015625 -L 9.28125 47.703125 -L 2.6875 47.703125 -L 2.6875 54.6875 -L 9.28125 54.6875 -L 9.28125 70.21875 -z -" id="DejaVuSans-116"/> - <path d="M 9.421875 75.984375 -L 18.40625 75.984375 -L 18.40625 0 -L 9.421875 0 -z -" id="DejaVuSans-108"/> - <path d="M 11.71875 12.40625 -L 22.015625 12.40625 -L 22.015625 4 -L 14.015625 -11.625 -L 7.71875 -11.625 -L 11.71875 4 -z -" id="DejaVuSans-44"/> - <path d="M 8.5 21.578125 -L 8.5 54.6875 -L 17.484375 54.6875 -L 17.484375 21.921875 -Q 17.484375 14.15625 20.5 10.265625 -Q 23.53125 6.390625 29.59375 6.390625 -Q 36.859375 6.390625 41.078125 11.03125 -Q 45.3125 15.671875 45.3125 23.6875 -L 45.3125 54.6875 -L 54.296875 54.6875 -L 54.296875 0 -L 45.3125 0 -L 45.3125 8.40625 -Q 42.046875 3.421875 37.71875 1 -Q 33.40625 -1.421875 27.6875 -1.421875 -Q 18.265625 -1.421875 13.375 4.4375 -Q 8.5 10.296875 8.5 21.578125 -z -M 31.109375 56 -z -" id="DejaVuSans-117"/> - <path d="M 54.890625 33.015625 -L 54.890625 0 -L 45.90625 0 -L 45.90625 32.71875 -Q 45.90625 40.484375 42.875 44.328125 -Q 39.84375 48.1875 33.796875 48.1875 -Q 26.515625 48.1875 22.3125 43.546875 -Q 18.109375 38.921875 18.109375 30.90625 -L 18.109375 0 -L 9.078125 0 -L 9.078125 75.984375 -L 18.109375 75.984375 -L 18.109375 46.1875 -Q 21.34375 51.125 25.703125 53.5625 -Q 30.078125 56 35.796875 56 -Q 45.21875 56 50.046875 50.171875 -Q 54.890625 44.34375 54.890625 33.015625 -z -" id="DejaVuSans-104"/> - </defs> - <g style="fill:#262626;" transform="translate(88.626406 16.318125)scale(0.12 -0.12)"> - <use xlink:href="#DejaVuSans-80"/> - <use x="60.255859" xlink:href="#DejaVuSans-101"/> - <use x="121.779297" xlink:href="#DejaVuSans-114"/> - <use x="162.892578" xlink:href="#DejaVuSans-102"/> - <use x="198.097656" xlink:href="#DejaVuSans-111"/> - <use x="259.279297" xlink:href="#DejaVuSans-114"/> - <use x="300.376953" xlink:href="#DejaVuSans-109"/> - <use x="397.789062" xlink:href="#DejaVuSans-97"/> - <use x="459.068359" xlink:href="#DejaVuSans-110"/> - <use x="522.447266" xlink:href="#DejaVuSans-99"/> - <use x="577.427734" xlink:href="#DejaVuSans-101"/> - <use x="638.951172" xlink:href="#DejaVuSans-32"/> - <use x="670.738281" xlink:href="#DejaVuSans-111"/> - <use x="731.919922" xlink:href="#DejaVuSans-110"/> - <use x="795.298828" xlink:href="#DejaVuSans-32"/> - <use x="827.085938" xlink:href="#DejaVuSans-65"/> - <use x="895.416016" xlink:href="#DejaVuSans-87"/> - <use x="994.292969" xlink:href="#DejaVuSans-83"/> - <use x="1057.769531" xlink:href="#DejaVuSans-32"/> - <use x="1089.556641" xlink:href="#DejaVuSans-99"/> - <use x="1144.537109" xlink:href="#DejaVuSans-53"/> - <use x="1208.160156" xlink:href="#DejaVuSans-46"/> - <use x="1239.947266" xlink:href="#DejaVuSans-109"/> - <use x="1337.359375" xlink:href="#DejaVuSans-101"/> - <use x="1398.882812" xlink:href="#DejaVuSans-116"/> - <use x="1438.091797" xlink:href="#DejaVuSans-97"/> - <use x="1499.371094" xlink:href="#DejaVuSans-108"/> - <use x="1527.154297" xlink:href="#DejaVuSans-44"/> - <use x="1558.941406" xlink:href="#DejaVuSans-32"/> - <use x="1590.728516" xlink:href="#DejaVuSans-49"/> - <use x="1654.351562" xlink:href="#DejaVuSans-54"/> - <use x="1717.974609" xlink:href="#DejaVuSans-32"/> - <use x="1749.761719" xlink:href="#DejaVuSans-75"/> - <use x="1815.337891" xlink:href="#DejaVuSans-105"/> - <use x="1843.121094" xlink:href="#DejaVuSans-66"/> - <use x="1911.724609" xlink:href="#DejaVuSans-32"/> - <use x="1943.511719" xlink:href="#DejaVuSans-105"/> - <use x="1971.294922" xlink:href="#DejaVuSans-110"/> - <use x="2034.673828" xlink:href="#DejaVuSans-112"/> - <use x="2098.150391" xlink:href="#DejaVuSans-117"/> - <use x="2161.529297" xlink:href="#DejaVuSans-116"/> - <use x="2200.738281" xlink:href="#DejaVuSans-44"/> - <use x="2232.525391" xlink:href="#DejaVuSans-32"/> - <use x="2264.3125" xlink:href="#DejaVuSans-49"/> - <use x="2327.935547" xlink:href="#DejaVuSans-32"/> - <use x="2359.722656" xlink:href="#DejaVuSans-116"/> - <use x="2398.931641" xlink:href="#DejaVuSans-104"/> - <use x="2462.310547" xlink:href="#DejaVuSans-114"/> - <use x="2503.392578" xlink:href="#DejaVuSans-101"/> - <use x="2564.916016" xlink:href="#DejaVuSans-97"/> - <use x="2626.195312" xlink:href="#DejaVuSans-100"/> - </g> - </g> - </g> - </g> - <defs> - <clipPath id="p6091bd3d0f"> - <rect height="266.112" width="357.12" x="71.443906" y="22.318125"/> - </clipPath> - </defs> -</svg> diff --git a/thirdparty/BLAKE3/reference_impl/Cargo.toml b/thirdparty/BLAKE3/reference_impl/Cargo.toml deleted file mode 100644 index 8c81e5ad9..000000000 --- a/thirdparty/BLAKE3/reference_impl/Cargo.toml +++ /dev/null @@ -1,8 +0,0 @@ -[package] -name = "reference_impl" -version = "0.0.0" -edition = "2018" - -[lib] -name = "reference_impl" -path = "reference_impl.rs" diff --git a/thirdparty/BLAKE3/reference_impl/README.md b/thirdparty/BLAKE3/reference_impl/README.md deleted file mode 100644 index 941fafd72..000000000 --- a/thirdparty/BLAKE3/reference_impl/README.md +++ /dev/null @@ -1,9 +0,0 @@ -This is the reference implementation of BLAKE3. It is used for testing and -as a readable example of the algorithms involved. Section 5.1 of [the BLAKE3 -spec](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf) -discusses this implementation. You can render docs for this implementation -by running `cargo doc --open` in this directory. - -This implementation is a single file -([`reference_impl.rs`](reference_impl.rs)) with no dependencies. It is -not optimized for performance. diff --git a/thirdparty/BLAKE3/reference_impl/reference_impl.rs b/thirdparty/BLAKE3/reference_impl/reference_impl.rs deleted file mode 100644 index 248834319..000000000 --- a/thirdparty/BLAKE3/reference_impl/reference_impl.rs +++ /dev/null @@ -1,383 +0,0 @@ -//! This is the reference implementation of BLAKE3. It is used for testing and -//! as a readable example of the algorithms involved. Section 5.1 of [the BLAKE3 -//! spec](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf) -//! discusses this implementation. You can render docs for this implementation -//! by running `cargo doc --open` in this directory. -//! -//! # Example -//! -//! ``` -//! let mut hasher = reference_impl::Hasher::new(); -//! hasher.update(b"abc"); -//! hasher.update(b"def"); -//! let mut hash = [0; 32]; -//! hasher.finalize(&mut hash); -//! let mut extended_hash = [0; 500]; -//! hasher.finalize(&mut extended_hash); -//! assert_eq!(hash, extended_hash[..32]); -//! ``` - -use core::cmp::min; -use core::convert::TryInto; - -const OUT_LEN: usize = 32; -const KEY_LEN: usize = 32; -const BLOCK_LEN: usize = 64; -const CHUNK_LEN: usize = 1024; - -const CHUNK_START: u32 = 1 << 0; -const CHUNK_END: u32 = 1 << 1; -const PARENT: u32 = 1 << 2; -const ROOT: u32 = 1 << 3; -const KEYED_HASH: u32 = 1 << 4; -const DERIVE_KEY_CONTEXT: u32 = 1 << 5; -const DERIVE_KEY_MATERIAL: u32 = 1 << 6; - -const IV: [u32; 8] = [ - 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19, -]; - -const MSG_PERMUTATION: [usize; 16] = [2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8]; - -// The mixing function, G, which mixes either a column or a diagonal. -fn g(state: &mut [u32; 16], a: usize, b: usize, c: usize, d: usize, mx: u32, my: u32) { - state[a] = state[a].wrapping_add(state[b]).wrapping_add(mx); - state[d] = (state[d] ^ state[a]).rotate_right(16); - state[c] = state[c].wrapping_add(state[d]); - state[b] = (state[b] ^ state[c]).rotate_right(12); - state[a] = state[a].wrapping_add(state[b]).wrapping_add(my); - state[d] = (state[d] ^ state[a]).rotate_right(8); - state[c] = state[c].wrapping_add(state[d]); - state[b] = (state[b] ^ state[c]).rotate_right(7); -} - -fn round(state: &mut [u32; 16], m: &[u32; 16]) { - // Mix the columns. - g(state, 0, 4, 8, 12, m[0], m[1]); - g(state, 1, 5, 9, 13, m[2], m[3]); - g(state, 2, 6, 10, 14, m[4], m[5]); - g(state, 3, 7, 11, 15, m[6], m[7]); - // Mix the diagonals. - g(state, 0, 5, 10, 15, m[8], m[9]); - g(state, 1, 6, 11, 12, m[10], m[11]); - g(state, 2, 7, 8, 13, m[12], m[13]); - g(state, 3, 4, 9, 14, m[14], m[15]); -} - -fn permute(m: &mut [u32; 16]) { - let mut permuted = [0; 16]; - for i in 0..16 { - permuted[i] = m[MSG_PERMUTATION[i]]; - } - *m = permuted; -} - -fn compress( - chaining_value: &[u32; 8], - block_words: &[u32; 16], - counter: u64, - block_len: u32, - flags: u32, -) -> [u32; 16] { - let mut state = [ - chaining_value[0], - chaining_value[1], - chaining_value[2], - chaining_value[3], - chaining_value[4], - chaining_value[5], - chaining_value[6], - chaining_value[7], - IV[0], - IV[1], - IV[2], - IV[3], - counter as u32, - (counter >> 32) as u32, - block_len, - flags, - ]; - let mut block = *block_words; - - round(&mut state, &block); // round 1 - permute(&mut block); - round(&mut state, &block); // round 2 - permute(&mut block); - round(&mut state, &block); // round 3 - permute(&mut block); - round(&mut state, &block); // round 4 - permute(&mut block); - round(&mut state, &block); // round 5 - permute(&mut block); - round(&mut state, &block); // round 6 - permute(&mut block); - round(&mut state, &block); // round 7 - - for i in 0..8 { - state[i] ^= state[i + 8]; - state[i + 8] ^= chaining_value[i]; - } - state -} - -fn first_8_words(compression_output: [u32; 16]) -> [u32; 8] { - compression_output[0..8].try_into().unwrap() -} - -fn words_from_little_endian_bytes(bytes: &[u8], words: &mut [u32]) { - for (bytes_block, word) in bytes.chunks_exact(4).zip(words.iter_mut()) { - *word = u32::from_le_bytes(bytes_block.try_into().unwrap()); - } -} - -// Each chunk or parent node can produce either an 8-word chaining value or, by -// setting the ROOT flag, any number of final output bytes. The Output struct -// captures the state just prior to choosing between those two possibilities. -struct Output { - input_chaining_value: [u32; 8], - block_words: [u32; 16], - counter: u64, - block_len: u32, - flags: u32, -} - -impl Output { - fn chaining_value(&self) -> [u32; 8] { - first_8_words(compress( - &self.input_chaining_value, - &self.block_words, - self.counter, - self.block_len, - self.flags, - )) - } - - fn root_output_bytes(&self, out_slice: &mut [u8]) { - let mut output_block_counter = 0; - for out_block in out_slice.chunks_mut(2 * OUT_LEN) { - let words = compress( - &self.input_chaining_value, - &self.block_words, - output_block_counter, - self.block_len, - self.flags | ROOT, - ); - // The output length might not be a multiple of 4. - for (word, out_word) in words.iter().zip(out_block.chunks_mut(4)) { - out_word.copy_from_slice(&word.to_le_bytes()[..out_word.len()]); - } - output_block_counter += 1; - } - } -} - -struct ChunkState { - chaining_value: [u32; 8], - chunk_counter: u64, - block: [u8; BLOCK_LEN], - block_len: u8, - blocks_compressed: u8, - flags: u32, -} - -impl ChunkState { - fn new(key: [u32; 8], chunk_counter: u64, flags: u32) -> Self { - Self { - chaining_value: key, - chunk_counter, - block: [0; BLOCK_LEN], - block_len: 0, - blocks_compressed: 0, - flags, - } - } - - fn len(&self) -> usize { - BLOCK_LEN * self.blocks_compressed as usize + self.block_len as usize - } - - fn start_flag(&self) -> u32 { - if self.blocks_compressed == 0 { - CHUNK_START - } else { - 0 - } - } - - fn update(&mut self, mut input: &[u8]) { - while !input.is_empty() { - // If the block buffer is full, compress it and clear it. More - // input is coming, so this compression is not CHUNK_END. - if self.block_len as usize == BLOCK_LEN { - let mut block_words = [0; 16]; - words_from_little_endian_bytes(&self.block, &mut block_words); - self.chaining_value = first_8_words(compress( - &self.chaining_value, - &block_words, - self.chunk_counter, - BLOCK_LEN as u32, - self.flags | self.start_flag(), - )); - self.blocks_compressed += 1; - self.block = [0; BLOCK_LEN]; - self.block_len = 0; - } - - // Copy input bytes into the block buffer. - let want = BLOCK_LEN - self.block_len as usize; - let take = min(want, input.len()); - self.block[self.block_len as usize..][..take].copy_from_slice(&input[..take]); - self.block_len += take as u8; - input = &input[take..]; - } - } - - fn output(&self) -> Output { - let mut block_words = [0; 16]; - words_from_little_endian_bytes(&self.block, &mut block_words); - Output { - input_chaining_value: self.chaining_value, - block_words, - block_len: self.block_len as u32, - counter: self.chunk_counter, - flags: self.flags | self.start_flag() | CHUNK_END, - } - } -} - -fn parent_output( - left_child_cv: [u32; 8], - right_child_cv: [u32; 8], - key: [u32; 8], - flags: u32, -) -> Output { - let mut block_words = [0; 16]; - block_words[..8].copy_from_slice(&left_child_cv); - block_words[8..].copy_from_slice(&right_child_cv); - Output { - input_chaining_value: key, - block_words, - counter: 0, // Always 0 for parent nodes. - block_len: BLOCK_LEN as u32, // Always BLOCK_LEN (64) for parent nodes. - flags: PARENT | flags, - } -} - -fn parent_cv( - left_child_cv: [u32; 8], - right_child_cv: [u32; 8], - key: [u32; 8], - flags: u32, -) -> [u32; 8] { - parent_output(left_child_cv, right_child_cv, key, flags).chaining_value() -} - -/// An incremental hasher that can accept any number of writes. -pub struct Hasher { - chunk_state: ChunkState, - key: [u32; 8], - cv_stack: [[u32; 8]; 54], // Space for 54 subtree chaining values: - cv_stack_len: u8, // 2^54 * CHUNK_LEN = 2^64 - flags: u32, -} - -impl Hasher { - fn new_internal(key: [u32; 8], flags: u32) -> Self { - Self { - chunk_state: ChunkState::new(key, 0, flags), - key, - cv_stack: [[0; 8]; 54], - cv_stack_len: 0, - flags, - } - } - - /// Construct a new `Hasher` for the regular hash function. - pub fn new() -> Self { - Self::new_internal(IV, 0) - } - - /// Construct a new `Hasher` for the keyed hash function. - pub fn new_keyed(key: &[u8; KEY_LEN]) -> Self { - let mut key_words = [0; 8]; - words_from_little_endian_bytes(key, &mut key_words); - Self::new_internal(key_words, KEYED_HASH) - } - - /// Construct a new `Hasher` for the key derivation function. The context - /// string should be hardcoded, globally unique, and application-specific. - pub fn new_derive_key(context: &str) -> Self { - let mut context_hasher = Self::new_internal(IV, DERIVE_KEY_CONTEXT); - context_hasher.update(context.as_bytes()); - let mut context_key = [0; KEY_LEN]; - context_hasher.finalize(&mut context_key); - let mut context_key_words = [0; 8]; - words_from_little_endian_bytes(&context_key, &mut context_key_words); - Self::new_internal(context_key_words, DERIVE_KEY_MATERIAL) - } - - fn push_stack(&mut self, cv: [u32; 8]) { - self.cv_stack[self.cv_stack_len as usize] = cv; - self.cv_stack_len += 1; - } - - fn pop_stack(&mut self) -> [u32; 8] { - self.cv_stack_len -= 1; - self.cv_stack[self.cv_stack_len as usize] - } - - // Section 5.1.2 of the BLAKE3 spec explains this algorithm in more detail. - fn add_chunk_chaining_value(&mut self, mut new_cv: [u32; 8], mut total_chunks: u64) { - // This chunk might complete some subtrees. For each completed subtree, - // its left child will be the current top entry in the CV stack, and - // its right child will be the current value of `new_cv`. Pop each left - // child off the stack, merge it with `new_cv`, and overwrite `new_cv` - // with the result. After all these merges, push the final value of - // `new_cv` onto the stack. The number of completed subtrees is given - // by the number of trailing 0-bits in the new total number of chunks. - while total_chunks & 1 == 0 { - new_cv = parent_cv(self.pop_stack(), new_cv, self.key, self.flags); - total_chunks >>= 1; - } - self.push_stack(new_cv); - } - - /// Add input to the hash state. This can be called any number of times. - pub fn update(&mut self, mut input: &[u8]) { - while !input.is_empty() { - // If the current chunk is complete, finalize it and reset the - // chunk state. More input is coming, so this chunk is not ROOT. - if self.chunk_state.len() == CHUNK_LEN { - let chunk_cv = self.chunk_state.output().chaining_value(); - let total_chunks = self.chunk_state.chunk_counter + 1; - self.add_chunk_chaining_value(chunk_cv, total_chunks); - self.chunk_state = ChunkState::new(self.key, total_chunks, self.flags); - } - - // Compress input bytes into the current chunk state. - let want = CHUNK_LEN - self.chunk_state.len(); - let take = min(want, input.len()); - self.chunk_state.update(&input[..take]); - input = &input[take..]; - } - } - - /// Finalize the hash and write any number of output bytes. - pub fn finalize(&self, out_slice: &mut [u8]) { - // Starting with the Output from the current chunk, compute all the - // parent chaining values along the right edge of the tree, until we - // have the root Output. - let mut output = self.chunk_state.output(); - let mut parent_nodes_remaining = self.cv_stack_len as usize; - while parent_nodes_remaining > 0 { - parent_nodes_remaining -= 1; - output = parent_output( - self.cv_stack[parent_nodes_remaining], - output.chaining_value(), - self.key, - self.flags, - ); - } - output.root_output_bytes(out_slice); - } -} diff --git a/thirdparty/BLAKE3/src/ffi_avx2.rs b/thirdparty/BLAKE3/src/ffi_avx2.rs deleted file mode 100644 index d805e868e..000000000 --- a/thirdparty/BLAKE3/src/ffi_avx2.rs +++ /dev/null @@ -1,63 +0,0 @@ -use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN}; - -// Note that there is no AVX2 implementation of compress_in_place or -// compress_xof. - -// Unsafe because this may only be called on platforms supporting AVX2. -pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>( - inputs: &[&A], - key: &CVWords, - counter: u64, - increment_counter: IncrementCounter, - flags: u8, - flags_start: u8, - flags_end: u8, - out: &mut [u8], -) { - // The Rust hash_many implementations do bounds checking on the `out` - // array, but the C implementations don't. Even though this is an unsafe - // function, assert the bounds here. - assert!(out.len() >= inputs.len() * OUT_LEN); - ffi::blake3_hash_many_avx2( - inputs.as_ptr() as *const *const u8, - inputs.len(), - A::CAPACITY / BLOCK_LEN, - key.as_ptr(), - counter, - increment_counter.yes(), - flags, - flags_start, - flags_end, - out.as_mut_ptr(), - ) -} - -pub mod ffi { - extern "C" { - pub fn blake3_hash_many_avx2( - inputs: *const *const u8, - num_inputs: usize, - blocks: usize, - key: *const u32, - counter: u64, - increment_counter: bool, - flags: u8, - flags_start: u8, - flags_end: u8, - out: *mut u8, - ); - } -} - -#[cfg(test)] -mod test { - use super::*; - - #[test] - fn test_hash_many() { - if !crate::platform::avx2_detected() { - return; - } - crate::test::test_hash_many_fn(hash_many, hash_many); - } -} diff --git a/thirdparty/BLAKE3/src/ffi_avx512.rs b/thirdparty/BLAKE3/src/ffi_avx512.rs deleted file mode 100644 index c1b9f649b..000000000 --- a/thirdparty/BLAKE3/src/ffi_avx512.rs +++ /dev/null @@ -1,114 +0,0 @@ -use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN}; - -// Unsafe because this may only be called on platforms supporting AVX-512. -pub unsafe fn compress_in_place( - cv: &mut CVWords, - block: &[u8; BLOCK_LEN], - block_len: u8, - counter: u64, - flags: u8, -) { - ffi::blake3_compress_in_place_avx512(cv.as_mut_ptr(), block.as_ptr(), block_len, counter, flags) -} - -// Unsafe because this may only be called on platforms supporting AVX-512. -pub unsafe fn compress_xof( - cv: &CVWords, - block: &[u8; BLOCK_LEN], - block_len: u8, - counter: u64, - flags: u8, -) -> [u8; 64] { - let mut out = [0u8; 64]; - ffi::blake3_compress_xof_avx512( - cv.as_ptr(), - block.as_ptr(), - block_len, - counter, - flags, - out.as_mut_ptr(), - ); - out -} - -// Unsafe because this may only be called on platforms supporting AVX-512. -pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>( - inputs: &[&A], - key: &CVWords, - counter: u64, - increment_counter: IncrementCounter, - flags: u8, - flags_start: u8, - flags_end: u8, - out: &mut [u8], -) { - // The Rust hash_many implementations do bounds checking on the `out` - // array, but the C implementations don't. Even though this is an unsafe - // function, assert the bounds here. - assert!(out.len() >= inputs.len() * OUT_LEN); - ffi::blake3_hash_many_avx512( - inputs.as_ptr() as *const *const u8, - inputs.len(), - A::CAPACITY / BLOCK_LEN, - key.as_ptr(), - counter, - increment_counter.yes(), - flags, - flags_start, - flags_end, - out.as_mut_ptr(), - ) -} - -pub mod ffi { - extern "C" { - pub fn blake3_compress_in_place_avx512( - cv: *mut u32, - block: *const u8, - block_len: u8, - counter: u64, - flags: u8, - ); - pub fn blake3_compress_xof_avx512( - cv: *const u32, - block: *const u8, - block_len: u8, - counter: u64, - flags: u8, - out: *mut u8, - ); - pub fn blake3_hash_many_avx512( - inputs: *const *const u8, - num_inputs: usize, - blocks: usize, - key: *const u32, - counter: u64, - increment_counter: bool, - flags: u8, - flags_start: u8, - flags_end: u8, - out: *mut u8, - ); - } -} - -#[cfg(test)] -mod test { - use super::*; - - #[test] - fn test_compress() { - if !crate::platform::avx512_detected() { - return; - } - crate::test::test_compress_fn(compress_in_place, compress_xof); - } - - #[test] - fn test_hash_many() { - if !crate::platform::avx512_detected() { - return; - } - crate::test::test_hash_many_fn(hash_many, hash_many); - } -} diff --git a/thirdparty/BLAKE3/src/ffi_neon.rs b/thirdparty/BLAKE3/src/ffi_neon.rs deleted file mode 100644 index 889974277..000000000 --- a/thirdparty/BLAKE3/src/ffi_neon.rs +++ /dev/null @@ -1,82 +0,0 @@ -use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN}; - -// Unsafe because this may only be called on platforms supporting NEON. -pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>( - inputs: &[&A], - key: &CVWords, - counter: u64, - increment_counter: IncrementCounter, - flags: u8, - flags_start: u8, - flags_end: u8, - out: &mut [u8], -) { - // The Rust hash_many implementations do bounds checking on the `out` - // array, but the C implementations don't. Even though this is an unsafe - // function, assert the bounds here. - assert!(out.len() >= inputs.len() * OUT_LEN); - ffi::blake3_hash_many_neon( - inputs.as_ptr() as *const *const u8, - inputs.len(), - A::CAPACITY / BLOCK_LEN, - key.as_ptr(), - counter, - increment_counter.yes(), - flags, - flags_start, - flags_end, - out.as_mut_ptr(), - ) -} - -// blake3_neon.c normally depends on blake3_portable.c, because the NEON -// implementation only provides 4x compression, and it relies on the portable -// implementation for 1x compression. However, we expose the portable Rust -// implementation here instead, to avoid linking in unnecessary code. -#[no_mangle] -pub extern "C" fn blake3_compress_in_place_portable( - cv: *mut u32, - block: *const u8, - block_len: u8, - counter: u64, - flags: u8, -) { - unsafe { - crate::portable::compress_in_place( - &mut *(cv as *mut [u32; 8]), - &*(block as *const [u8; 64]), - block_len, - counter, - flags, - ) - } -} - -pub mod ffi { - extern "C" { - pub fn blake3_hash_many_neon( - inputs: *const *const u8, - num_inputs: usize, - blocks: usize, - key: *const u32, - counter: u64, - increment_counter: bool, - flags: u8, - flags_start: u8, - flags_end: u8, - out: *mut u8, - ); - } -} - -#[cfg(test)] -mod test { - use super::*; - - #[test] - fn test_hash_many() { - // This entire file is gated on feature="neon", so NEON support is - // assumed here. - crate::test::test_hash_many_fn(hash_many, hash_many); - } -} diff --git a/thirdparty/BLAKE3/src/ffi_sse2.rs b/thirdparty/BLAKE3/src/ffi_sse2.rs deleted file mode 100644 index c49a229ad..000000000 --- a/thirdparty/BLAKE3/src/ffi_sse2.rs +++ /dev/null @@ -1,114 +0,0 @@ -use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN}; - -// Unsafe because this may only be called on platforms supporting SSE2. -pub unsafe fn compress_in_place( - cv: &mut CVWords, - block: &[u8; BLOCK_LEN], - block_len: u8, - counter: u64, - flags: u8, -) { - ffi::blake3_compress_in_place_sse2(cv.as_mut_ptr(), block.as_ptr(), block_len, counter, flags) -} - -// Unsafe because this may only be called on platforms supporting SSE2. -pub unsafe fn compress_xof( - cv: &CVWords, - block: &[u8; BLOCK_LEN], - block_len: u8, - counter: u64, - flags: u8, -) -> [u8; 64] { - let mut out = [0u8; 64]; - ffi::blake3_compress_xof_sse2( - cv.as_ptr(), - block.as_ptr(), - block_len, - counter, - flags, - out.as_mut_ptr(), - ); - out -} - -// Unsafe because this may only be called on platforms supporting SSE2. -pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>( - inputs: &[&A], - key: &CVWords, - counter: u64, - increment_counter: IncrementCounter, - flags: u8, - flags_start: u8, - flags_end: u8, - out: &mut [u8], -) { - // The Rust hash_many implementations do bounds checking on the `out` - // array, but the C implementations don't. Even though this is an unsafe - // function, assert the bounds here. - assert!(out.len() >= inputs.len() * OUT_LEN); - ffi::blake3_hash_many_sse2( - inputs.as_ptr() as *const *const u8, - inputs.len(), - A::CAPACITY / BLOCK_LEN, - key.as_ptr(), - counter, - increment_counter.yes(), - flags, - flags_start, - flags_end, - out.as_mut_ptr(), - ) -} - -pub mod ffi { - extern "C" { - pub fn blake3_compress_in_place_sse2( - cv: *mut u32, - block: *const u8, - block_len: u8, - counter: u64, - flags: u8, - ); - pub fn blake3_compress_xof_sse2( - cv: *const u32, - block: *const u8, - block_len: u8, - counter: u64, - flags: u8, - out: *mut u8, - ); - pub fn blake3_hash_many_sse2( - inputs: *const *const u8, - num_inputs: usize, - blocks: usize, - key: *const u32, - counter: u64, - increment_counter: bool, - flags: u8, - flags_start: u8, - flags_end: u8, - out: *mut u8, - ); - } -} - -#[cfg(test)] -mod test { - use super::*; - - #[test] - fn test_compress() { - if !crate::platform::sse2_detected() { - return; - } - crate::test::test_compress_fn(compress_in_place, compress_xof); - } - - #[test] - fn test_hash_many() { - if !crate::platform::sse2_detected() { - return; - } - crate::test::test_hash_many_fn(hash_many, hash_many); - } -} diff --git a/thirdparty/BLAKE3/src/ffi_sse41.rs b/thirdparty/BLAKE3/src/ffi_sse41.rs deleted file mode 100644 index 0b64c90a0..000000000 --- a/thirdparty/BLAKE3/src/ffi_sse41.rs +++ /dev/null @@ -1,114 +0,0 @@ -use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN}; - -// Unsafe because this may only be called on platforms supporting SSE4.1. -pub unsafe fn compress_in_place( - cv: &mut CVWords, - block: &[u8; BLOCK_LEN], - block_len: u8, - counter: u64, - flags: u8, -) { - ffi::blake3_compress_in_place_sse41(cv.as_mut_ptr(), block.as_ptr(), block_len, counter, flags) -} - -// Unsafe because this may only be called on platforms supporting SSE4.1. -pub unsafe fn compress_xof( - cv: &CVWords, - block: &[u8; BLOCK_LEN], - block_len: u8, - counter: u64, - flags: u8, -) -> [u8; 64] { - let mut out = [0u8; 64]; - ffi::blake3_compress_xof_sse41( - cv.as_ptr(), - block.as_ptr(), - block_len, - counter, - flags, - out.as_mut_ptr(), - ); - out -} - -// Unsafe because this may only be called on platforms supporting SSE4.1. -pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>( - inputs: &[&A], - key: &CVWords, - counter: u64, - increment_counter: IncrementCounter, - flags: u8, - flags_start: u8, - flags_end: u8, - out: &mut [u8], -) { - // The Rust hash_many implementations do bounds checking on the `out` - // array, but the C implementations don't. Even though this is an unsafe - // function, assert the bounds here. - assert!(out.len() >= inputs.len() * OUT_LEN); - ffi::blake3_hash_many_sse41( - inputs.as_ptr() as *const *const u8, - inputs.len(), - A::CAPACITY / BLOCK_LEN, - key.as_ptr(), - counter, - increment_counter.yes(), - flags, - flags_start, - flags_end, - out.as_mut_ptr(), - ) -} - -pub mod ffi { - extern "C" { - pub fn blake3_compress_in_place_sse41( - cv: *mut u32, - block: *const u8, - block_len: u8, - counter: u64, - flags: u8, - ); - pub fn blake3_compress_xof_sse41( - cv: *const u32, - block: *const u8, - block_len: u8, - counter: u64, - flags: u8, - out: *mut u8, - ); - pub fn blake3_hash_many_sse41( - inputs: *const *const u8, - num_inputs: usize, - blocks: usize, - key: *const u32, - counter: u64, - increment_counter: bool, - flags: u8, - flags_start: u8, - flags_end: u8, - out: *mut u8, - ); - } -} - -#[cfg(test)] -mod test { - use super::*; - - #[test] - fn test_compress() { - if !crate::platform::sse41_detected() { - return; - } - crate::test::test_compress_fn(compress_in_place, compress_xof); - } - - #[test] - fn test_hash_many() { - if !crate::platform::sse41_detected() { - return; - } - crate::test::test_hash_many_fn(hash_many, hash_many); - } -} diff --git a/thirdparty/BLAKE3/src/guts.rs b/thirdparty/BLAKE3/src/guts.rs deleted file mode 100644 index 88dcc86cd..000000000 --- a/thirdparty/BLAKE3/src/guts.rs +++ /dev/null @@ -1,95 +0,0 @@ -// This module is for incremental use cases like the `bao` crate, which need to -// get their hands on internal chunk and parent chaining values. The vast -// majority of users should ignore this and use the publicly documented -// interface instead. - -#[derive(Clone, Debug)] -pub struct ChunkState(crate::ChunkState); - -impl ChunkState { - // Currently this type only supports the regular hash mode. If an - // incremental user needs keyed_hash or derive_key, we can add that. - pub fn new(chunk_counter: u64) -> Self { - Self(crate::ChunkState::new( - crate::IV, - chunk_counter, - 0, - crate::platform::Platform::detect(), - )) - } - - #[inline] - pub fn len(&self) -> usize { - self.0.len() - } - - #[inline] - pub fn update(&mut self, input: &[u8]) -> &mut Self { - self.0.update(input); - self - } - - pub fn finalize(&self, is_root: bool) -> crate::Hash { - let output = self.0.output(); - if is_root { - output.root_hash() - } else { - output.chaining_value().into() - } - } -} - -// As above, this currently assumes the regular hash mode. If an incremental -// user needs keyed_hash or derive_key, we can add that. -pub fn parent_cv( - left_child: &crate::Hash, - right_child: &crate::Hash, - is_root: bool, -) -> crate::Hash { - let output = crate::parent_node_output( - left_child.as_bytes(), - right_child.as_bytes(), - crate::IV, - 0, - crate::platform::Platform::detect(), - ); - if is_root { - output.root_hash() - } else { - output.chaining_value().into() - } -} - -#[cfg(test)] -mod test { - use super::*; - - #[test] - fn test_chunk() { - assert_eq!( - crate::hash(b"foo"), - ChunkState::new(0).update(b"foo").finalize(true) - ); - } - - #[test] - fn test_parents() { - let mut hasher = crate::Hasher::new(); - let mut buf = [0; crate::CHUNK_LEN]; - - buf[0] = 'a' as u8; - hasher.update(&buf); - let chunk0_cv = ChunkState::new(0).update(&buf).finalize(false); - - buf[0] = 'b' as u8; - hasher.update(&buf); - let chunk1_cv = ChunkState::new(1).update(&buf).finalize(false); - - hasher.update(b"c"); - let chunk2_cv = ChunkState::new(2).update(b"c").finalize(false); - - let parent = parent_cv(&chunk0_cv, &chunk1_cv, false); - let root = parent_cv(&parent, &chunk2_cv, true); - assert_eq!(hasher.finalize(), root); - } -} diff --git a/thirdparty/BLAKE3/src/join.rs b/thirdparty/BLAKE3/src/join.rs deleted file mode 100644 index 60932db1c..000000000 --- a/thirdparty/BLAKE3/src/join.rs +++ /dev/null @@ -1,120 +0,0 @@ -//! The multi-threading abstractions used by [`Hasher::update_with_join`]. -//! -//! Different implementations of the `Join` trait determine whether -//! [`Hasher::update_with_join`] performs multi-threading on sufficiently large -//! inputs. The `SerialJoin` implementation is single-threaded, and the -//! `RayonJoin` implementation (gated by the `rayon` feature) is -//! multi-threaded. Interfaces other than [`Hasher::update_with_join`], like -//! [`hash`] and [`Hasher::update`], always use `SerialJoin` internally. -//! -//! The `Join` trait is an almost exact copy of the [`rayon::join`] API, and -//! `RayonJoin` is the only non-trivial implementation provided. The only -//! difference between the function signature in the `Join` trait and the -//! underlying one in Rayon, is that the trait method includes two length -//! parameters. This gives an implementation the option of e.g. setting a -//! subtree size threshold below which it keeps splits on the same thread. -//! However, neither of the two provided implementations currently makes use of -//! those parameters. Note that in Rayon, the very first `join` call is more -//! expensive than subsequent calls, because it moves work from the calling -//! thread into the thread pool. That makes a coarse-grained input length -//! threshold in the caller more effective than a fine-grained subtree size -//! threshold after the implementation has already started recursing. -//! -//! # Example -//! -//! ``` -//! // Hash a large input using multi-threading. Note that multi-threading -//! // comes with some overhead, and it can actually hurt performance for small -//! // inputs. The meaning of "small" varies, however, depending on the -//! // platform and the number of threads. (On x86_64, the cutoff tends to be -//! // around 128 KiB.) You should benchmark your own use case to see whether -//! // multi-threading helps. -//! # #[cfg(feature = "rayon")] -//! # { -//! # fn some_large_input() -> &'static [u8] { b"foo" } -//! let input: &[u8] = some_large_input(); -//! let mut hasher = blake3::Hasher::new(); -//! hasher.update_with_join::<blake3::join::RayonJoin>(input); -//! let hash = hasher.finalize(); -//! # } -//! ``` -//! -//! [`Hasher::update_with_join`]: ../struct.Hasher.html#method.update_with_join -//! [`Hasher::update`]: ../struct.Hasher.html#method.update -//! [`hash`]: ../fn.hash.html -//! [`rayon::join`]: https://docs.rs/rayon/1.3.0/rayon/fn.join.html - -/// The trait that abstracts over single-threaded and multi-threaded recursion. -/// -/// See the [`join` module docs](index.html) for more details. -pub trait Join { - fn join<A, B, RA, RB>(oper_a: A, oper_b: B, len_a: usize, len_b: usize) -> (RA, RB) - where - A: FnOnce() -> RA + Send, - B: FnOnce() -> RB + Send, - RA: Send, - RB: Send; -} - -/// The trivial, serial implementation of `Join`. The left and right sides are -/// executed one after the other, on the calling thread. The standalone hashing -/// functions and the `Hasher::update` method use this implementation -/// internally. -/// -/// See the [`join` module docs](index.html) for more details. -pub enum SerialJoin {} - -impl Join for SerialJoin { - #[inline] - fn join<A, B, RA, RB>(oper_a: A, oper_b: B, _len_a: usize, _len_b: usize) -> (RA, RB) - where - A: FnOnce() -> RA + Send, - B: FnOnce() -> RB + Send, - RA: Send, - RB: Send, - { - (oper_a(), oper_b()) - } -} - -/// The Rayon-based implementation of `Join`. The left and right sides are -/// executed on the Rayon thread pool, potentially in parallel. This -/// implementation is gated by the `rayon` feature, which is off by default. -/// -/// See the [`join` module docs](index.html) for more details. -#[cfg(feature = "rayon")] -pub enum RayonJoin {} - -#[cfg(feature = "rayon")] -impl Join for RayonJoin { - #[inline] - fn join<A, B, RA, RB>(oper_a: A, oper_b: B, _len_a: usize, _len_b: usize) -> (RA, RB) - where - A: FnOnce() -> RA + Send, - B: FnOnce() -> RB + Send, - RA: Send, - RB: Send, - { - rayon::join(oper_a, oper_b) - } -} - -#[cfg(test)] -mod test { - use super::*; - - #[test] - fn test_serial_join() { - let oper_a = || 1 + 1; - let oper_b = || 2 + 2; - assert_eq!((2, 4), SerialJoin::join(oper_a, oper_b, 3, 4)); - } - - #[test] - #[cfg(feature = "rayon")] - fn test_rayon_join() { - let oper_a = || 1 + 1; - let oper_b = || 2 + 2; - assert_eq!((2, 4), RayonJoin::join(oper_a, oper_b, 3, 4)); - } -} diff --git a/thirdparty/BLAKE3/src/lib.rs b/thirdparty/BLAKE3/src/lib.rs deleted file mode 100644 index bf66b6dae..000000000 --- a/thirdparty/BLAKE3/src/lib.rs +++ /dev/null @@ -1,1359 +0,0 @@ -//! The official Rust implementation of the [BLAKE3] cryptographic hash -//! function. -//! -//! # Examples -//! -//! ``` -//! # fn main() -> Result<(), Box<dyn std::error::Error>> { -//! // Hash an input all at once. -//! let hash1 = blake3::hash(b"foobarbaz"); -//! -//! // Hash an input incrementally. -//! let mut hasher = blake3::Hasher::new(); -//! hasher.update(b"foo"); -//! hasher.update(b"bar"); -//! hasher.update(b"baz"); -//! let hash2 = hasher.finalize(); -//! assert_eq!(hash1, hash2); -//! -//! // Extended output. OutputReader also implements Read and Seek. -//! # #[cfg(feature = "std")] { -//! let mut output = [0; 1000]; -//! let mut output_reader = hasher.finalize_xof(); -//! output_reader.fill(&mut output); -//! assert_eq!(&output[..32], hash1.as_bytes()); -//! # } -//! -//! // Print a hash as hex. -//! println!("{}", hash1.to_hex()); -//! # Ok(()) -//! # } -//! ``` -//! -//! # Cargo Features -//! -//! The `rayon` feature provides [Rayon]-based multi-threading, in particular -//! the [`join::RayonJoin`] type for use with [`Hasher::update_with_join`]. It -//! is disabled by default, but enabled for [docs.rs]. -//! -//! The `neon` feature enables ARM NEON support. Currently there is no runtime -//! CPU feature detection for NEON, so you must only enable this feature for -//! targets that are known to have NEON support. In particular, some ARMv7 -//! targets support NEON, and some don't. -//! -//! The `std` feature (enabled by default) is required for implementations of -//! the [`Write`] and [`Seek`] traits, and also for runtime CPU feature -//! detection. If this feature is disabled, the only way to use the SIMD -//! implementations in this crate is to enable the corresponding instruction -//! sets statically for the entire build, with e.g. `RUSTFLAGS="-C -//! target-cpu=native"`. The resulting binary will not be portable to other -//! machines. -//! -//! [BLAKE3]: https://blake3.io -//! [Rayon]: https://github.com/rayon-rs/rayon -//! [`join::RayonJoin`]: join/enum.RayonJoin.html -//! [`Hasher::update_with_join`]: struct.Hasher.html#method.update_with_join -//! [docs.rs]: https://docs.rs/ -//! [`Write`]: https://doc.rust-lang.org/std/io/trait.Write.html -//! [`Seek`]: https://doc.rust-lang.org/std/io/trait.Seek.html - -#![cfg_attr(not(feature = "std"), no_std)] - -#[cfg(test)] -mod test; - -// The guts module is for incremental use cases like the `bao` crate that need -// to explicitly compute chunk and parent chaining values. It is semi-stable -// and likely to keep working, but largely undocumented and not intended for -// widespread use. -#[doc(hidden)] -pub mod guts; - -// The platform module is pub for benchmarks only. It is not stable. -#[doc(hidden)] -pub mod platform; - -// Platform-specific implementations of the compression function. These -// BLAKE3-specific cfg flags are set in build.rs. -#[cfg(blake3_avx2_rust)] -#[path = "rust_avx2.rs"] -mod avx2; -#[cfg(blake3_avx2_ffi)] -#[path = "ffi_avx2.rs"] -mod avx2; -#[cfg(blake3_avx512_ffi)] -#[path = "ffi_avx512.rs"] -mod avx512; -#[cfg(feature = "neon")] -#[path = "ffi_neon.rs"] -mod neon; -mod portable; -#[cfg(blake3_sse2_rust)] -#[path = "rust_sse2.rs"] -mod sse2; -#[cfg(blake3_sse2_ffi)] -#[path = "ffi_sse2.rs"] -mod sse2; -#[cfg(blake3_sse41_rust)] -#[path = "rust_sse41.rs"] -mod sse41; -#[cfg(blake3_sse41_ffi)] -#[path = "ffi_sse41.rs"] -mod sse41; - -pub mod traits; - -pub mod join; - -use arrayref::{array_mut_ref, array_ref}; -use arrayvec::{ArrayString, ArrayVec}; -use core::cmp; -use core::fmt; -use join::{Join, SerialJoin}; -use platform::{Platform, MAX_SIMD_DEGREE, MAX_SIMD_DEGREE_OR_2}; - -/// The number of bytes in a [`Hash`](struct.Hash.html), 32. -pub const OUT_LEN: usize = 32; - -/// The number of bytes in a key, 32. -pub const KEY_LEN: usize = 32; - -// These constants are pub for incremental use cases like `bao`, as well as -// tests and benchmarks. Most callers should not need them. -#[doc(hidden)] -pub const BLOCK_LEN: usize = 64; -#[doc(hidden)] -pub const CHUNK_LEN: usize = 1024; -#[doc(hidden)] -pub const MAX_DEPTH: usize = 54; // 2^54 * CHUNK_LEN = 2^64 - -// While iterating the compression function within a chunk, the CV is -// represented as words, to avoid doing two extra endianness conversions for -// each compression in the portable implementation. But the hash_many interface -// needs to hash both input bytes and parent nodes, so its better for its -// output CVs to be represented as bytes. -type CVWords = [u32; 8]; -type CVBytes = [u8; 32]; // little-endian - -const IV: &CVWords = &[ - 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19, -]; - -const MSG_SCHEDULE: [[usize; 16]; 7] = [ - [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], - [2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8], - [3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1], - [10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6], - [12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4], - [9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7], - [11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13], -]; - -// These are the internal flags that we use to domain separate root/non-root, -// chunk/parent, and chunk beginning/middle/end. These get set at the high end -// of the block flags word in the compression function, so their values start -// high and go down. -const CHUNK_START: u8 = 1 << 0; -const CHUNK_END: u8 = 1 << 1; -const PARENT: u8 = 1 << 2; -const ROOT: u8 = 1 << 3; -const KEYED_HASH: u8 = 1 << 4; -const DERIVE_KEY_CONTEXT: u8 = 1 << 5; -const DERIVE_KEY_MATERIAL: u8 = 1 << 6; - -#[inline] -fn counter_low(counter: u64) -> u32 { - counter as u32 -} - -#[inline] -fn counter_high(counter: u64) -> u32 { - (counter >> 32) as u32 -} - -/// An output of the default size, 32 bytes, which provides constant-time -/// equality checking. -/// -/// `Hash` implements [`From`] and [`Into`] for `[u8; 32]`, and it provides an -/// explicit [`as_bytes`] method returning `&[u8; 32]`. However, byte arrays -/// and slices don't provide constant-time equality checking, which is often a -/// security requirement in software that handles private data. `Hash` doesn't -/// implement [`Deref`] or [`AsRef`], to avoid situations where a type -/// conversion happens implicitly and the constant-time property is -/// accidentally lost. -/// -/// `Hash` provides the [`to_hex`] method for converting to hexadecimal. It -/// doesn't directly support converting from hexadecimal, but here's an example -/// of doing that with the [`hex`] crate: -/// -/// ``` -/// # fn main() -> Result<(), Box<dyn std::error::Error>> { -/// use std::convert::TryInto; -/// -/// let hash_hex = "d74981efa70a0c880b8d8c1985d075dbcbf679b99a5f9914e5aaf96b831a9e24"; -/// let hash_bytes = hex::decode(hash_hex)?; -/// let hash_array: [u8; blake3::OUT_LEN] = hash_bytes[..].try_into()?; -/// let hash: blake3::Hash = hash_array.into(); -/// # Ok(()) -/// # } -/// ``` -/// -/// [`From`]: https://doc.rust-lang.org/std/convert/trait.From.html -/// [`Into`]: https://doc.rust-lang.org/std/convert/trait.Into.html -/// [`as_bytes`]: #method.as_bytes -/// [`Deref`]: https://doc.rust-lang.org/stable/std/ops/trait.Deref.html -/// [`AsRef`]: https://doc.rust-lang.org/std/convert/trait.AsRef.html -/// [`to_hex`]: #method.to_hex -/// [`hex`]: https://crates.io/crates/hex -#[derive(Clone, Copy, Hash)] -pub struct Hash([u8; OUT_LEN]); - -impl Hash { - /// The bytes of the `Hash`. Note that byte arrays don't provide - /// constant-time equality checking, so if you need to compare hashes, - /// prefer the `Hash` type. - #[inline] - pub fn as_bytes(&self) -> &[u8; OUT_LEN] { - &self.0 - } - - /// The hexadecimal encoding of the `Hash`. The returned [`ArrayString`] is - /// a fixed size and doesn't allocate memory on the heap. Note that - /// [`ArrayString`] doesn't provide constant-time equality checking, so if - /// you need to compare hashes, prefer the `Hash` type. - /// - /// [`ArrayString`]: https://docs.rs/arrayvec/0.5.1/arrayvec/struct.ArrayString.html - pub fn to_hex(&self) -> ArrayString<[u8; 2 * OUT_LEN]> { - let mut s = ArrayString::new(); - let table = b"0123456789abcdef"; - for &b in self.0.iter() { - s.push(table[(b >> 4) as usize] as char); - s.push(table[(b & 0xf) as usize] as char); - } - s - } -} - -impl From<[u8; OUT_LEN]> for Hash { - #[inline] - fn from(bytes: [u8; OUT_LEN]) -> Self { - Self(bytes) - } -} - -impl From<Hash> for [u8; OUT_LEN] { - #[inline] - fn from(hash: Hash) -> Self { - hash.0 - } -} - -/// This implementation is constant-time. -impl PartialEq for Hash { - #[inline] - fn eq(&self, other: &Hash) -> bool { - constant_time_eq::constant_time_eq_32(&self.0, &other.0) - } -} - -/// This implementation is constant-time. -impl PartialEq<[u8; OUT_LEN]> for Hash { - #[inline] - fn eq(&self, other: &[u8; OUT_LEN]) -> bool { - constant_time_eq::constant_time_eq_32(&self.0, other) - } -} - -impl Eq for Hash {} - -impl fmt::Debug for Hash { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - // Formatting field as `&str` to reduce code size since the `Debug` - // dynamic dispatch table for `&str` is likely needed elsewhere already, - // but that for `ArrayString<[u8; 64]>` is not. - let hex = self.to_hex(); - let hex: &str = hex.as_str(); - - f.debug_tuple("Hash").field(&hex).finish() - } -} - -// Each chunk or parent node can produce either a 32-byte chaining value or, by -// setting the ROOT flag, any number of final output bytes. The Output struct -// captures the state just prior to choosing between those two possibilities. -#[derive(Clone)] -struct Output { - input_chaining_value: CVWords, - block: [u8; 64], - block_len: u8, - counter: u64, - flags: u8, - platform: Platform, -} - -impl Output { - fn chaining_value(&self) -> CVBytes { - let mut cv = self.input_chaining_value; - self.platform.compress_in_place( - &mut cv, - &self.block, - self.block_len, - self.counter, - self.flags, - ); - platform::le_bytes_from_words_32(&cv) - } - - fn root_hash(&self) -> Hash { - debug_assert_eq!(self.counter, 0); - let mut cv = self.input_chaining_value; - self.platform - .compress_in_place(&mut cv, &self.block, self.block_len, 0, self.flags | ROOT); - Hash(platform::le_bytes_from_words_32(&cv)) - } - - fn root_output_block(&self) -> [u8; 2 * OUT_LEN] { - self.platform.compress_xof( - &self.input_chaining_value, - &self.block, - self.block_len, - self.counter, - self.flags | ROOT, - ) - } -} - -#[derive(Clone)] -struct ChunkState { - cv: CVWords, - chunk_counter: u64, - buf: [u8; BLOCK_LEN], - buf_len: u8, - blocks_compressed: u8, - flags: u8, - platform: Platform, -} - -impl ChunkState { - fn new(key: &CVWords, chunk_counter: u64, flags: u8, platform: Platform) -> Self { - Self { - cv: *key, - chunk_counter, - buf: [0; BLOCK_LEN], - buf_len: 0, - blocks_compressed: 0, - flags, - platform, - } - } - - fn len(&self) -> usize { - BLOCK_LEN * self.blocks_compressed as usize + self.buf_len as usize - } - - fn fill_buf(&mut self, input: &mut &[u8]) { - let want = BLOCK_LEN - self.buf_len as usize; - let take = cmp::min(want, input.len()); - self.buf[self.buf_len as usize..][..take].copy_from_slice(&input[..take]); - self.buf_len += take as u8; - *input = &input[take..]; - } - - fn start_flag(&self) -> u8 { - if self.blocks_compressed == 0 { - CHUNK_START - } else { - 0 - } - } - - // Try to avoid buffering as much as possible, by compressing directly from - // the input slice when full blocks are available. - fn update(&mut self, mut input: &[u8]) -> &mut Self { - if self.buf_len > 0 { - self.fill_buf(&mut input); - if !input.is_empty() { - debug_assert_eq!(self.buf_len as usize, BLOCK_LEN); - let block_flags = self.flags | self.start_flag(); // borrowck - self.platform.compress_in_place( - &mut self.cv, - &self.buf, - BLOCK_LEN as u8, - self.chunk_counter, - block_flags, - ); - self.buf_len = 0; - self.buf = [0; BLOCK_LEN]; - self.blocks_compressed += 1; - } - } - - while input.len() > BLOCK_LEN { - debug_assert_eq!(self.buf_len, 0); - let block_flags = self.flags | self.start_flag(); // borrowck - self.platform.compress_in_place( - &mut self.cv, - array_ref!(input, 0, BLOCK_LEN), - BLOCK_LEN as u8, - self.chunk_counter, - block_flags, - ); - self.blocks_compressed += 1; - input = &input[BLOCK_LEN..]; - } - - self.fill_buf(&mut input); - debug_assert!(input.is_empty()); - debug_assert!(self.len() <= CHUNK_LEN); - self - } - - fn output(&self) -> Output { - let block_flags = self.flags | self.start_flag() | CHUNK_END; - Output { - input_chaining_value: self.cv, - block: self.buf, - block_len: self.buf_len, - counter: self.chunk_counter, - flags: block_flags, - platform: self.platform, - } - } -} - -// Don't derive(Debug), because the state may be secret. -impl fmt::Debug for ChunkState { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - f.debug_struct("ChunkState") - .field("len", &self.len()) - .field("chunk_counter", &self.chunk_counter) - .field("flags", &self.flags) - .field("platform", &self.platform) - .finish() - } -} - -// IMPLEMENTATION NOTE -// =================== -// The recursive function compress_subtree_wide(), implemented below, is the -// basis of high-performance BLAKE3. We use it both for all-at-once hashing, -// and for the incremental input with Hasher (though we have to be careful with -// subtree boundaries in the incremental case). compress_subtree_wide() applies -// several optimizations at the same time: -// - Multi-threading with Rayon. -// - Parallel chunk hashing with SIMD. -// - Parallel parent hashing with SIMD. Note that while SIMD chunk hashing -// maxes out at MAX_SIMD_DEGREE*CHUNK_LEN, parallel parent hashing continues -// to benefit from larger inputs, because more levels of the tree benefit can -// use full-width SIMD vectors for parent hashing. Without parallel parent -// hashing, we lose about 10% of overall throughput on AVX2 and AVX-512. - -// pub for benchmarks -#[doc(hidden)] -#[derive(Clone, Copy)] -pub enum IncrementCounter { - Yes, - No, -} - -impl IncrementCounter { - #[inline] - fn yes(&self) -> bool { - match self { - IncrementCounter::Yes => true, - IncrementCounter::No => false, - } - } -} - -// The largest power of two less than or equal to `n`, used for left_len() -// immediately below, and also directly in Hasher::update(). -fn largest_power_of_two_leq(n: usize) -> usize { - ((n / 2) + 1).next_power_of_two() -} - -// Given some input larger than one chunk, return the number of bytes that -// should go in the left subtree. This is the largest power-of-2 number of -// chunks that leaves at least 1 byte for the right subtree. -fn left_len(content_len: usize) -> usize { - debug_assert!(content_len > CHUNK_LEN); - // Subtract 1 to reserve at least one byte for the right side. - let full_chunks = (content_len - 1) / CHUNK_LEN; - largest_power_of_two_leq(full_chunks) * CHUNK_LEN -} - -// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE chunks at the same time -// on a single thread. Write out the chunk chaining values and return the -// number of chunks hashed. These chunks are never the root and never empty; -// those cases use a different codepath. -fn compress_chunks_parallel( - input: &[u8], - key: &CVWords, - chunk_counter: u64, - flags: u8, - platform: Platform, - out: &mut [u8], -) -> usize { - debug_assert!(!input.is_empty(), "empty chunks below the root"); - debug_assert!(input.len() <= MAX_SIMD_DEGREE * CHUNK_LEN); - - let mut chunks_exact = input.chunks_exact(CHUNK_LEN); - let mut chunks_array = ArrayVec::<[&[u8; CHUNK_LEN]; MAX_SIMD_DEGREE]>::new(); - for chunk in &mut chunks_exact { - chunks_array.push(array_ref!(chunk, 0, CHUNK_LEN)); - } - platform.hash_many( - &chunks_array, - key, - chunk_counter, - IncrementCounter::Yes, - flags, - CHUNK_START, - CHUNK_END, - out, - ); - - // Hash the remaining partial chunk, if there is one. Note that the empty - // chunk (meaning the empty message) is a different codepath. - let chunks_so_far = chunks_array.len(); - if !chunks_exact.remainder().is_empty() { - let counter = chunk_counter + chunks_so_far as u64; - let mut chunk_state = ChunkState::new(key, counter, flags, platform); - chunk_state.update(chunks_exact.remainder()); - *array_mut_ref!(out, chunks_so_far * OUT_LEN, OUT_LEN) = - chunk_state.output().chaining_value(); - chunks_so_far + 1 - } else { - chunks_so_far - } -} - -// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE parents at the same time -// on a single thread. Write out the parent chaining values and return the -// number of parents hashed. (If there's an odd input chaining value left over, -// return it as an additional output.) These parents are never the root and -// never empty; those cases use a different codepath. -fn compress_parents_parallel( - child_chaining_values: &[u8], - key: &CVWords, - flags: u8, - platform: Platform, - out: &mut [u8], -) -> usize { - debug_assert_eq!(child_chaining_values.len() % OUT_LEN, 0, "wacky hash bytes"); - let num_children = child_chaining_values.len() / OUT_LEN; - debug_assert!(num_children >= 2, "not enough children"); - debug_assert!(num_children <= 2 * MAX_SIMD_DEGREE_OR_2, "too many"); - - let mut parents_exact = child_chaining_values.chunks_exact(BLOCK_LEN); - // Use MAX_SIMD_DEGREE_OR_2 rather than MAX_SIMD_DEGREE here, because of - // the requirements of compress_subtree_wide(). - let mut parents_array = ArrayVec::<[&[u8; BLOCK_LEN]; MAX_SIMD_DEGREE_OR_2]>::new(); - for parent in &mut parents_exact { - parents_array.push(array_ref!(parent, 0, BLOCK_LEN)); - } - platform.hash_many( - &parents_array, - key, - 0, // Parents always use counter 0. - IncrementCounter::No, - flags | PARENT, - 0, // Parents have no start flags. - 0, // Parents have no end flags. - out, - ); - - // If there's an odd child left over, it becomes an output. - let parents_so_far = parents_array.len(); - if !parents_exact.remainder().is_empty() { - out[parents_so_far * OUT_LEN..][..OUT_LEN].copy_from_slice(parents_exact.remainder()); - parents_so_far + 1 - } else { - parents_so_far - } -} - -// The wide helper function returns (writes out) an array of chaining values -// and returns the length of that array. The number of chaining values returned -// is the dyanmically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer, -// if the input is shorter than that many chunks. The reason for maintaining a -// wide array of chaining values going back up the tree, is to allow the -// implementation to hash as many parents in parallel as possible. -// -// As a special case when the SIMD degree is 1, this function will still return -// at least 2 outputs. This guarantees that this function doesn't perform the -// root compression. (If it did, it would use the wrong flags, and also we -// wouldn't be able to implement exendable ouput.) Note that this function is -// not used when the whole input is only 1 chunk long; that's a different -// codepath. -// -// Why not just have the caller split the input on the first update(), instead -// of implementing this special rule? Because we don't want to limit SIMD or -// multi-threading parallelism for that update(). -fn compress_subtree_wide<J: Join>( - input: &[u8], - key: &CVWords, - chunk_counter: u64, - flags: u8, - platform: Platform, - out: &mut [u8], -) -> usize { - // Note that the single chunk case does *not* bump the SIMD degree up to 2 - // when it is 1. This allows Rayon the option of multi-threading even the - // 2-chunk case, which can help performance on smaller platforms. - if input.len() <= platform.simd_degree() * CHUNK_LEN { - return compress_chunks_parallel(input, key, chunk_counter, flags, platform, out); - } - - // With more than simd_degree chunks, we need to recurse. Start by dividing - // the input into left and right subtrees. (Note that this is only optimal - // as long as the SIMD degree is a power of 2. If we ever get a SIMD degree - // of 3 or something, we'll need a more complicated strategy.) - debug_assert_eq!(platform.simd_degree().count_ones(), 1, "power of 2"); - let (left, right) = input.split_at(left_len(input.len())); - let right_chunk_counter = chunk_counter + (left.len() / CHUNK_LEN) as u64; - - // Make space for the child outputs. Here we use MAX_SIMD_DEGREE_OR_2 to - // account for the special case of returning 2 outputs when the SIMD degree - // is 1. - let mut cv_array = [0; 2 * MAX_SIMD_DEGREE_OR_2 * OUT_LEN]; - let degree = if left.len() == CHUNK_LEN { - // The "simd_degree=1 and we're at the leaf nodes" case. - debug_assert_eq!(platform.simd_degree(), 1); - 1 - } else { - cmp::max(platform.simd_degree(), 2) - }; - let (left_out, right_out) = cv_array.split_at_mut(degree * OUT_LEN); - - // Recurse! This uses multiple threads if the "rayon" feature is enabled. - let (left_n, right_n) = J::join( - || compress_subtree_wide::<J>(left, key, chunk_counter, flags, platform, left_out), - || compress_subtree_wide::<J>(right, key, right_chunk_counter, flags, platform, right_out), - left.len(), - right.len(), - ); - - // The special case again. If simd_degree=1, then we'll have left_n=1 and - // right_n=1. Rather than compressing them into a single output, return - // them directly, to make sure we always have at least two outputs. - debug_assert_eq!(left_n, degree); - debug_assert!(right_n >= 1 && right_n <= left_n); - if left_n == 1 { - out[..2 * OUT_LEN].copy_from_slice(&cv_array[..2 * OUT_LEN]); - return 2; - } - - // Otherwise, do one layer of parent node compression. - let num_children = left_n + right_n; - compress_parents_parallel( - &cv_array[..num_children * OUT_LEN], - key, - flags, - platform, - out, - ) -} - -// Hash a subtree with compress_subtree_wide(), and then condense the resulting -// list of chaining values down to a single parent node. Don't compress that -// last parent node, however. Instead, return its message bytes (the -// concatenated chaining values of its children). This is necessary when the -// first call to update() supplies a complete subtree, because the topmost -// parent node of that subtree could end up being the root. It's also necessary -// for extended output in the general case. -// -// As with compress_subtree_wide(), this function is not used on inputs of 1 -// chunk or less. That's a different codepath. -fn compress_subtree_to_parent_node<J: Join>( - input: &[u8], - key: &CVWords, - chunk_counter: u64, - flags: u8, - platform: Platform, -) -> [u8; BLOCK_LEN] { - debug_assert!(input.len() > CHUNK_LEN); - let mut cv_array = [0; MAX_SIMD_DEGREE_OR_2 * OUT_LEN]; - let mut num_cvs = - compress_subtree_wide::<J>(input, &key, chunk_counter, flags, platform, &mut cv_array); - debug_assert!(num_cvs >= 2); - - // If MAX_SIMD_DEGREE is greater than 2 and there's enough input, - // compress_subtree_wide() returns more than 2 chaining values. Condense - // them into 2 by forming parent nodes repeatedly. - let mut out_array = [0; MAX_SIMD_DEGREE_OR_2 * OUT_LEN / 2]; - while num_cvs > 2 { - let cv_slice = &cv_array[..num_cvs * OUT_LEN]; - num_cvs = compress_parents_parallel(cv_slice, key, flags, platform, &mut out_array); - cv_array[..num_cvs * OUT_LEN].copy_from_slice(&out_array[..num_cvs * OUT_LEN]); - } - *array_ref!(cv_array, 0, 2 * OUT_LEN) -} - -// Hash a complete input all at once. Unlike compress_subtree_wide() and -// compress_subtree_to_parent_node(), this function handles the 1 chunk case. -// Note that this we use SerialJoin here, so this is always single-threaded. -fn hash_all_at_once(input: &[u8], key: &CVWords, flags: u8) -> Output { - let platform = Platform::detect(); - - // If the whole subtree is one chunk, hash it directly with a ChunkState. - if input.len() <= CHUNK_LEN { - return ChunkState::new(key, 0, flags, platform) - .update(input) - .output(); - } - - // Otherwise construct an Output object from the parent node returned by - // compress_subtree_to_parent_node(). - Output { - input_chaining_value: *key, - block: compress_subtree_to_parent_node::<SerialJoin>(input, key, 0, flags, platform), - block_len: BLOCK_LEN as u8, - counter: 0, - flags: flags | PARENT, - platform, - } -} - -/// The default hash function. -/// -/// For an incremental version that accepts multiple writes, see [`Hasher::update`]. -/// -/// This function is always single-threaded. For multi-threading support, see -/// [`Hasher::update_with_join`]. -/// -/// [`Hasher::update`]: struct.Hasher.html#method.update -/// [`Hasher::update_with_join`]: struct.Hasher.html#method.update_with_join -pub fn hash(input: &[u8]) -> Hash { - hash_all_at_once(input, IV, 0).root_hash() -} - -/// The keyed hash function. -/// -/// This is suitable for use as a message authentication code, for -/// example to replace an HMAC instance. -/// In that use case, the constant-time equality checking provided by -/// [`Hash`](struct.Hash.html) is almost always a security requirement, and -/// callers need to be careful not to compare MACs as raw bytes. -/// -/// This function is always single-threaded. For multi-threading support, see -/// [`Hasher::update_with_join`]. -/// -/// [`Hasher::update_with_join`]: struct.Hasher.html#method.update_with_join -pub fn keyed_hash(key: &[u8; KEY_LEN], input: &[u8]) -> Hash { - let key_words = platform::words_from_le_bytes_32(key); - hash_all_at_once(input, &key_words, KEYED_HASH).root_hash() -} - -/// The key derivation function. -/// -/// Given cryptographic key material of any length and a context string of any -/// length, this function outputs a derived subkey of any length. **The context -/// string should be hardcoded, globally unique, and application-specific.** A -/// good default format for such strings is `"[application] [commit timestamp] -/// [purpose]"`, e.g., `"example.com 2019-12-25 16:18:03 session tokens v1"`. -/// -/// Key derivation is important when you want to use the same key in multiple -/// algorithms or use cases. Using the same key with different cryptographic -/// algorithms is generally forbidden, and deriving a separate subkey for each -/// use case protects you from bad interactions. Derived keys also mitigate the -/// damage from one part of your application accidentally leaking its key. -/// -/// As a rare exception to that general rule, however, it is possible to use -/// `derive_key` itself with key material that you are already using with -/// another algorithm. You might need to do this if you're adding features to -/// an existing application, which does not yet use key derivation internally. -/// However, you still must not share key material with algorithms that forbid -/// key reuse entirely, like a one-time pad. -/// -/// Note that BLAKE3 is not a password hash, and **`derive_key` should never be -/// used with passwords.** Instead, use a dedicated password hash like -/// [Argon2]. Password hashes are entirely different from generic hash -/// functions, with opposite design requirements. -/// -/// This function is always single-threaded. For multi-threading support, see -/// [`Hasher::update_with_join`]. -/// -/// [`Hasher::new_derive_key`]: struct.Hasher.html#method.new_derive_key -/// [`Hasher::finalize_xof`]: struct.Hasher.html#method.finalize_xof -/// [Argon2]: https://en.wikipedia.org/wiki/Argon2 -/// [`Hasher::update_with_join`]: struct.Hasher.html#method.update_with_join -pub fn derive_key(context: &str, key_material: &[u8], output: &mut [u8]) { - let context_key = hash_all_at_once(context.as_bytes(), IV, DERIVE_KEY_CONTEXT).root_hash(); - let context_key_words = platform::words_from_le_bytes_32(context_key.as_bytes()); - let inner_output = hash_all_at_once(key_material, &context_key_words, DERIVE_KEY_MATERIAL); - OutputReader::new(inner_output).fill(output); -} - -fn parent_node_output( - left_child: &CVBytes, - right_child: &CVBytes, - key: &CVWords, - flags: u8, - platform: Platform, -) -> Output { - let mut block = [0; BLOCK_LEN]; - block[..32].copy_from_slice(left_child); - block[32..].copy_from_slice(right_child); - Output { - input_chaining_value: *key, - block, - block_len: BLOCK_LEN as u8, - counter: 0, - flags: flags | PARENT, - platform, - } -} - -/// An incremental hash state that can accept any number of writes. -/// -/// In addition to its inherent methods, this type implements several commonly -/// used traits from the [`digest`](https://crates.io/crates/digest) and -/// [`crypto_mac`](https://crates.io/crates/crypto-mac) crates. -/// -/// **Performance note:** The [`update`] and [`update_with_join`] methods -/// perform poorly when the caller's input buffer is small. See their method -/// docs below. A 16 KiB buffer is large enough to leverage all currently -/// supported SIMD instruction sets. -/// -/// # Examples -/// -/// ``` -/// # fn main() -> Result<(), Box<dyn std::error::Error>> { -/// // Hash an input incrementally. -/// let mut hasher = blake3::Hasher::new(); -/// hasher.update(b"foo"); -/// hasher.update(b"bar"); -/// hasher.update(b"baz"); -/// assert_eq!(hasher.finalize(), blake3::hash(b"foobarbaz")); -/// -/// // Extended output. OutputReader also implements Read and Seek. -/// # #[cfg(feature = "std")] { -/// let mut output = [0; 1000]; -/// let mut output_reader = hasher.finalize_xof(); -/// output_reader.fill(&mut output); -/// assert_eq!(&output[..32], blake3::hash(b"foobarbaz").as_bytes()); -/// # } -/// # Ok(()) -/// # } -/// ``` -/// -/// [`update`]: #method.update -/// [`update_with_join`]: #method.update_with_join -#[derive(Clone)] -pub struct Hasher { - key: CVWords, - chunk_state: ChunkState, - // The stack size is MAX_DEPTH + 1 because we do lazy merging. For example, - // with 7 chunks, we have 3 entries in the stack. Adding an 8th chunk - // requires a 4th entry, rather than merging everything down to 1, because - // we don't know whether more input is coming. This is different from how - // the reference implementation does things. - cv_stack: ArrayVec<[CVBytes; MAX_DEPTH + 1]>, -} - -impl Hasher { - fn new_internal(key: &CVWords, flags: u8) -> Self { - Self { - key: *key, - chunk_state: ChunkState::new(key, 0, flags, Platform::detect()), - cv_stack: ArrayVec::new(), - } - } - - /// Construct a new `Hasher` for the regular hash function. - pub fn new() -> Self { - Self::new_internal(IV, 0) - } - - /// Construct a new `Hasher` for the keyed hash function. See - /// [`keyed_hash`]. - /// - /// [`keyed_hash`]: fn.keyed_hash.html - pub fn new_keyed(key: &[u8; KEY_LEN]) -> Self { - let key_words = platform::words_from_le_bytes_32(key); - Self::new_internal(&key_words, KEYED_HASH) - } - - /// Construct a new `Hasher` for the key derivation function. See - /// [`derive_key`]. The context string should be hardcoded, globally - /// unique, and application-specific. - /// - /// [`derive_key`]: fn.derive_key.html - pub fn new_derive_key(context: &str) -> Self { - let context_key = hash_all_at_once(context.as_bytes(), IV, DERIVE_KEY_CONTEXT).root_hash(); - let context_key_words = platform::words_from_le_bytes_32(context_key.as_bytes()); - Self::new_internal(&context_key_words, DERIVE_KEY_MATERIAL) - } - - /// Reset the `Hasher` to its initial state. - /// - /// This is functionally the same as overwriting the `Hasher` with a new - /// one, using the same key or context string if any. However, depending on - /// how much inlining the optimizer does, moving a `Hasher` might copy its - /// entire CV stack, most of which is useless uninitialized bytes. This - /// methods avoids that copy. - pub fn reset(&mut self) -> &mut Self { - self.chunk_state = ChunkState::new( - &self.key, - 0, - self.chunk_state.flags, - self.chunk_state.platform, - ); - self.cv_stack.clear(); - self - } - - // As described in push_cv() below, we do "lazy merging", delaying merges - // until right before the next CV is about to be added. This is different - // from the reference implementation. Another difference is that we aren't - // always merging 1 chunk at a time. Instead, each CV might represent any - // power-of-two number of chunks, as long as the smaller-above-larger stack - // order is maintained. Instead of the "count the trailing 0-bits" - // algorithm described in the spec, we use a "count the total number of - // 1-bits" variant that doesn't require us to retain the subtree size of - // the CV on top of the stack. The principle is the same: each CV that - // should remain in the stack is represented by a 1-bit in the total number - // of chunks (or bytes) so far. - fn merge_cv_stack(&mut self, total_len: u64) { - let post_merge_stack_len = total_len.count_ones() as usize; - while self.cv_stack.len() > post_merge_stack_len { - let right_child = self.cv_stack.pop().unwrap(); - let left_child = self.cv_stack.pop().unwrap(); - let parent_output = parent_node_output( - &left_child, - &right_child, - &self.key, - self.chunk_state.flags, - self.chunk_state.platform, - ); - self.cv_stack.push(parent_output.chaining_value()); - } - } - - // In reference_impl.rs, we merge the new CV with existing CVs from the - // stack before pushing it. We can do that because we know more input is - // coming, so we know none of the merges are root. - // - // This setting is different. We want to feed as much input as possible to - // compress_subtree_wide(), without setting aside anything for the - // chunk_state. If the user gives us 64 KiB, we want to parallelize over - // all 64 KiB at once as a single subtree, if at all possible. - // - // This leads to two problems: - // 1) This 64 KiB input might be the only call that ever gets made to - // update. In this case, the root node of the 64 KiB subtree would be - // the root node of the whole tree, and it would need to be ROOT - // finalized. We can't compress it until we know. - // 2) This 64 KiB input might complete a larger tree, whose root node is - // similarly going to be the the root of the whole tree. For example, - // maybe we have 196 KiB (that is, 128 + 64) hashed so far. We can't - // compress the node at the root of the 256 KiB subtree until we know - // how to finalize it. - // - // The second problem is solved with "lazy merging". That is, when we're - // about to add a CV to the stack, we don't merge it with anything first, - // as the reference impl does. Instead we do merges using the *previous* CV - // that was added, which is sitting on top of the stack, and we put the new - // CV (unmerged) on top of the stack afterwards. This guarantees that we - // never merge the root node until finalize(). - // - // Solving the first problem requires an additional tool, - // compress_subtree_to_parent_node(). That function always returns the top - // *two* chaining values of the subtree it's compressing. We then do lazy - // merging with each of them separately, so that the second CV will always - // remain unmerged. (That also helps us support extendable output when - // we're hashing an input all-at-once.) - fn push_cv(&mut self, new_cv: &CVBytes, chunk_counter: u64) { - self.merge_cv_stack(chunk_counter); - self.cv_stack.push(*new_cv); - } - - /// Add input bytes to the hash state. You can call this any number of - /// times. - /// - /// This method is always single-threaded. For multi-threading support, see - /// `update_with_join` below. - /// - /// Note that the degree of SIMD parallelism that `update` can use is - /// limited by the size of this input buffer. The 8 KiB buffer currently - /// used by [`std::io::copy`] is enough to leverage AVX2, for example, but - /// not enough to leverage AVX-512. A 16 KiB buffer is large enough to - /// leverage all currently supported SIMD instruction sets. - /// - /// [`std::io::copy`]: https://doc.rust-lang.org/std/io/fn.copy.html - pub fn update(&mut self, input: &[u8]) -> &mut Self { - self.update_with_join::<SerialJoin>(input) - } - - /// Add input bytes to the hash state, as with `update`, but potentially - /// using multi-threading. See the example below, and the - /// [`join`](join/index.html) module for a more detailed explanation. - /// - /// To get any performance benefit from multi-threading, the input buffer - /// size needs to be very large. As a rule of thumb on x86_64, there is no - /// benefit to multi-threading inputs less than 128 KiB. Other platforms - /// have different thresholds, and in general you need to benchmark your - /// specific use case. Where possible, memory mapping an entire input file - /// is recommended, to take maximum advantage of multi-threading without - /// needing to tune a specific buffer size. Where memory mapping is not - /// possible, good multi-threading performance requires doing IO on a - /// background thread, to avoid sleeping all your worker threads while the - /// input buffer is (serially) refilled. This is quite complicated compared - /// to memory mapping. - /// - /// # Example - /// - /// ``` - /// // Hash a large input using multi-threading. Note that multi-threading - /// // comes with some overhead, and it can actually hurt performance for small - /// // inputs. The meaning of "small" varies, however, depending on the - /// // platform and the number of threads. (On x86_64, the cutoff tends to be - /// // around 128 KiB.) You should benchmark your own use case to see whether - /// // multi-threading helps. - /// # #[cfg(feature = "rayon")] - /// # { - /// # fn some_large_input() -> &'static [u8] { b"foo" } - /// let input: &[u8] = some_large_input(); - /// let mut hasher = blake3::Hasher::new(); - /// hasher.update_with_join::<blake3::join::RayonJoin>(input); - /// let hash = hasher.finalize(); - /// # } - /// ``` - pub fn update_with_join<J: Join>(&mut self, mut input: &[u8]) -> &mut Self { - // If we have some partial chunk bytes in the internal chunk_state, we - // need to finish that chunk first. - if self.chunk_state.len() > 0 { - let want = CHUNK_LEN - self.chunk_state.len(); - let take = cmp::min(want, input.len()); - self.chunk_state.update(&input[..take]); - input = &input[take..]; - if !input.is_empty() { - // We've filled the current chunk, and there's more input - // coming, so we know it's not the root and we can finalize it. - // Then we'll proceed to hashing whole chunks below. - debug_assert_eq!(self.chunk_state.len(), CHUNK_LEN); - let chunk_cv = self.chunk_state.output().chaining_value(); - self.push_cv(&chunk_cv, self.chunk_state.chunk_counter); - self.chunk_state = ChunkState::new( - &self.key, - self.chunk_state.chunk_counter + 1, - self.chunk_state.flags, - self.chunk_state.platform, - ); - } else { - return self; - } - } - - // Now the chunk_state is clear, and we have more input. If there's - // more than a single chunk (so, definitely not the root chunk), hash - // the largest whole subtree we can, with the full benefits of SIMD and - // multi-threading parallelism. Two restrictions: - // - The subtree has to be a power-of-2 number of chunks. Only subtrees - // along the right edge can be incomplete, and we don't know where - // the right edge is going to be until we get to finalize(). - // - The subtree must evenly divide the total number of chunks up until - // this point (if total is not 0). If the current incomplete subtree - // is only waiting for 1 more chunk, we can't hash a subtree of 4 - // chunks. We have to complete the current subtree first. - // Because we might need to break up the input to form powers of 2, or - // to evenly divide what we already have, this part runs in a loop. - while input.len() > CHUNK_LEN { - debug_assert_eq!(self.chunk_state.len(), 0, "no partial chunk data"); - debug_assert_eq!(CHUNK_LEN.count_ones(), 1, "power of 2 chunk len"); - let mut subtree_len = largest_power_of_two_leq(input.len()); - let count_so_far = self.chunk_state.chunk_counter * CHUNK_LEN as u64; - // Shrink the subtree_len until it evenly divides the count so far. - // We know that subtree_len itself is a power of 2, so we can use a - // bitmasking trick instead of an actual remainder operation. (Note - // that if the caller consistently passes power-of-2 inputs of the - // same size, as is hopefully typical, this loop condition will - // always fail, and subtree_len will always be the full length of - // the input.) - // - // An aside: We don't have to shrink subtree_len quite this much. - // For example, if count_so_far is 1, we could pass 2 chunks to - // compress_subtree_to_parent_node. Since we'll get 2 CVs back, - // we'll still get the right answer in the end, and we might get to - // use 2-way SIMD parallelism. The problem with this optimization, - // is that it gets us stuck always hashing 2 chunks. The total - // number of chunks will remain odd, and we'll never graduate to - // higher degrees of parallelism. See - // https://github.com/BLAKE3-team/BLAKE3/issues/69. - while (subtree_len - 1) as u64 & count_so_far != 0 { - subtree_len /= 2; - } - // The shrunken subtree_len might now be 1 chunk long. If so, hash - // that one chunk by itself. Otherwise, compress the subtree into a - // pair of CVs. - let subtree_chunks = (subtree_len / CHUNK_LEN) as u64; - if subtree_len <= CHUNK_LEN { - debug_assert_eq!(subtree_len, CHUNK_LEN); - self.push_cv( - &ChunkState::new( - &self.key, - self.chunk_state.chunk_counter, - self.chunk_state.flags, - self.chunk_state.platform, - ) - .update(&input[..subtree_len]) - .output() - .chaining_value(), - self.chunk_state.chunk_counter, - ); - } else { - // This is the high-performance happy path, though getting here - // depends on the caller giving us a long enough input. - let cv_pair = compress_subtree_to_parent_node::<J>( - &input[..subtree_len], - &self.key, - self.chunk_state.chunk_counter, - self.chunk_state.flags, - self.chunk_state.platform, - ); - let left_cv = array_ref!(cv_pair, 0, 32); - let right_cv = array_ref!(cv_pair, 32, 32); - // Push the two CVs we received into the CV stack in order. Because - // the stack merges lazily, this guarantees we aren't merging the - // root. - self.push_cv(left_cv, self.chunk_state.chunk_counter); - self.push_cv( - right_cv, - self.chunk_state.chunk_counter + (subtree_chunks / 2), - ); - } - self.chunk_state.chunk_counter += subtree_chunks; - input = &input[subtree_len..]; - } - - // What remains is 1 chunk or less. Add it to the chunk state. - debug_assert!(input.len() <= CHUNK_LEN); - if !input.is_empty() { - self.chunk_state.update(input); - // Having added some input to the chunk_state, we know what's in - // the CV stack won't become the root node, and we can do an extra - // merge. This simplifies finalize(). - self.merge_cv_stack(self.chunk_state.chunk_counter); - } - - self - } - - fn final_output(&self) -> Output { - // If the current chunk is the only chunk, that makes it the root node - // also. Convert it directly into an Output. Otherwise, we need to - // merge subtrees below. - if self.cv_stack.is_empty() { - debug_assert_eq!(self.chunk_state.chunk_counter, 0); - return self.chunk_state.output(); - } - - // If there are any bytes in the ChunkState, finalize that chunk and - // merge its CV with everything in the CV stack. In that case, the work - // we did at the end of update() above guarantees that the stack - // doesn't contain any unmerged subtrees that need to be merged first. - // (This is important, because if there were two chunk hashes sitting - // on top of the stack, they would need to merge with each other, and - // merging a new chunk hash into them would be incorrect.) - // - // If there are no bytes in the ChunkState, we'll merge what's already - // in the stack. In this case it's fine if there are unmerged chunks on - // top, because we'll merge them with each other. Note that the case of - // the empty chunk is taken care of above. - let mut output: Output; - let mut num_cvs_remaining = self.cv_stack.len(); - if self.chunk_state.len() > 0 { - debug_assert_eq!( - self.cv_stack.len(), - self.chunk_state.chunk_counter.count_ones() as usize, - "cv stack does not need a merge" - ); - output = self.chunk_state.output(); - } else { - debug_assert!(self.cv_stack.len() >= 2); - output = parent_node_output( - &self.cv_stack[num_cvs_remaining - 2], - &self.cv_stack[num_cvs_remaining - 1], - &self.key, - self.chunk_state.flags, - self.chunk_state.platform, - ); - num_cvs_remaining -= 2; - } - while num_cvs_remaining > 0 { - output = parent_node_output( - &self.cv_stack[num_cvs_remaining - 1], - &output.chaining_value(), - &self.key, - self.chunk_state.flags, - self.chunk_state.platform, - ); - num_cvs_remaining -= 1; - } - output - } - - /// Finalize the hash state and return the [`Hash`](struct.Hash.html) of - /// the input. - /// - /// This method is idempotent. Calling it twice will give the same result. - /// You can also add more input and finalize again. - pub fn finalize(&self) -> Hash { - self.final_output().root_hash() - } - - /// Finalize the hash state and return an [`OutputReader`], which can - /// supply any number of output bytes. - /// - /// This method is idempotent. Calling it twice will give the same result. - /// You can also add more input and finalize again. - /// - /// [`OutputReader`]: struct.OutputReader.html - pub fn finalize_xof(&self) -> OutputReader { - OutputReader::new(self.final_output()) - } -} - -// Don't derive(Debug), because the state may be secret. -impl fmt::Debug for Hasher { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - f.debug_struct("Hasher") - .field("flags", &self.chunk_state.flags) - .field("platform", &self.chunk_state.platform) - .finish() - } -} - -impl Default for Hasher { - #[inline] - fn default() -> Self { - Self::new() - } -} - -#[cfg(feature = "std")] -impl std::io::Write for Hasher { - /// This is equivalent to [`update`](#method.update). - #[inline] - fn write(&mut self, input: &[u8]) -> std::io::Result<usize> { - self.update(input); - Ok(input.len()) - } - - #[inline] - fn flush(&mut self) -> std::io::Result<()> { - Ok(()) - } -} - -/// An incremental reader for extended output, returned by -/// [`Hasher::finalize_xof`](struct.Hasher.html#method.finalize_xof). -#[derive(Clone)] -pub struct OutputReader { - inner: Output, - position_within_block: u8, -} - -impl OutputReader { - fn new(inner: Output) -> Self { - Self { - inner, - position_within_block: 0, - } - } - - /// Fill a buffer with output bytes and advance the position of the - /// `OutputReader`. This is equivalent to [`Read::read`], except that it - /// doesn't return a `Result`. Both methods always fill the entire buffer. - /// - /// Note that `OutputReader` doesn't buffer output bytes internally, so - /// calling `fill` repeatedly with a short-length or odd-length slice will - /// end up performing the same compression multiple times. If you're - /// reading output in a loop, prefer a slice length that's a multiple of - /// 64. - /// - /// The maximum output size of BLAKE3 is 2<sup>64</sup>-1 bytes. If you try - /// to extract more than that, for example by seeking near the end and - /// reading further, the behavior is unspecified. - /// - /// [`Read::read`]: #method.read - pub fn fill(&mut self, mut buf: &mut [u8]) { - while !buf.is_empty() { - let block: [u8; BLOCK_LEN] = self.inner.root_output_block(); - let output_bytes = &block[self.position_within_block as usize..]; - let take = cmp::min(buf.len(), output_bytes.len()); - buf[..take].copy_from_slice(&output_bytes[..take]); - buf = &mut buf[take..]; - self.position_within_block += take as u8; - if self.position_within_block == BLOCK_LEN as u8 { - self.inner.counter += 1; - self.position_within_block = 0; - } - } - } - - /// Return the current read position in the output stream. The position of - /// a new `OutputReader` starts at 0, and each call to [`fill`] or - /// [`Read::read`] moves the position forward by the number of bytes read. - /// - /// [`fill`]: #method.fill - /// [`Read::read`]: #method.read - pub fn position(&self) -> u64 { - self.inner.counter * BLOCK_LEN as u64 + self.position_within_block as u64 - } - - /// Seek to a new read position in the output stream. This is equivalent to - /// calling [`Seek::seek`] with [`SeekFrom::Start`], except that it doesn't - /// return a `Result`. - /// - /// [`Seek::seek`]: #method.seek - /// [`SeekFrom::Start`]: https://doc.rust-lang.org/std/io/enum.SeekFrom.html - pub fn set_position(&mut self, position: u64) { - self.position_within_block = (position % BLOCK_LEN as u64) as u8; - self.inner.counter = position / BLOCK_LEN as u64; - } -} - -// Don't derive(Debug), because the state may be secret. -impl fmt::Debug for OutputReader { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - f.debug_struct("OutputReader") - .field("position", &self.position()) - .finish() - } -} - -#[cfg(feature = "std")] -impl std::io::Read for OutputReader { - #[inline] - fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> { - self.fill(buf); - Ok(buf.len()) - } -} - -#[cfg(feature = "std")] -impl std::io::Seek for OutputReader { - fn seek(&mut self, pos: std::io::SeekFrom) -> std::io::Result<u64> { - let max_position = u64::max_value() as i128; - let target_position: i128 = match pos { - std::io::SeekFrom::Start(x) => x as i128, - std::io::SeekFrom::Current(x) => self.position() as i128 + x as i128, - std::io::SeekFrom::End(_) => { - return Err(std::io::Error::new( - std::io::ErrorKind::InvalidInput, - "seek from end not supported", - )); - } - }; - if target_position < 0 { - return Err(std::io::Error::new( - std::io::ErrorKind::InvalidInput, - "seek before start", - )); - } - self.set_position(cmp::min(target_position, max_position) as u64); - Ok(self.position()) - } -} diff --git a/thirdparty/BLAKE3/src/platform.rs b/thirdparty/BLAKE3/src/platform.rs deleted file mode 100644 index 4bd67de7a..000000000 --- a/thirdparty/BLAKE3/src/platform.rs +++ /dev/null @@ -1,487 +0,0 @@ -use crate::{portable, CVWords, IncrementCounter, BLOCK_LEN}; -use arrayref::{array_mut_ref, array_ref}; - -cfg_if::cfg_if! { - if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { - cfg_if::cfg_if! { - if #[cfg(blake3_avx512_ffi)] { - pub const MAX_SIMD_DEGREE: usize = 16; - } else { - pub const MAX_SIMD_DEGREE: usize = 8; - } - } - } else if #[cfg(feature = "neon")] { - pub const MAX_SIMD_DEGREE: usize = 4; - } else { - pub const MAX_SIMD_DEGREE: usize = 1; - } -} - -// There are some places where we want a static size that's equal to the -// MAX_SIMD_DEGREE, but also at least 2. Constant contexts aren't currently -// allowed to use cmp::max, so we have to hardcode this additional constant -// value. Get rid of this once cmp::max is a const fn. -cfg_if::cfg_if! { - if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { - cfg_if::cfg_if! { - if #[cfg(blake3_avx512_ffi)] { - pub const MAX_SIMD_DEGREE_OR_2: usize = 16; - } else { - pub const MAX_SIMD_DEGREE_OR_2: usize = 8; - } - } - } else if #[cfg(feature = "neon")] { - pub const MAX_SIMD_DEGREE_OR_2: usize = 4; - } else { - pub const MAX_SIMD_DEGREE_OR_2: usize = 2; - } -} - -#[derive(Clone, Copy, Debug)] -pub enum Platform { - Portable, - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - SSE2, - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - SSE41, - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - AVX2, - #[cfg(blake3_avx512_ffi)] - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - AVX512, - #[cfg(feature = "neon")] - NEON, -} - -impl Platform { - #[allow(unreachable_code)] - pub fn detect() -> Self { - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - { - #[cfg(blake3_avx512_ffi)] - { - if avx512_detected() { - return Platform::AVX512; - } - } - if avx2_detected() { - return Platform::AVX2; - } - if sse41_detected() { - return Platform::SSE41; - } - if sse2_detected() { - return Platform::SSE2; - } - } - // We don't use dynamic feature detection for NEON. If the "neon" - // feature is on, NEON is assumed to be supported. - #[cfg(feature = "neon")] - { - return Platform::NEON; - } - Platform::Portable - } - - pub fn simd_degree(&self) -> usize { - let degree = match self { - Platform::Portable => 1, - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - Platform::SSE2 => 4, - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - Platform::SSE41 => 4, - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - Platform::AVX2 => 8, - #[cfg(blake3_avx512_ffi)] - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - Platform::AVX512 => 16, - #[cfg(feature = "neon")] - Platform::NEON => 4, - }; - debug_assert!(degree <= MAX_SIMD_DEGREE); - degree - } - - pub fn compress_in_place( - &self, - cv: &mut CVWords, - block: &[u8; BLOCK_LEN], - block_len: u8, - counter: u64, - flags: u8, - ) { - match self { - Platform::Portable => portable::compress_in_place(cv, block, block_len, counter, flags), - // Safe because detect() checked for platform support. - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - Platform::SSE2 => unsafe { - crate::sse2::compress_in_place(cv, block, block_len, counter, flags) - }, - // Safe because detect() checked for platform support. - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - Platform::SSE41 | Platform::AVX2 => unsafe { - crate::sse41::compress_in_place(cv, block, block_len, counter, flags) - }, - // Safe because detect() checked for platform support. - #[cfg(blake3_avx512_ffi)] - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - Platform::AVX512 => unsafe { - crate::avx512::compress_in_place(cv, block, block_len, counter, flags) - }, - // No NEON compress_in_place() implementation yet. - #[cfg(feature = "neon")] - Platform::NEON => portable::compress_in_place(cv, block, block_len, counter, flags), - } - } - - pub fn compress_xof( - &self, - cv: &CVWords, - block: &[u8; BLOCK_LEN], - block_len: u8, - counter: u64, - flags: u8, - ) -> [u8; 64] { - match self { - Platform::Portable => portable::compress_xof(cv, block, block_len, counter, flags), - // Safe because detect() checked for platform support. - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - Platform::SSE2 => unsafe { - crate::sse2::compress_xof(cv, block, block_len, counter, flags) - }, - // Safe because detect() checked for platform support. - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - Platform::SSE41 | Platform::AVX2 => unsafe { - crate::sse41::compress_xof(cv, block, block_len, counter, flags) - }, - // Safe because detect() checked for platform support. - #[cfg(blake3_avx512_ffi)] - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - Platform::AVX512 => unsafe { - crate::avx512::compress_xof(cv, block, block_len, counter, flags) - }, - // No NEON compress_xof() implementation yet. - #[cfg(feature = "neon")] - Platform::NEON => portable::compress_xof(cv, block, block_len, counter, flags), - } - } - - // IMPLEMENTATION NOTE - // =================== - // hash_many() applies two optimizations. The critically important - // optimization is the high-performance parallel SIMD hashing mode, - // described in detail in the spec. This more than doubles throughput per - // thread. Another optimization is keeping the state vectors transposed - // from block to block within a chunk. When state vectors are transposed - // after every block, there's a small but measurable performance loss. - // Compressing chunks with a dedicated loop avoids this. - - pub fn hash_many<A: arrayvec::Array<Item = u8>>( - &self, - inputs: &[&A], - key: &CVWords, - counter: u64, - increment_counter: IncrementCounter, - flags: u8, - flags_start: u8, - flags_end: u8, - out: &mut [u8], - ) { - match self { - Platform::Portable => portable::hash_many( - inputs, - key, - counter, - increment_counter, - flags, - flags_start, - flags_end, - out, - ), - // Safe because detect() checked for platform support. - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - Platform::SSE2 => unsafe { - crate::sse2::hash_many( - inputs, - key, - counter, - increment_counter, - flags, - flags_start, - flags_end, - out, - ) - }, - // Safe because detect() checked for platform support. - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - Platform::SSE41 => unsafe { - crate::sse41::hash_many( - inputs, - key, - counter, - increment_counter, - flags, - flags_start, - flags_end, - out, - ) - }, - // Safe because detect() checked for platform support. - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - Platform::AVX2 => unsafe { - crate::avx2::hash_many( - inputs, - key, - counter, - increment_counter, - flags, - flags_start, - flags_end, - out, - ) - }, - // Safe because detect() checked for platform support. - #[cfg(blake3_avx512_ffi)] - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - Platform::AVX512 => unsafe { - crate::avx512::hash_many( - inputs, - key, - counter, - increment_counter, - flags, - flags_start, - flags_end, - out, - ) - }, - // Assumed to be safe if the "neon" feature is on. - #[cfg(feature = "neon")] - Platform::NEON => unsafe { - crate::neon::hash_many( - inputs, - key, - counter, - increment_counter, - flags, - flags_start, - flags_end, - out, - ) - }, - } - } - - // Explicit platform constructors, for benchmarks. - - pub fn portable() -> Self { - Self::Portable - } - - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - pub fn sse2() -> Option<Self> { - if sse2_detected() { - Some(Self::SSE2) - } else { - None - } - } - - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - pub fn sse41() -> Option<Self> { - if sse41_detected() { - Some(Self::SSE41) - } else { - None - } - } - - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - pub fn avx2() -> Option<Self> { - if avx2_detected() { - Some(Self::AVX2) - } else { - None - } - } - - #[cfg(blake3_avx512_ffi)] - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - pub fn avx512() -> Option<Self> { - if avx512_detected() { - Some(Self::AVX512) - } else { - None - } - } - - #[cfg(feature = "neon")] - pub fn neon() -> Option<Self> { - // Assumed to be safe if the "neon" feature is on. - Some(Self::NEON) - } -} - -// Note that AVX-512 is divided into multiple featuresets, and we use two of -// them, F and VL. -#[cfg(blake3_avx512_ffi)] -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -#[inline(always)] -pub fn avx512_detected() -> bool { - // A testing-only short-circuit. - if cfg!(feature = "no_avx512") { - return false; - } - // Static check, e.g. for building with target-cpu=native. - #[cfg(all(target_feature = "avx512f", target_feature = "avx512vl"))] - { - return true; - } - // Dynamic check, if std is enabled. - #[cfg(feature = "std")] - { - if is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512vl") { - return true; - } - } - false -} - -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -#[inline(always)] -pub fn avx2_detected() -> bool { - // A testing-only short-circuit. - if cfg!(feature = "no_avx2") { - return false; - } - // Static check, e.g. for building with target-cpu=native. - #[cfg(target_feature = "avx2")] - { - return true; - } - // Dynamic check, if std is enabled. - #[cfg(feature = "std")] - { - if is_x86_feature_detected!("avx2") { - return true; - } - } - false -} - -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -#[inline(always)] -pub fn sse41_detected() -> bool { - // A testing-only short-circuit. - if cfg!(feature = "no_sse41") { - return false; - } - // Static check, e.g. for building with target-cpu=native. - #[cfg(target_feature = "sse4.1")] - { - return true; - } - // Dynamic check, if std is enabled. - #[cfg(feature = "std")] - { - if is_x86_feature_detected!("sse4.1") { - return true; - } - } - false -} - -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -#[inline(always)] -#[allow(unreachable_code)] -pub fn sse2_detected() -> bool { - // A testing-only short-circuit. - if cfg!(feature = "no_sse2") { - return false; - } - // Static check, e.g. for building with target-cpu=native. - #[cfg(target_feature = "sse2")] - { - return true; - } - // Dynamic check, if std is enabled. - #[cfg(feature = "std")] - { - if is_x86_feature_detected!("sse2") { - return true; - } - } - false -} - -#[inline(always)] -pub fn words_from_le_bytes_32(bytes: &[u8; 32]) -> [u32; 8] { - let mut out = [0; 8]; - out[0] = u32::from_le_bytes(*array_ref!(bytes, 0 * 4, 4)); - out[1] = u32::from_le_bytes(*array_ref!(bytes, 1 * 4, 4)); - out[2] = u32::from_le_bytes(*array_ref!(bytes, 2 * 4, 4)); - out[3] = u32::from_le_bytes(*array_ref!(bytes, 3 * 4, 4)); - out[4] = u32::from_le_bytes(*array_ref!(bytes, 4 * 4, 4)); - out[5] = u32::from_le_bytes(*array_ref!(bytes, 5 * 4, 4)); - out[6] = u32::from_le_bytes(*array_ref!(bytes, 6 * 4, 4)); - out[7] = u32::from_le_bytes(*array_ref!(bytes, 7 * 4, 4)); - out -} - -#[inline(always)] -pub fn words_from_le_bytes_64(bytes: &[u8; 64]) -> [u32; 16] { - let mut out = [0; 16]; - out[0] = u32::from_le_bytes(*array_ref!(bytes, 0 * 4, 4)); - out[1] = u32::from_le_bytes(*array_ref!(bytes, 1 * 4, 4)); - out[2] = u32::from_le_bytes(*array_ref!(bytes, 2 * 4, 4)); - out[3] = u32::from_le_bytes(*array_ref!(bytes, 3 * 4, 4)); - out[4] = u32::from_le_bytes(*array_ref!(bytes, 4 * 4, 4)); - out[5] = u32::from_le_bytes(*array_ref!(bytes, 5 * 4, 4)); - out[6] = u32::from_le_bytes(*array_ref!(bytes, 6 * 4, 4)); - out[7] = u32::from_le_bytes(*array_ref!(bytes, 7 * 4, 4)); - out[8] = u32::from_le_bytes(*array_ref!(bytes, 8 * 4, 4)); - out[9] = u32::from_le_bytes(*array_ref!(bytes, 9 * 4, 4)); - out[10] = u32::from_le_bytes(*array_ref!(bytes, 10 * 4, 4)); - out[11] = u32::from_le_bytes(*array_ref!(bytes, 11 * 4, 4)); - out[12] = u32::from_le_bytes(*array_ref!(bytes, 12 * 4, 4)); - out[13] = u32::from_le_bytes(*array_ref!(bytes, 13 * 4, 4)); - out[14] = u32::from_le_bytes(*array_ref!(bytes, 14 * 4, 4)); - out[15] = u32::from_le_bytes(*array_ref!(bytes, 15 * 4, 4)); - out -} - -#[inline(always)] -pub fn le_bytes_from_words_32(words: &[u32; 8]) -> [u8; 32] { - let mut out = [0; 32]; - *array_mut_ref!(out, 0 * 4, 4) = words[0].to_le_bytes(); - *array_mut_ref!(out, 1 * 4, 4) = words[1].to_le_bytes(); - *array_mut_ref!(out, 2 * 4, 4) = words[2].to_le_bytes(); - *array_mut_ref!(out, 3 * 4, 4) = words[3].to_le_bytes(); - *array_mut_ref!(out, 4 * 4, 4) = words[4].to_le_bytes(); - *array_mut_ref!(out, 5 * 4, 4) = words[5].to_le_bytes(); - *array_mut_ref!(out, 6 * 4, 4) = words[6].to_le_bytes(); - *array_mut_ref!(out, 7 * 4, 4) = words[7].to_le_bytes(); - out -} - -#[inline(always)] -pub fn le_bytes_from_words_64(words: &[u32; 16]) -> [u8; 64] { - let mut out = [0; 64]; - *array_mut_ref!(out, 0 * 4, 4) = words[0].to_le_bytes(); - *array_mut_ref!(out, 1 * 4, 4) = words[1].to_le_bytes(); - *array_mut_ref!(out, 2 * 4, 4) = words[2].to_le_bytes(); - *array_mut_ref!(out, 3 * 4, 4) = words[3].to_le_bytes(); - *array_mut_ref!(out, 4 * 4, 4) = words[4].to_le_bytes(); - *array_mut_ref!(out, 5 * 4, 4) = words[5].to_le_bytes(); - *array_mut_ref!(out, 6 * 4, 4) = words[6].to_le_bytes(); - *array_mut_ref!(out, 7 * 4, 4) = words[7].to_le_bytes(); - *array_mut_ref!(out, 8 * 4, 4) = words[8].to_le_bytes(); - *array_mut_ref!(out, 9 * 4, 4) = words[9].to_le_bytes(); - *array_mut_ref!(out, 10 * 4, 4) = words[10].to_le_bytes(); - *array_mut_ref!(out, 11 * 4, 4) = words[11].to_le_bytes(); - *array_mut_ref!(out, 12 * 4, 4) = words[12].to_le_bytes(); - *array_mut_ref!(out, 13 * 4, 4) = words[13].to_le_bytes(); - *array_mut_ref!(out, 14 * 4, 4) = words[14].to_le_bytes(); - *array_mut_ref!(out, 15 * 4, 4) = words[15].to_le_bytes(); - out -} diff --git a/thirdparty/BLAKE3/src/portable.rs b/thirdparty/BLAKE3/src/portable.rs deleted file mode 100644 index 0a569cec7..000000000 --- a/thirdparty/BLAKE3/src/portable.rs +++ /dev/null @@ -1,198 +0,0 @@ -use crate::{ - counter_high, counter_low, CVBytes, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE, - OUT_LEN, -}; -use arrayref::{array_mut_ref, array_ref}; - -#[inline(always)] -fn g(state: &mut [u32; 16], a: usize, b: usize, c: usize, d: usize, x: u32, y: u32) { - state[a] = state[a].wrapping_add(state[b]).wrapping_add(x); - state[d] = (state[d] ^ state[a]).rotate_right(16); - state[c] = state[c].wrapping_add(state[d]); - state[b] = (state[b] ^ state[c]).rotate_right(12); - state[a] = state[a].wrapping_add(state[b]).wrapping_add(y); - state[d] = (state[d] ^ state[a]).rotate_right(8); - state[c] = state[c].wrapping_add(state[d]); - state[b] = (state[b] ^ state[c]).rotate_right(7); -} - -#[inline(always)] -fn round(state: &mut [u32; 16], msg: &[u32; 16], round: usize) { - // Select the message schedule based on the round. - let schedule = MSG_SCHEDULE[round]; - - // Mix the columns. - g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]); - g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]); - g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]); - g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]); - - // Mix the diagonals. - g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]); - g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]); - g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]); - g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]); -} - -#[inline(always)] -fn compress_pre( - cv: &CVWords, - block: &[u8; BLOCK_LEN], - block_len: u8, - counter: u64, - flags: u8, -) -> [u32; 16] { - let block_words = crate::platform::words_from_le_bytes_64(block); - - let mut state = [ - cv[0], - cv[1], - cv[2], - cv[3], - cv[4], - cv[5], - cv[6], - cv[7], - IV[0], - IV[1], - IV[2], - IV[3], - counter_low(counter), - counter_high(counter), - block_len as u32, - flags as u32, - ]; - - round(&mut state, &block_words, 0); - round(&mut state, &block_words, 1); - round(&mut state, &block_words, 2); - round(&mut state, &block_words, 3); - round(&mut state, &block_words, 4); - round(&mut state, &block_words, 5); - round(&mut state, &block_words, 6); - - state -} - -pub fn compress_in_place( - cv: &mut CVWords, - block: &[u8; BLOCK_LEN], - block_len: u8, - counter: u64, - flags: u8, -) { - let state = compress_pre(cv, block, block_len, counter, flags); - - cv[0] = state[0] ^ state[8]; - cv[1] = state[1] ^ state[9]; - cv[2] = state[2] ^ state[10]; - cv[3] = state[3] ^ state[11]; - cv[4] = state[4] ^ state[12]; - cv[5] = state[5] ^ state[13]; - cv[6] = state[6] ^ state[14]; - cv[7] = state[7] ^ state[15]; -} - -pub fn compress_xof( - cv: &CVWords, - block: &[u8; BLOCK_LEN], - block_len: u8, - counter: u64, - flags: u8, -) -> [u8; 64] { - let mut state = compress_pre(cv, block, block_len, counter, flags); - state[0] ^= state[8]; - state[1] ^= state[9]; - state[2] ^= state[10]; - state[3] ^= state[11]; - state[4] ^= state[12]; - state[5] ^= state[13]; - state[6] ^= state[14]; - state[7] ^= state[15]; - state[8] ^= cv[0]; - state[9] ^= cv[1]; - state[10] ^= cv[2]; - state[11] ^= cv[3]; - state[12] ^= cv[4]; - state[13] ^= cv[5]; - state[14] ^= cv[6]; - state[15] ^= cv[7]; - crate::platform::le_bytes_from_words_64(&state) -} - -pub fn hash1<A: arrayvec::Array<Item = u8>>( - input: &A, - key: &CVWords, - counter: u64, - flags: u8, - flags_start: u8, - flags_end: u8, - out: &mut CVBytes, -) { - debug_assert_eq!(A::CAPACITY % BLOCK_LEN, 0, "uneven blocks"); - let mut cv = *key; - let mut block_flags = flags | flags_start; - let mut slice = input.as_slice(); - while slice.len() >= BLOCK_LEN { - if slice.len() == BLOCK_LEN { - block_flags |= flags_end; - } - compress_in_place( - &mut cv, - array_ref!(slice, 0, BLOCK_LEN), - BLOCK_LEN as u8, - counter, - block_flags, - ); - block_flags = flags; - slice = &slice[BLOCK_LEN..]; - } - *out = crate::platform::le_bytes_from_words_32(&cv); -} - -pub fn hash_many<A: arrayvec::Array<Item = u8>>( - inputs: &[&A], - key: &CVWords, - mut counter: u64, - increment_counter: IncrementCounter, - flags: u8, - flags_start: u8, - flags_end: u8, - out: &mut [u8], -) { - debug_assert!(out.len() >= inputs.len() * OUT_LEN, "out too short"); - for (&input, output) in inputs.iter().zip(out.chunks_exact_mut(OUT_LEN)) { - hash1( - input, - key, - counter, - flags, - flags_start, - flags_end, - array_mut_ref!(output, 0, OUT_LEN), - ); - if increment_counter.yes() { - counter += 1; - } - } -} - -#[cfg(test)] -pub mod test { - use super::*; - - // This is basically testing the portable implementation against itself, - // but it also checks that compress_in_place and compress_xof are - // consistent. And there are tests against the reference implementation and - // against hardcoded test vectors elsewhere. - #[test] - fn test_compress() { - crate::test::test_compress_fn(compress_in_place, compress_xof); - } - - // Ditto. - #[test] - fn test_hash_many() { - crate::test::test_hash_many_fn(hash_many, hash_many); - } -} diff --git a/thirdparty/BLAKE3/src/rust_avx2.rs b/thirdparty/BLAKE3/src/rust_avx2.rs deleted file mode 100644 index 6ab773ad4..000000000 --- a/thirdparty/BLAKE3/src/rust_avx2.rs +++ /dev/null @@ -1,474 +0,0 @@ -#[cfg(target_arch = "x86")] -use core::arch::x86::*; -#[cfg(target_arch = "x86_64")] -use core::arch::x86_64::*; - -use crate::{ - counter_high, counter_low, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE, OUT_LEN, -}; -use arrayref::{array_mut_ref, mut_array_refs}; - -pub const DEGREE: usize = 8; - -#[inline(always)] -unsafe fn loadu(src: *const u8) -> __m256i { - // This is an unaligned load, so the pointer cast is allowed. - _mm256_loadu_si256(src as *const __m256i) -} - -#[inline(always)] -unsafe fn storeu(src: __m256i, dest: *mut u8) { - // This is an unaligned store, so the pointer cast is allowed. - _mm256_storeu_si256(dest as *mut __m256i, src) -} - -#[inline(always)] -unsafe fn add(a: __m256i, b: __m256i) -> __m256i { - _mm256_add_epi32(a, b) -} - -#[inline(always)] -unsafe fn xor(a: __m256i, b: __m256i) -> __m256i { - _mm256_xor_si256(a, b) -} - -#[inline(always)] -unsafe fn set1(x: u32) -> __m256i { - _mm256_set1_epi32(x as i32) -} - -#[inline(always)] -unsafe fn set8(a: u32, b: u32, c: u32, d: u32, e: u32, f: u32, g: u32, h: u32) -> __m256i { - _mm256_setr_epi32( - a as i32, b as i32, c as i32, d as i32, e as i32, f as i32, g as i32, h as i32, - ) -} - -// These rotations are the "simple/shifts version". For the -// "complicated/shuffles version", see -// https://github.com/sneves/blake2-avx2/blob/b3723921f668df09ece52dcd225a36d4a4eea1d9/blake2s-common.h#L63-L66. -// For a discussion of the tradeoffs, see -// https://github.com/sneves/blake2-avx2/pull/5. Due to an LLVM bug -// (https://bugs.llvm.org/show_bug.cgi?id=44379), this version performs better -// on recent x86 chips. - -#[inline(always)] -unsafe fn rot16(x: __m256i) -> __m256i { - _mm256_or_si256(_mm256_srli_epi32(x, 16), _mm256_slli_epi32(x, 32 - 16)) -} - -#[inline(always)] -unsafe fn rot12(x: __m256i) -> __m256i { - _mm256_or_si256(_mm256_srli_epi32(x, 12), _mm256_slli_epi32(x, 32 - 12)) -} - -#[inline(always)] -unsafe fn rot8(x: __m256i) -> __m256i { - _mm256_or_si256(_mm256_srli_epi32(x, 8), _mm256_slli_epi32(x, 32 - 8)) -} - -#[inline(always)] -unsafe fn rot7(x: __m256i) -> __m256i { - _mm256_or_si256(_mm256_srli_epi32(x, 7), _mm256_slli_epi32(x, 32 - 7)) -} - -#[inline(always)] -unsafe fn round(v: &mut [__m256i; 16], m: &[__m256i; 16], r: usize) { - v[0] = add(v[0], m[MSG_SCHEDULE[r][0] as usize]); - v[1] = add(v[1], m[MSG_SCHEDULE[r][2] as usize]); - v[2] = add(v[2], m[MSG_SCHEDULE[r][4] as usize]); - v[3] = add(v[3], m[MSG_SCHEDULE[r][6] as usize]); - v[0] = add(v[0], v[4]); - v[1] = add(v[1], v[5]); - v[2] = add(v[2], v[6]); - v[3] = add(v[3], v[7]); - v[12] = xor(v[12], v[0]); - v[13] = xor(v[13], v[1]); - v[14] = xor(v[14], v[2]); - v[15] = xor(v[15], v[3]); - v[12] = rot16(v[12]); - v[13] = rot16(v[13]); - v[14] = rot16(v[14]); - v[15] = rot16(v[15]); - v[8] = add(v[8], v[12]); - v[9] = add(v[9], v[13]); - v[10] = add(v[10], v[14]); - v[11] = add(v[11], v[15]); - v[4] = xor(v[4], v[8]); - v[5] = xor(v[5], v[9]); - v[6] = xor(v[6], v[10]); - v[7] = xor(v[7], v[11]); - v[4] = rot12(v[4]); - v[5] = rot12(v[5]); - v[6] = rot12(v[6]); - v[7] = rot12(v[7]); - v[0] = add(v[0], m[MSG_SCHEDULE[r][1] as usize]); - v[1] = add(v[1], m[MSG_SCHEDULE[r][3] as usize]); - v[2] = add(v[2], m[MSG_SCHEDULE[r][5] as usize]); - v[3] = add(v[3], m[MSG_SCHEDULE[r][7] as usize]); - v[0] = add(v[0], v[4]); - v[1] = add(v[1], v[5]); - v[2] = add(v[2], v[6]); - v[3] = add(v[3], v[7]); - v[12] = xor(v[12], v[0]); - v[13] = xor(v[13], v[1]); - v[14] = xor(v[14], v[2]); - v[15] = xor(v[15], v[3]); - v[12] = rot8(v[12]); - v[13] = rot8(v[13]); - v[14] = rot8(v[14]); - v[15] = rot8(v[15]); - v[8] = add(v[8], v[12]); - v[9] = add(v[9], v[13]); - v[10] = add(v[10], v[14]); - v[11] = add(v[11], v[15]); - v[4] = xor(v[4], v[8]); - v[5] = xor(v[5], v[9]); - v[6] = xor(v[6], v[10]); - v[7] = xor(v[7], v[11]); - v[4] = rot7(v[4]); - v[5] = rot7(v[5]); - v[6] = rot7(v[6]); - v[7] = rot7(v[7]); - - v[0] = add(v[0], m[MSG_SCHEDULE[r][8] as usize]); - v[1] = add(v[1], m[MSG_SCHEDULE[r][10] as usize]); - v[2] = add(v[2], m[MSG_SCHEDULE[r][12] as usize]); - v[3] = add(v[3], m[MSG_SCHEDULE[r][14] as usize]); - v[0] = add(v[0], v[5]); - v[1] = add(v[1], v[6]); - v[2] = add(v[2], v[7]); - v[3] = add(v[3], v[4]); - v[15] = xor(v[15], v[0]); - v[12] = xor(v[12], v[1]); - v[13] = xor(v[13], v[2]); - v[14] = xor(v[14], v[3]); - v[15] = rot16(v[15]); - v[12] = rot16(v[12]); - v[13] = rot16(v[13]); - v[14] = rot16(v[14]); - v[10] = add(v[10], v[15]); - v[11] = add(v[11], v[12]); - v[8] = add(v[8], v[13]); - v[9] = add(v[9], v[14]); - v[5] = xor(v[5], v[10]); - v[6] = xor(v[6], v[11]); - v[7] = xor(v[7], v[8]); - v[4] = xor(v[4], v[9]); - v[5] = rot12(v[5]); - v[6] = rot12(v[6]); - v[7] = rot12(v[7]); - v[4] = rot12(v[4]); - v[0] = add(v[0], m[MSG_SCHEDULE[r][9] as usize]); - v[1] = add(v[1], m[MSG_SCHEDULE[r][11] as usize]); - v[2] = add(v[2], m[MSG_SCHEDULE[r][13] as usize]); - v[3] = add(v[3], m[MSG_SCHEDULE[r][15] as usize]); - v[0] = add(v[0], v[5]); - v[1] = add(v[1], v[6]); - v[2] = add(v[2], v[7]); - v[3] = add(v[3], v[4]); - v[15] = xor(v[15], v[0]); - v[12] = xor(v[12], v[1]); - v[13] = xor(v[13], v[2]); - v[14] = xor(v[14], v[3]); - v[15] = rot8(v[15]); - v[12] = rot8(v[12]); - v[13] = rot8(v[13]); - v[14] = rot8(v[14]); - v[10] = add(v[10], v[15]); - v[11] = add(v[11], v[12]); - v[8] = add(v[8], v[13]); - v[9] = add(v[9], v[14]); - v[5] = xor(v[5], v[10]); - v[6] = xor(v[6], v[11]); - v[7] = xor(v[7], v[8]); - v[4] = xor(v[4], v[9]); - v[5] = rot7(v[5]); - v[6] = rot7(v[6]); - v[7] = rot7(v[7]); - v[4] = rot7(v[4]); -} - -#[inline(always)] -unsafe fn interleave128(a: __m256i, b: __m256i) -> (__m256i, __m256i) { - ( - _mm256_permute2x128_si256(a, b, 0x20), - _mm256_permute2x128_si256(a, b, 0x31), - ) -} - -// There are several ways to do a transposition. We could do it naively, with 8 separate -// _mm256_set_epi32 instructions, referencing each of the 32 words explicitly. Or we could copy -// the vecs into contiguous storage and then use gather instructions. This third approach is to use -// a series of unpack instructions to interleave the vectors. In my benchmarks, interleaving is the -// fastest approach. To test this, run `cargo +nightly bench --bench libtest load_8` in the -// https://github.com/oconnor663/bao_experiments repo. -#[inline(always)] -unsafe fn transpose_vecs(vecs: &mut [__m256i; DEGREE]) { - // Interleave 32-bit lanes. The low unpack is lanes 00/11/44/55, and the high is 22/33/66/77. - let ab_0145 = _mm256_unpacklo_epi32(vecs[0], vecs[1]); - let ab_2367 = _mm256_unpackhi_epi32(vecs[0], vecs[1]); - let cd_0145 = _mm256_unpacklo_epi32(vecs[2], vecs[3]); - let cd_2367 = _mm256_unpackhi_epi32(vecs[2], vecs[3]); - let ef_0145 = _mm256_unpacklo_epi32(vecs[4], vecs[5]); - let ef_2367 = _mm256_unpackhi_epi32(vecs[4], vecs[5]); - let gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]); - let gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]); - - // Interleave 64-bit lates. The low unpack is lanes 00/22 and the high is 11/33. - let abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145); - let abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145); - let abcd_26 = _mm256_unpacklo_epi64(ab_2367, cd_2367); - let abcd_37 = _mm256_unpackhi_epi64(ab_2367, cd_2367); - let efgh_04 = _mm256_unpacklo_epi64(ef_0145, gh_0145); - let efgh_15 = _mm256_unpackhi_epi64(ef_0145, gh_0145); - let efgh_26 = _mm256_unpacklo_epi64(ef_2367, gh_2367); - let efgh_37 = _mm256_unpackhi_epi64(ef_2367, gh_2367); - - // Interleave 128-bit lanes. - let (abcdefgh_0, abcdefgh_4) = interleave128(abcd_04, efgh_04); - let (abcdefgh_1, abcdefgh_5) = interleave128(abcd_15, efgh_15); - let (abcdefgh_2, abcdefgh_6) = interleave128(abcd_26, efgh_26); - let (abcdefgh_3, abcdefgh_7) = interleave128(abcd_37, efgh_37); - - vecs[0] = abcdefgh_0; - vecs[1] = abcdefgh_1; - vecs[2] = abcdefgh_2; - vecs[3] = abcdefgh_3; - vecs[4] = abcdefgh_4; - vecs[5] = abcdefgh_5; - vecs[6] = abcdefgh_6; - vecs[7] = abcdefgh_7; -} - -#[inline(always)] -unsafe fn transpose_msg_vecs(inputs: &[*const u8; DEGREE], block_offset: usize) -> [__m256i; 16] { - let mut vecs = [ - loadu(inputs[0].add(block_offset + 0 * 4 * DEGREE)), - loadu(inputs[1].add(block_offset + 0 * 4 * DEGREE)), - loadu(inputs[2].add(block_offset + 0 * 4 * DEGREE)), - loadu(inputs[3].add(block_offset + 0 * 4 * DEGREE)), - loadu(inputs[4].add(block_offset + 0 * 4 * DEGREE)), - loadu(inputs[5].add(block_offset + 0 * 4 * DEGREE)), - loadu(inputs[6].add(block_offset + 0 * 4 * DEGREE)), - loadu(inputs[7].add(block_offset + 0 * 4 * DEGREE)), - loadu(inputs[0].add(block_offset + 1 * 4 * DEGREE)), - loadu(inputs[1].add(block_offset + 1 * 4 * DEGREE)), - loadu(inputs[2].add(block_offset + 1 * 4 * DEGREE)), - loadu(inputs[3].add(block_offset + 1 * 4 * DEGREE)), - loadu(inputs[4].add(block_offset + 1 * 4 * DEGREE)), - loadu(inputs[5].add(block_offset + 1 * 4 * DEGREE)), - loadu(inputs[6].add(block_offset + 1 * 4 * DEGREE)), - loadu(inputs[7].add(block_offset + 1 * 4 * DEGREE)), - ]; - for i in 0..DEGREE { - _mm_prefetch(inputs[i].add(block_offset + 256) as *const i8, _MM_HINT_T0); - } - let squares = mut_array_refs!(&mut vecs, DEGREE, DEGREE); - transpose_vecs(squares.0); - transpose_vecs(squares.1); - vecs -} - -#[inline(always)] -unsafe fn load_counters(counter: u64, increment_counter: IncrementCounter) -> (__m256i, __m256i) { - let mask = if increment_counter.yes() { !0 } else { 0 }; - ( - set8( - counter_low(counter + (mask & 0)), - counter_low(counter + (mask & 1)), - counter_low(counter + (mask & 2)), - counter_low(counter + (mask & 3)), - counter_low(counter + (mask & 4)), - counter_low(counter + (mask & 5)), - counter_low(counter + (mask & 6)), - counter_low(counter + (mask & 7)), - ), - set8( - counter_high(counter + (mask & 0)), - counter_high(counter + (mask & 1)), - counter_high(counter + (mask & 2)), - counter_high(counter + (mask & 3)), - counter_high(counter + (mask & 4)), - counter_high(counter + (mask & 5)), - counter_high(counter + (mask & 6)), - counter_high(counter + (mask & 7)), - ), - ) -} - -#[target_feature(enable = "avx2")] -pub unsafe fn hash8( - inputs: &[*const u8; DEGREE], - blocks: usize, - key: &CVWords, - counter: u64, - increment_counter: IncrementCounter, - flags: u8, - flags_start: u8, - flags_end: u8, - out: &mut [u8; DEGREE * OUT_LEN], -) { - let mut h_vecs = [ - set1(key[0]), - set1(key[1]), - set1(key[2]), - set1(key[3]), - set1(key[4]), - set1(key[5]), - set1(key[6]), - set1(key[7]), - ]; - let (counter_low_vec, counter_high_vec) = load_counters(counter, increment_counter); - let mut block_flags = flags | flags_start; - - for block in 0..blocks { - if block + 1 == blocks { - block_flags |= flags_end; - } - let block_len_vec = set1(BLOCK_LEN as u32); // full blocks only - let block_flags_vec = set1(block_flags as u32); - let msg_vecs = transpose_msg_vecs(inputs, block * BLOCK_LEN); - - // The transposed compression function. Note that inlining this - // manually here improves compile times by a lot, compared to factoring - // it out into its own function and making it #[inline(always)]. Just - // guessing, it might have something to do with loop unrolling. - let mut v = [ - h_vecs[0], - h_vecs[1], - h_vecs[2], - h_vecs[3], - h_vecs[4], - h_vecs[5], - h_vecs[6], - h_vecs[7], - set1(IV[0]), - set1(IV[1]), - set1(IV[2]), - set1(IV[3]), - counter_low_vec, - counter_high_vec, - block_len_vec, - block_flags_vec, - ]; - round(&mut v, &msg_vecs, 0); - round(&mut v, &msg_vecs, 1); - round(&mut v, &msg_vecs, 2); - round(&mut v, &msg_vecs, 3); - round(&mut v, &msg_vecs, 4); - round(&mut v, &msg_vecs, 5); - round(&mut v, &msg_vecs, 6); - h_vecs[0] = xor(v[0], v[8]); - h_vecs[1] = xor(v[1], v[9]); - h_vecs[2] = xor(v[2], v[10]); - h_vecs[3] = xor(v[3], v[11]); - h_vecs[4] = xor(v[4], v[12]); - h_vecs[5] = xor(v[5], v[13]); - h_vecs[6] = xor(v[6], v[14]); - h_vecs[7] = xor(v[7], v[15]); - - block_flags = flags; - } - - transpose_vecs(&mut h_vecs); - storeu(h_vecs[0], out.as_mut_ptr().add(0 * 4 * DEGREE)); - storeu(h_vecs[1], out.as_mut_ptr().add(1 * 4 * DEGREE)); - storeu(h_vecs[2], out.as_mut_ptr().add(2 * 4 * DEGREE)); - storeu(h_vecs[3], out.as_mut_ptr().add(3 * 4 * DEGREE)); - storeu(h_vecs[4], out.as_mut_ptr().add(4 * 4 * DEGREE)); - storeu(h_vecs[5], out.as_mut_ptr().add(5 * 4 * DEGREE)); - storeu(h_vecs[6], out.as_mut_ptr().add(6 * 4 * DEGREE)); - storeu(h_vecs[7], out.as_mut_ptr().add(7 * 4 * DEGREE)); -} - -#[target_feature(enable = "avx2")] -pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>( - mut inputs: &[&A], - key: &CVWords, - mut counter: u64, - increment_counter: IncrementCounter, - flags: u8, - flags_start: u8, - flags_end: u8, - mut out: &mut [u8], -) { - debug_assert!(out.len() >= inputs.len() * OUT_LEN, "out too short"); - while inputs.len() >= DEGREE && out.len() >= DEGREE * OUT_LEN { - // Safe because the layout of arrays is guaranteed, and because the - // `blocks` count is determined statically from the argument type. - let input_ptrs: &[*const u8; DEGREE] = &*(inputs.as_ptr() as *const [*const u8; DEGREE]); - let blocks = A::CAPACITY / BLOCK_LEN; - hash8( - input_ptrs, - blocks, - key, - counter, - increment_counter, - flags, - flags_start, - flags_end, - array_mut_ref!(out, 0, DEGREE * OUT_LEN), - ); - if increment_counter.yes() { - counter += DEGREE as u64; - } - inputs = &inputs[DEGREE..]; - out = &mut out[DEGREE * OUT_LEN..]; - } - crate::sse41::hash_many( - inputs, - key, - counter, - increment_counter, - flags, - flags_start, - flags_end, - out, - ); -} - -#[cfg(test)] -mod test { - use super::*; - - #[test] - fn test_transpose() { - if !crate::platform::avx2_detected() { - return; - } - - #[target_feature(enable = "avx2")] - unsafe fn transpose_wrapper(vecs: &mut [__m256i; DEGREE]) { - transpose_vecs(vecs); - } - - let mut matrix = [[0 as u32; DEGREE]; DEGREE]; - for i in 0..DEGREE { - for j in 0..DEGREE { - matrix[i][j] = (i * DEGREE + j) as u32; - } - } - - unsafe { - let mut vecs: [__m256i; DEGREE] = core::mem::transmute(matrix); - transpose_wrapper(&mut vecs); - matrix = core::mem::transmute(vecs); - } - - for i in 0..DEGREE { - for j in 0..DEGREE { - // Reversed indexes from above. - assert_eq!(matrix[j][i], (i * DEGREE + j) as u32); - } - } - } - - #[test] - fn test_hash_many() { - if !crate::platform::avx2_detected() { - return; - } - crate::test::test_hash_many_fn(hash_many, hash_many); - } -} diff --git a/thirdparty/BLAKE3/src/rust_sse2.rs b/thirdparty/BLAKE3/src/rust_sse2.rs deleted file mode 100644 index 15b52ee5d..000000000 --- a/thirdparty/BLAKE3/src/rust_sse2.rs +++ /dev/null @@ -1,775 +0,0 @@ -#[cfg(target_arch = "x86")] -use core::arch::x86::*; -#[cfg(target_arch = "x86_64")] -use core::arch::x86_64::*; - -use crate::{ - counter_high, counter_low, CVBytes, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE, - OUT_LEN, -}; -use arrayref::{array_mut_ref, array_ref, mut_array_refs}; - -pub const DEGREE: usize = 4; - -#[inline(always)] -unsafe fn loadu(src: *const u8) -> __m128i { - // This is an unaligned load, so the pointer cast is allowed. - _mm_loadu_si128(src as *const __m128i) -} - -#[inline(always)] -unsafe fn storeu(src: __m128i, dest: *mut u8) { - // This is an unaligned store, so the pointer cast is allowed. - _mm_storeu_si128(dest as *mut __m128i, src) -} - -#[inline(always)] -unsafe fn add(a: __m128i, b: __m128i) -> __m128i { - _mm_add_epi32(a, b) -} - -#[inline(always)] -unsafe fn xor(a: __m128i, b: __m128i) -> __m128i { - _mm_xor_si128(a, b) -} - -#[inline(always)] -unsafe fn set1(x: u32) -> __m128i { - _mm_set1_epi32(x as i32) -} - -#[inline(always)] -unsafe fn set4(a: u32, b: u32, c: u32, d: u32) -> __m128i { - _mm_setr_epi32(a as i32, b as i32, c as i32, d as i32) -} - -// These rotations are the "simple/shifts version". For the -// "complicated/shuffles version", see -// https://github.com/sneves/blake2-avx2/blob/b3723921f668df09ece52dcd225a36d4a4eea1d9/blake2s-common.h#L63-L66. -// For a discussion of the tradeoffs, see -// https://github.com/sneves/blake2-avx2/pull/5. Due to an LLVM bug -// (https://bugs.llvm.org/show_bug.cgi?id=44379), this version performs better -// on recent x86 chips. - -#[inline(always)] -unsafe fn rot16(a: __m128i) -> __m128i { - _mm_or_si128(_mm_srli_epi32(a, 16), _mm_slli_epi32(a, 32 - 16)) -} - -#[inline(always)] -unsafe fn rot12(a: __m128i) -> __m128i { - _mm_or_si128(_mm_srli_epi32(a, 12), _mm_slli_epi32(a, 32 - 12)) -} - -#[inline(always)] -unsafe fn rot8(a: __m128i) -> __m128i { - _mm_or_si128(_mm_srli_epi32(a, 8), _mm_slli_epi32(a, 32 - 8)) -} - -#[inline(always)] -unsafe fn rot7(a: __m128i) -> __m128i { - _mm_or_si128(_mm_srli_epi32(a, 7), _mm_slli_epi32(a, 32 - 7)) -} - -#[inline(always)] -unsafe fn g1( - row0: &mut __m128i, - row1: &mut __m128i, - row2: &mut __m128i, - row3: &mut __m128i, - m: __m128i, -) { - *row0 = add(add(*row0, m), *row1); - *row3 = xor(*row3, *row0); - *row3 = rot16(*row3); - *row2 = add(*row2, *row3); - *row1 = xor(*row1, *row2); - *row1 = rot12(*row1); -} - -#[inline(always)] -unsafe fn g2( - row0: &mut __m128i, - row1: &mut __m128i, - row2: &mut __m128i, - row3: &mut __m128i, - m: __m128i, -) { - *row0 = add(add(*row0, m), *row1); - *row3 = xor(*row3, *row0); - *row3 = rot8(*row3); - *row2 = add(*row2, *row3); - *row1 = xor(*row1, *row2); - *row1 = rot7(*row1); -} - -// Adapted from https://github.com/rust-lang-nursery/stdsimd/pull/479. -macro_rules! _MM_SHUFFLE { - ($z:expr, $y:expr, $x:expr, $w:expr) => { - ($z << 6) | ($y << 4) | ($x << 2) | $w - }; -} - -macro_rules! shuffle2 { - ($a:expr, $b:expr, $c:expr) => { - _mm_castps_si128(_mm_shuffle_ps( - _mm_castsi128_ps($a), - _mm_castsi128_ps($b), - $c, - )) - }; -} - -// Note the optimization here of leaving row1 as the unrotated row, rather than -// row0. All the message loads below are adjusted to compensate for this. See -// discussion at https://github.com/sneves/blake2-avx2/pull/4 -#[inline(always)] -unsafe fn diagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) { - *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(2, 1, 0, 3)); - *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2)); - *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(0, 3, 2, 1)); -} - -#[inline(always)] -unsafe fn undiagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) { - *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(0, 3, 2, 1)); - *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2)); - *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(2, 1, 0, 3)); -} - -#[inline(always)] -unsafe fn blend_epi16(a: __m128i, b: __m128i, imm8: i32) -> __m128i { - let bits = _mm_set_epi16(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01); - let mut mask = _mm_set1_epi16(imm8 as i16); - mask = _mm_and_si128(mask, bits); - mask = _mm_cmpeq_epi16(mask, bits); - _mm_or_si128(_mm_and_si128(mask, b), _mm_andnot_si128(mask, a)) -} - -#[inline(always)] -unsafe fn compress_pre( - cv: &CVWords, - block: &[u8; BLOCK_LEN], - block_len: u8, - counter: u64, - flags: u8, -) -> [__m128i; 4] { - let row0 = &mut loadu(cv.as_ptr().add(0) as *const u8); - let row1 = &mut loadu(cv.as_ptr().add(4) as *const u8); - let row2 = &mut set4(IV[0], IV[1], IV[2], IV[3]); - let row3 = &mut set4( - counter_low(counter), - counter_high(counter), - block_len as u32, - flags as u32, - ); - - let mut m0 = loadu(block.as_ptr().add(0 * 4 * DEGREE)); - let mut m1 = loadu(block.as_ptr().add(1 * 4 * DEGREE)); - let mut m2 = loadu(block.as_ptr().add(2 * 4 * DEGREE)); - let mut m3 = loadu(block.as_ptr().add(3 * 4 * DEGREE)); - - let mut t0; - let mut t1; - let mut t2; - let mut t3; - let mut tt; - - // Round 1. The first round permutes the message words from the original - // input order, into the groups that get mixed in parallel. - t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(2, 0, 2, 0)); // 6 4 2 0 - g1(row0, row1, row2, row3, t0); - t1 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 3, 1)); // 7 5 3 1 - g2(row0, row1, row2, row3, t1); - diagonalize(row0, row2, row3); - t2 = shuffle2!(m2, m3, _MM_SHUFFLE!(2, 0, 2, 0)); // 14 12 10 8 - t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE!(2, 1, 0, 3)); // 12 10 8 14 - g1(row0, row1, row2, row3, t2); - t3 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 1, 3, 1)); // 15 13 11 9 - t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE!(2, 1, 0, 3)); // 13 11 9 15 - g2(row0, row1, row2, row3, t3); - undiagonalize(row0, row2, row3); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 2. This round and all following rounds apply a fixed permutation - // to the message words from the round before. - t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); - g1(row0, row1, row2, row3, t0); - t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); - t1 = blend_epi16(tt, t1, 0xCC); - g2(row0, row1, row2, row3, t1); - diagonalize(row0, row2, row3); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); - g1(row0, row1, row2, row3, t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); - g2(row0, row1, row2, row3, t3); - undiagonalize(row0, row2, row3); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 3 - t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); - g1(row0, row1, row2, row3, t0); - t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); - t1 = blend_epi16(tt, t1, 0xCC); - g2(row0, row1, row2, row3, t1); - diagonalize(row0, row2, row3); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); - g1(row0, row1, row2, row3, t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); - g2(row0, row1, row2, row3, t3); - undiagonalize(row0, row2, row3); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 4 - t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); - g1(row0, row1, row2, row3, t0); - t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); - t1 = blend_epi16(tt, t1, 0xCC); - g2(row0, row1, row2, row3, t1); - diagonalize(row0, row2, row3); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); - g1(row0, row1, row2, row3, t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); - g2(row0, row1, row2, row3, t3); - undiagonalize(row0, row2, row3); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 5 - t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); - g1(row0, row1, row2, row3, t0); - t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); - t1 = blend_epi16(tt, t1, 0xCC); - g2(row0, row1, row2, row3, t1); - diagonalize(row0, row2, row3); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); - g1(row0, row1, row2, row3, t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); - g2(row0, row1, row2, row3, t3); - undiagonalize(row0, row2, row3); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 6 - t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); - g1(row0, row1, row2, row3, t0); - t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); - t1 = blend_epi16(tt, t1, 0xCC); - g2(row0, row1, row2, row3, t1); - diagonalize(row0, row2, row3); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); - g1(row0, row1, row2, row3, t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); - g2(row0, row1, row2, row3, t3); - undiagonalize(row0, row2, row3); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 7 - t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); - g1(row0, row1, row2, row3, t0); - t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); - t1 = blend_epi16(tt, t1, 0xCC); - g2(row0, row1, row2, row3, t1); - diagonalize(row0, row2, row3); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); - g1(row0, row1, row2, row3, t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); - g2(row0, row1, row2, row3, t3); - undiagonalize(row0, row2, row3); - - [*row0, *row1, *row2, *row3] -} - -#[target_feature(enable = "sse2")] -pub unsafe fn compress_in_place( - cv: &mut CVWords, - block: &[u8; BLOCK_LEN], - block_len: u8, - counter: u64, - flags: u8, -) { - let [row0, row1, row2, row3] = compress_pre(cv, block, block_len, counter, flags); - storeu(xor(row0, row2), cv.as_mut_ptr().add(0) as *mut u8); - storeu(xor(row1, row3), cv.as_mut_ptr().add(4) as *mut u8); -} - -#[target_feature(enable = "sse2")] -pub unsafe fn compress_xof( - cv: &CVWords, - block: &[u8; BLOCK_LEN], - block_len: u8, - counter: u64, - flags: u8, -) -> [u8; 64] { - let [mut row0, mut row1, mut row2, mut row3] = - compress_pre(cv, block, block_len, counter, flags); - row0 = xor(row0, row2); - row1 = xor(row1, row3); - row2 = xor(row2, loadu(cv.as_ptr().add(0) as *const u8)); - row3 = xor(row3, loadu(cv.as_ptr().add(4) as *const u8)); - core::mem::transmute([row0, row1, row2, row3]) -} - -#[inline(always)] -unsafe fn round(v: &mut [__m128i; 16], m: &[__m128i; 16], r: usize) { - v[0] = add(v[0], m[MSG_SCHEDULE[r][0] as usize]); - v[1] = add(v[1], m[MSG_SCHEDULE[r][2] as usize]); - v[2] = add(v[2], m[MSG_SCHEDULE[r][4] as usize]); - v[3] = add(v[3], m[MSG_SCHEDULE[r][6] as usize]); - v[0] = add(v[0], v[4]); - v[1] = add(v[1], v[5]); - v[2] = add(v[2], v[6]); - v[3] = add(v[3], v[7]); - v[12] = xor(v[12], v[0]); - v[13] = xor(v[13], v[1]); - v[14] = xor(v[14], v[2]); - v[15] = xor(v[15], v[3]); - v[12] = rot16(v[12]); - v[13] = rot16(v[13]); - v[14] = rot16(v[14]); - v[15] = rot16(v[15]); - v[8] = add(v[8], v[12]); - v[9] = add(v[9], v[13]); - v[10] = add(v[10], v[14]); - v[11] = add(v[11], v[15]); - v[4] = xor(v[4], v[8]); - v[5] = xor(v[5], v[9]); - v[6] = xor(v[6], v[10]); - v[7] = xor(v[7], v[11]); - v[4] = rot12(v[4]); - v[5] = rot12(v[5]); - v[6] = rot12(v[6]); - v[7] = rot12(v[7]); - v[0] = add(v[0], m[MSG_SCHEDULE[r][1] as usize]); - v[1] = add(v[1], m[MSG_SCHEDULE[r][3] as usize]); - v[2] = add(v[2], m[MSG_SCHEDULE[r][5] as usize]); - v[3] = add(v[3], m[MSG_SCHEDULE[r][7] as usize]); - v[0] = add(v[0], v[4]); - v[1] = add(v[1], v[5]); - v[2] = add(v[2], v[6]); - v[3] = add(v[3], v[7]); - v[12] = xor(v[12], v[0]); - v[13] = xor(v[13], v[1]); - v[14] = xor(v[14], v[2]); - v[15] = xor(v[15], v[3]); - v[12] = rot8(v[12]); - v[13] = rot8(v[13]); - v[14] = rot8(v[14]); - v[15] = rot8(v[15]); - v[8] = add(v[8], v[12]); - v[9] = add(v[9], v[13]); - v[10] = add(v[10], v[14]); - v[11] = add(v[11], v[15]); - v[4] = xor(v[4], v[8]); - v[5] = xor(v[5], v[9]); - v[6] = xor(v[6], v[10]); - v[7] = xor(v[7], v[11]); - v[4] = rot7(v[4]); - v[5] = rot7(v[5]); - v[6] = rot7(v[6]); - v[7] = rot7(v[7]); - - v[0] = add(v[0], m[MSG_SCHEDULE[r][8] as usize]); - v[1] = add(v[1], m[MSG_SCHEDULE[r][10] as usize]); - v[2] = add(v[2], m[MSG_SCHEDULE[r][12] as usize]); - v[3] = add(v[3], m[MSG_SCHEDULE[r][14] as usize]); - v[0] = add(v[0], v[5]); - v[1] = add(v[1], v[6]); - v[2] = add(v[2], v[7]); - v[3] = add(v[3], v[4]); - v[15] = xor(v[15], v[0]); - v[12] = xor(v[12], v[1]); - v[13] = xor(v[13], v[2]); - v[14] = xor(v[14], v[3]); - v[15] = rot16(v[15]); - v[12] = rot16(v[12]); - v[13] = rot16(v[13]); - v[14] = rot16(v[14]); - v[10] = add(v[10], v[15]); - v[11] = add(v[11], v[12]); - v[8] = add(v[8], v[13]); - v[9] = add(v[9], v[14]); - v[5] = xor(v[5], v[10]); - v[6] = xor(v[6], v[11]); - v[7] = xor(v[7], v[8]); - v[4] = xor(v[4], v[9]); - v[5] = rot12(v[5]); - v[6] = rot12(v[6]); - v[7] = rot12(v[7]); - v[4] = rot12(v[4]); - v[0] = add(v[0], m[MSG_SCHEDULE[r][9] as usize]); - v[1] = add(v[1], m[MSG_SCHEDULE[r][11] as usize]); - v[2] = add(v[2], m[MSG_SCHEDULE[r][13] as usize]); - v[3] = add(v[3], m[MSG_SCHEDULE[r][15] as usize]); - v[0] = add(v[0], v[5]); - v[1] = add(v[1], v[6]); - v[2] = add(v[2], v[7]); - v[3] = add(v[3], v[4]); - v[15] = xor(v[15], v[0]); - v[12] = xor(v[12], v[1]); - v[13] = xor(v[13], v[2]); - v[14] = xor(v[14], v[3]); - v[15] = rot8(v[15]); - v[12] = rot8(v[12]); - v[13] = rot8(v[13]); - v[14] = rot8(v[14]); - v[10] = add(v[10], v[15]); - v[11] = add(v[11], v[12]); - v[8] = add(v[8], v[13]); - v[9] = add(v[9], v[14]); - v[5] = xor(v[5], v[10]); - v[6] = xor(v[6], v[11]); - v[7] = xor(v[7], v[8]); - v[4] = xor(v[4], v[9]); - v[5] = rot7(v[5]); - v[6] = rot7(v[6]); - v[7] = rot7(v[7]); - v[4] = rot7(v[4]); -} - -#[inline(always)] -unsafe fn transpose_vecs(vecs: &mut [__m128i; DEGREE]) { - // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is - // 22/33. Note that this doesn't split the vector into two lanes, as the - // AVX2 counterparts do. - let ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]); - let ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]); - let cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]); - let cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]); - - // Interleave 64-bit lanes. - let abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01); - let abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01); - let abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23); - let abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23); - - vecs[0] = abcd_0; - vecs[1] = abcd_1; - vecs[2] = abcd_2; - vecs[3] = abcd_3; -} - -#[inline(always)] -unsafe fn transpose_msg_vecs(inputs: &[*const u8; DEGREE], block_offset: usize) -> [__m128i; 16] { - let mut vecs = [ - loadu(inputs[0].add(block_offset + 0 * 4 * DEGREE)), - loadu(inputs[1].add(block_offset + 0 * 4 * DEGREE)), - loadu(inputs[2].add(block_offset + 0 * 4 * DEGREE)), - loadu(inputs[3].add(block_offset + 0 * 4 * DEGREE)), - loadu(inputs[0].add(block_offset + 1 * 4 * DEGREE)), - loadu(inputs[1].add(block_offset + 1 * 4 * DEGREE)), - loadu(inputs[2].add(block_offset + 1 * 4 * DEGREE)), - loadu(inputs[3].add(block_offset + 1 * 4 * DEGREE)), - loadu(inputs[0].add(block_offset + 2 * 4 * DEGREE)), - loadu(inputs[1].add(block_offset + 2 * 4 * DEGREE)), - loadu(inputs[2].add(block_offset + 2 * 4 * DEGREE)), - loadu(inputs[3].add(block_offset + 2 * 4 * DEGREE)), - loadu(inputs[0].add(block_offset + 3 * 4 * DEGREE)), - loadu(inputs[1].add(block_offset + 3 * 4 * DEGREE)), - loadu(inputs[2].add(block_offset + 3 * 4 * DEGREE)), - loadu(inputs[3].add(block_offset + 3 * 4 * DEGREE)), - ]; - for i in 0..DEGREE { - _mm_prefetch(inputs[i].add(block_offset + 256) as *const i8, _MM_HINT_T0); - } - let squares = mut_array_refs!(&mut vecs, DEGREE, DEGREE, DEGREE, DEGREE); - transpose_vecs(squares.0); - transpose_vecs(squares.1); - transpose_vecs(squares.2); - transpose_vecs(squares.3); - vecs -} - -#[inline(always)] -unsafe fn load_counters(counter: u64, increment_counter: IncrementCounter) -> (__m128i, __m128i) { - let mask = if increment_counter.yes() { !0 } else { 0 }; - ( - set4( - counter_low(counter + (mask & 0)), - counter_low(counter + (mask & 1)), - counter_low(counter + (mask & 2)), - counter_low(counter + (mask & 3)), - ), - set4( - counter_high(counter + (mask & 0)), - counter_high(counter + (mask & 1)), - counter_high(counter + (mask & 2)), - counter_high(counter + (mask & 3)), - ), - ) -} - -#[target_feature(enable = "sse2")] -pub unsafe fn hash4( - inputs: &[*const u8; DEGREE], - blocks: usize, - key: &CVWords, - counter: u64, - increment_counter: IncrementCounter, - flags: u8, - flags_start: u8, - flags_end: u8, - out: &mut [u8; DEGREE * OUT_LEN], -) { - let mut h_vecs = [ - set1(key[0]), - set1(key[1]), - set1(key[2]), - set1(key[3]), - set1(key[4]), - set1(key[5]), - set1(key[6]), - set1(key[7]), - ]; - let (counter_low_vec, counter_high_vec) = load_counters(counter, increment_counter); - let mut block_flags = flags | flags_start; - - for block in 0..blocks { - if block + 1 == blocks { - block_flags |= flags_end; - } - let block_len_vec = set1(BLOCK_LEN as u32); // full blocks only - let block_flags_vec = set1(block_flags as u32); - let msg_vecs = transpose_msg_vecs(inputs, block * BLOCK_LEN); - - // The transposed compression function. Note that inlining this - // manually here improves compile times by a lot, compared to factoring - // it out into its own function and making it #[inline(always)]. Just - // guessing, it might have something to do with loop unrolling. - let mut v = [ - h_vecs[0], - h_vecs[1], - h_vecs[2], - h_vecs[3], - h_vecs[4], - h_vecs[5], - h_vecs[6], - h_vecs[7], - set1(IV[0]), - set1(IV[1]), - set1(IV[2]), - set1(IV[3]), - counter_low_vec, - counter_high_vec, - block_len_vec, - block_flags_vec, - ]; - round(&mut v, &msg_vecs, 0); - round(&mut v, &msg_vecs, 1); - round(&mut v, &msg_vecs, 2); - round(&mut v, &msg_vecs, 3); - round(&mut v, &msg_vecs, 4); - round(&mut v, &msg_vecs, 5); - round(&mut v, &msg_vecs, 6); - h_vecs[0] = xor(v[0], v[8]); - h_vecs[1] = xor(v[1], v[9]); - h_vecs[2] = xor(v[2], v[10]); - h_vecs[3] = xor(v[3], v[11]); - h_vecs[4] = xor(v[4], v[12]); - h_vecs[5] = xor(v[5], v[13]); - h_vecs[6] = xor(v[6], v[14]); - h_vecs[7] = xor(v[7], v[15]); - - block_flags = flags; - } - - let squares = mut_array_refs!(&mut h_vecs, DEGREE, DEGREE); - transpose_vecs(squares.0); - transpose_vecs(squares.1); - // The first four vecs now contain the first half of each output, and the - // second four vecs contain the second half of each output. - storeu(h_vecs[0], out.as_mut_ptr().add(0 * 4 * DEGREE)); - storeu(h_vecs[4], out.as_mut_ptr().add(1 * 4 * DEGREE)); - storeu(h_vecs[1], out.as_mut_ptr().add(2 * 4 * DEGREE)); - storeu(h_vecs[5], out.as_mut_ptr().add(3 * 4 * DEGREE)); - storeu(h_vecs[2], out.as_mut_ptr().add(4 * 4 * DEGREE)); - storeu(h_vecs[6], out.as_mut_ptr().add(5 * 4 * DEGREE)); - storeu(h_vecs[3], out.as_mut_ptr().add(6 * 4 * DEGREE)); - storeu(h_vecs[7], out.as_mut_ptr().add(7 * 4 * DEGREE)); -} - -#[target_feature(enable = "sse2")] -unsafe fn hash1<A: arrayvec::Array<Item = u8>>( - input: &A, - key: &CVWords, - counter: u64, - flags: u8, - flags_start: u8, - flags_end: u8, - out: &mut CVBytes, -) { - debug_assert_eq!(A::CAPACITY % BLOCK_LEN, 0, "uneven blocks"); - let mut cv = *key; - let mut block_flags = flags | flags_start; - let mut slice = input.as_slice(); - while slice.len() >= BLOCK_LEN { - if slice.len() == BLOCK_LEN { - block_flags |= flags_end; - } - compress_in_place( - &mut cv, - array_ref!(slice, 0, BLOCK_LEN), - BLOCK_LEN as u8, - counter, - block_flags, - ); - block_flags = flags; - slice = &slice[BLOCK_LEN..]; - } - *out = core::mem::transmute(cv); // x86 is little-endian -} - -#[target_feature(enable = "sse2")] -pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>( - mut inputs: &[&A], - key: &CVWords, - mut counter: u64, - increment_counter: IncrementCounter, - flags: u8, - flags_start: u8, - flags_end: u8, - mut out: &mut [u8], -) { - debug_assert!(out.len() >= inputs.len() * OUT_LEN, "out too short"); - while inputs.len() >= DEGREE && out.len() >= DEGREE * OUT_LEN { - // Safe because the layout of arrays is guaranteed, and because the - // `blocks` count is determined statically from the argument type. - let input_ptrs: &[*const u8; DEGREE] = &*(inputs.as_ptr() as *const [*const u8; DEGREE]); - let blocks = A::CAPACITY / BLOCK_LEN; - hash4( - input_ptrs, - blocks, - key, - counter, - increment_counter, - flags, - flags_start, - flags_end, - array_mut_ref!(out, 0, DEGREE * OUT_LEN), - ); - if increment_counter.yes() { - counter += DEGREE as u64; - } - inputs = &inputs[DEGREE..]; - out = &mut out[DEGREE * OUT_LEN..]; - } - for (&input, output) in inputs.iter().zip(out.chunks_exact_mut(OUT_LEN)) { - hash1( - input, - key, - counter, - flags, - flags_start, - flags_end, - array_mut_ref!(output, 0, OUT_LEN), - ); - if increment_counter.yes() { - counter += 1; - } - } -} - -#[cfg(test)] -mod test { - use super::*; - - #[test] - fn test_transpose() { - if !crate::platform::sse2_detected() { - return; - } - - #[target_feature(enable = "sse2")] - unsafe fn transpose_wrapper(vecs: &mut [__m128i; DEGREE]) { - transpose_vecs(vecs); - } - - let mut matrix = [[0 as u32; DEGREE]; DEGREE]; - for i in 0..DEGREE { - for j in 0..DEGREE { - matrix[i][j] = (i * DEGREE + j) as u32; - } - } - - unsafe { - let mut vecs: [__m128i; DEGREE] = core::mem::transmute(matrix); - transpose_wrapper(&mut vecs); - matrix = core::mem::transmute(vecs); - } - - for i in 0..DEGREE { - for j in 0..DEGREE { - // Reversed indexes from above. - assert_eq!(matrix[j][i], (i * DEGREE + j) as u32); - } - } - } - - #[test] - fn test_compress() { - if !crate::platform::sse2_detected() { - return; - } - crate::test::test_compress_fn(compress_in_place, compress_xof); - } - - #[test] - fn test_hash_many() { - if !crate::platform::sse2_detected() { - return; - } - crate::test::test_hash_many_fn(hash_many, hash_many); - } -} diff --git a/thirdparty/BLAKE3/src/rust_sse41.rs b/thirdparty/BLAKE3/src/rust_sse41.rs deleted file mode 100644 index d5cf0f4a9..000000000 --- a/thirdparty/BLAKE3/src/rust_sse41.rs +++ /dev/null @@ -1,766 +0,0 @@ -#[cfg(target_arch = "x86")] -use core::arch::x86::*; -#[cfg(target_arch = "x86_64")] -use core::arch::x86_64::*; - -use crate::{ - counter_high, counter_low, CVBytes, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE, - OUT_LEN, -}; -use arrayref::{array_mut_ref, array_ref, mut_array_refs}; - -pub const DEGREE: usize = 4; - -#[inline(always)] -unsafe fn loadu(src: *const u8) -> __m128i { - // This is an unaligned load, so the pointer cast is allowed. - _mm_loadu_si128(src as *const __m128i) -} - -#[inline(always)] -unsafe fn storeu(src: __m128i, dest: *mut u8) { - // This is an unaligned store, so the pointer cast is allowed. - _mm_storeu_si128(dest as *mut __m128i, src) -} - -#[inline(always)] -unsafe fn add(a: __m128i, b: __m128i) -> __m128i { - _mm_add_epi32(a, b) -} - -#[inline(always)] -unsafe fn xor(a: __m128i, b: __m128i) -> __m128i { - _mm_xor_si128(a, b) -} - -#[inline(always)] -unsafe fn set1(x: u32) -> __m128i { - _mm_set1_epi32(x as i32) -} - -#[inline(always)] -unsafe fn set4(a: u32, b: u32, c: u32, d: u32) -> __m128i { - _mm_setr_epi32(a as i32, b as i32, c as i32, d as i32) -} - -// These rotations are the "simple/shifts version". For the -// "complicated/shuffles version", see -// https://github.com/sneves/blake2-avx2/blob/b3723921f668df09ece52dcd225a36d4a4eea1d9/blake2s-common.h#L63-L66. -// For a discussion of the tradeoffs, see -// https://github.com/sneves/blake2-avx2/pull/5. Due to an LLVM bug -// (https://bugs.llvm.org/show_bug.cgi?id=44379), this version performs better -// on recent x86 chips. - -#[inline(always)] -unsafe fn rot16(a: __m128i) -> __m128i { - _mm_or_si128(_mm_srli_epi32(a, 16), _mm_slli_epi32(a, 32 - 16)) -} - -#[inline(always)] -unsafe fn rot12(a: __m128i) -> __m128i { - _mm_or_si128(_mm_srli_epi32(a, 12), _mm_slli_epi32(a, 32 - 12)) -} - -#[inline(always)] -unsafe fn rot8(a: __m128i) -> __m128i { - _mm_or_si128(_mm_srli_epi32(a, 8), _mm_slli_epi32(a, 32 - 8)) -} - -#[inline(always)] -unsafe fn rot7(a: __m128i) -> __m128i { - _mm_or_si128(_mm_srli_epi32(a, 7), _mm_slli_epi32(a, 32 - 7)) -} - -#[inline(always)] -unsafe fn g1( - row0: &mut __m128i, - row1: &mut __m128i, - row2: &mut __m128i, - row3: &mut __m128i, - m: __m128i, -) { - *row0 = add(add(*row0, m), *row1); - *row3 = xor(*row3, *row0); - *row3 = rot16(*row3); - *row2 = add(*row2, *row3); - *row1 = xor(*row1, *row2); - *row1 = rot12(*row1); -} - -#[inline(always)] -unsafe fn g2( - row0: &mut __m128i, - row1: &mut __m128i, - row2: &mut __m128i, - row3: &mut __m128i, - m: __m128i, -) { - *row0 = add(add(*row0, m), *row1); - *row3 = xor(*row3, *row0); - *row3 = rot8(*row3); - *row2 = add(*row2, *row3); - *row1 = xor(*row1, *row2); - *row1 = rot7(*row1); -} - -// Adapted from https://github.com/rust-lang-nursery/stdsimd/pull/479. -macro_rules! _MM_SHUFFLE { - ($z:expr, $y:expr, $x:expr, $w:expr) => { - ($z << 6) | ($y << 4) | ($x << 2) | $w - }; -} - -macro_rules! shuffle2 { - ($a:expr, $b:expr, $c:expr) => { - _mm_castps_si128(_mm_shuffle_ps( - _mm_castsi128_ps($a), - _mm_castsi128_ps($b), - $c, - )) - }; -} - -// Note the optimization here of leaving row1 as the unrotated row, rather than -// row0. All the message loads below are adjusted to compensate for this. See -// discussion at https://github.com/sneves/blake2-avx2/pull/4 -#[inline(always)] -unsafe fn diagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) { - *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(2, 1, 0, 3)); - *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2)); - *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(0, 3, 2, 1)); -} - -#[inline(always)] -unsafe fn undiagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) { - *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(0, 3, 2, 1)); - *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2)); - *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(2, 1, 0, 3)); -} - -#[inline(always)] -unsafe fn compress_pre( - cv: &CVWords, - block: &[u8; BLOCK_LEN], - block_len: u8, - counter: u64, - flags: u8, -) -> [__m128i; 4] { - let row0 = &mut loadu(cv.as_ptr().add(0) as *const u8); - let row1 = &mut loadu(cv.as_ptr().add(4) as *const u8); - let row2 = &mut set4(IV[0], IV[1], IV[2], IV[3]); - let row3 = &mut set4( - counter_low(counter), - counter_high(counter), - block_len as u32, - flags as u32, - ); - - let mut m0 = loadu(block.as_ptr().add(0 * 4 * DEGREE)); - let mut m1 = loadu(block.as_ptr().add(1 * 4 * DEGREE)); - let mut m2 = loadu(block.as_ptr().add(2 * 4 * DEGREE)); - let mut m3 = loadu(block.as_ptr().add(3 * 4 * DEGREE)); - - let mut t0; - let mut t1; - let mut t2; - let mut t3; - let mut tt; - - // Round 1. The first round permutes the message words from the original - // input order, into the groups that get mixed in parallel. - t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(2, 0, 2, 0)); // 6 4 2 0 - g1(row0, row1, row2, row3, t0); - t1 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 3, 1)); // 7 5 3 1 - g2(row0, row1, row2, row3, t1); - diagonalize(row0, row2, row3); - t2 = shuffle2!(m2, m3, _MM_SHUFFLE!(2, 0, 2, 0)); // 14 12 10 8 - t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE!(2, 1, 0, 3)); // 12 10 8 14 - g1(row0, row1, row2, row3, t2); - t3 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 1, 3, 1)); // 15 13 11 9 - t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE!(2, 1, 0, 3)); // 13 11 9 15 - g2(row0, row1, row2, row3, t3); - undiagonalize(row0, row2, row3); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 2. This round and all following rounds apply a fixed permutation - // to the message words from the round before. - t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); - g1(row0, row1, row2, row3, t0); - t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(row0, row1, row2, row3, t1); - diagonalize(row0, row2, row3); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); - g1(row0, row1, row2, row3, t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); - g2(row0, row1, row2, row3, t3); - undiagonalize(row0, row2, row3); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 3 - t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); - g1(row0, row1, row2, row3, t0); - t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(row0, row1, row2, row3, t1); - diagonalize(row0, row2, row3); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); - g1(row0, row1, row2, row3, t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); - g2(row0, row1, row2, row3, t3); - undiagonalize(row0, row2, row3); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 4 - t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); - g1(row0, row1, row2, row3, t0); - t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(row0, row1, row2, row3, t1); - diagonalize(row0, row2, row3); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); - g1(row0, row1, row2, row3, t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); - g2(row0, row1, row2, row3, t3); - undiagonalize(row0, row2, row3); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 5 - t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); - g1(row0, row1, row2, row3, t0); - t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(row0, row1, row2, row3, t1); - diagonalize(row0, row2, row3); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); - g1(row0, row1, row2, row3, t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); - g2(row0, row1, row2, row3, t3); - undiagonalize(row0, row2, row3); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 6 - t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); - g1(row0, row1, row2, row3, t0); - t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(row0, row1, row2, row3, t1); - diagonalize(row0, row2, row3); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); - g1(row0, row1, row2, row3, t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); - g2(row0, row1, row2, row3, t3); - undiagonalize(row0, row2, row3); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 7 - t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); - g1(row0, row1, row2, row3, t0); - t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(row0, row1, row2, row3, t1); - diagonalize(row0, row2, row3); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); - g1(row0, row1, row2, row3, t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); - g2(row0, row1, row2, row3, t3); - undiagonalize(row0, row2, row3); - - [*row0, *row1, *row2, *row3] -} - -#[target_feature(enable = "sse4.1")] -pub unsafe fn compress_in_place( - cv: &mut CVWords, - block: &[u8; BLOCK_LEN], - block_len: u8, - counter: u64, - flags: u8, -) { - let [row0, row1, row2, row3] = compress_pre(cv, block, block_len, counter, flags); - storeu(xor(row0, row2), cv.as_mut_ptr().add(0) as *mut u8); - storeu(xor(row1, row3), cv.as_mut_ptr().add(4) as *mut u8); -} - -#[target_feature(enable = "sse4.1")] -pub unsafe fn compress_xof( - cv: &CVWords, - block: &[u8; BLOCK_LEN], - block_len: u8, - counter: u64, - flags: u8, -) -> [u8; 64] { - let [mut row0, mut row1, mut row2, mut row3] = - compress_pre(cv, block, block_len, counter, flags); - row0 = xor(row0, row2); - row1 = xor(row1, row3); - row2 = xor(row2, loadu(cv.as_ptr().add(0) as *const u8)); - row3 = xor(row3, loadu(cv.as_ptr().add(4) as *const u8)); - core::mem::transmute([row0, row1, row2, row3]) -} - -#[inline(always)] -unsafe fn round(v: &mut [__m128i; 16], m: &[__m128i; 16], r: usize) { - v[0] = add(v[0], m[MSG_SCHEDULE[r][0] as usize]); - v[1] = add(v[1], m[MSG_SCHEDULE[r][2] as usize]); - v[2] = add(v[2], m[MSG_SCHEDULE[r][4] as usize]); - v[3] = add(v[3], m[MSG_SCHEDULE[r][6] as usize]); - v[0] = add(v[0], v[4]); - v[1] = add(v[1], v[5]); - v[2] = add(v[2], v[6]); - v[3] = add(v[3], v[7]); - v[12] = xor(v[12], v[0]); - v[13] = xor(v[13], v[1]); - v[14] = xor(v[14], v[2]); - v[15] = xor(v[15], v[3]); - v[12] = rot16(v[12]); - v[13] = rot16(v[13]); - v[14] = rot16(v[14]); - v[15] = rot16(v[15]); - v[8] = add(v[8], v[12]); - v[9] = add(v[9], v[13]); - v[10] = add(v[10], v[14]); - v[11] = add(v[11], v[15]); - v[4] = xor(v[4], v[8]); - v[5] = xor(v[5], v[9]); - v[6] = xor(v[6], v[10]); - v[7] = xor(v[7], v[11]); - v[4] = rot12(v[4]); - v[5] = rot12(v[5]); - v[6] = rot12(v[6]); - v[7] = rot12(v[7]); - v[0] = add(v[0], m[MSG_SCHEDULE[r][1] as usize]); - v[1] = add(v[1], m[MSG_SCHEDULE[r][3] as usize]); - v[2] = add(v[2], m[MSG_SCHEDULE[r][5] as usize]); - v[3] = add(v[3], m[MSG_SCHEDULE[r][7] as usize]); - v[0] = add(v[0], v[4]); - v[1] = add(v[1], v[5]); - v[2] = add(v[2], v[6]); - v[3] = add(v[3], v[7]); - v[12] = xor(v[12], v[0]); - v[13] = xor(v[13], v[1]); - v[14] = xor(v[14], v[2]); - v[15] = xor(v[15], v[3]); - v[12] = rot8(v[12]); - v[13] = rot8(v[13]); - v[14] = rot8(v[14]); - v[15] = rot8(v[15]); - v[8] = add(v[8], v[12]); - v[9] = add(v[9], v[13]); - v[10] = add(v[10], v[14]); - v[11] = add(v[11], v[15]); - v[4] = xor(v[4], v[8]); - v[5] = xor(v[5], v[9]); - v[6] = xor(v[6], v[10]); - v[7] = xor(v[7], v[11]); - v[4] = rot7(v[4]); - v[5] = rot7(v[5]); - v[6] = rot7(v[6]); - v[7] = rot7(v[7]); - - v[0] = add(v[0], m[MSG_SCHEDULE[r][8] as usize]); - v[1] = add(v[1], m[MSG_SCHEDULE[r][10] as usize]); - v[2] = add(v[2], m[MSG_SCHEDULE[r][12] as usize]); - v[3] = add(v[3], m[MSG_SCHEDULE[r][14] as usize]); - v[0] = add(v[0], v[5]); - v[1] = add(v[1], v[6]); - v[2] = add(v[2], v[7]); - v[3] = add(v[3], v[4]); - v[15] = xor(v[15], v[0]); - v[12] = xor(v[12], v[1]); - v[13] = xor(v[13], v[2]); - v[14] = xor(v[14], v[3]); - v[15] = rot16(v[15]); - v[12] = rot16(v[12]); - v[13] = rot16(v[13]); - v[14] = rot16(v[14]); - v[10] = add(v[10], v[15]); - v[11] = add(v[11], v[12]); - v[8] = add(v[8], v[13]); - v[9] = add(v[9], v[14]); - v[5] = xor(v[5], v[10]); - v[6] = xor(v[6], v[11]); - v[7] = xor(v[7], v[8]); - v[4] = xor(v[4], v[9]); - v[5] = rot12(v[5]); - v[6] = rot12(v[6]); - v[7] = rot12(v[7]); - v[4] = rot12(v[4]); - v[0] = add(v[0], m[MSG_SCHEDULE[r][9] as usize]); - v[1] = add(v[1], m[MSG_SCHEDULE[r][11] as usize]); - v[2] = add(v[2], m[MSG_SCHEDULE[r][13] as usize]); - v[3] = add(v[3], m[MSG_SCHEDULE[r][15] as usize]); - v[0] = add(v[0], v[5]); - v[1] = add(v[1], v[6]); - v[2] = add(v[2], v[7]); - v[3] = add(v[3], v[4]); - v[15] = xor(v[15], v[0]); - v[12] = xor(v[12], v[1]); - v[13] = xor(v[13], v[2]); - v[14] = xor(v[14], v[3]); - v[15] = rot8(v[15]); - v[12] = rot8(v[12]); - v[13] = rot8(v[13]); - v[14] = rot8(v[14]); - v[10] = add(v[10], v[15]); - v[11] = add(v[11], v[12]); - v[8] = add(v[8], v[13]); - v[9] = add(v[9], v[14]); - v[5] = xor(v[5], v[10]); - v[6] = xor(v[6], v[11]); - v[7] = xor(v[7], v[8]); - v[4] = xor(v[4], v[9]); - v[5] = rot7(v[5]); - v[6] = rot7(v[6]); - v[7] = rot7(v[7]); - v[4] = rot7(v[4]); -} - -#[inline(always)] -unsafe fn transpose_vecs(vecs: &mut [__m128i; DEGREE]) { - // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is - // 22/33. Note that this doesn't split the vector into two lanes, as the - // AVX2 counterparts do. - let ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]); - let ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]); - let cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]); - let cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]); - - // Interleave 64-bit lanes. - let abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01); - let abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01); - let abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23); - let abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23); - - vecs[0] = abcd_0; - vecs[1] = abcd_1; - vecs[2] = abcd_2; - vecs[3] = abcd_3; -} - -#[inline(always)] -unsafe fn transpose_msg_vecs(inputs: &[*const u8; DEGREE], block_offset: usize) -> [__m128i; 16] { - let mut vecs = [ - loadu(inputs[0].add(block_offset + 0 * 4 * DEGREE)), - loadu(inputs[1].add(block_offset + 0 * 4 * DEGREE)), - loadu(inputs[2].add(block_offset + 0 * 4 * DEGREE)), - loadu(inputs[3].add(block_offset + 0 * 4 * DEGREE)), - loadu(inputs[0].add(block_offset + 1 * 4 * DEGREE)), - loadu(inputs[1].add(block_offset + 1 * 4 * DEGREE)), - loadu(inputs[2].add(block_offset + 1 * 4 * DEGREE)), - loadu(inputs[3].add(block_offset + 1 * 4 * DEGREE)), - loadu(inputs[0].add(block_offset + 2 * 4 * DEGREE)), - loadu(inputs[1].add(block_offset + 2 * 4 * DEGREE)), - loadu(inputs[2].add(block_offset + 2 * 4 * DEGREE)), - loadu(inputs[3].add(block_offset + 2 * 4 * DEGREE)), - loadu(inputs[0].add(block_offset + 3 * 4 * DEGREE)), - loadu(inputs[1].add(block_offset + 3 * 4 * DEGREE)), - loadu(inputs[2].add(block_offset + 3 * 4 * DEGREE)), - loadu(inputs[3].add(block_offset + 3 * 4 * DEGREE)), - ]; - for i in 0..DEGREE { - _mm_prefetch(inputs[i].add(block_offset + 256) as *const i8, _MM_HINT_T0); - } - let squares = mut_array_refs!(&mut vecs, DEGREE, DEGREE, DEGREE, DEGREE); - transpose_vecs(squares.0); - transpose_vecs(squares.1); - transpose_vecs(squares.2); - transpose_vecs(squares.3); - vecs -} - -#[inline(always)] -unsafe fn load_counters(counter: u64, increment_counter: IncrementCounter) -> (__m128i, __m128i) { - let mask = if increment_counter.yes() { !0 } else { 0 }; - ( - set4( - counter_low(counter + (mask & 0)), - counter_low(counter + (mask & 1)), - counter_low(counter + (mask & 2)), - counter_low(counter + (mask & 3)), - ), - set4( - counter_high(counter + (mask & 0)), - counter_high(counter + (mask & 1)), - counter_high(counter + (mask & 2)), - counter_high(counter + (mask & 3)), - ), - ) -} - -#[target_feature(enable = "sse4.1")] -pub unsafe fn hash4( - inputs: &[*const u8; DEGREE], - blocks: usize, - key: &CVWords, - counter: u64, - increment_counter: IncrementCounter, - flags: u8, - flags_start: u8, - flags_end: u8, - out: &mut [u8; DEGREE * OUT_LEN], -) { - let mut h_vecs = [ - set1(key[0]), - set1(key[1]), - set1(key[2]), - set1(key[3]), - set1(key[4]), - set1(key[5]), - set1(key[6]), - set1(key[7]), - ]; - let (counter_low_vec, counter_high_vec) = load_counters(counter, increment_counter); - let mut block_flags = flags | flags_start; - - for block in 0..blocks { - if block + 1 == blocks { - block_flags |= flags_end; - } - let block_len_vec = set1(BLOCK_LEN as u32); // full blocks only - let block_flags_vec = set1(block_flags as u32); - let msg_vecs = transpose_msg_vecs(inputs, block * BLOCK_LEN); - - // The transposed compression function. Note that inlining this - // manually here improves compile times by a lot, compared to factoring - // it out into its own function and making it #[inline(always)]. Just - // guessing, it might have something to do with loop unrolling. - let mut v = [ - h_vecs[0], - h_vecs[1], - h_vecs[2], - h_vecs[3], - h_vecs[4], - h_vecs[5], - h_vecs[6], - h_vecs[7], - set1(IV[0]), - set1(IV[1]), - set1(IV[2]), - set1(IV[3]), - counter_low_vec, - counter_high_vec, - block_len_vec, - block_flags_vec, - ]; - round(&mut v, &msg_vecs, 0); - round(&mut v, &msg_vecs, 1); - round(&mut v, &msg_vecs, 2); - round(&mut v, &msg_vecs, 3); - round(&mut v, &msg_vecs, 4); - round(&mut v, &msg_vecs, 5); - round(&mut v, &msg_vecs, 6); - h_vecs[0] = xor(v[0], v[8]); - h_vecs[1] = xor(v[1], v[9]); - h_vecs[2] = xor(v[2], v[10]); - h_vecs[3] = xor(v[3], v[11]); - h_vecs[4] = xor(v[4], v[12]); - h_vecs[5] = xor(v[5], v[13]); - h_vecs[6] = xor(v[6], v[14]); - h_vecs[7] = xor(v[7], v[15]); - - block_flags = flags; - } - - let squares = mut_array_refs!(&mut h_vecs, DEGREE, DEGREE); - transpose_vecs(squares.0); - transpose_vecs(squares.1); - // The first four vecs now contain the first half of each output, and the - // second four vecs contain the second half of each output. - storeu(h_vecs[0], out.as_mut_ptr().add(0 * 4 * DEGREE)); - storeu(h_vecs[4], out.as_mut_ptr().add(1 * 4 * DEGREE)); - storeu(h_vecs[1], out.as_mut_ptr().add(2 * 4 * DEGREE)); - storeu(h_vecs[5], out.as_mut_ptr().add(3 * 4 * DEGREE)); - storeu(h_vecs[2], out.as_mut_ptr().add(4 * 4 * DEGREE)); - storeu(h_vecs[6], out.as_mut_ptr().add(5 * 4 * DEGREE)); - storeu(h_vecs[3], out.as_mut_ptr().add(6 * 4 * DEGREE)); - storeu(h_vecs[7], out.as_mut_ptr().add(7 * 4 * DEGREE)); -} - -#[target_feature(enable = "sse4.1")] -unsafe fn hash1<A: arrayvec::Array<Item = u8>>( - input: &A, - key: &CVWords, - counter: u64, - flags: u8, - flags_start: u8, - flags_end: u8, - out: &mut CVBytes, -) { - debug_assert_eq!(A::CAPACITY % BLOCK_LEN, 0, "uneven blocks"); - let mut cv = *key; - let mut block_flags = flags | flags_start; - let mut slice = input.as_slice(); - while slice.len() >= BLOCK_LEN { - if slice.len() == BLOCK_LEN { - block_flags |= flags_end; - } - compress_in_place( - &mut cv, - array_ref!(slice, 0, BLOCK_LEN), - BLOCK_LEN as u8, - counter, - block_flags, - ); - block_flags = flags; - slice = &slice[BLOCK_LEN..]; - } - *out = core::mem::transmute(cv); // x86 is little-endian -} - -#[target_feature(enable = "sse4.1")] -pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>( - mut inputs: &[&A], - key: &CVWords, - mut counter: u64, - increment_counter: IncrementCounter, - flags: u8, - flags_start: u8, - flags_end: u8, - mut out: &mut [u8], -) { - debug_assert!(out.len() >= inputs.len() * OUT_LEN, "out too short"); - while inputs.len() >= DEGREE && out.len() >= DEGREE * OUT_LEN { - // Safe because the layout of arrays is guaranteed, and because the - // `blocks` count is determined statically from the argument type. - let input_ptrs: &[*const u8; DEGREE] = &*(inputs.as_ptr() as *const [*const u8; DEGREE]); - let blocks = A::CAPACITY / BLOCK_LEN; - hash4( - input_ptrs, - blocks, - key, - counter, - increment_counter, - flags, - flags_start, - flags_end, - array_mut_ref!(out, 0, DEGREE * OUT_LEN), - ); - if increment_counter.yes() { - counter += DEGREE as u64; - } - inputs = &inputs[DEGREE..]; - out = &mut out[DEGREE * OUT_LEN..]; - } - for (&input, output) in inputs.iter().zip(out.chunks_exact_mut(OUT_LEN)) { - hash1( - input, - key, - counter, - flags, - flags_start, - flags_end, - array_mut_ref!(output, 0, OUT_LEN), - ); - if increment_counter.yes() { - counter += 1; - } - } -} - -#[cfg(test)] -mod test { - use super::*; - - #[test] - fn test_transpose() { - if !crate::platform::sse41_detected() { - return; - } - - #[target_feature(enable = "sse4.1")] - unsafe fn transpose_wrapper(vecs: &mut [__m128i; DEGREE]) { - transpose_vecs(vecs); - } - - let mut matrix = [[0 as u32; DEGREE]; DEGREE]; - for i in 0..DEGREE { - for j in 0..DEGREE { - matrix[i][j] = (i * DEGREE + j) as u32; - } - } - - unsafe { - let mut vecs: [__m128i; DEGREE] = core::mem::transmute(matrix); - transpose_wrapper(&mut vecs); - matrix = core::mem::transmute(vecs); - } - - for i in 0..DEGREE { - for j in 0..DEGREE { - // Reversed indexes from above. - assert_eq!(matrix[j][i], (i * DEGREE + j) as u32); - } - } - } - - #[test] - fn test_compress() { - if !crate::platform::sse41_detected() { - return; - } - crate::test::test_compress_fn(compress_in_place, compress_xof); - } - - #[test] - fn test_hash_many() { - if !crate::platform::sse41_detected() { - return; - } - crate::test::test_hash_many_fn(hash_many, hash_many); - } -} diff --git a/thirdparty/BLAKE3/src/test.rs b/thirdparty/BLAKE3/src/test.rs deleted file mode 100644 index eefb1a354..000000000 --- a/thirdparty/BLAKE3/src/test.rs +++ /dev/null @@ -1,569 +0,0 @@ -use crate::{CVBytes, CVWords, IncrementCounter, BLOCK_LEN, CHUNK_LEN, OUT_LEN}; -use arrayref::array_ref; -use arrayvec::ArrayVec; -use core::sync::atomic::{AtomicUsize, Ordering}; -use core::usize; -use rand::prelude::*; - -// Interesting input lengths to run tests on. -pub const TEST_CASES: &[usize] = &[ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - BLOCK_LEN - 1, - BLOCK_LEN, - BLOCK_LEN + 1, - 2 * BLOCK_LEN - 1, - 2 * BLOCK_LEN, - 2 * BLOCK_LEN + 1, - CHUNK_LEN - 1, - CHUNK_LEN, - CHUNK_LEN + 1, - 2 * CHUNK_LEN, - 2 * CHUNK_LEN + 1, - 3 * CHUNK_LEN, - 3 * CHUNK_LEN + 1, - 4 * CHUNK_LEN, - 4 * CHUNK_LEN + 1, - 5 * CHUNK_LEN, - 5 * CHUNK_LEN + 1, - 6 * CHUNK_LEN, - 6 * CHUNK_LEN + 1, - 7 * CHUNK_LEN, - 7 * CHUNK_LEN + 1, - 8 * CHUNK_LEN, - 8 * CHUNK_LEN + 1, - 16 * CHUNK_LEN, // AVX512's bandwidth - 31 * CHUNK_LEN, // 16 + 8 + 4 + 2 + 1 - 100 * CHUNK_LEN, // subtrees larger than MAX_SIMD_DEGREE chunks -]; - -pub const TEST_CASES_MAX: usize = 100 * CHUNK_LEN; - -// There's a test to make sure these two are equal below. -pub const TEST_KEY: CVBytes = *b"whats the Elvish word for friend"; -pub const TEST_KEY_WORDS: CVWords = [ - 1952540791, 1752440947, 1816469605, 1752394102, 1919907616, 1868963940, 1919295602, 1684956521, -]; - -// Paint the input with a repeating byte pattern. We use a cycle length of 251, -// because that's the largets prime number less than 256. This makes it -// unlikely to swapping any two adjacent input blocks or chunks will give the -// same answer. -pub fn paint_test_input(buf: &mut [u8]) { - for (i, b) in buf.iter_mut().enumerate() { - *b = (i % 251) as u8; - } -} - -type CompressInPlaceFn = - unsafe fn(cv: &mut CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8); - -type CompressXofFn = unsafe fn( - cv: &CVWords, - block: &[u8; BLOCK_LEN], - block_len: u8, - counter: u64, - flags: u8, -) -> [u8; 64]; - -// A shared helper function for platform-specific tests. -pub fn test_compress_fn(compress_in_place_fn: CompressInPlaceFn, compress_xof_fn: CompressXofFn) { - let initial_state = TEST_KEY_WORDS; - let block_len: u8 = 61; - let mut block = [0; BLOCK_LEN]; - paint_test_input(&mut block[..block_len as usize]); - // Use a counter with set bits in both 32-bit words. - let counter = (5u64 << 32) + 6; - let flags = crate::CHUNK_END | crate::ROOT | crate::KEYED_HASH; - - let portable_out = - crate::portable::compress_xof(&initial_state, &block, block_len, counter as u64, flags); - - let mut test_state = initial_state; - unsafe { compress_in_place_fn(&mut test_state, &block, block_len, counter as u64, flags) }; - let test_state_bytes = crate::platform::le_bytes_from_words_32(&test_state); - let test_xof = - unsafe { compress_xof_fn(&initial_state, &block, block_len, counter as u64, flags) }; - - assert_eq!(&portable_out[..32], &test_state_bytes[..]); - assert_eq!(&portable_out[..], &test_xof[..]); -} - -type HashManyFn<A> = unsafe fn( - inputs: &[&A], - key: &CVWords, - counter: u64, - increment_counter: IncrementCounter, - flags: u8, - flags_start: u8, - flags_end: u8, - out: &mut [u8], -); - -// A shared helper function for platform-specific tests. -pub fn test_hash_many_fn( - hash_many_chunks_fn: HashManyFn<[u8; CHUNK_LEN]>, - hash_many_parents_fn: HashManyFn<[u8; 2 * OUT_LEN]>, -) { - // 31 (16 + 8 + 4 + 2 + 1) inputs - const NUM_INPUTS: usize = 31; - let mut input_buf = [0; CHUNK_LEN * NUM_INPUTS]; - crate::test::paint_test_input(&mut input_buf); - // A counter just prior to u32::MAX. - let counter = (1u64 << 32) - 1; - - // First hash chunks. - let mut chunks = ArrayVec::<[&[u8; CHUNK_LEN]; NUM_INPUTS]>::new(); - for i in 0..NUM_INPUTS { - chunks.push(array_ref!(input_buf, i * CHUNK_LEN, CHUNK_LEN)); - } - let mut portable_chunks_out = [0; NUM_INPUTS * OUT_LEN]; - crate::portable::hash_many( - &chunks, - &TEST_KEY_WORDS, - counter, - IncrementCounter::Yes, - crate::KEYED_HASH, - crate::CHUNK_START, - crate::CHUNK_END, - &mut portable_chunks_out, - ); - - let mut test_chunks_out = [0; NUM_INPUTS * OUT_LEN]; - unsafe { - hash_many_chunks_fn( - &chunks[..], - &TEST_KEY_WORDS, - counter, - IncrementCounter::Yes, - crate::KEYED_HASH, - crate::CHUNK_START, - crate::CHUNK_END, - &mut test_chunks_out, - ); - } - for n in 0..NUM_INPUTS { - #[cfg(feature = "std")] - dbg!(n); - assert_eq!( - &portable_chunks_out[n * OUT_LEN..][..OUT_LEN], - &test_chunks_out[n * OUT_LEN..][..OUT_LEN] - ); - } - - // Then hash parents. - let mut parents = ArrayVec::<[&[u8; 2 * OUT_LEN]; NUM_INPUTS]>::new(); - for i in 0..NUM_INPUTS { - parents.push(array_ref!(input_buf, i * 2 * OUT_LEN, 2 * OUT_LEN)); - } - let mut portable_parents_out = [0; NUM_INPUTS * OUT_LEN]; - crate::portable::hash_many( - &parents, - &TEST_KEY_WORDS, - counter, - IncrementCounter::No, - crate::KEYED_HASH | crate::PARENT, - 0, - 0, - &mut portable_parents_out, - ); - - let mut test_parents_out = [0; NUM_INPUTS * OUT_LEN]; - unsafe { - hash_many_parents_fn( - &parents[..], - &TEST_KEY_WORDS, - counter, - IncrementCounter::No, - crate::KEYED_HASH | crate::PARENT, - 0, - 0, - &mut test_parents_out, - ); - } - for n in 0..NUM_INPUTS { - #[cfg(feature = "std")] - dbg!(n); - assert_eq!( - &portable_parents_out[n * OUT_LEN..][..OUT_LEN], - &test_parents_out[n * OUT_LEN..][..OUT_LEN] - ); - } -} - -#[test] -fn test_key_bytes_equal_key_words() { - assert_eq!( - TEST_KEY_WORDS, - crate::platform::words_from_le_bytes_32(&TEST_KEY), - ); -} - -#[test] -fn test_reference_impl_size() { - // Because the Rust compiler optimizes struct layout, it's possible that - // some future version of the compiler will produce a different size. If - // that happens, we can either disable this test, or test for multiple - // expected values. For now, the purpose of this test is to make sure we - // notice if that happens. - assert_eq!(1880, core::mem::size_of::<reference_impl::Hasher>()); -} - -#[test] -fn test_counter_words() { - let counter: u64 = (1 << 32) + 2; - assert_eq!(crate::counter_low(counter), 2); - assert_eq!(crate::counter_high(counter), 1); -} - -#[test] -fn test_largest_power_of_two_leq() { - let input_output = &[ - // The zero case is nonsensical, but it does work. - (0, 1), - (1, 1), - (2, 2), - (3, 2), - (4, 4), - (5, 4), - (6, 4), - (7, 4), - (8, 8), - // the largest possible usize - (usize::MAX, (usize::MAX >> 1) + 1), - ]; - for &(input, output) in input_output { - assert_eq!( - output, - crate::largest_power_of_two_leq(input), - "wrong output for n={}", - input - ); - } -} - -#[test] -fn test_left_len() { - let input_output = &[ - (CHUNK_LEN + 1, CHUNK_LEN), - (2 * CHUNK_LEN - 1, CHUNK_LEN), - (2 * CHUNK_LEN, CHUNK_LEN), - (2 * CHUNK_LEN + 1, 2 * CHUNK_LEN), - (4 * CHUNK_LEN - 1, 2 * CHUNK_LEN), - (4 * CHUNK_LEN, 2 * CHUNK_LEN), - (4 * CHUNK_LEN + 1, 4 * CHUNK_LEN), - ]; - for &(input, output) in input_output { - assert_eq!(crate::left_len(input), output); - } -} - -#[test] -fn test_compare_reference_impl() { - const OUT: usize = 303; // more than 64, not a multiple of 4 - let mut input_buf = [0; TEST_CASES_MAX]; - paint_test_input(&mut input_buf); - for &case in TEST_CASES { - let input = &input_buf[..case]; - #[cfg(feature = "std")] - dbg!(case); - - // regular - { - let mut reference_hasher = reference_impl::Hasher::new(); - reference_hasher.update(input); - let mut expected_out = [0; OUT]; - reference_hasher.finalize(&mut expected_out); - - // all at once - let test_out = crate::hash(input); - assert_eq!(test_out, *array_ref!(expected_out, 0, 32)); - // incremental - let mut hasher = crate::Hasher::new(); - hasher.update(input); - assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32)); - assert_eq!(hasher.finalize(), test_out); - // xof - let mut extended = [0; OUT]; - hasher.finalize_xof().fill(&mut extended); - assert_eq!(extended[..], expected_out[..]); - } - - // keyed - { - let mut reference_hasher = reference_impl::Hasher::new_keyed(&TEST_KEY); - reference_hasher.update(input); - let mut expected_out = [0; OUT]; - reference_hasher.finalize(&mut expected_out); - - // all at once - let test_out = crate::keyed_hash(&TEST_KEY, input); - assert_eq!(test_out, *array_ref!(expected_out, 0, 32)); - // incremental - let mut hasher = crate::Hasher::new_keyed(&TEST_KEY); - hasher.update(input); - assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32)); - assert_eq!(hasher.finalize(), test_out); - // xof - let mut extended = [0; OUT]; - hasher.finalize_xof().fill(&mut extended); - assert_eq!(extended[..], expected_out[..]); - } - - // derive_key - { - let context = "BLAKE3 2019-12-27 16:13:59 example context (not the test vector one)"; - let mut reference_hasher = reference_impl::Hasher::new_derive_key(context); - reference_hasher.update(input); - let mut expected_out = [0; OUT]; - reference_hasher.finalize(&mut expected_out); - - // all at once - let mut test_out = [0; OUT]; - crate::derive_key(context, input, &mut test_out); - assert_eq!(test_out[..], expected_out[..]); - // incremental - let mut hasher = crate::Hasher::new_derive_key(context); - hasher.update(input); - assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32)); - assert_eq!(hasher.finalize(), *array_ref!(test_out, 0, 32)); - // xof - let mut extended = [0; OUT]; - hasher.finalize_xof().fill(&mut extended); - assert_eq!(extended[..], expected_out[..]); - } - } -} - -fn reference_hash(input: &[u8]) -> crate::Hash { - let mut hasher = reference_impl::Hasher::new(); - hasher.update(input); - let mut bytes = [0; 32]; - hasher.finalize(&mut bytes); - bytes.into() -} - -#[test] -fn test_compare_update_multiple() { - // Don't use all the long test cases here, since that's unnecessarily slow - // in debug mode. - let mut short_test_cases = TEST_CASES; - while *short_test_cases.last().unwrap() > 4 * CHUNK_LEN { - short_test_cases = &short_test_cases[..short_test_cases.len() - 1]; - } - assert_eq!(*short_test_cases.last().unwrap(), 4 * CHUNK_LEN); - - let mut input_buf = [0; 2 * TEST_CASES_MAX]; - paint_test_input(&mut input_buf); - - for &first_update in short_test_cases { - #[cfg(feature = "std")] - dbg!(first_update); - let first_input = &input_buf[..first_update]; - let mut test_hasher = crate::Hasher::new(); - test_hasher.update(first_input); - - for &second_update in short_test_cases { - #[cfg(feature = "std")] - dbg!(second_update); - let second_input = &input_buf[first_update..][..second_update]; - let total_input = &input_buf[..first_update + second_update]; - - // Clone the hasher with first_update bytes already written, so - // that the next iteration can reuse it. - let mut test_hasher = test_hasher.clone(); - test_hasher.update(second_input); - let expected = reference_hash(total_input); - assert_eq!(expected, test_hasher.finalize()); - } - } -} - -#[test] -fn test_fuzz_hasher() { - const INPUT_MAX: usize = 4 * CHUNK_LEN; - let mut input_buf = [0; 3 * INPUT_MAX]; - paint_test_input(&mut input_buf); - - // Don't do too many iterations in debug mode, to keep the tests under a - // second or so. CI should run tests in release mode also. Provide an - // environment variable for specifying a larger number of fuzz iterations. - let num_tests = if cfg!(debug_assertions) { 100 } else { 10_000 }; - - // Use a fixed RNG seed for reproducibility. - let mut rng = rand_chacha::ChaCha8Rng::from_seed([1; 32]); - for _num_test in 0..num_tests { - #[cfg(feature = "std")] - dbg!(_num_test); - let mut hasher = crate::Hasher::new(); - let mut total_input = 0; - // For each test, write 3 inputs of random length. - for _ in 0..3 { - let input_len = rng.gen_range(0, INPUT_MAX + 1); - #[cfg(feature = "std")] - dbg!(input_len); - let input = &input_buf[total_input..][..input_len]; - hasher.update(input); - total_input += input_len; - } - let expected = reference_hash(&input_buf[..total_input]); - assert_eq!(expected, hasher.finalize()); - } -} - -#[test] -fn test_xof_seek() { - let mut out = [0; 533]; - let mut hasher = crate::Hasher::new(); - hasher.update(b"foo"); - hasher.finalize_xof().fill(&mut out); - assert_eq!(hasher.finalize().as_bytes(), &out[0..32]); - - let mut reader = hasher.finalize_xof(); - reader.set_position(303); - let mut out2 = [0; 102]; - reader.fill(&mut out2); - assert_eq!(&out[303..][..102], &out2[..]); - - #[cfg(feature = "std")] - { - use std::io::prelude::*; - let mut reader = hasher.finalize_xof(); - reader.seek(std::io::SeekFrom::Start(303)).unwrap(); - let mut out3 = Vec::new(); - reader.by_ref().take(102).read_to_end(&mut out3).unwrap(); - assert_eq!(&out[303..][..102], &out3[..]); - - assert_eq!( - reader.seek(std::io::SeekFrom::Current(0)).unwrap(), - 303 + 102 - ); - reader.seek(std::io::SeekFrom::Current(-5)).unwrap(); - assert_eq!( - reader.seek(std::io::SeekFrom::Current(0)).unwrap(), - 303 + 102 - 5 - ); - let mut out4 = [0; 17]; - assert_eq!(reader.read(&mut out4).unwrap(), 17); - assert_eq!(&out[303 + 102 - 5..][..17], &out4[..]); - assert_eq!( - reader.seek(std::io::SeekFrom::Current(0)).unwrap(), - 303 + 102 - 5 + 17 - ); - assert!(reader.seek(std::io::SeekFrom::End(0)).is_err()); - assert!(reader.seek(std::io::SeekFrom::Current(-1000)).is_err()); - } -} - -#[test] -fn test_msg_schdule_permutation() { - let permutation = [2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8]; - - let mut generated = [[0; 16]; 7]; - generated[0] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]; - - for round in 1..7 { - for i in 0..16 { - generated[round][i] = generated[round - 1][permutation[i]]; - } - } - - assert_eq!(generated, crate::MSG_SCHEDULE); -} - -#[test] -fn test_reset() { - let mut hasher = crate::Hasher::new(); - hasher.update(&[42; 3 * CHUNK_LEN + 7]); - hasher.reset(); - hasher.update(&[42; CHUNK_LEN + 3]); - assert_eq!(hasher.finalize(), crate::hash(&[42; CHUNK_LEN + 3])); - - let key = &[99; crate::KEY_LEN]; - let mut keyed_hasher = crate::Hasher::new_keyed(key); - keyed_hasher.update(&[42; 3 * CHUNK_LEN + 7]); - keyed_hasher.reset(); - keyed_hasher.update(&[42; CHUNK_LEN + 3]); - assert_eq!( - keyed_hasher.finalize(), - crate::keyed_hash(key, &[42; CHUNK_LEN + 3]), - ); - - let context = "BLAKE3 2020-02-12 10:20:58 reset test"; - let mut kdf = crate::Hasher::new_derive_key(context); - kdf.update(&[42; 3 * CHUNK_LEN + 7]); - kdf.reset(); - kdf.update(&[42; CHUNK_LEN + 3]); - let mut expected = [0; crate::OUT_LEN]; - crate::derive_key(context, &[42; CHUNK_LEN + 3], &mut expected); - assert_eq!(kdf.finalize(), expected); -} - -#[test] -#[cfg(feature = "rayon")] -fn test_update_with_rayon_join() { - let mut input = [0; TEST_CASES_MAX]; - paint_test_input(&mut input); - let rayon_hash = crate::Hasher::new() - .update_with_join::<crate::join::RayonJoin>(&input) - .finalize(); - assert_eq!(crate::hash(&input), rayon_hash); -} - -// Test that the length values given to Join::join are what they're supposed to -// be. -#[test] -fn test_join_lengths() { - // Use static atomics to let us safely get a couple of values in and out of - // CustomJoin. This avoids depending on std, though it assumes that this - // thread will only run once in the lifetime of the runner process. - static SINGLE_THREAD_LEN: AtomicUsize = AtomicUsize::new(0); - static CUSTOM_JOIN_CALLS: AtomicUsize = AtomicUsize::new(0); - - // Use an input that's exactly (simd_degree * CHUNK_LEN) + 1. That should - // guarantee that compress_subtree_wide does exactly one split, with the - // last byte on the right side. Note that it we used - // Hasher::update_with_join, we would end up buffering that last byte, - // rather than splitting and joining it. - let single_thread_len = crate::platform::Platform::detect().simd_degree() * CHUNK_LEN; - SINGLE_THREAD_LEN.store(single_thread_len, Ordering::SeqCst); - let mut input_buf = [0; 2 * crate::platform::MAX_SIMD_DEGREE * CHUNK_LEN]; - paint_test_input(&mut input_buf); - let input = &input_buf[..single_thread_len + 1]; - - enum CustomJoin {} - - impl crate::join::Join for CustomJoin { - fn join<A, B, RA, RB>(oper_a: A, oper_b: B, len_a: usize, len_b: usize) -> (RA, RB) - where - A: FnOnce() -> RA + Send, - B: FnOnce() -> RB + Send, - RA: Send, - RB: Send, - { - let prev_calls = CUSTOM_JOIN_CALLS.fetch_add(1, Ordering::SeqCst); - assert_eq!(prev_calls, 0); - assert_eq!(len_a, SINGLE_THREAD_LEN.load(Ordering::SeqCst)); - assert_eq!(len_b, 1); - (oper_a(), oper_b()) - } - } - - let mut out_buf = [0; crate::platform::MAX_SIMD_DEGREE_OR_2 * CHUNK_LEN]; - crate::compress_subtree_wide::<CustomJoin>( - input, - crate::IV, - 0, - 0, - crate::platform::Platform::detect(), - &mut out_buf, - ); - assert_eq!(CUSTOM_JOIN_CALLS.load(Ordering::SeqCst), 1); -} diff --git a/thirdparty/BLAKE3/src/traits.rs b/thirdparty/BLAKE3/src/traits.rs deleted file mode 100644 index 9704e0106..000000000 --- a/thirdparty/BLAKE3/src/traits.rs +++ /dev/null @@ -1,184 +0,0 @@ -//! Implementations of commonly used traits like -//! [`digest::Digest`](https://crates.io/crates/digest) and -//! [`crypto_mac::Mac`](https://crates.io/crates/crypto-mac). - -pub use crypto_mac; -pub use digest; - -use crate::{Hasher, OutputReader}; -use digest::generic_array::{ - typenum::{U32, U64}, - GenericArray, -}; - -impl digest::BlockInput for Hasher { - type BlockSize = U64; -} - -impl digest::Update for Hasher { - #[inline] - fn update(&mut self, data: impl AsRef<[u8]>) { - self.update(data.as_ref()); - } -} - -impl digest::Reset for Hasher { - #[inline] - fn reset(&mut self) { - self.reset(); // the inherent method - } -} - -impl digest::FixedOutput for Hasher { - type OutputSize = U32; - - #[inline] - fn finalize_into(self, out: &mut GenericArray<u8, Self::OutputSize>) { - out.copy_from_slice(self.finalize().as_bytes()); - } - - #[inline] - fn finalize_into_reset(&mut self, out: &mut GenericArray<u8, Self::OutputSize>) { - out.copy_from_slice(self.finalize().as_bytes()); - self.reset(); - } -} - -impl digest::ExtendableOutput for Hasher { - type Reader = OutputReader; - - #[inline] - fn finalize_xof(self) -> Self::Reader { - Hasher::finalize_xof(&self) - } - - #[inline] - fn finalize_xof_reset(&mut self) -> Self::Reader { - let reader = Hasher::finalize_xof(self); - self.reset(); - reader - } -} - -impl digest::XofReader for OutputReader { - #[inline] - fn read(&mut self, buffer: &mut [u8]) { - self.fill(buffer); - } -} - -impl crypto_mac::NewMac for Hasher { - type KeySize = U32; - - #[inline] - fn new(key: &crypto_mac::Key<Self>) -> Self { - let key_bytes: [u8; 32] = (*key).into(); - Hasher::new_keyed(&key_bytes) - } -} - -impl crypto_mac::Mac for Hasher { - type OutputSize = U32; - - #[inline] - fn update(&mut self, data: &[u8]) { - self.update(data); - } - - #[inline] - fn reset(&mut self) { - self.reset(); - } - - #[inline] - fn finalize(self) -> crypto_mac::Output<Self> { - crypto_mac::Output::new(digest::Digest::finalize(self)) - } -} - -#[cfg(test)] -mod test { - use super::*; - - #[test] - fn test_digest_traits() { - // Inherent methods. - let mut hasher1 = crate::Hasher::new(); - hasher1.update(b"foo"); - hasher1.update(b"bar"); - hasher1.update(b"baz"); - let out1 = hasher1.finalize(); - let mut xof1 = [0; 301]; - hasher1.finalize_xof().fill(&mut xof1); - assert_eq!(out1.as_bytes(), &xof1[..32]); - - // Trait implementations. - let mut hasher2: crate::Hasher = digest::Digest::new(); - digest::Digest::update(&mut hasher2, b"xxx"); - digest::Digest::reset(&mut hasher2); - digest::Digest::update(&mut hasher2, b"foo"); - digest::Digest::update(&mut hasher2, b"bar"); - digest::Digest::update(&mut hasher2, b"baz"); - let out2 = digest::Digest::finalize(hasher2.clone()); - let mut xof2 = [0; 301]; - digest::XofReader::read( - &mut digest::ExtendableOutput::finalize_xof(hasher2.clone()), - &mut xof2, - ); - assert_eq!(out1.as_bytes(), &out2[..]); - assert_eq!(xof1[..], xof2[..]); - - // Again with the resetting variants. - let mut hasher3: crate::Hasher = digest::Digest::new(); - digest::Digest::update(&mut hasher3, b"foobarbaz"); - let mut out3 = [0; 32]; - digest::FixedOutput::finalize_into_reset( - &mut hasher3, - GenericArray::from_mut_slice(&mut out3), - ); - digest::Digest::update(&mut hasher3, b"foobarbaz"); - let mut out4 = [0; 32]; - digest::FixedOutput::finalize_into_reset( - &mut hasher3, - GenericArray::from_mut_slice(&mut out4), - ); - digest::Digest::update(&mut hasher3, b"foobarbaz"); - let mut xof3 = [0; 301]; - digest::XofReader::read( - &mut digest::ExtendableOutput::finalize_xof_reset(&mut hasher3), - &mut xof3, - ); - digest::Digest::update(&mut hasher3, b"foobarbaz"); - let mut xof4 = [0; 301]; - digest::XofReader::read( - &mut digest::ExtendableOutput::finalize_xof_reset(&mut hasher3), - &mut xof4, - ); - assert_eq!(out1.as_bytes(), &out3[..]); - assert_eq!(out1.as_bytes(), &out4[..]); - assert_eq!(xof1[..], xof3[..]); - assert_eq!(xof1[..], xof4[..]); - } - - #[test] - fn test_mac_trait() { - // Inherent methods. - let key = b"some super secret key bytes fooo"; - let mut hasher1 = crate::Hasher::new_keyed(key); - hasher1.update(b"foo"); - hasher1.update(b"bar"); - hasher1.update(b"baz"); - let out1 = hasher1.finalize(); - - // Trait implementation. - let generic_key = (*key).into(); - let mut hasher2: crate::Hasher = crypto_mac::NewMac::new(&generic_key); - crypto_mac::Mac::update(&mut hasher2, b"xxx"); - crypto_mac::Mac::reset(&mut hasher2); - crypto_mac::Mac::update(&mut hasher2, b"foo"); - crypto_mac::Mac::update(&mut hasher2, b"bar"); - crypto_mac::Mac::update(&mut hasher2, b"baz"); - let out2 = crypto_mac::Mac::finalize(hasher2); - assert_eq!(out1.as_bytes(), out2.into_bytes().as_slice()); - } -} diff --git a/thirdparty/BLAKE3/test_vectors/Cargo.toml b/thirdparty/BLAKE3/test_vectors/Cargo.toml deleted file mode 100644 index cd74a9df0..000000000 --- a/thirdparty/BLAKE3/test_vectors/Cargo.toml +++ /dev/null @@ -1,18 +0,0 @@ -[package] -name = "test_vectors" -version = "0.0.0" -edition = "2018" - -[features] -neon = ["blake3/neon"] -prefer_intrinsics = ["blake3/prefer_intrinsics"] -pure = ["blake3/pure"] - -[dependencies] -# If you ever change these path dependencies, you'll probably need to update -# cross_test.sh, or CI will break. I'm sorry >.< -blake3 = { path = "../" } -hex = "0.4.0" -reference_impl = { path = "../reference_impl" } -serde = { version = "1.0", features = ["derive"] } -serde_json = "1.0" diff --git a/thirdparty/BLAKE3/test_vectors/cross_test.sh b/thirdparty/BLAKE3/test_vectors/cross_test.sh deleted file mode 100644 index c4d280c9d..000000000 --- a/thirdparty/BLAKE3/test_vectors/cross_test.sh +++ /dev/null @@ -1,25 +0,0 @@ -#! /usr/bin/env bash - -# This hacky script works around the fact that `cross test` does not support -# path dependencies. (It uses a docker shared folder to let the guest access -# project files, so parent directories aren't available.) Solve this problem by -# copying the entire project to a temp dir and rearranging paths to put -# "blake3" and "reference_impl" underneath "test_vectors", so that everything -# is accessible. Hopefully this will just run on CI forever and no one will -# ever read this and discover my deep shame. - -set -e -u -o pipefail - -project_root="$(realpath "$(dirname "$BASH_SOURCE")/..")" -tmpdir="$(mktemp -d)" -echo "Running cross tests in $tmpdir" -cd "$tmpdir" -git clone "$project_root" blake3 -mv blake3/test_vectors . -mv blake3/reference_impl test_vectors -mv blake3 test_vectors -cd test_vectors -sed -i 's|blake3 = { path = "../" }|blake3 = { path = "./blake3" }|' Cargo.toml -sed -i 's|reference_impl = { path = "../reference_impl" }|reference_impl = { path = "reference_impl" }|' Cargo.toml - -cross test "$@" diff --git a/thirdparty/BLAKE3/test_vectors/src/lib.rs b/thirdparty/BLAKE3/test_vectors/src/lib.rs deleted file mode 100644 index 04460f668..000000000 --- a/thirdparty/BLAKE3/test_vectors/src/lib.rs +++ /dev/null @@ -1,349 +0,0 @@ -use blake3::{BLOCK_LEN, CHUNK_LEN}; -use serde::{Deserialize, Serialize}; - -// A non-multiple of 4 is important, since one possible bug is to fail to emit -// partial words. -pub const OUTPUT_LEN: usize = 2 * blake3::BLOCK_LEN + 3; - -pub const TEST_CASES: &[usize] = &[ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - BLOCK_LEN - 1, - BLOCK_LEN, - BLOCK_LEN + 1, - 2 * BLOCK_LEN - 1, - 2 * BLOCK_LEN, - 2 * BLOCK_LEN + 1, - CHUNK_LEN - 1, - CHUNK_LEN, - CHUNK_LEN + 1, - 2 * CHUNK_LEN, - 2 * CHUNK_LEN + 1, - 3 * CHUNK_LEN, - 3 * CHUNK_LEN + 1, - 4 * CHUNK_LEN, - 4 * CHUNK_LEN + 1, - 5 * CHUNK_LEN, - 5 * CHUNK_LEN + 1, - 6 * CHUNK_LEN, - 6 * CHUNK_LEN + 1, - 7 * CHUNK_LEN, - 7 * CHUNK_LEN + 1, - 8 * CHUNK_LEN, - 8 * CHUNK_LEN + 1, - 16 * CHUNK_LEN, // AVX512's bandwidth - 31 * CHUNK_LEN, // 16 + 8 + 4 + 2 + 1 - 100 * CHUNK_LEN, // subtrees larger than MAX_SIMD_DEGREE chunks -]; - -pub const TEST_KEY: &[u8; blake3::KEY_LEN] = b"whats the Elvish word for friend"; -pub const TEST_CONTEXT: &str = "BLAKE3 2019-12-27 16:29:52 test vectors context"; - -const COMMENT: &str = r#" -Each test is an input length and three outputs, one for each of the hash, -keyed_hash, and derive_key modes. The input in each case is filled with a -repeating sequence of 251 bytes: 0, 1, 2, ..., 249, 250, 0, 1, ..., and so on. -The key used with keyed_hash is the 32-byte ASCII string "whats the Elvish word -for friend", also given in the `key` field below. The context string used with -derive_key is the ASCII string "BLAKE3 2019-12-27 16:29:52 test vectors -context", also given in the `context_string` field below. Outputs are encoded -as hexadecimal. Each case is an extended output, and implementations should -also check that the first 32 bytes match their default-length output. -"#; - -// Paint the input with a repeating byte pattern. We use a cycle length of 251, -// because that's the largets prime number less than 256. This makes it -// unlikely to swapping any two adjacent input blocks or chunks will give the -// same answer. -pub fn paint_test_input(buf: &mut [u8]) { - for (i, b) in buf.iter_mut().enumerate() { - *b = (i % 251) as u8; - } -} - -#[derive(Debug, Serialize, Deserialize)] -pub struct Cases { - pub _comment: String, - pub key: String, - pub context_string: String, - pub cases: Vec<Case>, -} - -#[derive(Debug, Serialize, Deserialize)] -pub struct Case { - pub input_len: usize, - pub hash: String, - pub keyed_hash: String, - pub derive_key: String, -} - -pub fn generate_json() -> String { - let mut cases = Vec::new(); - for &input_len in TEST_CASES { - let mut input = vec![0; input_len]; - paint_test_input(&mut input); - - let mut hash_out = [0; OUTPUT_LEN]; - blake3::Hasher::new() - .update(&input) - .finalize_xof() - .fill(&mut hash_out); - - let mut keyed_hash_out = [0; OUTPUT_LEN]; - blake3::Hasher::new_keyed(TEST_KEY) - .update(&input) - .finalize_xof() - .fill(&mut keyed_hash_out); - - let mut derive_key_out = [0; OUTPUT_LEN]; - blake3::Hasher::new_derive_key(TEST_CONTEXT) - .update(&input) - .finalize_xof() - .fill(&mut derive_key_out); - - cases.push(Case { - input_len, - hash: hex::encode(&hash_out[..]), - keyed_hash: hex::encode(&keyed_hash_out[..]), - derive_key: hex::encode(&derive_key_out[..]), - }); - } - - let mut json = serde_json::to_string_pretty(&Cases { - _comment: COMMENT.trim().replace("\n", " "), - key: std::str::from_utf8(TEST_KEY).unwrap().to_string(), - context_string: TEST_CONTEXT.to_string(), - cases, - }) - .unwrap(); - - // Add a trailing newline. - json.push('\n'); - json -} - -pub fn read_test_vectors_file() -> String { - let test_vectors_file_path = "./test_vectors.json"; - std::fs::read_to_string(test_vectors_file_path).expect("failed to read test_vectors.json") -} - -pub fn parse_test_cases() -> Cases { - let json = read_test_vectors_file(); - serde_json::from_str(&json).expect("failed to parse test_vectors.json") -} - -#[cfg(test)] -mod tests { - use super::*; - use std::convert::TryInto; - - fn test_reference_impl_all_at_once( - key: &[u8; blake3::KEY_LEN], - input: &[u8], - expected_hash: &[u8], - expected_keyed_hash: &[u8], - expected_derive_key: &[u8], - ) { - let mut out = vec![0; expected_hash.len()]; - let mut hasher = reference_impl::Hasher::new(); - hasher.update(input); - hasher.finalize(&mut out); - assert_eq!(expected_hash, &out[..]); - - let mut out = vec![0; expected_keyed_hash.len()]; - let mut hasher = reference_impl::Hasher::new_keyed(key); - hasher.update(input); - hasher.finalize(&mut out); - assert_eq!(expected_keyed_hash, &out[..]); - - let mut out = vec![0; expected_derive_key.len()]; - let mut hasher = reference_impl::Hasher::new_derive_key(TEST_CONTEXT); - hasher.update(input); - hasher.finalize(&mut out); - assert_eq!(expected_derive_key, &out[..]); - } - - fn test_reference_impl_one_at_a_time( - key: &[u8; blake3::KEY_LEN], - input: &[u8], - expected_hash: &[u8], - expected_keyed_hash: &[u8], - expected_derive_key: &[u8], - ) { - let mut out = vec![0; expected_hash.len()]; - let mut hasher = reference_impl::Hasher::new(); - for &b in input { - hasher.update(&[b]); - } - hasher.finalize(&mut out); - assert_eq!(expected_hash, &out[..]); - - let mut out = vec![0; expected_keyed_hash.len()]; - let mut hasher = reference_impl::Hasher::new_keyed(key); - for &b in input { - hasher.update(&[b]); - } - hasher.finalize(&mut out); - assert_eq!(expected_keyed_hash, &out[..]); - - let mut out = vec![0; expected_derive_key.len()]; - let mut hasher = reference_impl::Hasher::new_derive_key(TEST_CONTEXT); - for &b in input { - hasher.update(&[b]); - } - hasher.finalize(&mut out); - assert_eq!(expected_derive_key, &out[..]); - } - - fn test_incremental_all_at_once( - key: &[u8; blake3::KEY_LEN], - input: &[u8], - expected_hash: &[u8], - expected_keyed_hash: &[u8], - expected_derive_key: &[u8], - ) { - let mut out = vec![0; expected_hash.len()]; - let mut hasher = blake3::Hasher::new(); - hasher.update(input); - hasher.finalize_xof().fill(&mut out); - assert_eq!(expected_hash, &out[..]); - assert_eq!(&expected_hash[..32], hasher.finalize().as_bytes()); - - let mut out = vec![0; expected_keyed_hash.len()]; - let mut hasher = blake3::Hasher::new_keyed(key); - hasher.update(input); - hasher.finalize_xof().fill(&mut out); - assert_eq!(expected_keyed_hash, &out[..]); - assert_eq!(&expected_keyed_hash[..32], hasher.finalize().as_bytes()); - - let mut out = vec![0; expected_derive_key.len()]; - let mut hasher = blake3::Hasher::new_derive_key(TEST_CONTEXT); - hasher.update(input); - hasher.finalize_xof().fill(&mut out); - assert_eq!(expected_derive_key, &out[..]); - assert_eq!(&expected_derive_key[..32], hasher.finalize().as_bytes()); - } - - fn test_incremental_one_at_a_time( - key: &[u8; blake3::KEY_LEN], - input: &[u8], - expected_hash: &[u8], - expected_keyed_hash: &[u8], - expected_derive_key: &[u8], - ) { - let mut out = vec![0; expected_hash.len()]; - let mut hasher = blake3::Hasher::new(); - for &b in input { - hasher.update(&[b]); - } - hasher.finalize_xof().fill(&mut out); - assert_eq!(expected_hash, &out[..]); - assert_eq!(&expected_hash[..32], hasher.finalize().as_bytes()); - - let mut out = vec![0; expected_keyed_hash.len()]; - let mut hasher = blake3::Hasher::new_keyed(key); - for &b in input { - hasher.update(&[b]); - } - hasher.finalize_xof().fill(&mut out); - assert_eq!(expected_keyed_hash, &out[..]); - assert_eq!(&expected_keyed_hash[..32], hasher.finalize().as_bytes()); - - let mut out = vec![0; expected_derive_key.len()]; - let mut hasher = blake3::Hasher::new_derive_key(TEST_CONTEXT); - for &b in input { - hasher.update(&[b]); - } - hasher.finalize_xof().fill(&mut out); - assert_eq!(expected_derive_key, &out[..]); - assert_eq!(&expected_derive_key[..32], hasher.finalize().as_bytes()); - } - - fn test_recursive( - key: &[u8; blake3::KEY_LEN], - input: &[u8], - expected_hash: &[u8], - expected_keyed_hash: &[u8], - expected_derive_key: &[u8], - ) { - assert_eq!(&expected_hash[..32], blake3::hash(input).as_bytes()); - assert_eq!( - &expected_keyed_hash[..32], - &blake3::keyed_hash(key, input).as_bytes()[..], - ); - let mut derive_key_out = vec![0; expected_derive_key.len()]; - blake3::derive_key(TEST_CONTEXT, input, &mut derive_key_out); - assert_eq!(expected_derive_key, &derive_key_out[..],); - } - - #[test] - fn run_test_vectors() { - let cases = parse_test_cases(); - let key: &[u8; blake3::KEY_LEN] = cases.key.as_bytes().try_into().unwrap(); - for case in &cases.cases { - dbg!(case.input_len); - let mut input = vec![0; case.input_len]; - paint_test_input(&mut input); - let expected_hash = hex::decode(&case.hash).unwrap(); - let expected_keyed_hash = hex::decode(&case.keyed_hash).unwrap(); - let expected_derive_key = hex::decode(&case.derive_key).unwrap(); - - test_reference_impl_all_at_once( - key, - &input, - &expected_hash, - &expected_keyed_hash, - &expected_derive_key, - ); - - test_reference_impl_one_at_a_time( - key, - &input, - &expected_hash, - &expected_keyed_hash, - &expected_derive_key, - ); - - test_incremental_all_at_once( - key, - &input, - &expected_hash, - &expected_keyed_hash, - &expected_derive_key, - ); - - test_incremental_one_at_a_time( - key, - &input, - &expected_hash, - &expected_keyed_hash, - &expected_derive_key, - ); - - test_recursive( - key, - &input, - &expected_hash, - &expected_keyed_hash, - &expected_derive_key, - ); - } - } - - #[test] - fn test_checked_in_vectors_up_to_date() { - // Replace Windows newlines, in case Git is configured to alter - // newlines when files are checked out. - let json = read_test_vectors_file().replace("\r\n", "\n"); - if generate_json() != json { - panic!("Checked-in test_vectors.json is not up to date. Regenerate with `cargo run --bin generate > ./test_vectors.json`."); - } - } -} diff --git a/thirdparty/BLAKE3/test_vectors/test_vectors.json b/thirdparty/BLAKE3/test_vectors/test_vectors.json deleted file mode 100644 index f6da91792..000000000 --- a/thirdparty/BLAKE3/test_vectors/test_vectors.json +++ /dev/null @@ -1,217 +0,0 @@ -{ - "_comment": "Each test is an input length and three outputs, one for each of the hash, keyed_hash, and derive_key modes. The input in each case is filled with a repeating sequence of 251 bytes: 0, 1, 2, ..., 249, 250, 0, 1, ..., and so on. The key used with keyed_hash is the 32-byte ASCII string \"whats the Elvish word for friend\", also given in the `key` field below. The context string used with derive_key is the ASCII string \"BLAKE3 2019-12-27 16:29:52 test vectors context\", also given in the `context_string` field below. Outputs are encoded as hexadecimal. Each case is an extended output, and implementations should also check that the first 32 bytes match their default-length output.", - "key": "whats the Elvish word for friend", - "context_string": "BLAKE3 2019-12-27 16:29:52 test vectors context", - "cases": [ - { - "input_len": 0, - "hash": "af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262e00f03e7b69af26b7faaf09fcd333050338ddfe085b8cc869ca98b206c08243a26f5487789e8f660afe6c99ef9e0c52b92e7393024a80459cf91f476f9ffdbda7001c22e159b402631f277ca96f2defdf1078282314e763699a31c5363165421cce14d", - "keyed_hash": "92b2b75604ed3c761f9d6f62392c8a9227ad0ea3f09573e783f1498a4ed60d26b18171a2f22a4b94822c701f107153dba24918c4bae4d2945c20ece13387627d3b73cbf97b797d5e59948c7ef788f54372df45e45e4293c7dc18c1d41144a9758be58960856be1eabbe22c2653190de560ca3b2ac4aa692a9210694254c371e851bc8f", - "derive_key": "2cc39783c223154fea8dfb7c1b1660f2ac2dcbd1c1de8277b0b0dd39b7e50d7d905630c8be290dfcf3e6842f13bddd573c098c3f17361f1f206b8cad9d088aa4a3f746752c6b0ce6a83b0da81d59649257cdf8eb3e9f7d4998e41021fac119deefb896224ac99f860011f73609e6e0e4540f93b273e56547dfd3aa1a035ba6689d89a0" - }, - { - "input_len": 1, - "hash": "2d3adedff11b61f14c886e35afa036736dcd87a74d27b5c1510225d0f592e213c3a6cb8bf623e20cdb535f8d1a5ffb86342d9c0b64aca3bce1d31f60adfa137b358ad4d79f97b47c3d5e79f179df87a3b9776ef8325f8329886ba42f07fb138bb502f4081cbcec3195c5871e6c23e2cc97d3c69a613eba131e5f1351f3f1da786545e5", - "keyed_hash": "6d7878dfff2f485635d39013278ae14f1454b8c0a3a2d34bc1ab38228a80c95b6568c0490609413006fbd428eb3fd14e7756d90f73a4725fad147f7bf70fd61c4e0cf7074885e92b0e3f125978b4154986d4fb202a3f331a3fb6cf349a3a70e49990f98fe4289761c8602c4e6ab1138d31d3b62218078b2f3ba9a88e1d08d0dd4cea11", - "derive_key": "b3e2e340a117a499c6cf2398a19ee0d29cca2bb7404c73063382693bf66cb06c5827b91bf889b6b97c5477f535361caefca0b5d8c4746441c57617111933158950670f9aa8a05d791daae10ac683cbef8faf897c84e6114a59d2173c3f417023a35d6983f2c7dfa57e7fc559ad751dbfb9ffab39c2ef8c4aafebc9ae973a64f0c76551" - }, - { - "input_len": 2, - "hash": "7b7015bb92cf0b318037702a6cdd81dee41224f734684c2c122cd6359cb1ee63d8386b22e2ddc05836b7c1bb693d92af006deb5ffbc4c70fb44d0195d0c6f252faac61659ef86523aa16517f87cb5f1340e723756ab65efb2f91964e14391de2a432263a6faf1d146937b35a33621c12d00be8223a7f1919cec0acd12097ff3ab00ab1", - "keyed_hash": "5392ddae0e0a69d5f40160462cbd9bd889375082ff224ac9c758802b7a6fd20a9ffbf7efd13e989a6c246f96d3a96b9d279f2c4e63fb0bdff633957acf50ee1a5f658be144bab0f6f16500dee4aa5967fc2c586d85a04caddec90fffb7633f46a60786024353b9e5cebe277fcd9514217fee2267dcda8f7b31697b7c54fab6a939bf8f", - "derive_key": "1f166565a7df0098ee65922d7fea425fb18b9943f19d6161e2d17939356168e6daa59cae19892b2d54f6fc9f475d26031fd1c22ae0a3e8ef7bdb23f452a15e0027629d2e867b1bb1e6ab21c71297377750826c404dfccc2406bd57a83775f89e0b075e59a7732326715ef912078e213944f490ad68037557518b79c0086de6d6f6cdd2" - }, - { - "input_len": 3, - "hash": "e1be4d7a8ab5560aa4199eea339849ba8e293d55ca0a81006726d184519e647f5b49b82f805a538c68915c1ae8035c900fd1d4b13902920fd05e1450822f36de9454b7e9996de4900c8e723512883f93f4345f8a58bfe64ee38d3ad71ab027765d25cdd0e448328a8e7a683b9a6af8b0af94fa09010d9186890b096a08471e4230a134", - "keyed_hash": "39e67b76b5a007d4921969779fe666da67b5213b096084ab674742f0d5ec62b9b9142d0fab08e1b161efdbb28d18afc64d8f72160c958e53a950cdecf91c1a1bbab1a9c0f01def762a77e2e8545d4dec241e98a89b6db2e9a5b070fc110caae2622690bd7b76c02ab60750a3ea75426a6bb8803c370ffe465f07fb57def95df772c39f", - "derive_key": "440aba35cb006b61fc17c0529255de438efc06a8c9ebf3f2ddac3b5a86705797f27e2e914574f4d87ec04c379e12789eccbfbc15892626042707802dbe4e97c3ff59dca80c1e54246b6d055154f7348a39b7d098b2b4824ebe90e104e763b2a447512132cede16243484a55a4e40a85790038bb0dcf762e8c053cabae41bbe22a5bff7" - }, - { - "input_len": 4, - "hash": "f30f5ab28fe047904037f77b6da4fea1e27241c5d132638d8bedce9d40494f328f603ba4564453e06cdcee6cbe728a4519bbe6f0d41e8a14b5b225174a566dbfa61b56afb1e452dc08c804f8c3143c9e2cc4a31bb738bf8c1917b55830c6e65797211701dc0b98daa1faeaa6ee9e56ab606ce03a1a881e8f14e87a4acf4646272cfd12", - "keyed_hash": "7671dde590c95d5ac9616651ff5aa0a27bee5913a348e053b8aa9108917fe070116c0acff3f0d1fa97ab38d813fd46506089118147d83393019b068a55d646251ecf81105f798d76a10ae413f3d925787d6216a7eb444e510fd56916f1d753a5544ecf0072134a146b2615b42f50c179f56b8fae0788008e3e27c67482349e249cb86a", - "derive_key": "f46085c8190d69022369ce1a18880e9b369c135eb93f3c63550d3e7630e91060fbd7d8f4258bec9da4e05044f88b91944f7cab317a2f0c18279629a3867fad0662c9ad4d42c6f27e5b124da17c8c4f3a94a025ba5d1b623686c6099d202a7317a82e3d95dae46a87de0555d727a5df55de44dab799a20dffe239594d6e99ed17950910" - }, - { - "input_len": 5, - "hash": "b40b44dfd97e7a84a996a91af8b85188c66c126940ba7aad2e7ae6b385402aa2ebcfdac6c5d32c31209e1f81a454751280db64942ce395104e1e4eaca62607de1c2ca748251754ea5bbe8c20150e7f47efd57012c63b3c6a6632dc1c7cd15f3e1c999904037d60fac2eb9397f2adbe458d7f264e64f1e73aa927b30988e2aed2f03620", - "keyed_hash": "73ac69eecf286894d8102018a6fc729f4b1f4247d3703f69bdc6a5fe3e0c84616ab199d1f2f3e53bffb17f0a2209fe8b4f7d4c7bae59c2bc7d01f1ff94c67588cc6b38fa6024886f2c078bfe09b5d9e6584cd6c521c3bb52f4de7687b37117a2dbbec0d59e92fa9a8cc3240d4432f91757aabcae03e87431dac003e7d73574bfdd8218", - "derive_key": "1f24eda69dbcb752847ec3ebb5dd42836d86e58500c7c98d906ecd82ed9ae47f6f48a3f67e4e43329c9a89b1ca526b9b35cbf7d25c1e353baffb590fd79be58ddb6c711f1a6b60e98620b851c688670412fcb0435657ba6b638d21f0f2a04f2f6b0bd8834837b10e438d5f4c7c2c71299cf7586ea9144ed09253d51f8f54dd6bff719d" - }, - { - "input_len": 6, - "hash": "06c4e8ffb6872fad96f9aaca5eee1553eb62aed0ad7198cef42e87f6a616c844611a30c4e4f37fe2fe23c0883cde5cf7059d88b657c7ed2087e3d210925ede716435d6d5d82597a1e52b9553919e804f5656278bd739880692c94bff2824d8e0b48cac1d24682699e4883389dc4f2faa2eb3b4db6e39debd5061ff3609916f3e07529a", - "keyed_hash": "82d3199d0013035682cc7f2a399d4c212544376a839aa863a0f4c91220ca7a6dc2ffb3aa05f2631f0fa9ac19b6e97eb7e6669e5ec254799350c8b8d189e8807800842a5383c4d907c932f34490aaf00064de8cdb157357bde37c1504d2960034930887603abc5ccb9f5247f79224baff6120a3c622a46d7b1bcaee02c5025460941256", - "derive_key": "be96b30b37919fe4379dfbe752ae77b4f7e2ab92f7ff27435f76f2f065f6a5f435ae01a1d14bd5a6b3b69d8cbd35f0b01ef2173ff6f9b640ca0bd4748efa398bf9a9c0acd6a66d9332fdc9b47ffe28ba7ab6090c26747b85f4fab22f936b71eb3f64613d8bd9dfabe9bb68da19de78321b481e5297df9e40ec8a3d662f3e1479c65de0" - }, - { - "input_len": 7, - "hash": "3f8770f387faad08faa9d8414e9f449ac68e6ff0417f673f602a646a891419fe66036ef6e6d1a8f54baa9fed1fc11c77cfb9cff65bae915045027046ebe0c01bf5a941f3bb0f73791d3fc0b84370f9f30af0cd5b0fc334dd61f70feb60dad785f070fef1f343ed933b49a5ca0d16a503f599a365a4296739248b28d1a20b0e2cc8975c", - "keyed_hash": "af0a7ec382aedc0cfd626e49e7628bc7a353a4cb108855541a5651bf64fbb28a7c5035ba0f48a9c73dabb2be0533d02e8fd5d0d5639a18b2803ba6bf527e1d145d5fd6406c437b79bcaad6c7bdf1cf4bd56a893c3eb9510335a7a798548c6753f74617bede88bef924ba4b334f8852476d90b26c5dc4c3668a2519266a562c6c8034a6", - "derive_key": "dc3b6485f9d94935329442916b0d059685ba815a1fa2a14107217453a7fc9f0e66266db2ea7c96843f9d8208e600a73f7f45b2f55b9e6d6a7ccf05daae63a3fdd10b25ac0bd2e224ce8291f88c05976d575df998477db86fb2cfbbf91725d62cb57acfeb3c2d973b89b503c2b60dde85a7802b69dc1ac2007d5623cbea8cbfb6b181f5" - }, - { - "input_len": 8, - "hash": "2351207d04fc16ade43ccab08600939c7c1fa70a5c0aaca76063d04c3228eaeb725d6d46ceed8f785ab9f2f9b06acfe398c6699c6129da084cb531177445a682894f9685eaf836999221d17c9a64a3a057000524cd2823986db378b074290a1a9b93a22e135ed2c14c7e20c6d045cd00b903400374126676ea78874d79f2dd7883cf5c", - "keyed_hash": "be2f5495c61cba1bb348a34948c004045e3bd4dae8f0fe82bf44d0da245a060048eb5e68ce6dea1eb0229e144f578b3aa7e9f4f85febd135df8525e6fe40c6f0340d13dd09b255ccd5112a94238f2be3c0b5b7ecde06580426a93e0708555a265305abf86d874e34b4995b788e37a823491f25127a502fe0704baa6bfdf04e76c13276", - "derive_key": "2b166978cef14d9d438046c720519d8b1cad707e199746f1562d0c87fbd32940f0e2545a96693a66654225ebbaac76d093bfa9cd8f525a53acb92a861a98c42e7d1c4ae82e68ab691d510012edd2a728f98cd4794ef757e94d6546961b4f280a51aac339cc95b64a92b83cc3f26d8af8dfb4c091c240acdb4d47728d23e7148720ef04" - }, - { - "input_len": 63, - "hash": "e9bc37a594daad83be9470df7f7b3798297c3d834ce80ba85d6e207627b7db7b1197012b1e7d9af4d7cb7bdd1f3bb49a90a9b5dec3ea2bbc6eaebce77f4e470cbf4687093b5352f04e4a4570fba233164e6acc36900e35d185886a827f7ea9bdc1e5c3ce88b095a200e62c10c043b3e9bc6cb9b6ac4dfa51794b02ace9f98779040755", - "keyed_hash": "bb1eb5d4afa793c1ebdd9fb08def6c36d10096986ae0cfe148cd101170ce37aea05a63d74a840aecd514f654f080e51ac50fd617d22610d91780fe6b07a26b0847abb38291058c97474ef6ddd190d30fc318185c09ca1589d2024f0a6f16d45f11678377483fa5c005b2a107cb9943e5da634e7046855eaa888663de55d6471371d55d", - "derive_key": "b6451e30b953c206e34644c6803724e9d2725e0893039cfc49584f991f451af3b89e8ff572d3da4f4022199b9563b9d70ebb616efff0763e9abec71b550f1371e233319c4c4e74da936ba8e5bbb29a598e007a0bbfa929c99738ca2cc098d59134d11ff300c39f82e2fce9f7f0fa266459503f64ab9913befc65fddc474f6dc1c67669" - }, - { - "input_len": 64, - "hash": "4eed7141ea4a5cd4b788606bd23f46e212af9cacebacdc7d1f4c6dc7f2511b98fc9cc56cb831ffe33ea8e7e1d1df09b26efd2767670066aa82d023b1dfe8ab1b2b7fbb5b97592d46ffe3e05a6a9b592e2949c74160e4674301bc3f97e04903f8c6cf95b863174c33228924cdef7ae47559b10b294acd660666c4538833582b43f82d74", - "keyed_hash": "ba8ced36f327700d213f120b1a207a3b8c04330528586f414d09f2f7d9ccb7e68244c26010afc3f762615bbac552a1ca909e67c83e2fd5478cf46b9e811efccc93f77a21b17a152ebaca1695733fdb086e23cd0eb48c41c034d52523fc21236e5d8c9255306e48d52ba40b4dac24256460d56573d1312319afcf3ed39d72d0bfc69acb", - "derive_key": "a5c4a7053fa86b64746d4bb688d06ad1f02a18fce9afd3e818fefaa7126bf73e9b9493a9befebe0bf0c9509fb3105cfa0e262cde141aa8e3f2c2f77890bb64a4cca96922a21ead111f6338ad5244f2c15c44cb595443ac2ac294231e31be4a4307d0a91e874d36fc9852aeb1265c09b6e0cda7c37ef686fbbcab97e8ff66718be048bb" - }, - { - "input_len": 65, - "hash": "de1e5fa0be70df6d2be8fffd0e99ceaa8eb6e8c93a63f2d8d1c30ecb6b263dee0e16e0a4749d6811dd1d6d1265c29729b1b75a9ac346cf93f0e1d7296dfcfd4313b3a227faaaaf7757cc95b4e87a49be3b8a270a12020233509b1c3632b3485eef309d0abc4a4a696c9decc6e90454b53b000f456a3f10079072baaf7a981653221f2c", - "keyed_hash": "c0a4edefa2d2accb9277c371ac12fcdbb52988a86edc54f0716e1591b4326e72d5e795f46a596b02d3d4bfb43abad1e5d19211152722ec1f20fef2cd413e3c22f2fc5da3d73041275be6ede3517b3b9f0fc67ade5956a672b8b75d96cb43294b9041497de92637ed3f2439225e683910cb3ae923374449ca788fb0f9bea92731bc26ad", - "derive_key": "51fd05c3c1cfbc8ed67d139ad76f5cf8236cd2acd26627a30c104dfd9d3ff8a82b02e8bd36d8498a75ad8c8e9b15eb386970283d6dd42c8ae7911cc592887fdbe26a0a5f0bf821cd92986c60b2502c9be3f98a9c133a7e8045ea867e0828c7252e739321f7c2d65daee4468eb4429efae469a42763f1f94977435d10dccae3e3dce88d" - }, - { - "input_len": 127, - "hash": "d81293fda863f008c09e92fc382a81f5a0b4a1251cba1634016a0f86a6bd640de3137d477156d1fde56b0cf36f8ef18b44b2d79897bece12227539ac9ae0a5119da47644d934d26e74dc316145dcb8bb69ac3f2e05c242dd6ee06484fcb0e956dc44355b452c5e2bbb5e2b66e99f5dd443d0cbcaaafd4beebaed24ae2f8bb672bcef78", - "keyed_hash": "c64200ae7dfaf35577ac5a9521c47863fb71514a3bcad18819218b818de85818ee7a317aaccc1458f78d6f65f3427ec97d9c0adb0d6dacd4471374b621b7b5f35cd54663c64dbe0b9e2d95632f84c611313ea5bd90b71ce97b3cf645776f3adc11e27d135cbadb9875c2bf8d3ae6b02f8a0206aba0c35bfe42574011931c9a255ce6dc", - "derive_key": "c91c090ceee3a3ac81902da31838012625bbcd73fcb92e7d7e56f78deba4f0c3feeb3974306966ccb3e3c69c337ef8a45660ad02526306fd685c88542ad00f759af6dd1adc2e50c2b8aac9f0c5221ff481565cf6455b772515a69463223202e5c371743e35210bbbbabd89651684107fd9fe493c937be16e39cfa7084a36207c99bea3" - }, - { - "input_len": 128, - "hash": "f17e570564b26578c33bb7f44643f539624b05df1a76c81f30acd548c44b45efa69faba091427f9c5c4caa873aa07828651f19c55bad85c47d1368b11c6fd99e47ecba5820a0325984d74fe3e4058494ca12e3f1d3293d0010a9722f7dee64f71246f75e9361f44cc8e214a100650db1313ff76a9f93ec6e84edb7add1cb4a95019b0c", - "keyed_hash": "b04fe15577457267ff3b6f3c947d93be581e7e3a4b018679125eaf86f6a628ecd86bbe0001f10bda47e6077b735016fca8119da11348d93ca302bbd125bde0db2b50edbe728a620bb9d3e6f706286aedea973425c0b9eedf8a38873544cf91badf49ad92a635a93f71ddfcee1eae536c25d1b270956be16588ef1cfef2f1d15f650bd5", - "derive_key": "81720f34452f58a0120a58b6b4608384b5c51d11f39ce97161a0c0e442ca022550e7cd651e312f0b4c6afb3c348ae5dd17d2b29fab3b894d9a0034c7b04fd9190cbd90043ff65d1657bbc05bfdecf2897dd894c7a1b54656d59a50b51190a9da44db426266ad6ce7c173a8c0bbe091b75e734b4dadb59b2861cd2518b4e7591e4b83c9" - }, - { - "input_len": 129, - "hash": "683aaae9f3c5ba37eaaf072aed0f9e30bac0865137bae68b1fde4ca2aebdcb12f96ffa7b36dd78ba321be7e842d364a62a42e3746681c8bace18a4a8a79649285c7127bf8febf125be9de39586d251f0d41da20980b70d35e3dac0eee59e468a894fa7e6a07129aaad09855f6ad4801512a116ba2b7841e6cfc99ad77594a8f2d181a7", - "keyed_hash": "d4a64dae6cdccbac1e5287f54f17c5f985105457c1a2ec1878ebd4b57e20d38f1c9db018541eec241b748f87725665b7b1ace3e0065b29c3bcb232c90e37897fa5aaee7e1e8a2ecfcd9b51463e42238cfdd7fee1aecb3267fa7f2128079176132a412cd8aaf0791276f6b98ff67359bd8652ef3a203976d5ff1cd41885573487bcd683", - "derive_key": "938d2d4435be30eafdbb2b7031f7857c98b04881227391dc40db3c7b21f41fc18d72d0f9c1de5760e1941aebf3100b51d64644cb459eb5d20258e233892805eb98b07570ef2a1787cd48e117c8d6a63a68fd8fc8e59e79dbe63129e88352865721c8d5f0cf183f85e0609860472b0d6087cefdd186d984b21542c1c780684ed6832d8d" - }, - { - "input_len": 1023, - "hash": "10108970eeda3eb932baac1428c7a2163b0e924c9a9e25b35bba72b28f70bd11a182d27a591b05592b15607500e1e8dd56bc6c7fc063715b7a1d737df5bad3339c56778957d870eb9717b57ea3d9fb68d1b55127bba6a906a4a24bbd5acb2d123a37b28f9e9a81bbaae360d58f85e5fc9d75f7c370a0cc09b6522d9c8d822f2f28f485", - "keyed_hash": "c951ecdf03288d0fcc96ee3413563d8a6d3589547f2c2fb36d9786470f1b9d6e890316d2e6d8b8c25b0a5b2180f94fb1a158ef508c3cde45e2966bd796a696d3e13efd86259d756387d9becf5c8bf1ce2192b87025152907b6d8cc33d17826d8b7b9bc97e38c3c85108ef09f013e01c229c20a83d9e8efac5b37470da28575fd755a10", - "derive_key": "74a16c1c3d44368a86e1ca6df64be6a2f64cce8f09220787450722d85725dea59c413264404661e9e4d955409dfe4ad3aa487871bcd454ed12abfe2c2b1eb7757588cf6cb18d2eccad49e018c0d0fec323bec82bf1644c6325717d13ea712e6840d3e6e730d35553f59eff5377a9c350bcc1556694b924b858f329c44ee64b884ef00d" - }, - { - "input_len": 1024, - "hash": "42214739f095a406f3fc83deb889744ac00df831c10daa55189b5d121c855af71cf8107265ecdaf8505b95d8fcec83a98a6a96ea5109d2c179c47a387ffbb404756f6eeae7883b446b70ebb144527c2075ab8ab204c0086bb22b7c93d465efc57f8d917f0b385c6df265e77003b85102967486ed57db5c5ca170ba441427ed9afa684e", - "keyed_hash": "75c46f6f3d9eb4f55ecaaee480db732e6c2105546f1e675003687c31719c7ba4a78bc838c72852d4f49c864acb7adafe2478e824afe51c8919d06168414c265f298a8094b1ad813a9b8614acabac321f24ce61c5a5346eb519520d38ecc43e89b5000236df0597243e4d2493fd626730e2ba17ac4d8824d09d1a4a8f57b8227778e2de", - "derive_key": "7356cd7720d5b66b6d0697eb3177d9f8d73a4a5c5e968896eb6a6896843027066c23b601d3ddfb391e90d5c8eccdef4ae2a264bce9e612ba15e2bc9d654af1481b2e75dbabe615974f1070bba84d56853265a34330b4766f8e75edd1f4a1650476c10802f22b64bd3919d246ba20a17558bc51c199efdec67e80a227251808d8ce5bad" - }, - { - "input_len": 1025, - "hash": "d00278ae47eb27b34faecf67b4fe263f82d5412916c1ffd97c8cb7fb814b8444f4c4a22b4b399155358a994e52bf255de60035742ec71bd08ac275a1b51cc6bfe332b0ef84b409108cda080e6269ed4b3e2c3f7d722aa4cdc98d16deb554e5627be8f955c98e1d5f9565a9194cad0c4285f93700062d9595adb992ae68ff12800ab67a", - "keyed_hash": "357dc55de0c7e382c900fd6e320acc04146be01db6a8ce7210b7189bd664ea69362396b77fdc0d2634a552970843722066c3c15902ae5097e00ff53f1e116f1cd5352720113a837ab2452cafbde4d54085d9cf5d21ca613071551b25d52e69d6c81123872b6f19cd3bc1333edf0c52b94de23ba772cf82636cff4542540a7738d5b930", - "derive_key": "effaa245f065fbf82ac186839a249707c3bddf6d3fdda22d1b95a3c970379bcb5d31013a167509e9066273ab6e2123bc835b408b067d88f96addb550d96b6852dad38e320b9d940f86db74d398c770f462118b35d2724efa13da97194491d96dd37c3c09cbef665953f2ee85ec83d88b88d11547a6f911c8217cca46defa2751e7f3ad" - }, - { - "input_len": 2048, - "hash": "e776b6028c7cd22a4d0ba182a8bf62205d2ef576467e838ed6f2529b85fba24a9a60bf80001410ec9eea6698cd537939fad4749edd484cb541aced55cd9bf54764d063f23f6f1e32e12958ba5cfeb1bf618ad094266d4fc3c968c2088f677454c288c67ba0dba337b9d91c7e1ba586dc9a5bc2d5e90c14f53a8863ac75655461cea8f9", - "keyed_hash": "879cf1fa2ea0e79126cb1063617a05b6ad9d0b696d0d757cf053439f60a99dd10173b961cd574288194b23ece278c330fbb8585485e74967f31352a8183aa782b2b22f26cdcadb61eed1a5bc144b8198fbb0c13abbf8e3192c145d0a5c21633b0ef86054f42809df823389ee40811a5910dcbd1018af31c3b43aa55201ed4edaac74fe", - "derive_key": "7b2945cb4fef70885cc5d78a87bf6f6207dd901ff239201351ffac04e1088a23e2c11a1ebffcea4d80447867b61badb1383d842d4e79645d48dd82ccba290769caa7af8eaa1bd78a2a5e6e94fbdab78d9c7b74e894879f6a515257ccf6f95056f4e25390f24f6b35ffbb74b766202569b1d797f2d4bd9d17524c720107f985f4ddc583" - }, - { - "input_len": 2049, - "hash": "5f4d72f40d7a5f82b15ca2b2e44b1de3c2ef86c426c95c1af0b687952256303096de31d71d74103403822a2e0bc1eb193e7aecc9643a76b7bbc0c9f9c52e8783aae98764ca468962b5c2ec92f0c74eb5448d519713e09413719431c802f948dd5d90425a4ecdadece9eb178d80f26efccae630734dff63340285adec2aed3b51073ad3", - "keyed_hash": "9f29700902f7c86e514ddc4df1e3049f258b2472b6dd5267f61bf13983b78dd5f9a88abfefdfa1e00b418971f2b39c64ca621e8eb37fceac57fd0c8fc8e117d43b81447be22d5d8186f8f5919ba6bcc6846bd7d50726c06d245672c2ad4f61702c646499ee1173daa061ffe15bf45a631e2946d616a4c345822f1151284712f76b2b0e", - "derive_key": "2ea477c5515cc3dd606512ee72bb3e0e758cfae7232826f35fb98ca1bcbdf27316d8e9e79081a80b046b60f6a263616f33ca464bd78d79fa18200d06c7fc9bffd808cc4755277a7d5e09da0f29ed150f6537ea9bed946227ff184cc66a72a5f8c1e4bd8b04e81cf40fe6dc4427ad5678311a61f4ffc39d195589bdbc670f63ae70f4b6" - }, - { - "input_len": 3072, - "hash": "b98cb0ff3623be03326b373de6b9095218513e64f1ee2edd2525c7ad1e5cffd29a3f6b0b978d6608335c09dc94ccf682f9951cdfc501bfe47b9c9189a6fc7b404d120258506341a6d802857322fbd20d3e5dae05b95c88793fa83db1cb08e7d8008d1599b6209d78336e24839724c191b2a52a80448306e0daa84a3fdb566661a37e11", - "keyed_hash": "044a0e7b172a312dc02a4c9a818c036ffa2776368d7f528268d2e6b5df19177022f302d0529e4174cc507c463671217975e81dab02b8fdeb0d7ccc7568dd22574c783a76be215441b32e91b9a904be8ea81f7a0afd14bad8ee7c8efc305ace5d3dd61b996febe8da4f56ca0919359a7533216e2999fc87ff7d8f176fbecb3d6f34278b", - "derive_key": "050df97f8c2ead654d9bb3ab8c9178edcd902a32f8495949feadcc1e0480c46b3604131bbd6e3ba573b6dd682fa0a63e5b165d39fc43a625d00207607a2bfeb65ff1d29292152e26b298868e3b87be95d6458f6f2ce6118437b632415abe6ad522874bcd79e4030a5e7bad2efa90a7a7c67e93f0a18fb28369d0a9329ab5c24134ccb0" - }, - { - "input_len": 3073, - "hash": "7124b49501012f81cc7f11ca069ec9226cecb8a2c850cfe644e327d22d3e1cd39a27ae3b79d68d89da9bf25bc27139ae65a324918a5f9b7828181e52cf373c84f35b639b7fccbb985b6f2fa56aea0c18f531203497b8bbd3a07ceb5926f1cab74d14bd66486d9a91eba99059a98bd1cd25876b2af5a76c3e9eed554ed72ea952b603bf", - "keyed_hash": "68dede9bef00ba89e43f31a6825f4cf433389fedae75c04ee9f0cf16a427c95a96d6da3fe985054d3478865be9a092250839a697bbda74e279e8a9e69f0025e4cfddd6cfb434b1cd9543aaf97c635d1b451a4386041e4bb100f5e45407cbbc24fa53ea2de3536ccb329e4eb9466ec37093a42cf62b82903c696a93a50b702c80f3c3c5", - "derive_key": "72613c9ec9ff7e40f8f5c173784c532ad852e827dba2bf85b2ab4b76f7079081576288e552647a9d86481c2cae75c2dd4e7c5195fb9ada1ef50e9c5098c249d743929191441301c69e1f48505a4305ec1778450ee48b8e69dc23a25960fe33070ea549119599760a8a2d28aeca06b8c5e9ba58bc19e11fe57b6ee98aa44b2a8e6b14a5" - }, - { - "input_len": 4096, - "hash": "015094013f57a5277b59d8475c0501042c0b642e531b0a1c8f58d2163229e9690289e9409ddb1b99768eafe1623da896faf7e1114bebeadc1be30829b6f8af707d85c298f4f0ff4d9438aef948335612ae921e76d411c3a9111df62d27eaf871959ae0062b5492a0feb98ef3ed4af277f5395172dbe5c311918ea0074ce0036454f620", - "keyed_hash": "befc660aea2f1718884cd8deb9902811d332f4fc4a38cf7c7300d597a081bfc0bbb64a36edb564e01e4b4aaf3b060092a6b838bea44afebd2deb8298fa562b7b597c757b9df4c911c3ca462e2ac89e9a787357aaf74c3b56d5c07bc93ce899568a3eb17d9250c20f6c5f6c1e792ec9a2dcb715398d5a6ec6d5c54f586a00403a1af1de", - "derive_key": "1e0d7f3db8c414c97c6307cbda6cd27ac3b030949da8e23be1a1a924ad2f25b9d78038f7b198596c6cc4a9ccf93223c08722d684f240ff6569075ed81591fd93f9fff1110b3a75bc67e426012e5588959cc5a4c192173a03c00731cf84544f65a2fb9378989f72e9694a6a394a8a30997c2e67f95a504e631cd2c5f55246024761b245" - }, - { - "input_len": 4097, - "hash": "9b4052b38f1c5fc8b1f9ff7ac7b27cd242487b3d890d15c96a1c25b8aa0fb99505f91b0b5600a11251652eacfa9497b31cd3c409ce2e45cfe6c0a016967316c426bd26f619eab5d70af9a418b845c608840390f361630bd497b1ab44019316357c61dbe091ce72fc16dc340ac3d6e009e050b3adac4b5b2c92e722cffdc46501531956", - "keyed_hash": "00df940cd36bb9fa7cbbc3556744e0dbc8191401afe70520ba292ee3ca80abbc606db4976cfdd266ae0abf667d9481831ff12e0caa268e7d3e57260c0824115a54ce595ccc897786d9dcbf495599cfd90157186a46ec800a6763f1c59e36197e9939e900809f7077c102f888caaf864b253bc41eea812656d46742e4ea42769f89b83f", - "derive_key": "aca51029626b55fda7117b42a7c211f8c6e9ba4fe5b7a8ca922f34299500ead8a897f66a400fed9198fd61dd2d58d382458e64e100128075fc54b860934e8de2e84170734b06e1d212a117100820dbc48292d148afa50567b8b84b1ec336ae10d40c8c975a624996e12de31abbe135d9d159375739c333798a80c64ae895e51e22f3ad" - }, - { - "input_len": 5120, - "hash": "9cadc15fed8b5d854562b26a9536d9707cadeda9b143978f319ab34230535833acc61c8fdc114a2010ce8038c853e121e1544985133fccdd0a2d507e8e615e611e9a0ba4f47915f49e53d721816a9198e8b30f12d20ec3689989175f1bf7a300eee0d9321fad8da232ece6efb8e9fd81b42ad161f6b9550a069e66b11b40487a5f5059", - "keyed_hash": "2c493e48e9b9bf31e0553a22b23503c0a3388f035cece68eb438d22fa1943e209b4dc9209cd80ce7c1f7c9a744658e7e288465717ae6e56d5463d4f80cdb2ef56495f6a4f5487f69749af0c34c2cdfa857f3056bf8d807336a14d7b89bf62bef2fb54f9af6a546f818dc1e98b9e07f8a5834da50fa28fb5874af91bf06020d1bf0120e", - "derive_key": "7a7acac8a02adcf3038d74cdd1d34527de8a0fcc0ee3399d1262397ce5817f6055d0cefd84d9d57fe792d65a278fd20384ac6c30fdb340092f1a74a92ace99c482b28f0fc0ef3b923e56ade20c6dba47e49227166251337d80a037e987ad3a7f728b5ab6dfafd6e2ab1bd583a95d9c895ba9c2422c24ea0f62961f0dca45cad47bfa0d" - }, - { - "input_len": 5121, - "hash": "628bd2cb2004694adaab7bbd778a25df25c47b9d4155a55f8fbd79f2fe154cff96adaab0613a6146cdaabe498c3a94e529d3fc1da2bd08edf54ed64d40dcd6777647eac51d8277d70219a9694334a68bc8f0f23e20b0ff70ada6f844542dfa32cd4204ca1846ef76d811cdb296f65e260227f477aa7aa008bac878f72257484f2b6c95", - "keyed_hash": "6ccf1c34753e7a044db80798ecd0782a8f76f33563accaddbfbb2e0ea4b2d0240d07e63f13667a8d1490e5e04f13eb617aea16a8c8a5aaed1ef6fbde1b0515e3c81050b361af6ead126032998290b563e3caddeaebfab592e155f2e161fb7cba939092133f23f9e65245e58ec23457b78a2e8a125588aad6e07d7f11a85b88d375b72d", - "derive_key": "b07f01e518e702f7ccb44a267e9e112d403a7b3f4883a47ffbed4b48339b3c341a0add0ac032ab5aaea1e4e5b004707ec5681ae0fcbe3796974c0b1cf31a194740c14519273eedaabec832e8a784b6e7cfc2c5952677e6c3f2c3914454082d7eb1ce1766ac7d75a4d3001fc89544dd46b5147382240d689bbbaefc359fb6ae30263165" - }, - { - "input_len": 6144, - "hash": "3e2e5b74e048f3add6d21faab3f83aa44d3b2278afb83b80b3c35164ebeca2054d742022da6fdda444ebc384b04a54c3ac5839b49da7d39f6d8a9db03deab32aade156c1c0311e9b3435cde0ddba0dce7b26a376cad121294b689193508dd63151603c6ddb866ad16c2ee41585d1633a2cea093bea714f4c5d6b903522045b20395c83", - "keyed_hash": "3d6b6d21281d0ade5b2b016ae4034c5dec10ca7e475f90f76eac7138e9bc8f1dc35754060091dc5caf3efabe0603c60f45e415bb3407db67e6beb3d11cf8e4f7907561f05dace0c15807f4b5f389c841eb114d81a82c02a00b57206b1d11fa6e803486b048a5ce87105a686dee041207e095323dfe172df73deb8c9532066d88f9da7e", - "derive_key": "2a95beae63ddce523762355cf4b9c1d8f131465780a391286a5d01abb5683a1597099e3c6488aab6c48f3c15dbe1942d21dbcdc12115d19a8b8465fb54e9053323a9178e4275647f1a9927f6439e52b7031a0b465c861a3fc531527f7758b2b888cf2f20582e9e2c593709c0a44f9c6e0f8b963994882ea4168827823eef1f64169fef" - }, - { - "input_len": 6145, - "hash": "f1323a8631446cc50536a9f705ee5cb619424d46887f3c376c695b70e0f0507f18a2cfdd73c6e39dd75ce7c1c6e3ef238fd54465f053b25d21044ccb2093beb015015532b108313b5829c3621ce324b8e14229091b7c93f32db2e4e63126a377d2a63a3597997d4f1cba59309cb4af240ba70cebff9a23d5e3ff0cdae2cfd54e070022", - "keyed_hash": "9ac301e9e39e45e3250a7e3b3df701aa0fb6889fbd80eeecf28dbc6300fbc539f3c184ca2f59780e27a576c1d1fb9772e99fd17881d02ac7dfd39675aca918453283ed8c3169085ef4a466b91c1649cc341dfdee60e32231fc34c9c4e0b9a2ba87ca8f372589c744c15fd6f985eec15e98136f25beeb4b13c4e43dc84abcc79cd4646c", - "derive_key": "379bcc61d0051dd489f686c13de00d5b14c505245103dc040d9e4dd1facab8e5114493d029bdbd295aaa744a59e31f35c7f52dba9c3642f773dd0b4262a9980a2aef811697e1305d37ba9d8b6d850ef07fe41108993180cf779aeece363704c76483458603bbeeb693cffbbe5588d1f3535dcad888893e53d977424bb707201569a8d2" - }, - { - "input_len": 7168, - "hash": "61da957ec2499a95d6b8023e2b0e604ec7f6b50e80a9678b89d2628e99ada77a5707c321c83361793b9af62a40f43b523df1c8633cecb4cd14d00bdc79c78fca5165b863893f6d38b02ff7236c5a9a8ad2dba87d24c547cab046c29fc5bc1ed142e1de4763613bb162a5a538e6ef05ed05199d751f9eb58d332791b8d73fb74e4fce95", - "keyed_hash": "b42835e40e9d4a7f42ad8cc04f85a963a76e18198377ed84adddeaecacc6f3fca2f01d5277d69bb681c70fa8d36094f73ec06e452c80d2ff2257ed82e7ba348400989a65ee8daa7094ae0933e3d2210ac6395c4af24f91c2b590ef87d7788d7066ea3eaebca4c08a4f14b9a27644f99084c3543711b64a070b94f2c9d1d8a90d035d52", - "derive_key": "11c37a112765370c94a51415d0d651190c288566e295d505defdad895dae223730d5a5175a38841693020669c7638f40b9bc1f9f39cf98bda7a5b54ae24218a800a2116b34665aa95d846d97ea988bfcb53dd9c055d588fa21ba78996776ea6c40bc428b53c62b5f3ccf200f647a5aae8067f0ea1976391fcc72af1945100e2a6dcb88" - }, - { - "input_len": 7169, - "hash": "a003fc7a51754a9b3c7fae0367ab3d782dccf28855a03d435f8cfe74605e781798a8b20534be1ca9eb2ae2df3fae2ea60e48c6fb0b850b1385b5de0fe460dbe9d9f9b0d8db4435da75c601156df9d047f4ede008732eb17adc05d96180f8a73548522840779e6062d643b79478a6e8dbce68927f36ebf676ffa7d72d5f68f050b119c8", - "keyed_hash": "ed9b1a922c046fdb3d423ae34e143b05ca1bf28b710432857bf738bcedbfa5113c9e28d72fcbfc020814ce3f5d4fc867f01c8f5b6caf305b3ea8a8ba2da3ab69fabcb438f19ff11f5378ad4484d75c478de425fb8e6ee809b54eec9bdb184315dc856617c09f5340451bf42fd3270a7b0b6566169f242e533777604c118a6358250f54", - "derive_key": "554b0a5efea9ef183f2f9b931b7497995d9eb26f5c5c6dad2b97d62fc5ac31d99b20652c016d88ba2a611bbd761668d5eda3e568e940faae24b0d9991c3bd25a65f770b89fdcadabcb3d1a9c1cb63e69721cacf1ae69fefdcef1e3ef41bc5312ccc17222199e47a26552c6adc460cf47a72319cb5039369d0060eaea59d6c65130f1dd" - }, - { - "input_len": 8192, - "hash": "aae792484c8efe4f19e2ca7d371d8c467ffb10748d8a5a1ae579948f718a2a635fe51a27db045a567c1ad51be5aa34c01c6651c4d9b5b5ac5d0fd58cf18dd61a47778566b797a8c67df7b1d60b97b19288d2d877bb2df417ace009dcb0241ca1257d62712b6a4043b4ff33f690d849da91ea3bf711ed583cb7b7a7da2839ba71309bbf", - "keyed_hash": "dc9637c8845a770b4cbf76b8daec0eebf7dc2eac11498517f08d44c8fc00d58a4834464159dcbc12a0ba0c6d6eb41bac0ed6585cabfe0aca36a375e6c5480c22afdc40785c170f5a6b8a1107dbee282318d00d915ac9ed1143ad40765ec120042ee121cd2baa36250c618adaf9e27260fda2f94dea8fb6f08c04f8f10c78292aa46102", - "derive_key": "ad01d7ae4ad059b0d33baa3c01319dcf8088094d0359e5fd45d6aeaa8b2d0c3d4c9e58958553513b67f84f8eac653aeeb02ae1d5672dcecf91cd9985a0e67f4501910ecba25555395427ccc7241d70dc21c190e2aadee875e5aae6bf1912837e53411dabf7a56cbf8e4fb780432b0d7fe6cec45024a0788cf5874616407757e9e6bef7" - }, - { - "input_len": 8193, - "hash": "bab6c09cb8ce8cf459261398d2e7aef35700bf488116ceb94a36d0f5f1b7bc3bb2282aa69be089359ea1154b9a9286c4a56af4de975a9aa4a5c497654914d279bea60bb6d2cf7225a2fa0ff5ef56bbe4b149f3ed15860f78b4e2ad04e158e375c1e0c0b551cd7dfc82f1b155c11b6b3ed51ec9edb30d133653bb5709d1dbd55f4e1ff6", - "keyed_hash": "954a2a75420c8d6547e3ba5b98d963e6fa6491addc8c023189cc519821b4a1f5f03228648fd983aef045c2fa8290934b0866b615f585149587dda2299039965328835a2b18f1d63b7e300fc76ff260b571839fe44876a4eae66cbac8c67694411ed7e09df51068a22c6e67d6d3dd2cca8ff12e3275384006c80f4db68023f24eebba57", - "derive_key": "af1e0346e389b17c23200270a64aa4e1ead98c61695d917de7d5b00491c9b0f12f20a01d6d622edf3de026a4db4e4526225debb93c1237934d71c7340bb5916158cbdafe9ac3225476b6ab57a12357db3abbad7a26c6e66290e44034fb08a20a8d0ec264f309994d2810c49cfba6989d7abb095897459f5425adb48aba07c5fb3c83c0" - }, - { - "input_len": 16384, - "hash": "f875d6646de28985646f34ee13be9a576fd515f76b5b0a26bb324735041ddde49d764c270176e53e97bdffa58d549073f2c660be0e81293767ed4e4929f9ad34bbb39a529334c57c4a381ffd2a6d4bfdbf1482651b172aa883cc13408fa67758a3e47503f93f87720a3177325f7823251b85275f64636a8f1d599c2e49722f42e93893", - "keyed_hash": "9e9fc4eb7cf081ea7c47d1807790ed211bfec56aa25bb7037784c13c4b707b0df9e601b101e4cf63a404dfe50f2e1865bb12edc8fca166579ce0c70dba5a5c0fc960ad6f3772183416a00bd29d4c6e651ea7620bb100c9449858bf14e1ddc9ecd35725581ca5b9160de04060045993d972571c3e8f71e9d0496bfa744656861b169d65", - "derive_key": "160e18b5878cd0df1c3af85eb25a0db5344d43a6fbd7a8ef4ed98d0714c3f7e160dc0b1f09caa35f2f417b9ef309dfe5ebd67f4c9507995a531374d099cf8ae317542e885ec6f589378864d3ea98716b3bbb65ef4ab5e0ab5bb298a501f19a41ec19af84a5e6b428ecd813b1a47ed91c9657c3fba11c406bc316768b58f6802c9e9b57" - }, - { - "input_len": 31744, - "hash": "62b6960e1a44bcc1eb1a611a8d6235b6b4b78f32e7abc4fb4c6cdcce94895c47860cc51f2b0c28a7b77304bd55fe73af663c02d3f52ea053ba43431ca5bab7bfea2f5e9d7121770d88f70ae9649ea713087d1914f7f312147e247f87eb2d4ffef0ac978bf7b6579d57d533355aa20b8b77b13fd09748728a5cc327a8ec470f4013226f", - "keyed_hash": "efa53b389ab67c593dba624d898d0f7353ab99e4ac9d42302ee64cbf9939a4193a7258db2d9cd32a7a3ecfce46144114b15c2fcb68a618a976bd74515d47be08b628be420b5e830fade7c080e351a076fbc38641ad80c736c8a18fe3c66ce12f95c61c2462a9770d60d0f77115bbcd3782b593016a4e728d4c06cee4505cb0c08a42ec", - "derive_key": "39772aef80e0ebe60596361e45b061e8f417429d529171b6764468c22928e28e9759adeb797a3fbf771b1bcea30150a020e317982bf0d6e7d14dd9f064bc11025c25f31e81bd78a921db0174f03dd481d30e93fd8e90f8b2fee209f849f2d2a52f31719a490fb0ba7aea1e09814ee912eba111a9fde9d5c274185f7bae8ba85d300a2b" - }, - { - "input_len": 102400, - "hash": "bc3e3d41a1146b069abffad3c0d44860cf664390afce4d9661f7902e7943e085e01c59dab908c04c3342b816941a26d69c2605ebee5ec5291cc55e15b76146e6745f0601156c3596cb75065a9c57f35585a52e1ac70f69131c23d611ce11ee4ab1ec2c009012d236648e77be9295dd0426f29b764d65de58eb7d01dd42248204f45f8e", - "keyed_hash": "1c35d1a5811083fd7119f5d5d1ba027b4d01c0c6c49fb6ff2cf75393ea5db4a7f9dbdd3e1d81dcbca3ba241bb18760f207710b751846faaeb9dff8262710999a59b2aa1aca298a032d94eacfadf1aa192418eb54808db23b56e34213266aa08499a16b354f018fc4967d05f8b9d2ad87a7278337be9693fc638a3bfdbe314574ee6fc4", - "derive_key": "4652cff7a3f385a6103b5c260fc1593e13c778dbe608efb092fe7ee69df6e9c6d83a3e041bc3a48df2879f4a0a3ed40e7c961c73eff740f3117a0504c2dff4786d44fb17f1549eb0ba585e40ec29bf7732f0b7e286ff8acddc4cb1e23b87ff5d824a986458dcc6a04ac83969b80637562953df51ed1a7e90a7926924d2763778be8560" - } - ] -} diff --git a/thirdparty/BLAKE3/tools/compiler_version/Cargo.toml b/thirdparty/BLAKE3/tools/compiler_version/Cargo.toml deleted file mode 100644 index 1046cf29d..000000000 --- a/thirdparty/BLAKE3/tools/compiler_version/Cargo.toml +++ /dev/null @@ -1,7 +0,0 @@ -[package] -name = "compiler_version" -version = "0.0.0" -edition = "2018" - -[build-dependencies] -cc = "1.0.50" diff --git a/thirdparty/BLAKE3/tools/compiler_version/build.rs b/thirdparty/BLAKE3/tools/compiler_version/build.rs deleted file mode 100644 index 3e14ebe67..000000000 --- a/thirdparty/BLAKE3/tools/compiler_version/build.rs +++ /dev/null @@ -1,6 +0,0 @@ -fn main() { - let build = cc::Build::new(); - let compiler = build.get_compiler(); - let compiler_path = compiler.path().to_string_lossy(); - println!("cargo:rustc-env=COMPILER_PATH={}", compiler_path); -} diff --git a/thirdparty/BLAKE3/tools/compiler_version/src/main.rs b/thirdparty/BLAKE3/tools/compiler_version/src/main.rs deleted file mode 100644 index 767cb31bd..000000000 --- a/thirdparty/BLAKE3/tools/compiler_version/src/main.rs +++ /dev/null @@ -1,27 +0,0 @@ -use std::process::Command; - -fn main() { - // Print the rustc version. - Command::new(env!("CARGO")) - .args(&["rustc", "--quiet", "--", "--version"]) - .status() - .unwrap(); - println!(); - - // Print the Cargo version. - Command::new(env!("CARGO")) - .args(&["--version"]) - .status() - .unwrap(); - println!(); - - // Print the C compiler version. This relies on C compiler detection done - // in build.rs, which sets the COMPILER_PATH variable. - let compiler_path = env!("COMPILER_PATH"); - let mut compiler_command = Command::new(compiler_path); - // Use the --version flag on everything other than MSVC. - if !cfg!(target_env = "msvc") { - compiler_command.arg("--version"); - } - let _ = compiler_command.status().unwrap(); -} diff --git a/thirdparty/BLAKE3/tools/instruction_set_support/Cargo.toml b/thirdparty/BLAKE3/tools/instruction_set_support/Cargo.toml deleted file mode 100644 index 9e30174a9..000000000 --- a/thirdparty/BLAKE3/tools/instruction_set_support/Cargo.toml +++ /dev/null @@ -1,6 +0,0 @@ -[package] -name = "instruction_set_support" -version = "0.0.0" -edition = "2018" - -[dependencies] diff --git a/thirdparty/BLAKE3/tools/instruction_set_support/src/main.rs b/thirdparty/BLAKE3/tools/instruction_set_support/src/main.rs deleted file mode 100644 index 6b509b053..000000000 --- a/thirdparty/BLAKE3/tools/instruction_set_support/src/main.rs +++ /dev/null @@ -1,10 +0,0 @@ -fn main() { - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - { - dbg!(is_x86_feature_detected!("sse2")); - dbg!(is_x86_feature_detected!("sse4.1")); - dbg!(is_x86_feature_detected!("avx2")); - dbg!(is_x86_feature_detected!("avx512f")); - dbg!(is_x86_feature_detected!("avx512vl")); - } -} @@ -4,6 +4,7 @@ set_configvar("ZEN_SCHEMA_VERSION", 4) -- store Cid data in CAS under raw hash ( add_requires( "vcpkg::asio", + "vcpkg::blake3", "vcpkg::catch2 2.13.8", "vcpkg::cpr", "vcpkg::curl", diff --git a/zencore/blake3.cpp b/zencore/blake3.cpp index 02d6eb241..89826ae5d 100644 --- a/zencore/blake3.cpp +++ b/zencore/blake3.cpp @@ -7,13 +7,10 @@ #include <zencore/testing.h> #include <zencore/zencore.h> -#include "../thirdparty/BLAKE3/c/blake3.h" -#if ZEN_PLATFORM_WINDOWS -# pragma comment(lib, "blake3.lib") -#endif - #include <string.h> +#include "blake3.h" + ////////////////////////////////////////////////////////////////////////// namespace zen { diff --git a/zencore/xmake.lua b/zencore/xmake.lua index f01c12c86..aae20274b 100644 --- a/zencore/xmake.lua +++ b/zencore/xmake.lua @@ -14,40 +14,33 @@ target('zencore') add_includedirs("include", {public=true}) add_includedirs("$(projectdir)/thirdparty/utfcpp/source") add_includedirs("$(projectdir)/thirdparty/trace", {public=true}) + add_links("blake3") if is_os("windows") then - add_linkdirs("$(projectdir)/thirdparty/BLAKE3/lib/Win64") add_linkdirs("$(projectdir)/thirdparty/Oodle/lib/Win64") elseif is_os("linux") then - add_linkdirs("$(projectdir)/thirdparty/BLAKE3/lib/Linux_x64") add_linkdirs("$(projectdir)/thirdparty/Oodle/lib/Linux_x64") - add_links("blake3") add_links("oo2corelinux64") add_syslinks("pthread") elseif is_os("macosx") then - if is_arch("arm64") then - add_linkdirs("$(projectdir)/thirdparty/BLAKE3/lib/Mac_arm64") - else - add_linkdirs("$(projectdir)/thirdparty/BLAKE3/lib/Mac_x64") - end add_linkdirs("$(projectdir)/thirdparty/Oodle/lib/Mac_x64") - add_links("blake3") add_links("oo2coremac64") end add_options("zentrace") add_packages( - "vcpkg::spdlog", - "vcpkg::fmt", - "vcpkg::doctest", + "vcpkg::blake3", "vcpkg::catch2", + "vcpkg::cpr", + "vcpkg::curl", -- required by cpr + "vcpkg::doctest", + "vcpkg::fmt", + "vcpkg::gsl-lite", "vcpkg::json11", "vcpkg::lz4", "vcpkg::mimalloc", - "vcpkg::cpr", - "vcpkg::curl", -- required by cpr - "vcpkg::zlib", -- required by curl "vcpkg::openssl", -- required by curl - "vcpkg::xxhash", - "vcpkg::gsl-lite") + "vcpkg::spdlog", + "vcpkg::zlib", -- required by curl + "vcpkg::xxhash") if is_plat("linux") then -- The 'vcpkg::openssl' package is two libraries; ssl and crypto, with |