aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStefan Boberg <[email protected]>2022-09-20 17:28:41 +0200
committerGitHub <[email protected]>2022-09-20 17:28:41 +0200
commita735967c7c54fcecbfd9760286afc06a3b48233a (patch)
tree4789717b7a05c7122cb366d3bcf5810db9678058
parentrename URI chunk requests from value -> chunk (#166) (diff)
downloadzen-a735967c7c54fcecbfd9760286afc06a3b48233a.tar.xz
zen-a735967c7c54fcecbfd9760286afc06a3b48233a.zip
Use BLAKE3 port from vcpkg (#141)
use BLAKE3 port from vcpkg instead of in-tree binaries
-rw-r--r--thirdparty/BLAKE3/.github/workflows/build_b3sum.py37
-rw-r--r--thirdparty/BLAKE3/.github/workflows/ci.yml208
-rw-r--r--thirdparty/BLAKE3/.github/workflows/tag.yml45
-rw-r--r--thirdparty/BLAKE3/.github/workflows/upload_github_release_asset.py65
-rw-r--r--thirdparty/BLAKE3/.gitignore2
-rw-r--r--thirdparty/BLAKE3/CONTRIBUTING.md31
-rw-r--r--thirdparty/BLAKE3/Cargo.toml90
-rw-r--r--thirdparty/BLAKE3/LICENSE330
-rw-r--r--thirdparty/BLAKE3/README.md202
-rw-r--r--thirdparty/BLAKE3/b3sum/Cargo.toml27
-rw-r--r--thirdparty/BLAKE3/b3sum/README.md86
-rw-r--r--thirdparty/BLAKE3/b3sum/src/main.rs621
-rw-r--r--thirdparty/BLAKE3/b3sum/src/unit_tests.rs189
-rw-r--r--thirdparty/BLAKE3/b3sum/tests/cli_tests.rs552
-rw-r--r--thirdparty/BLAKE3/b3sum/what_does_check_do.md174
-rw-r--r--thirdparty/BLAKE3/benches/bench.rs520
-rw-r--r--thirdparty/BLAKE3/build.rs260
-rw-r--r--thirdparty/BLAKE3/c/.gitignore3
-rw-r--r--thirdparty/BLAKE3/c/Makefile.testing78
-rw-r--r--thirdparty/BLAKE3/c/README.md270
-rw-r--r--thirdparty/BLAKE3/c/blake3.c607
-rw-r--r--thirdparty/BLAKE3/c/blake3.h60
-rw-r--r--thirdparty/BLAKE3/c/blake3_avx2.c325
-rw-r--r--thirdparty/BLAKE3/c/blake3_avx2_x86-64_unix.S1815
-rw-r--r--thirdparty/BLAKE3/c/blake3_avx2_x86-64_windows_gnu.S1817
-rw-r--r--thirdparty/BLAKE3/c/blake3_avx2_x86-64_windows_msvc.asm1828
-rw-r--r--thirdparty/BLAKE3/c/blake3_avx512.c1204
-rw-r--r--thirdparty/BLAKE3/c/blake3_avx512_x86-64_unix.S2585
-rw-r--r--thirdparty/BLAKE3/c/blake3_avx512_x86-64_windows_gnu.S2615
-rw-r--r--thirdparty/BLAKE3/c/blake3_avx512_x86-64_windows_msvc.asm2634
-rw-r--r--thirdparty/BLAKE3/c/blake3_c_rust_bindings/Cargo.toml29
-rw-r--r--thirdparty/BLAKE3/c/blake3_c_rust_bindings/README.md4
-rw-r--r--thirdparty/BLAKE3/c/blake3_c_rust_bindings/benches/bench.rs393
-rw-r--r--thirdparty/BLAKE3/c/blake3_c_rust_bindings/build.rs182
-rw-r--r--thirdparty/BLAKE3/c/blake3_c_rust_bindings/cross_test.sh31
-rw-r--r--thirdparty/BLAKE3/c/blake3_c_rust_bindings/src/lib.rs299
-rw-r--r--thirdparty/BLAKE3/c/blake3_c_rust_bindings/src/test.rs511
-rw-r--r--thirdparty/BLAKE3/c/blake3_dispatch.c276
-rw-r--r--thirdparty/BLAKE3/c/blake3_impl.h269
-rw-r--r--thirdparty/BLAKE3/c/blake3_neon.c346
-rw-r--r--thirdparty/BLAKE3/c/blake3_portable.c160
-rw-r--r--thirdparty/BLAKE3/c/blake3_sse2.c565
-rw-r--r--thirdparty/BLAKE3/c/blake3_sse2_x86-64_unix.S2291
-rw-r--r--thirdparty/BLAKE3/c/blake3_sse2_x86-64_windows_gnu.S2332
-rw-r--r--thirdparty/BLAKE3/c/blake3_sse2_x86-64_windows_msvc.asm2350
-rw-r--r--thirdparty/BLAKE3/c/blake3_sse41.c559
-rw-r--r--thirdparty/BLAKE3/c/blake3_sse41_x86-64_unix.S2028
-rw-r--r--thirdparty/BLAKE3/c/blake3_sse41_x86-64_windows_gnu.S2069
-rw-r--r--thirdparty/BLAKE3/c/blake3_sse41_x86-64_windows_msvc.asm2089
-rw-r--r--thirdparty/BLAKE3/c/example.c27
-rw-r--r--thirdparty/BLAKE3/c/main.c166
-rw-r--r--thirdparty/BLAKE3/c/test.py97
-rw-r--r--thirdparty/BLAKE3/lib/Linux_x64/libblake3.abin94190 -> 0 bytes
-rw-r--r--thirdparty/BLAKE3/lib/Mac_arm64/libblake3.abin26576 -> 0 bytes
-rw-r--r--thirdparty/BLAKE3/lib/Mac_x64/libblake3.abin78200 -> 0 bytes
-rw-r--r--thirdparty/BLAKE3/lib/Win64/BLAKE3.libbin172346 -> 0 bytes
-rw-r--r--thirdparty/BLAKE3/media/B3.svg70
-rw-r--r--thirdparty/BLAKE3/media/BLAKE3.svg85
-rw-r--r--thirdparty/BLAKE3/media/speed.svg1474
-rw-r--r--thirdparty/BLAKE3/reference_impl/Cargo.toml8
-rw-r--r--thirdparty/BLAKE3/reference_impl/README.md9
-rw-r--r--thirdparty/BLAKE3/reference_impl/reference_impl.rs383
-rw-r--r--thirdparty/BLAKE3/src/ffi_avx2.rs63
-rw-r--r--thirdparty/BLAKE3/src/ffi_avx512.rs114
-rw-r--r--thirdparty/BLAKE3/src/ffi_neon.rs82
-rw-r--r--thirdparty/BLAKE3/src/ffi_sse2.rs114
-rw-r--r--thirdparty/BLAKE3/src/ffi_sse41.rs114
-rw-r--r--thirdparty/BLAKE3/src/guts.rs95
-rw-r--r--thirdparty/BLAKE3/src/join.rs120
-rw-r--r--thirdparty/BLAKE3/src/lib.rs1359
-rw-r--r--thirdparty/BLAKE3/src/platform.rs487
-rw-r--r--thirdparty/BLAKE3/src/portable.rs198
-rw-r--r--thirdparty/BLAKE3/src/rust_avx2.rs474
-rw-r--r--thirdparty/BLAKE3/src/rust_sse2.rs775
-rw-r--r--thirdparty/BLAKE3/src/rust_sse41.rs766
-rw-r--r--thirdparty/BLAKE3/src/test.rs569
-rw-r--r--thirdparty/BLAKE3/src/traits.rs184
-rw-r--r--thirdparty/BLAKE3/test_vectors/Cargo.toml18
-rw-r--r--thirdparty/BLAKE3/test_vectors/cross_test.sh25
-rw-r--r--thirdparty/BLAKE3/test_vectors/src/lib.rs349
-rw-r--r--thirdparty/BLAKE3/test_vectors/test_vectors.json217
-rw-r--r--thirdparty/BLAKE3/tools/compiler_version/Cargo.toml7
-rw-r--r--thirdparty/BLAKE3/tools/compiler_version/build.rs6
-rw-r--r--thirdparty/BLAKE3/tools/compiler_version/src/main.rs27
-rw-r--r--thirdparty/BLAKE3/tools/instruction_set_support/Cargo.toml6
-rw-r--r--thirdparty/BLAKE3/tools/instruction_set_support/src/main.rs10
-rw-r--r--xmake.lua1
-rw-r--r--zencore/blake3.cpp7
-rw-r--r--zencore/xmake.lua27
89 files changed, 13 insertions, 44583 deletions
diff --git a/thirdparty/BLAKE3/.github/workflows/build_b3sum.py b/thirdparty/BLAKE3/.github/workflows/build_b3sum.py
deleted file mode 100644
index e487daf97..000000000
--- a/thirdparty/BLAKE3/.github/workflows/build_b3sum.py
+++ /dev/null
@@ -1,37 +0,0 @@
-#! /usr/bin/env python3
-
-from pathlib import Path
-import platform
-import shutil
-import subprocess
-import sys
-
-ROOT = Path(__file__).parent.parent.parent
-RUST_TARGET = sys.argv[1]
-
-subprocess.run(["cargo", "build", "--target", sys.argv[1], "--release"],
- cwd=ROOT / "b3sum")
-
-if platform.system() == "Windows":
- original_exe_name = "b3sum.exe"
-else:
- original_exe_name = "b3sum"
-
-if platform.system() == "Windows":
- new_exe_name = "b3sum_windows_x64_bin.exe"
-elif platform.system() == "Darwin":
- new_exe_name = "b3sum_macos_x64_bin"
-elif platform.system() == "Linux":
- new_exe_name = "b3sum_linux_x64_bin"
-else:
- raise RuntimeError("Unexpected platform: " + platform.system())
-
-# Copy the built binary so that it has the upload name we want.
-out_dir = ROOT / "b3sum/target" / RUST_TARGET / "release"
-original_exe_path = str(out_dir / original_exe_name)
-new_exe_path = str(out_dir / new_exe_name)
-print("copying", repr(original_exe_path), "to", repr(new_exe_path))
-shutil.copyfile(original_exe_path, new_exe_path)
-
-# This lets the subsequent upload step get the filepath.
-print("::set-output name=bin_path::" + new_exe_path)
diff --git a/thirdparty/BLAKE3/.github/workflows/ci.yml b/thirdparty/BLAKE3/.github/workflows/ci.yml
deleted file mode 100644
index 464a411d5..000000000
--- a/thirdparty/BLAKE3/.github/workflows/ci.yml
+++ /dev/null
@@ -1,208 +0,0 @@
-name: tests
-
-on:
- push:
- branches:
- - "*"
- # not on tags
- pull_request:
-
-env:
- BLAKE3_CI: "1"
- RUSTFLAGS: "-D warnings"
- RUST_BACKTRACE: "1"
-
-jobs:
- cargo_tests:
- name: ${{ matrix.target.name }} ${{ matrix.channel }}
- runs-on: ${{ matrix.target.os }}
- strategy:
- fail-fast: false
- matrix:
- target: [
- { "os": "ubuntu-latest", "toolchain": "x86_64-unknown-linux-gnu", "name": "Linux GNU" },
- { "os": "macOS-latest", "toolchain": "x86_64-apple-darwin", "name": "macOS" },
- { "os": "windows-latest", "toolchain": "x86_64-pc-windows-msvc", "name": "Windows MSVC" },
- { "os": "windows-latest", "toolchain": "x86_64-pc-windows-gnu", "name": "Windows GNU" }
- ]
- channel: [stable, beta, nightly]
-
- steps:
- - uses: actions/checkout@v1
- - uses: actions-rs/toolchain@v1
- with:
- toolchain: ${{ format('{0}-{1}', matrix.channel, matrix.target.toolchain) }}
- profile: minimal
- override: true
- # Print the compiler version, for debugging.
- - name: print compiler version
- run: cargo run --quiet
- working-directory: ./tools/compiler_version
- # Print out instruction set support, for debugging.
- - name: print instruction set support
- run: cargo run --quiet
- working-directory: ./tools/instruction_set_support
- # Default tests plus Rayon.
- - run: cargo test --features=rayon
- # no_std tests.
- - run: cargo test --no-default-features
-
- # A matrix of different test settings:
- # - debug vs release
- # - assembly vs Rust+C intrinsics vs pure Rust intrinsics
- # - different levels of SIMD support
- #
- # Full SIMD support.
- - run: cargo test --features=
- - run: cargo test --features=prefer_intrinsics
- - run: cargo test --features=pure
- - run: cargo test --features= --release
- - run: cargo test --features=prefer_intrinsics --release
- - run: cargo test --features=pure --release
- # No AVX-512.
- - run: cargo test --features=no_avx512
- - run: cargo test --features=no_avx512,prefer_intrinsics
- - run: cargo test --features=no_avx512,pure
- - run: cargo test --features=no_avx512 --release
- - run: cargo test --features=no_avx512,prefer_intrinsics --release
- - run: cargo test --features=no_avx512,pure --release
- # No AVX2.
- - run: cargo test --features=no_avx512,no_avx2
- - run: cargo test --features=no_avx512,no_avx2,prefer_intrinsics
- - run: cargo test --features=no_avx512,no_avx2,pure
- - run: cargo test --features=no_avx512,no_avx2 --release
- - run: cargo test --features=no_avx512,no_avx2,prefer_intrinsics --release
- - run: cargo test --features=no_avx512,no_avx2,pure --release
- # No SSE4.1
- - run: cargo test --features=no_avx512,no_avx2,no_sse41
- - run: cargo test --features=no_avx512,no_avx2,no_sse41,prefer_intrinsics
- - run: cargo test --features=no_avx512,no_avx2,no_sse41,pure
- - run: cargo test --features=no_avx512,no_avx2,no_sse41 --release
- - run: cargo test --features=no_avx512,no_avx2,no_sse41,prefer_intrinsics --release
- - run: cargo test --features=no_avx512,no_avx2,no_sse41,pure --release
- # No SSE2
- - run: cargo test --features=no_avx512,no_avx2,no_sse41,no_sse2
- - run: cargo test --features=no_avx512,no_avx2,no_sse41,no_sse2,prefer_intrinsics
- - run: cargo test --features=no_avx512,no_avx2,no_sse41,no_sse2,pure
- - run: cargo test --features=no_avx512,no_avx2,no_sse41,no_sse2 --release
- - run: cargo test --features=no_avx512,no_avx2,no_sse41,no_sse2,prefer_intrinsics --release
- - run: cargo test --features=no_avx512,no_avx2,no_sse41,no_sse2,pure --release
-
- # Test benchmarks. RUSTC_BOOTSTRAP=1 lets this run on non-nightly toolchains.
- - run: cargo test --benches
- env:
- RUSTC_BOOTSTRAP: 1
- # Test vectors.
- - name: test vectors
- run: cargo test
- working-directory: ./test_vectors
- - name: test vectors intrinsics
- run: cargo test --features=prefer_intrinsics
- working-directory: ./test_vectors
- - name: test vectors pure
- run: cargo test --features=pure
- working-directory: ./test_vectors
- # Test b3sum.
- - name: test b3sum
- run: cargo test
- working-directory: ./b3sum
- - name: test b3sum --no-default-features
- run: cargo test --no-default-features
- working-directory: ./b3sum
- # Test C code.
- - name: cargo test C bindings assembly
- run: cargo test
- working-directory: ./c/blake3_c_rust_bindings
- - name: cargo test C bindings intrinsics
- run: cargo test --features=prefer_intrinsics
- working-directory: ./c/blake3_c_rust_bindings
- # Reference impl doc test.
- - name: reference impl doc test
- run: cargo test
- working-directory: ./reference_impl
-
- cross_tests:
- name: cross ${{ matrix.arch }}
- runs-on: ubuntu-latest
- strategy:
- fail-fast: false
- matrix:
- arch:
- - i586-unknown-linux-musl
- - i686-unknown-linux-musl
- - armv7-unknown-linux-gnueabihf
- - aarch64-unknown-linux-gnu
- - mips-unknown-linux-gnu
-
- steps:
- - uses: actions/checkout@v1
- - uses: actions-rs/toolchain@v1
- with:
- toolchain: stable
- override: true
- - run: cargo install cross
- # Test the portable implementation on everything.
- - run: cross test --target ${{ matrix.arch }}
- # Test building for ancient i386 processors without guaranteed SSE2 support.
- - run: cross rustc --target ${{ matrix.arch }} -- -C target-cpu=i386
- if: startsWith(matrix.arch, 'i586-') || startsWith(matrix.arch, 'i686-')
- # Test the NEON implementation on ARM targets.
- - run: cross test --target ${{ matrix.arch }} --features=neon
- if: startsWith(matrix.arch, 'armv7-') || startsWith(matrix.arch, 'aarch64-')
- # Test vectors. Note that this uses a hacky script due to path dependency limitations.
- - run: ./test_vectors/cross_test.sh --target ${{ matrix.arch }}
- # C code. Same issue with the hacky script.
- - run: ./c/blake3_c_rust_bindings/cross_test.sh --target ${{ matrix.arch }}
- - run: ./c/blake3_c_rust_bindings/cross_test.sh --target ${{ matrix.arch }} --features=neon
- if: startsWith(matrix.arch, 'armv7-') || startsWith(matrix.arch, 'aarch64-')
-
- # Currently only on x86.
- c_tests:
- name: C Makefile tests
- runs-on: ubuntu-latest
-
- steps:
- - uses: actions/checkout@v1
- # Test the intrinsics-based implementations.
- - run: make -f Makefile.testing test
- working-directory: ./c
- - run: make -f Makefile.testing clean && rm blake3_sse2.c
- working-directory: ./c
- - run: BLAKE3_NO_SSE2=1 make -f Makefile.testing test
- working-directory: ./c
- - run: make -f Makefile.testing clean && rm blake3_sse41.c
- working-directory: ./c
- - run: BLAKE3_NO_SSE2=1 BLAKE3_NO_SSE41=1 make -f Makefile.testing test
- working-directory: ./c
- - run: make -f Makefile.testing clean && rm blake3_avx2.c
- working-directory: ./c
- - run: BLAKE3_NO_SSE2=1 BLAKE3_NO_SSE41=1 BLAKE3_NO_AVX2=1 make -f Makefile.testing test
- working-directory: ./c
- - run: make -f Makefile.testing clean && rm blake3_avx512.c
- working-directory: ./c
- - run: BLAKE3_NO_SSE2=1 BLAKE3_NO_SSE41=1 BLAKE3_NO_AVX2=1 BLAKE3_NO_AVX512=1 make -f Makefile.testing test
- working-directory: ./c
- # Test the assembly implementations.
- - run: make -f Makefile.testing test_asm
- working-directory: ./c
- - run: make -f Makefile.testing clean && rm blake3_sse2_x86-64_unix.S
- working-directory: ./c
- - run: BLAKE3_NO_SSE2=1 make -f Makefile.testing test_asm
- working-directory: ./c
- - run: make -f Makefile.testing clean && rm blake3_sse41_x86-64_unix.S
- working-directory: ./c
- - run: BLAKE3_NO_SSE2=1 BLAKE3_NO_SSE41=1 make -f Makefile.testing test_asm
- working-directory: ./c
- - run: make -f Makefile.testing clean && rm blake3_avx2_x86-64_unix.S
- working-directory: ./c
- - run: BLAKE3_NO_SSE2=1 BLAKE3_NO_SSE41=1 BLAKE3_NO_AVX2=1 make -f Makefile.testing test_asm
- working-directory: ./c
- - run: make -f Makefile.testing clean && rm blake3_avx512_x86-64_unix.S
- working-directory: ./c
- - run: BLAKE3_NO_SSE2=1 BLAKE3_NO_SSE41=1 BLAKE3_NO_AVX2=1 BLAKE3_NO_AVX512=1 make -f Makefile.testing test_asm
- working-directory: ./c
- # Restore the files we deleted above.
- - run: git checkout .
- # Build the example.
- - run: make -f Makefile.testing example
- working-directory: ./c
diff --git a/thirdparty/BLAKE3/.github/workflows/tag.yml b/thirdparty/BLAKE3/.github/workflows/tag.yml
deleted file mode 100644
index 577d4f312..000000000
--- a/thirdparty/BLAKE3/.github/workflows/tag.yml
+++ /dev/null
@@ -1,45 +0,0 @@
-name: publish_b3sum_binaries
-
-on:
- push:
- tags:
- - "*"
-
-env:
- BLAKE3_CI: "1"
- RUSTFLAGS: "-D warnings"
-
-jobs:
- cargo_tests:
- name: ${{ matrix.target.name }}
- runs-on: ${{ matrix.target.os }}
- strategy:
- fail-fast: false
- matrix:
- target: [
- { "os": "ubuntu-latest", "rust-target": "x86_64-unknown-linux-musl", "name": "Linux" },
- { "os": "macOS-latest", "rust-target": "x86_64-apple-darwin", "name": "macOS" },
- { "os": "windows-latest", "rust-target": "x86_64-pc-windows-msvc", "name": "Windows" },
- ]
-
- steps:
- - uses: actions/checkout@v1
- - uses: actions/setup-python@v1
- with:
- python-version: "3.x"
- - run: pip install PyGithub
- - run: sudo apt-get install musl-tools
- if: matrix.target.os == 'ubuntu-latest'
- - uses: actions-rs/toolchain@v1
- with:
- toolchain: stable
- profile: minimal
- - run: rustup target add ${{ matrix.target.rust-target }}
- - name: build b3sum
- id: build_b3sum
- run: python -u .github/workflows/build_b3sum.py ${{ matrix.target.rust-target }}
- - name: upload release asset
- env:
- GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- GITHUB_TAG: ${{ github.ref }}
- run: python -u .github/workflows/upload_github_release_asset.py ${{ steps.build_b3sum.outputs.bin_path }}
diff --git a/thirdparty/BLAKE3/.github/workflows/upload_github_release_asset.py b/thirdparty/BLAKE3/.github/workflows/upload_github_release_asset.py
deleted file mode 100644
index c1cbf518b..000000000
--- a/thirdparty/BLAKE3/.github/workflows/upload_github_release_asset.py
+++ /dev/null
@@ -1,65 +0,0 @@
-#! /usr/bin/env python3
-
-import github
-import os
-import sys
-
-RETRIES = 10
-
-g = github.Github(os.environ["GITHUB_TOKEN"])
-tag_name = os.environ["GITHUB_TAG"]
-tag_prefix = "refs/tags/"
-if tag_name.startswith(tag_prefix):
- tag_name = tag_name[len(tag_prefix):]
-assert len(sys.argv) == 2
-asset_path = sys.argv[1]
-asset_name = os.path.basename(asset_path)
-
-repo = g.get_repo(os.environ["GITHUB_REPOSITORY"])
-
-tags = list(repo.get_tags())
-
-for tag in tags:
- if tag.name == tag_name:
- break
-else:
- raise RuntimeError("no tag named " + repr(tag_name))
-
-try:
- print("Creating GitHub release for tag " + repr(tag_name) + "...")
- repo.create_git_release(tag_name, tag_name, tag.commit.commit.message)
-except github.GithubException as github_error:
- if github_error.data["errors"][0]["code"] == "already_exists":
- print("Release for tag " + repr(tag_name) + " already exists.")
- else:
- raise
-
-releases = list(repo.get_releases())
-for release in releases:
- if release.tag_name == tag_name:
- break
-else:
- raise RuntimeError("no release for tag " + repr(tag_name))
-
-print("Uploading " + repr(asset_path) + "...")
-for i in range(RETRIES):
- try:
- print("Upload attempt #{} of {}...".format(i + 1, RETRIES))
- release.upload_asset(asset_path)
- break
- except github.GithubException as github_error:
- # Unfortunately the asset upload API is flaky. Even worse, it often
- # partially succeeds, returning an error to the caller but leaving the
- # release in a state where subsequent uploads of the same asset will
- # fail with an "already_exists" error. (Though the asset is not visible
- # on github.com, so we can't just declare victory and move on.) If we
- # detect this case, explicitly delete the asset and continue retrying.
- print(github_error)
- for asset in release.get_assets():
- if asset.name == asset_name:
- print("Found uploaded asset after failure. Deleting...")
- asset.delete_asset()
-else:
- raise RuntimeError("All upload attempts failed.")
-
-print("Success!")
diff --git a/thirdparty/BLAKE3/.gitignore b/thirdparty/BLAKE3/.gitignore
deleted file mode 100644
index fa8d85ac5..000000000
--- a/thirdparty/BLAKE3/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-Cargo.lock
-target
diff --git a/thirdparty/BLAKE3/CONTRIBUTING.md b/thirdparty/BLAKE3/CONTRIBUTING.md
deleted file mode 100644
index 3a605f255..000000000
--- a/thirdparty/BLAKE3/CONTRIBUTING.md
+++ /dev/null
@@ -1,31 +0,0 @@
-# Contributing
-
-We welcome and encourage third-party contributions to BLAKE3, be it reports of issues encountered while using the software or proposals of patches.
-
-## Bug reports
-
-Bugs and other problems should be reported on [GitHub Issues](https://github.com/BLAKE3/BLAKE3/issues).
-
-If you report a bug, please:
-
-* Check that it's not already reported in the [GitHub Issues](https://github.com/BLAKE3/BLAKE3/issues).
-* Provide information to help us diagnose and ideally reproduce the bug.
-
-## Patches
-
-We encourage you to fix a bug via a [GitHub Pull request](https://github.com/BLAKE3/BLAKE3/pulls), preferably after creating a related issue and referring it in the PR.
-
-If you contribute code and submit a patch, please note the following:
-
-* We use Rust's stable branch for developing BLAKE3.
-* Pull requests should target the `master` branch.
-* Try to follow the established Rust [style guidelines](https://doc.rust-lang.org/1.0.0/style/).
-
-Also please make sure to create new unit tests covering your code additions. You can execute the tests by running:
-
-```bash
-cargo test
-```
-
-All third-party contributions will be recognized in the list of contributors.
-
diff --git a/thirdparty/BLAKE3/Cargo.toml b/thirdparty/BLAKE3/Cargo.toml
deleted file mode 100644
index 3df0fd279..000000000
--- a/thirdparty/BLAKE3/Cargo.toml
+++ /dev/null
@@ -1,90 +0,0 @@
-[package]
-name = "blake3"
-version = "0.3.7"
-authors = ["Jack O'Connor <[email protected]>"]
-description = "the BLAKE3 hash function"
-repository = "https://github.com/BLAKE3-team/BLAKE3"
-license = "CC0-1.0 OR Apache-2.0"
-documentation = "https://docs.rs/blake3"
-readme = "README.md"
-edition = "2018"
-
-[features]
-default = ["std"]
-
-# The NEON implementation does not participate in dynamic feature detection,
-# which is currently x86-only. If "neon" is on, NEON support is assumed. Note
-# that AArch64 always supports NEON, but support on ARMv7 varies. The NEON
-# implementation uses C intrinsics and requires a C compiler.
-neon = []
-
-# This crate uses libstd for std::io trait implementations, and also for
-# runtime CPU feature detection. This feature is enabled by default. If you use
-# --no-default-features, the only way to use the SIMD implementations in this
-# crate is to enable the corresponding instruction sets statically for the
-# entire build, with e.g. RUSTFLAGS="-C target-cpu=native".
-std = ["digest/std"]
-
-# The "rayon" feature (defined below as an optional dependency) enables the
-# join::RayonJoin type, which can be used with Hasher::update_with_join to
-# perform multi-threaded hashing. However, even if this feature is enabled, all
-# other APIs remain single-threaded.
-
-# ---------- Features below this line are for internal testing only. ----------
-
-# By default on x86_64, this crate uses Samuel Neves' hand-written assembly
-# implementations for SSE4.1, AVX2, and AVX512. (These provide both the best
-# runtime performance, and the fastest build times.) And by default on 32-bit
-# x86, this crate uses Rust intrinsics implementations for SSE4.1 and AVX2, and
-# a C intrinsics implementation for AVX-512. In both cases, if a C compiler is
-# not detected, or if AVX-512 support is missing from the detected compiler,
-# build.rs automatically falls back to a pure Rust build. This feature forces
-# that fallback, for testing purposes. (Note that in CI testing, we set the
-# BLAKE3_CI environment variable, which instructs build.rs to error out rather
-# than doing an automatic fallback.)
-pure = []
-
-# As described above, on x86_64 this crate use assembly implementations by
-# default. Enabling the "prefer_intrinsics" feature makes this crate use
-# intrinsics implementations on both 32-bit and 64-bit x86, again for testing
-# purposes.
-prefer_intrinsics = []
-
-# Disable individual instruction sets. CI testing uses these flags to simulate
-# different levels of hardware SIMD support. Note that code for the
-# corresponding instruction set is still compiled; only detection is disabled.
-#
-# As noted above, these flags are *for testing only* and are not stable. It's
-# possible that some users might find that their particular use case performs
-# better if e.g. AVX-512 is disabled, because of issues like CPU downlocking.
-# If that comes up, and if disabling the instruction set here at the feature
-# level turns out to be the right approach, then we can design a stable
-# feature. Until then, we reserve the right to break these features in a patch
-# release.
-no_sse2 = []
-no_sse41 = []
-no_avx2 = []
-no_avx512 = []
-
-[package.metadata.docs.rs]
-# Document blake3::join::RayonJoin on docs.rs.
-features = ["rayon"]
-
-[dependencies]
-arrayref = "0.3.5"
-arrayvec = { version = "0.5.1", default-features = false, features = ["array-sizes-33-128"] }
-constant_time_eq = "0.1.5"
-rayon = { version = "1.2.1", optional = true }
-cfg-if = "0.1.10"
-digest = "0.9.0"
-crypto-mac = "0.8.0"
-
-[dev-dependencies]
-hex = "0.4.2"
-page_size = "0.4.1"
-rand = "0.7.2"
-rand_chacha = "0.2.1"
-reference_impl = { path = "./reference_impl" }
-
-[build-dependencies]
-cc = "1.0.4"
diff --git a/thirdparty/BLAKE3/LICENSE b/thirdparty/BLAKE3/LICENSE
deleted file mode 100644
index f5892efc3..000000000
--- a/thirdparty/BLAKE3/LICENSE
+++ /dev/null
@@ -1,330 +0,0 @@
-This work is released into the public domain with CC0 1.0. Alternatively, it is
-licensed under the Apache License 2.0.
-
--------------------------------------------------------------------------------
-
-Creative Commons Legal Code
-
-CC0 1.0 Universal
-
- CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
- LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
- ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
- INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
- REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
- PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
- THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
- HEREUNDER.
-
-Statement of Purpose
-
-The laws of most jurisdictions throughout the world automatically confer
-exclusive Copyright and Related Rights (defined below) upon the creator
-and subsequent owner(s) (each and all, an "owner") of an original work of
-authorship and/or a database (each, a "Work").
-
-Certain owners wish to permanently relinquish those rights to a Work for
-the purpose of contributing to a commons of creative, cultural and
-scientific works ("Commons") that the public can reliably and without fear
-of later claims of infringement build upon, modify, incorporate in other
-works, reuse and redistribute as freely as possible in any form whatsoever
-and for any purposes, including without limitation commercial purposes.
-These owners may contribute to the Commons to promote the ideal of a free
-culture and the further production of creative, cultural and scientific
-works, or to gain reputation or greater distribution for their Work in
-part through the use and efforts of others.
-
-For these and/or other purposes and motivations, and without any
-expectation of additional consideration or compensation, the person
-associating CC0 with a Work (the "Affirmer"), to the extent that he or she
-is an owner of Copyright and Related Rights in the Work, voluntarily
-elects to apply CC0 to the Work and publicly distribute the Work under its
-terms, with knowledge of his or her Copyright and Related Rights in the
-Work and the meaning and intended legal effect of CC0 on those rights.
-
-1. Copyright and Related Rights. A Work made available under CC0 may be
-protected by copyright and related or neighboring rights ("Copyright and
-Related Rights"). Copyright and Related Rights include, but are not
-limited to, the following:
-
- i. the right to reproduce, adapt, distribute, perform, display,
- communicate, and translate a Work;
- ii. moral rights retained by the original author(s) and/or performer(s);
-iii. publicity and privacy rights pertaining to a person's image or
- likeness depicted in a Work;
- iv. rights protecting against unfair competition in regards to a Work,
- subject to the limitations in paragraph 4(a), below;
- v. rights protecting the extraction, dissemination, use and reuse of data
- in a Work;
- vi. database rights (such as those arising under Directive 96/9/EC of the
- European Parliament and of the Council of 11 March 1996 on the legal
- protection of databases, and under any national implementation
- thereof, including any amended or successor version of such
- directive); and
-vii. other similar, equivalent or corresponding rights throughout the
- world based on applicable law or treaty, and any national
- implementations thereof.
-
-2. Waiver. To the greatest extent permitted by, but not in contravention
-of, applicable law, Affirmer hereby overtly, fully, permanently,
-irrevocably and unconditionally waives, abandons, and surrenders all of
-Affirmer's Copyright and Related Rights and associated claims and causes
-of action, whether now known or unknown (including existing as well as
-future claims and causes of action), in the Work (i) in all territories
-worldwide, (ii) for the maximum duration provided by applicable law or
-treaty (including future time extensions), (iii) in any current or future
-medium and for any number of copies, and (iv) for any purpose whatsoever,
-including without limitation commercial, advertising or promotional
-purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
-member of the public at large and to the detriment of Affirmer's heirs and
-successors, fully intending that such Waiver shall not be subject to
-revocation, rescission, cancellation, termination, or any other legal or
-equitable action to disrupt the quiet enjoyment of the Work by the public
-as contemplated by Affirmer's express Statement of Purpose.
-
-3. Public License Fallback. Should any part of the Waiver for any reason
-be judged legally invalid or ineffective under applicable law, then the
-Waiver shall be preserved to the maximum extent permitted taking into
-account Affirmer's express Statement of Purpose. In addition, to the
-extent the Waiver is so judged Affirmer hereby grants to each affected
-person a royalty-free, non transferable, non sublicensable, non exclusive,
-irrevocable and unconditional license to exercise Affirmer's Copyright and
-Related Rights in the Work (i) in all territories worldwide, (ii) for the
-maximum duration provided by applicable law or treaty (including future
-time extensions), (iii) in any current or future medium and for any number
-of copies, and (iv) for any purpose whatsoever, including without
-limitation commercial, advertising or promotional purposes (the
-"License"). The License shall be deemed effective as of the date CC0 was
-applied by Affirmer to the Work. Should any part of the License for any
-reason be judged legally invalid or ineffective under applicable law, such
-partial invalidity or ineffectiveness shall not invalidate the remainder
-of the License, and in such case Affirmer hereby affirms that he or she
-will not (i) exercise any of his or her remaining Copyright and Related
-Rights in the Work or (ii) assert any associated claims and causes of
-action with respect to the Work, in either case contrary to Affirmer's
-express Statement of Purpose.
-
-4. Limitations and Disclaimers.
-
- a. No trademark or patent rights held by Affirmer are waived, abandoned,
- surrendered, licensed or otherwise affected by this document.
- b. Affirmer offers the Work as-is and makes no representations or
- warranties of any kind concerning the Work, express, implied,
- statutory or otherwise, including without limitation warranties of
- title, merchantability, fitness for a particular purpose, non
- infringement, or the absence of latent or other defects, accuracy, or
- the present or absence of errors, whether or not discoverable, all to
- the greatest extent permissible under applicable law.
- c. Affirmer disclaims responsibility for clearing rights of other persons
- that may apply to the Work or any use thereof, including without
- limitation any person's Copyright and Related Rights in the Work.
- Further, Affirmer disclaims responsibility for obtaining any necessary
- consents, permissions or other rights required for any use of the
- Work.
- d. Affirmer understands and acknowledges that Creative Commons is not a
- party to this document and has no duty or obligation with respect to
- this CC0 or use of the Work.
-
--------------------------------------------------------------------------------
-
- Apache License
- Version 2.0, January 2004
- http://www.apache.org/licenses/
-
- TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
- 1. Definitions.
-
- "License" shall mean the terms and conditions for use, reproduction,
- and distribution as defined by Sections 1 through 9 of this document.
-
- "Licensor" shall mean the copyright owner or entity authorized by
- the copyright owner that is granting the License.
-
- "Legal Entity" shall mean the union of the acting entity and all
- other entities that control, are controlled by, or are under common
- control with that entity. For the purposes of this definition,
- "control" means (i) the power, direct or indirect, to cause the
- direction or management of such entity, whether by contract or
- otherwise, or (ii) ownership of fifty percent (50%) or more of the
- outstanding shares, or (iii) beneficial ownership of such entity.
-
- "You" (or "Your") shall mean an individual or Legal Entity
- exercising permissions granted by this License.
-
- "Source" form shall mean the preferred form for making modifications,
- including but not limited to software source code, documentation
- source, and configuration files.
-
- "Object" form shall mean any form resulting from mechanical
- transformation or translation of a Source form, including but
- not limited to compiled object code, generated documentation,
- and conversions to other media types.
-
- "Work" shall mean the work of authorship, whether in Source or
- Object form, made available under the License, as indicated by a
- copyright notice that is included in or attached to the work
- (an example is provided in the Appendix below).
-
- "Derivative Works" shall mean any work, whether in Source or Object
- form, that is based on (or derived from) the Work and for which the
- editorial revisions, annotations, elaborations, or other modifications
- represent, as a whole, an original work of authorship. For the purposes
- of this License, Derivative Works shall not include works that remain
- separable from, or merely link (or bind by name) to the interfaces of,
- the Work and Derivative Works thereof.
-
- "Contribution" shall mean any work of authorship, including
- the original version of the Work and any modifications or additions
- to that Work or Derivative Works thereof, that is intentionally
- submitted to Licensor for inclusion in the Work by the copyright owner
- or by an individual or Legal Entity authorized to submit on behalf of
- the copyright owner. For the purposes of this definition, "submitted"
- means any form of electronic, verbal, or written communication sent
- to the Licensor or its representatives, including but not limited to
- communication on electronic mailing lists, source code control systems,
- and issue tracking systems that are managed by, or on behalf of, the
- Licensor for the purpose of discussing and improving the Work, but
- excluding communication that is conspicuously marked or otherwise
- designated in writing by the copyright owner as "Not a Contribution."
-
- "Contributor" shall mean Licensor and any individual or Legal Entity
- on behalf of whom a Contribution has been received by Licensor and
- subsequently incorporated within the Work.
-
- 2. Grant of Copyright License. Subject to the terms and conditions of
- this License, each Contributor hereby grants to You a perpetual,
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
- copyright license to reproduce, prepare Derivative Works of,
- publicly display, publicly perform, sublicense, and distribute the
- Work and such Derivative Works in Source or Object form.
-
- 3. Grant of Patent License. Subject to the terms and conditions of
- this License, each Contributor hereby grants to You a perpetual,
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
- (except as stated in this section) patent license to make, have made,
- use, offer to sell, sell, import, and otherwise transfer the Work,
- where such license applies only to those patent claims licensable
- by such Contributor that are necessarily infringed by their
- Contribution(s) alone or by combination of their Contribution(s)
- with the Work to which such Contribution(s) was submitted. If You
- institute patent litigation against any entity (including a
- cross-claim or counterclaim in a lawsuit) alleging that the Work
- or a Contribution incorporated within the Work constitutes direct
- or contributory patent infringement, then any patent licenses
- granted to You under this License for that Work shall terminate
- as of the date such litigation is filed.
-
- 4. Redistribution. You may reproduce and distribute copies of the
- Work or Derivative Works thereof in any medium, with or without
- modifications, and in Source or Object form, provided that You
- meet the following conditions:
-
- (a) You must give any other recipients of the Work or
- Derivative Works a copy of this License; and
-
- (b) You must cause any modified files to carry prominent notices
- stating that You changed the files; and
-
- (c) You must retain, in the Source form of any Derivative Works
- that You distribute, all copyright, patent, trademark, and
- attribution notices from the Source form of the Work,
- excluding those notices that do not pertain to any part of
- the Derivative Works; and
-
- (d) If the Work includes a "NOTICE" text file as part of its
- distribution, then any Derivative Works that You distribute must
- include a readable copy of the attribution notices contained
- within such NOTICE file, excluding those notices that do not
- pertain to any part of the Derivative Works, in at least one
- of the following places: within a NOTICE text file distributed
- as part of the Derivative Works; within the Source form or
- documentation, if provided along with the Derivative Works; or,
- within a display generated by the Derivative Works, if and
- wherever such third-party notices normally appear. The contents
- of the NOTICE file are for informational purposes only and
- do not modify the License. You may add Your own attribution
- notices within Derivative Works that You distribute, alongside
- or as an addendum to the NOTICE text from the Work, provided
- that such additional attribution notices cannot be construed
- as modifying the License.
-
- You may add Your own copyright statement to Your modifications and
- may provide additional or different license terms and conditions
- for use, reproduction, or distribution of Your modifications, or
- for any such Derivative Works as a whole, provided Your use,
- reproduction, and distribution of the Work otherwise complies with
- the conditions stated in this License.
-
- 5. Submission of Contributions. Unless You explicitly state otherwise,
- any Contribution intentionally submitted for inclusion in the Work
- by You to the Licensor shall be under the terms and conditions of
- this License, without any additional terms or conditions.
- Notwithstanding the above, nothing herein shall supersede or modify
- the terms of any separate license agreement you may have executed
- with Licensor regarding such Contributions.
-
- 6. Trademarks. This License does not grant permission to use the trade
- names, trademarks, service marks, or product names of the Licensor,
- except as required for reasonable and customary use in describing the
- origin of the Work and reproducing the content of the NOTICE file.
-
- 7. Disclaimer of Warranty. Unless required by applicable law or
- agreed to in writing, Licensor provides the Work (and each
- Contributor provides its Contributions) on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
- implied, including, without limitation, any warranties or conditions
- of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
- PARTICULAR PURPOSE. You are solely responsible for determining the
- appropriateness of using or redistributing the Work and assume any
- risks associated with Your exercise of permissions under this License.
-
- 8. Limitation of Liability. In no event and under no legal theory,
- whether in tort (including negligence), contract, or otherwise,
- unless required by applicable law (such as deliberate and grossly
- negligent acts) or agreed to in writing, shall any Contributor be
- liable to You for damages, including any direct, indirect, special,
- incidental, or consequential damages of any character arising as a
- result of this License or out of the use or inability to use the
- Work (including but not limited to damages for loss of goodwill,
- work stoppage, computer failure or malfunction, or any and all
- other commercial damages or losses), even if such Contributor
- has been advised of the possibility of such damages.
-
- 9. Accepting Warranty or Additional Liability. While redistributing
- the Work or Derivative Works thereof, You may choose to offer,
- and charge a fee for, acceptance of support, warranty, indemnity,
- or other liability obligations and/or rights consistent with this
- License. However, in accepting such obligations, You may act only
- on Your own behalf and on Your sole responsibility, not on behalf
- of any other Contributor, and only if You agree to indemnify,
- defend, and hold each Contributor harmless for any liability
- incurred by, or claims asserted against, such Contributor by reason
- of your accepting any such warranty or additional liability.
-
- END OF TERMS AND CONDITIONS
-
- APPENDIX: How to apply the Apache License to your work.
-
- To apply the Apache License to your work, attach the following
- boilerplate notice, with the fields enclosed by brackets "[]"
- replaced with your own identifying information. (Don't include
- the brackets!) The text should be enclosed in the appropriate
- comment syntax for the file format. We also recommend that a
- file or class name and description of purpose be included on the
- same "printed page" as the copyright notice for easier
- identification within third-party archives.
-
- Copyright 2019 Jack O'Connor and Samuel Neves
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
diff --git a/thirdparty/BLAKE3/README.md b/thirdparty/BLAKE3/README.md
deleted file mode 100644
index 360183668..000000000
--- a/thirdparty/BLAKE3/README.md
+++ /dev/null
@@ -1,202 +0,0 @@
-# <a href="#"><img src="media/BLAKE3.svg" alt="BLAKE3" height=50></a>
-
-BLAKE3 is a cryptographic hash function that is:
-
-- **Much faster** than MD5, SHA-1, SHA-2, SHA-3, and BLAKE2.
-- **Secure**, unlike MD5 and SHA-1. And secure against length extension,
- unlike SHA-2.
-- **Highly parallelizable** across any number of threads and SIMD lanes,
- because it's a Merkle tree on the inside.
-- Capable of **verified streaming** and **incremental updates**, again
- because it's a Merkle tree.
-- A **PRF**, **MAC**, **KDF**, and **XOF**, as well as a regular hash.
-- **One algorithm with no variants**, which is fast on x86-64 and also
- on smaller architectures.
-
-The [chart below](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/benchmarks/bar_chart.py)
-is an example benchmark of 16 KiB inputs on modern server hardware (a Cascade
-Lake-SP 8275CL processor). For more detailed benchmarks, see the
-[BLAKE3 paper](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf).
-
-<p align="center">
-<img src="media/speed.svg" alt="performance graph">
-</p>
-
-BLAKE3 is based on an optimized instance of the established hash
-function [BLAKE2](https://blake2.net) and on the [original Bao tree
-mode](https://github.com/oconnor663/bao/blob/master/docs/spec_0.9.1.md).
-The specifications and design rationale are available in the [BLAKE3
-paper](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf).
-The default output size is 256 bits. The current version of
-[Bao](https://github.com/oconnor663/bao) implements verified streaming
-with BLAKE3.
-
-This repository is the official implementation of BLAKE3. It includes:
-
-* The [`blake3`](https://crates.io/crates/blake3) Rust crate, which
- includes optimized implementations for SSE2, SSE4.1, AVX2, AVX-512,
- and NEON, with automatic runtime CPU feature detection on x86. The
- `rayon` feature provides multithreading.
-
-* The [`b3sum`](https://crates.io/crates/b3sum) Rust crate, which
- provides a command line interface. It uses multithreading by default,
- making it an order of magnitude faster than e.g. `sha256sum` on
- typical desktop hardware.
-
-* The [C implementation](c), which like the Rust implementation includes
- SIMD code and runtime CPU feature detection on x86. Unlike the Rust
- implementation, it's not currently multithreaded. See
- [`c/README.md`](c/README.md).
-
-* The [reference implementation](reference_impl/reference_impl.rs),
- which is discussed in Section 5.1 of the [BLAKE3
- paper](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf).
- This implementation is much smaller and simpler than the optimized
- ones above. If you want to see how BLAKE3 works, or you're writing a
- port that doesn't need multithreading or SIMD optimizations, start
- here.
-
-* A [set of test
- vectors](https://github.com/BLAKE3-team/BLAKE3/blob/master/test_vectors/test_vectors.json)
- that covers extended outputs, all three modes, and a variety of input
- lengths.
-
-* [![Actions Status](https://github.com/BLAKE3-team/BLAKE3/workflows/tests/badge.svg)](https://github.com/BLAKE3-team/BLAKE3/actions)
-
-BLAKE3 was designed by:
-
-* [@oconnor663 ](https://github.com/oconnor663) (Jack O'Connor)
-* [@sneves](https://github.com/sneves) (Samuel Neves)
-* [@veorq](https://github.com/veorq) (Jean-Philippe Aumasson)
-* [@zookozcash](https://github.com/zookozcash) (Zooko)
-
-The development of BLAKE3 was sponsored by
-[Teserakt](https://teserakt.io) and [Electric Coin Company](https://electriccoin.co).
-
-*NOTE: BLAKE3 is not a password hashing algorithm, because it's
-designed to be fast, whereas password hashing should not be fast. If you
-hash passwords to store the hashes or if you derive keys from passwords,
-we recommend [Argon2](https://github.com/P-H-C/phc-winner-argon2).*
-
-## Usage
-
-### The `b3sum` utility
-
-The `b3sum` command line utility prints the BLAKE3 hashes of files or of
-standard input. Prebuilt binaries are available for Linux, Windows, and
-macOS (requiring the [unidentified developer
-workaround](https://support.apple.com/guide/mac-help/open-a-mac-app-from-an-unidentified-developer-mh40616/mac))
-on the [releases page](https://github.com/BLAKE3-team/BLAKE3/releases).
-If you've [installed Rust and
-Cargo](https://doc.rust-lang.org/cargo/getting-started/installation.html),
-you can also build `b3sum` yourself with:
-
-```bash
-cargo install b3sum
-```
-
-If `rustup` didn't configure your `PATH` for you, you might need to go
-looking for the installed binary in e.g. `~/.cargo/bin`. You can test
-out how fast BLAKE3 is on your machine by creating a big file and
-hashing it, for example:
-
-```bash
-# Create a 1 GB file.
-head -c 1000000000 /dev/zero > /tmp/bigfile
-# Hash it with SHA-256.
-time openssl sha256 /tmp/bigfile
-# Hash it with BLAKE3.
-time b3sum /tmp/bigfile
-```
-
-### The `blake3` crate [![docs.rs](https://docs.rs/blake3/badge.svg)](https://docs.rs/blake3)
-
-To use BLAKE3 from Rust code, add a dependency on the `blake3` crate to
-your `Cargo.toml`. Here's an example of hashing some input bytes:
-
-```rust
-// Hash an input all at once.
-let hash1 = blake3::hash(b"foobarbaz");
-
-// Hash an input incrementally.
-let mut hasher = blake3::Hasher::new();
-hasher.update(b"foo");
-hasher.update(b"bar");
-hasher.update(b"baz");
-let hash2 = hasher.finalize();
-assert_eq!(hash1, hash2);
-
-// Extended output. OutputReader also implements Read and Seek.
-let mut output = [0; 1000];
-let mut output_reader = hasher.finalize_xof();
-output_reader.fill(&mut output);
-assert_eq!(&output[..32], hash1.as_bytes());
-
-// Print a hash as hex.
-println!("{}", hash1.to_hex());
-```
-
-Besides `hash`, BLAKE3 provides two other modes, `keyed_hash` and
-`derive_key`. The `keyed_hash` mode takes a 256-bit key:
-
-```rust
-// MAC an input all at once.
-let example_key = [42u8; 32];
-let mac1 = blake3::keyed_hash(&example_key, b"example input");
-
-// MAC incrementally.
-let mut hasher = blake3::Hasher::new_keyed(&example_key);
-hasher.update(b"example input");
-let mac2 = hasher.finalize();
-assert_eq!(mac1, mac2);
-```
-
-The `derive_key` mode takes a context string of any length and key
-material of any length, and it outputs a derived key of any length. The
-context string should be hardcoded, globally unique, and
-application-specific. A good default format for the context string is
-`"[application] [commit timestamp] [purpose]"`:
-
-```rust
-// Derive a couple of subkeys for different purposes.
-const EMAIL_CONTEXT: &str = "BLAKE3 example 2020-01-07 17:10:44 email key";
-const API_CONTEXT: &str = "BLAKE3 example 2020-01-07 17:11:21 API key";
-let input_key = b"some very secret key material (>'-')> <('-'<) ^('-')^";
-let mut email_key = [0; 32];
-blake3::derive_key(EMAIL_CONTEXT, input_key, &mut email_key);
-let mut api_key = [0; 32];
-blake3::derive_key(API_CONTEXT, input_key, &mut api_key);
-assert!(email_key != api_key);
-```
-
-### The C implementation
-
-See [`c/README.md`](c/README.md).
-
-### Other implementations
-
-We post links to third-party bindings and implementations on the
-[@BLAKE3team Twitter account](https://twitter.com/BLAKE3team) whenever
-we hear about them. Some highlights include [an optimized Go
-implementation](https://github.com/zeebo/blake3), [Wasm bindings for
-Node.js and browsers](https://github.com/connor4312/blake3), and [binary
-wheels for Python](https://github.com/oconnor663/blake3-py).
-
-## Contributing
-
-Please see [CONTRIBUTING.md](CONTRIBUTING.md).
-
-## Intellectual property
-
-The Rust code is copyright Jack O'Connor, 2019-2020. The C code is
-copyright Samuel Neves and Jack O'Connor, 2019-2020. The assembly code
-is copyright Samuel Neves, 2019-2020.
-
-This work is released into the public domain with CC0 1.0.
-Alternatively, it is licensed under the Apache License 2.0.
-
-## Miscellany
-
-- [@veorq](https://github.com/veorq) and
- [@oconnor663](https://github.com/oconnor663) did [a podcast
- interview](https://www.cryptography.fm/3) about designing BLAKE3.
diff --git a/thirdparty/BLAKE3/b3sum/Cargo.toml b/thirdparty/BLAKE3/b3sum/Cargo.toml
deleted file mode 100644
index 4678bee2d..000000000
--- a/thirdparty/BLAKE3/b3sum/Cargo.toml
+++ /dev/null
@@ -1,27 +0,0 @@
-[package]
-name = "b3sum"
-version = "0.3.7"
-authors = ["Jack O'Connor <[email protected]>"]
-description = "a command line implementation of the BLAKE3 hash function"
-repository = "https://github.com/BLAKE3-team/BLAKE3"
-license = "CC0-1.0 OR Apache-2.0"
-readme = "README.md"
-edition = "2018"
-
-[features]
-neon = ["blake3/neon"]
-prefer_intrinsics = ["blake3/prefer_intrinsics"]
-pure = ["blake3/pure"]
-
-[dependencies]
-anyhow = "1.0.25"
-blake3 = { version = "0.3", path = "..", features = ["rayon"] }
-clap = "2.33.1"
-hex = "0.4.0"
-memmap = "0.7.0"
-rayon = "1.2.1"
-wild = "2.0.3"
-
-[dev-dependencies]
-duct = "0.13.3"
-tempfile = "3.1.0"
diff --git a/thirdparty/BLAKE3/b3sum/README.md b/thirdparty/BLAKE3/b3sum/README.md
deleted file mode 100644
index e97830b7c..000000000
--- a/thirdparty/BLAKE3/b3sum/README.md
+++ /dev/null
@@ -1,86 +0,0 @@
-# b3sum
-
-A command line utility for calculating
-[BLAKE3](https://github.com/BLAKE3-team/BLAKE3) hashes, similar to
-Coreutils tools like `b2sum` or `md5sum`.
-
-```
-b3sum 0.3.6
-
-USAGE:
- b3sum [FLAGS] [OPTIONS] [FILE]...
-
-FLAGS:
- -c, --check Reads BLAKE3 sums from the [file]s and checks them
- -h, --help Prints help information
- --keyed Uses the keyed mode. The secret key is read from standard
- input, and it must be exactly 32 raw bytes.
- --no-mmap Disables memory mapping. Currently this also disables
- multithreading.
- --no-names Omits filenames in the output
- --quiet Skips printing OK for each successfully verified file.
- Must be used with --check.
- --raw Writes raw output bytes to stdout, rather than hex.
- --no-names is implied. In this case, only a single
- input is allowed.
- -V, --version Prints version information
-
-OPTIONS:
- --derive-key <CONTEXT> Uses the key derivation mode, with the given
- context string. Cannot be used with --keyed.
- -l, --length <LEN> The number of output bytes, prior to hex
- encoding (default 32)
- --num-threads <NUM> The maximum number of threads to use. By
- default, this is the number of logical cores.
- If this flag is omitted, or if its value is 0,
- RAYON_NUM_THREADS is also respected.
-
-ARGS:
- <FILE>... Files to hash, or checkfiles to check. When no file is given,
- or when - is given, read standard input.
-```
-
-See also [this document about how the `--check` flag
-works](https://github.com/BLAKE3-team/BLAKE3/blob/master/b3sum/what_does_check_do.md).
-
-# Example
-
-Hash the file `foo.txt`:
-
-```bash
-b3sum foo.txt
-```
-
-Time hashing a gigabyte of data, to see how fast it is:
-
-```bash
-# Create a 1 GB file.
-head -c 1000000000 /dev/zero > /tmp/bigfile
-# Hash it with SHA-256.
-time openssl sha256 /tmp/bigfile
-# Hash it with BLAKE3.
-time b3sum /tmp/bigfile
-```
-
-
-# Installation
-
-Prebuilt binaries are available for Linux, Windows, and macOS (requiring
-the [unidentified developer
-workaround](https://support.apple.com/guide/mac-help/open-a-mac-app-from-an-unidentified-developer-mh40616/mac))
-on the [releases page](https://github.com/BLAKE3-team/BLAKE3/releases).
-If you've [installed Rust and
-Cargo](https://doc.rust-lang.org/cargo/getting-started/installation.html),
-you can also build `b3sum` yourself with:
-
-```
-cargo install b3sum
-```
-
-On Linux for example, Cargo will put the compiled binary in
-`~/.cargo/bin`. You might want to add that directory to your `$PATH`, or
-`rustup` might have done it for you when you installed Cargo.
-
-If you want to install directly from this directory, you can run `cargo
-install --path .`. Or you can just build with `cargo build --release`,
-which puts the binary at `./target/release/b3sum`.
diff --git a/thirdparty/BLAKE3/b3sum/src/main.rs b/thirdparty/BLAKE3/b3sum/src/main.rs
deleted file mode 100644
index b01e5de58..000000000
--- a/thirdparty/BLAKE3/b3sum/src/main.rs
+++ /dev/null
@@ -1,621 +0,0 @@
-use anyhow::{bail, ensure, Context, Result};
-use clap::{App, Arg};
-use std::cmp;
-use std::convert::TryInto;
-use std::fs::File;
-use std::io;
-use std::io::prelude::*;
-use std::path::{Path, PathBuf};
-
-#[cfg(test)]
-mod unit_tests;
-
-const NAME: &str = "b3sum";
-
-const FILE_ARG: &str = "FILE";
-const DERIVE_KEY_ARG: &str = "derive-key";
-const KEYED_ARG: &str = "keyed";
-const LENGTH_ARG: &str = "length";
-const NO_MMAP_ARG: &str = "no-mmap";
-const NO_NAMES_ARG: &str = "no-names";
-const NUM_THREADS_ARG: &str = "num-threads";
-const RAW_ARG: &str = "raw";
-const CHECK_ARG: &str = "check";
-const QUIET_ARG: &str = "quiet";
-
-struct Args {
- inner: clap::ArgMatches<'static>,
- file_args: Vec<PathBuf>,
- base_hasher: blake3::Hasher,
-}
-
-impl Args {
- fn parse() -> Result<Self> {
- let inner = App::new(NAME)
- .version(env!("CARGO_PKG_VERSION"))
- .arg(Arg::with_name(FILE_ARG).multiple(true).help(
- "Files to hash, or checkfiles to check. When no file is given,\n\
- or when - is given, read standard input.",
- ))
- .arg(
- Arg::with_name(LENGTH_ARG)
- .long(LENGTH_ARG)
- .short("l")
- .takes_value(true)
- .value_name("LEN")
- .help(
- "The number of output bytes, prior to hex\n\
- encoding (default 32)",
- ),
- )
- .arg(
- Arg::with_name(NUM_THREADS_ARG)
- .long(NUM_THREADS_ARG)
- .takes_value(true)
- .value_name("NUM")
- .help(
- "The maximum number of threads to use. By\n\
- default, this is the number of logical cores.\n\
- If this flag is omitted, or if its value is 0,\n\
- RAYON_NUM_THREADS is also respected.",
- ),
- )
- .arg(
- Arg::with_name(KEYED_ARG)
- .long(KEYED_ARG)
- .requires(FILE_ARG)
- .help(
- "Uses the keyed mode. The secret key is read from standard\n\
- input, and it must be exactly 32 raw bytes.",
- ),
- )
- .arg(
- Arg::with_name(DERIVE_KEY_ARG)
- .long(DERIVE_KEY_ARG)
- .conflicts_with(KEYED_ARG)
- .takes_value(true)
- .value_name("CONTEXT")
- .help(
- "Uses the key derivation mode, with the given\n\
- context string. Cannot be used with --keyed.",
- ),
- )
- .arg(Arg::with_name(NO_MMAP_ARG).long(NO_MMAP_ARG).help(
- "Disables memory mapping. Currently this also disables\n\
- multithreading.",
- ))
- .arg(
- Arg::with_name(NO_NAMES_ARG)
- .long(NO_NAMES_ARG)
- .help("Omits filenames in the output"),
- )
- .arg(Arg::with_name(RAW_ARG).long(RAW_ARG).help(
- "Writes raw output bytes to stdout, rather than hex.\n\
- --no-names is implied. In this case, only a single\n\
- input is allowed.",
- ))
- .arg(
- Arg::with_name(CHECK_ARG)
- .long(CHECK_ARG)
- .short("c")
- .conflicts_with(DERIVE_KEY_ARG)
- .conflicts_with(KEYED_ARG)
- .conflicts_with(LENGTH_ARG)
- .conflicts_with(RAW_ARG)
- .conflicts_with(NO_NAMES_ARG)
- .help("Reads BLAKE3 sums from the [file]s and checks them"),
- )
- .arg(
- Arg::with_name(QUIET_ARG)
- .long(QUIET_ARG)
- .requires(CHECK_ARG)
- .help(
- "Skips printing OK for each successfully verified file.\n\
- Must be used with --check.",
- ),
- )
- // wild::args_os() is equivalent to std::env::args_os() on Unix,
- // but on Windows it adds support for globbing.
- .get_matches_from(wild::args_os());
- let file_args = if let Some(iter) = inner.values_of_os(FILE_ARG) {
- iter.map(|s| s.into()).collect()
- } else {
- vec!["-".into()]
- };
- if inner.is_present(RAW_ARG) && file_args.len() > 1 {
- bail!("Only one filename can be provided when using --raw");
- }
- let base_hasher = if inner.is_present(KEYED_ARG) {
- // In keyed mode, since stdin is used for the key, we can't handle
- // `-` arguments. Input::open handles that case below.
- blake3::Hasher::new_keyed(&read_key_from_stdin()?)
- } else if let Some(context) = inner.value_of(DERIVE_KEY_ARG) {
- blake3::Hasher::new_derive_key(context)
- } else {
- blake3::Hasher::new()
- };
- Ok(Self {
- inner,
- file_args,
- base_hasher,
- })
- }
-
- fn num_threads(&self) -> Result<Option<usize>> {
- if let Some(num_threads_str) = self.inner.value_of(NUM_THREADS_ARG) {
- Ok(Some(
- num_threads_str
- .parse()
- .context("Failed to parse num threads.")?,
- ))
- } else {
- Ok(None)
- }
- }
-
- fn check(&self) -> bool {
- self.inner.is_present(CHECK_ARG)
- }
-
- fn raw(&self) -> bool {
- self.inner.is_present(RAW_ARG)
- }
-
- fn no_mmap(&self) -> bool {
- self.inner.is_present(NO_MMAP_ARG)
- }
-
- fn no_names(&self) -> bool {
- self.inner.is_present(NO_NAMES_ARG)
- }
-
- fn len(&self) -> Result<u64> {
- if let Some(length) = self.inner.value_of(LENGTH_ARG) {
- length.parse::<u64>().context("Failed to parse length.")
- } else {
- Ok(blake3::OUT_LEN as u64)
- }
- }
-
- fn keyed(&self) -> bool {
- self.inner.is_present(KEYED_ARG)
- }
-
- fn quiet(&self) -> bool {
- self.inner.is_present(QUIET_ARG)
- }
-}
-
-enum Input {
- Mmap(io::Cursor<memmap::Mmap>),
- File(File),
- Stdin,
-}
-
-impl Input {
- // Open an input file, using mmap if appropriate. "-" means stdin. Note
- // that this convention applies both to command line arguments, and to
- // filepaths that appear in a checkfile.
- fn open(path: &Path, args: &Args) -> Result<Self> {
- if path == Path::new("-") {
- if args.keyed() {
- bail!("Cannot open `-` in keyed mode");
- }
- return Ok(Self::Stdin);
- }
- let file = File::open(path)?;
- if !args.no_mmap() {
- if let Some(mmap) = maybe_memmap_file(&file)? {
- return Ok(Self::Mmap(io::Cursor::new(mmap)));
- }
- }
- Ok(Self::File(file))
- }
-
- fn hash(&mut self, args: &Args) -> Result<blake3::OutputReader> {
- let mut hasher = args.base_hasher.clone();
- match self {
- // The fast path: If we mmapped the file successfully, hash using
- // multiple threads. This doesn't work on stdin, or on some files,
- // and it can also be disabled with --no-mmap.
- Self::Mmap(cursor) => {
- hasher.update_with_join::<blake3::join::RayonJoin>(cursor.get_ref());
- }
- // The slower paths, for stdin or files we didn't/couldn't mmap.
- // This is currently all single-threaded. Doing multi-threaded
- // hashing without memory mapping is tricky, since all your worker
- // threads have to stop every time you refill the buffer, and that
- // ends up being a lot of overhead. To solve that, we need a more
- // complicated double-buffering strategy where a background thread
- // fills one buffer while the worker threads are hashing the other
- // one. We might implement that in the future, but since this is
- // the slow path anyway, it's not high priority.
- Self::File(file) => {
- copy_wide(file, &mut hasher)?;
- }
- Self::Stdin => {
- let stdin = io::stdin();
- let lock = stdin.lock();
- copy_wide(lock, &mut hasher)?;
- }
- }
- Ok(hasher.finalize_xof())
- }
-}
-
-impl Read for Input {
- fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
- match self {
- Self::Mmap(cursor) => cursor.read(buf),
- Self::File(file) => file.read(buf),
- Self::Stdin => io::stdin().read(buf),
- }
- }
-}
-
-// A 16 KiB buffer is enough to take advantage of all the SIMD instruction sets
-// that we support, but `std::io::copy` currently uses 8 KiB. Most platforms
-// can support at least 64 KiB, and there's some performance benefit to using
-// bigger reads, so that's what we use here.
-fn copy_wide(mut reader: impl Read, hasher: &mut blake3::Hasher) -> io::Result<u64> {
- let mut buffer = [0; 65536];
- let mut total = 0;
- loop {
- match reader.read(&mut buffer) {
- Ok(0) => return Ok(total),
- Ok(n) => {
- hasher.update(&buffer[..n]);
- total += n as u64;
- }
- Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
- Err(e) => return Err(e),
- }
- }
-}
-
-// Mmap a file, if it looks like a good idea. Return None in cases where we
-// know mmap will fail, or if the file is short enough that mmapping isn't
-// worth it. However, if we do try to mmap and it fails, return the error.
-fn maybe_memmap_file(file: &File) -> Result<Option<memmap::Mmap>> {
- let metadata = file.metadata()?;
- let file_size = metadata.len();
- Ok(if !metadata.is_file() {
- // Not a real file.
- None
- } else if file_size > isize::max_value() as u64 {
- // Too long to safely map.
- // https://github.com/danburkert/memmap-rs/issues/69
- None
- } else if file_size == 0 {
- // Mapping an empty file currently fails.
- // https://github.com/danburkert/memmap-rs/issues/72
- None
- } else if file_size < 16 * 1024 {
- // Mapping small files is not worth it.
- None
- } else {
- // Explicitly set the length of the memory map, so that filesystem
- // changes can't race to violate the invariants we just checked.
- let map = unsafe {
- memmap::MmapOptions::new()
- .len(file_size as usize)
- .map(&file)?
- };
- Some(map)
- })
-}
-
-fn write_hex_output(mut output: blake3::OutputReader, args: &Args) -> Result<()> {
- // Encoding multiples of the block size is most efficient.
- let mut len = args.len()?;
- let mut block = [0; blake3::BLOCK_LEN];
- while len > 0 {
- output.fill(&mut block);
- let hex_str = hex::encode(&block[..]);
- let take_bytes = cmp::min(len, block.len() as u64);
- print!("{}", &hex_str[..2 * take_bytes as usize]);
- len -= take_bytes;
- }
- Ok(())
-}
-
-fn write_raw_output(output: blake3::OutputReader, args: &Args) -> Result<()> {
- let mut output = output.take(args.len()?);
- let stdout = std::io::stdout();
- let mut handler = stdout.lock();
- std::io::copy(&mut output, &mut handler)?;
-
- Ok(())
-}
-
-fn read_key_from_stdin() -> Result<[u8; blake3::KEY_LEN]> {
- let mut bytes = Vec::with_capacity(blake3::KEY_LEN + 1);
- let n = std::io::stdin()
- .lock()
- .take(blake3::KEY_LEN as u64 + 1)
- .read_to_end(&mut bytes)?;
- if n < 32 {
- bail!(
- "expected {} key bytes from stdin, found {}",
- blake3::KEY_LEN,
- n,
- )
- } else if n > 32 {
- bail!("read more than {} key bytes from stdin", blake3::KEY_LEN)
- } else {
- Ok(bytes[..blake3::KEY_LEN].try_into().unwrap())
- }
-}
-
-struct FilepathString {
- filepath_string: String,
- is_escaped: bool,
-}
-
-// returns (string, did_escape)
-fn filepath_to_string(filepath: &Path) -> FilepathString {
- let unicode_cow = filepath.to_string_lossy();
- let mut filepath_string = unicode_cow.to_string();
- // If we're on Windows, normalize backslashes to forward slashes. This
- // avoids a lot of ugly escaping in the common case, and it makes
- // checkfiles created on Windows more likely to be portable to Unix. It
- // also allows us to set a blanket "no backslashes allowed in checkfiles on
- // Windows" rule, rather than allowing a Unix backslash to potentially get
- // interpreted as a directory separator on Windows.
- if cfg!(windows) {
- filepath_string = filepath_string.replace('\\', "/");
- }
- let mut is_escaped = false;
- if filepath_string.contains('\\') || filepath_string.contains('\n') {
- filepath_string = filepath_string.replace('\\', "\\\\").replace('\n', "\\n");
- is_escaped = true;
- }
- FilepathString {
- filepath_string,
- is_escaped,
- }
-}
-
-fn hex_half_byte(c: char) -> Result<u8> {
- // The hex characters in the hash must be lowercase for now, though we
- // could support uppercase too if we wanted to.
- if '0' <= c && c <= '9' {
- return Ok(c as u8 - '0' as u8);
- }
- if 'a' <= c && c <= 'f' {
- return Ok(c as u8 - 'a' as u8 + 10);
- }
- bail!("Invalid hex");
-}
-
-// The `check` command is a security tool. That means it's much better for a
-// check to fail more often than it should (a false negative), than for a check
-// to ever succeed when it shouldn't (a false positive). By forbidding certain
-// characters in checked filepaths, we avoid a class of false positives where
-// two different filepaths can get confused with each other.
-fn check_for_invalid_characters(utf8_path: &str) -> Result<()> {
- // Null characters in paths should never happen, but they can result in a
- // path getting silently truncated on Unix.
- if utf8_path.contains('\0') {
- bail!("Null character in path");
- }
- // Because we convert invalid UTF-8 sequences in paths to the Unicode
- // replacement character, multiple different invalid paths can map to the
- // same UTF-8 string.
- if utf8_path.contains('�') {
- bail!("Unicode replacement character in path");
- }
- // We normalize all Windows backslashes to forward slashes in our output,
- // so the only natural way to get a backslash in a checkfile on Windows is
- // to construct it on Unix and copy it over. (Or of course you could just
- // doctor it by hand.) To avoid confusing this with a directory separator,
- // we forbid backslashes entirely on Windows. Note that this check comes
- // after unescaping has been done.
- if cfg!(windows) && utf8_path.contains('\\') {
- bail!("Backslash in path");
- }
- Ok(())
-}
-
-fn unescape(mut path: &str) -> Result<String> {
- let mut unescaped = String::with_capacity(2 * path.len());
- while let Some(i) = path.find('\\') {
- ensure!(i < path.len() - 1, "Invalid backslash escape");
- unescaped.push_str(&path[..i]);
- match path[i + 1..].chars().next().unwrap() {
- // Anything other than a recognized escape sequence is an error.
- 'n' => unescaped.push_str("\n"),
- '\\' => unescaped.push_str("\\"),
- _ => bail!("Invalid backslash escape"),
- }
- path = &path[i + 2..];
- }
- unescaped.push_str(path);
- Ok(unescaped)
-}
-
-#[derive(Debug)]
-struct ParsedCheckLine {
- file_string: String,
- is_escaped: bool,
- file_path: PathBuf,
- expected_hash: blake3::Hash,
-}
-
-fn parse_check_line(mut line: &str) -> Result<ParsedCheckLine> {
- // Trim off the trailing newline, if any.
- line = line.trim_end_matches('\n');
- // If there's a backslash at the front of the line, that means we need to
- // unescape the path below. This matches the behavior of e.g. md5sum.
- let first = if let Some(c) = line.chars().next() {
- c
- } else {
- bail!("Empty line");
- };
- let mut is_escaped = false;
- if first == '\\' {
- is_escaped = true;
- line = &line[1..];
- }
- // The front of the line must be a hash of the usual length, followed by
- // two spaces. The hex characters in the hash must be lowercase for now,
- // though we could support uppercase too if we wanted to.
- let hash_hex_len = 2 * blake3::OUT_LEN;
- let num_spaces = 2;
- let prefix_len = hash_hex_len + num_spaces;
- ensure!(line.len() > prefix_len, "Short line");
- ensure!(
- line.chars().take(prefix_len).all(|c| c.is_ascii()),
- "Non-ASCII prefix"
- );
- ensure!(&line[hash_hex_len..][..2] == " ", "Invalid space");
- // Decode the hash hex.
- let mut hash_bytes = [0; blake3::OUT_LEN];
- let mut hex_chars = line[..hash_hex_len].chars();
- for byte in &mut hash_bytes {
- let high_char = hex_chars.next().unwrap();
- let low_char = hex_chars.next().unwrap();
- *byte = 16 * hex_half_byte(high_char)? + hex_half_byte(low_char)?;
- }
- let expected_hash: blake3::Hash = hash_bytes.into();
- let file_string = line[prefix_len..].to_string();
- let file_path_string = if is_escaped {
- // If we detected a backslash at the start of the line earlier, now we
- // need to unescape backslashes and newlines.
- unescape(&file_string)?
- } else {
- file_string.clone().into()
- };
- check_for_invalid_characters(&file_path_string)?;
- Ok(ParsedCheckLine {
- file_string,
- is_escaped,
- file_path: file_path_string.into(),
- expected_hash,
- })
-}
-
-fn hash_one_input(path: &Path, args: &Args) -> Result<()> {
- let mut input = Input::open(path, args)?;
- let output = input.hash(args)?;
- if args.raw() {
- write_raw_output(output, args)?;
- return Ok(());
- }
- if args.no_names() {
- write_hex_output(output, args)?;
- println!();
- return Ok(());
- }
- let FilepathString {
- filepath_string,
- is_escaped,
- } = filepath_to_string(path);
- if is_escaped {
- print!("\\");
- }
- write_hex_output(output, args)?;
- println!(" {}", filepath_string);
- Ok(())
-}
-
-// Returns true for success. Having a boolean return value here, instead of
-// passing down the some_file_failed reference, makes it less likely that we
-// might forget to set it in some error condition.
-fn check_one_line(line: &str, args: &Args) -> bool {
- let parse_result = parse_check_line(&line);
- let ParsedCheckLine {
- file_string,
- is_escaped,
- file_path,
- expected_hash,
- } = match parse_result {
- Ok(parsed) => parsed,
- Err(e) => {
- eprintln!("{}: {}", NAME, e);
- return false;
- }
- };
- let file_string = if is_escaped {
- "\\".to_string() + &file_string
- } else {
- file_string
- };
- let hash_result: Result<blake3::Hash> = Input::open(&file_path, args)
- .and_then(|mut input| input.hash(args))
- .map(|mut hash_output| {
- let mut found_hash_bytes = [0; blake3::OUT_LEN];
- hash_output.fill(&mut found_hash_bytes);
- found_hash_bytes.into()
- });
- let found_hash: blake3::Hash = match hash_result {
- Ok(hash) => hash,
- Err(e) => {
- println!("{}: FAILED ({})", file_string, e);
- return false;
- }
- };
- // This is a constant-time comparison.
- if expected_hash == found_hash {
- if !args.quiet() {
- println!("{}: OK", file_string);
- }
- true
- } else {
- println!("{}: FAILED", file_string);
- false
- }
-}
-
-fn check_one_checkfile(path: &Path, args: &Args, some_file_failed: &mut bool) -> Result<()> {
- let checkfile_input = Input::open(path, args)?;
- let mut bufreader = io::BufReader::new(checkfile_input);
- let mut line = String::new();
- loop {
- line.clear();
- let n = bufreader.read_line(&mut line)?;
- if n == 0 {
- return Ok(());
- }
- // check_one_line() prints errors and turns them into a success=false
- // return, so it doesn't return a Result.
- let success = check_one_line(&line, args);
- if !success {
- *some_file_failed = true;
- }
- }
-}
-
-fn main() -> Result<()> {
- let args = Args::parse()?;
- let mut thread_pool_builder = rayon::ThreadPoolBuilder::new();
- if let Some(num_threads) = args.num_threads()? {
- thread_pool_builder = thread_pool_builder.num_threads(num_threads);
- }
- let thread_pool = thread_pool_builder.build()?;
- thread_pool.install(|| {
- let mut some_file_failed = false;
- // Note that file_args automatically includes `-` if nothing is given.
- for path in &args.file_args {
- if args.check() {
- // A hash mismatch or a failure to read a hashed file will be
- // printed in the checkfile loop, and will not propagate here.
- // This is similar to the explicit error handling we do in the
- // hashing case immediately below. In these cases,
- // some_file_failed will be set to false.
- check_one_checkfile(path, &args, &mut some_file_failed)?;
- } else {
- // Errors encountered in hashing are tolerated and printed to
- // stderr. This allows e.g. `b3sum *` to print errors for
- // non-files and keep going. However, if we encounter any
- // errors we'll still return non-zero at the end.
- let result = hash_one_input(path, &args);
- if let Err(e) = result {
- some_file_failed = true;
- eprintln!("{}: {}: {}", NAME, path.to_string_lossy(), e);
- }
- }
- }
- std::process::exit(if some_file_failed { 1 } else { 0 });
- })
-}
diff --git a/thirdparty/BLAKE3/b3sum/src/unit_tests.rs b/thirdparty/BLAKE3/b3sum/src/unit_tests.rs
deleted file mode 100644
index 1fa1a17dc..000000000
--- a/thirdparty/BLAKE3/b3sum/src/unit_tests.rs
+++ /dev/null
@@ -1,189 +0,0 @@
-use std::path::Path;
-
-#[test]
-fn test_parse_check_line() {
- // =========================
- // ===== Success Cases =====
- // =========================
-
- // the basic case
- let crate::ParsedCheckLine {
- file_string,
- is_escaped,
- file_path,
- expected_hash,
- } = crate::parse_check_line(
- "0909090909090909090909090909090909090909090909090909090909090909 foo",
- )
- .unwrap();
- assert_eq!(expected_hash, blake3::Hash::from([0x09; 32]));
- assert!(!is_escaped);
- assert_eq!(file_string, "foo");
- assert_eq!(file_path, Path::new("foo"));
-
- // regular whitespace
- let crate::ParsedCheckLine {
- file_string,
- is_escaped,
- file_path,
- expected_hash,
- } = crate::parse_check_line(
- "fafafafafafafafafafafafafafafafafafafafafafafafafafafafafafafafa fo \to\n\n\n",
- )
- .unwrap();
- assert_eq!(expected_hash, blake3::Hash::from([0xfa; 32]));
- assert!(!is_escaped);
- assert_eq!(file_string, "fo \to");
- assert_eq!(file_path, Path::new("fo \to"));
-
- // path is one space
- let crate::ParsedCheckLine {
- file_string,
- is_escaped,
- file_path,
- expected_hash,
- } = crate::parse_check_line(
- "4242424242424242424242424242424242424242424242424242424242424242 ",
- )
- .unwrap();
- assert_eq!(expected_hash, blake3::Hash::from([0x42; 32]));
- assert!(!is_escaped);
- assert_eq!(file_string, " ");
- assert_eq!(file_path, Path::new(" "));
-
- // *Unescaped* backslashes. Note that this line does *not* start with a
- // backslash, so something like "\" + "n" is interpreted as *two*
- // characters. We forbid all backslashes on Windows, so this test is
- // Unix-only.
- if cfg!(not(windows)) {
- let crate::ParsedCheckLine {
- file_string,
- is_escaped,
- file_path,
- expected_hash,
- } = crate::parse_check_line(
- "4343434343434343434343434343434343434343434343434343434343434343 fo\\a\\no",
- )
- .unwrap();
- assert_eq!(expected_hash, blake3::Hash::from([0x43; 32]));
- assert!(!is_escaped);
- assert_eq!(file_string, "fo\\a\\no");
- assert_eq!(file_path, Path::new("fo\\a\\no"));
- }
-
- // escaped newline
- let crate::ParsedCheckLine {
- file_string,
- is_escaped,
- file_path,
- expected_hash,
- } = crate::parse_check_line(
- "\\4444444444444444444444444444444444444444444444444444444444444444 fo\\n\\no",
- )
- .unwrap();
- assert_eq!(expected_hash, blake3::Hash::from([0x44; 32]));
- assert!(is_escaped);
- assert_eq!(file_string, "fo\\n\\no");
- assert_eq!(file_path, Path::new("fo\n\no"));
-
- // Escaped newline and backslash. Again because backslash is not allowed on
- // Windows, this test is Unix-only.
- if cfg!(not(windows)) {
- let crate::ParsedCheckLine {
- file_string,
- is_escaped,
- file_path,
- expected_hash,
- } = crate::parse_check_line(
- "\\4545454545454545454545454545454545454545454545454545454545454545 fo\\n\\\\o",
- )
- .unwrap();
- assert_eq!(expected_hash, blake3::Hash::from([0x45; 32]));
- assert!(is_escaped);
- assert_eq!(file_string, "fo\\n\\\\o");
- assert_eq!(file_path, Path::new("fo\n\\o"));
- }
-
- // non-ASCII path
- let crate::ParsedCheckLine {
- file_string,
- is_escaped,
- file_path,
- expected_hash,
- } = crate::parse_check_line(
- "4646464646464646464646464646464646464646464646464646464646464646 否认",
- )
- .unwrap();
- assert_eq!(expected_hash, blake3::Hash::from([0x46; 32]));
- assert!(!is_escaped);
- assert_eq!(file_string, "否认");
- assert_eq!(file_path, Path::new("否认"));
-
- // =========================
- // ===== Failure Cases =====
- // =========================
-
- // too short
- crate::parse_check_line("").unwrap_err();
- crate::parse_check_line("0").unwrap_err();
- crate::parse_check_line("00").unwrap_err();
- crate::parse_check_line("0000000000000000000000000000000000000000000000000000000000000000")
- .unwrap_err();
- crate::parse_check_line("0000000000000000000000000000000000000000000000000000000000000000 ")
- .unwrap_err();
-
- // not enough spaces
- crate::parse_check_line("0000000000000000000000000000000000000000000000000000000000000000 foo")
- .unwrap_err();
-
- // capital letter hex
- crate::parse_check_line(
- "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA foo",
- )
- .unwrap_err();
-
- // non-hex hex
- crate::parse_check_line(
- "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx foo",
- )
- .unwrap_err();
-
- // non-ASCII hex
- crate::parse_check_line("你好, 我叫杰克. 认识你很高兴. 要不要吃个香蕉? foo").unwrap_err();
-
- // invalid escape sequence
- crate::parse_check_line(
- "\\0000000000000000000000000000000000000000000000000000000000000000 fo\\o",
- )
- .unwrap_err();
-
- // truncated escape sequence
- crate::parse_check_line(
- "\\0000000000000000000000000000000000000000000000000000000000000000 foo\\",
- )
- .unwrap_err();
-
- // null char
- crate::parse_check_line(
- "0000000000000000000000000000000000000000000000000000000000000000 fo\0o",
- )
- .unwrap_err();
-
- // Unicode replacement char
- crate::parse_check_line(
- "0000000000000000000000000000000000000000000000000000000000000000 fo�o",
- )
- .unwrap_err();
-
- // On Windows only, backslashes are not allowed, escaped or otherwise.
- if cfg!(windows) {
- crate::parse_check_line(
- "0000000000000000000000000000000000000000000000000000000000000000 fo\\o",
- )
- .unwrap_err();
- crate::parse_check_line(
- "\\0000000000000000000000000000000000000000000000000000000000000000 fo\\\\o",
- )
- .unwrap_err();
- }
-}
diff --git a/thirdparty/BLAKE3/b3sum/tests/cli_tests.rs b/thirdparty/BLAKE3/b3sum/tests/cli_tests.rs
deleted file mode 100644
index 51fbbba98..000000000
--- a/thirdparty/BLAKE3/b3sum/tests/cli_tests.rs
+++ /dev/null
@@ -1,552 +0,0 @@
-use duct::cmd;
-use std::ffi::OsString;
-use std::fs;
-use std::io::prelude::*;
-use std::path::PathBuf;
-
-pub fn b3sum_exe() -> PathBuf {
- env!("CARGO_BIN_EXE_b3sum").into()
-}
-
-#[test]
-fn test_hash_one() {
- let expected = format!("{} -", blake3::hash(b"foo").to_hex());
- let output = cmd!(b3sum_exe()).stdin_bytes("foo").read().unwrap();
- assert_eq!(&*expected, output);
-}
-
-#[test]
-fn test_hash_one_raw() {
- let expected = blake3::hash(b"foo").as_bytes().to_owned();
- let output = cmd!(b3sum_exe(), "--raw")
- .stdin_bytes("foo")
- .stdout_capture()
- .run()
- .unwrap()
- .stdout;
- assert_eq!(expected, output.as_slice());
-}
-
-#[test]
-fn test_hash_many() {
- let dir = tempfile::tempdir().unwrap();
- let file1 = dir.path().join("file1");
- fs::write(&file1, b"foo").unwrap();
- let file2 = dir.path().join("file2");
- fs::write(&file2, b"bar").unwrap();
-
- let output = cmd!(b3sum_exe(), &file1, &file2).read().unwrap();
- let foo_hash = blake3::hash(b"foo");
- let bar_hash = blake3::hash(b"bar");
- let expected = format!(
- "{} {}\n{} {}",
- foo_hash.to_hex(),
- // account for slash normalization on Windows
- file1.to_string_lossy().replace("\\", "/"),
- bar_hash.to_hex(),
- file2.to_string_lossy().replace("\\", "/"),
- );
- assert_eq!(expected, output);
-
- let output_no_names = cmd!(b3sum_exe(), "--no-names", &file1, &file2)
- .read()
- .unwrap();
- let expected_no_names = format!("{}\n{}", foo_hash.to_hex(), bar_hash.to_hex(),);
- assert_eq!(expected_no_names, output_no_names);
-}
-
-#[test]
-fn test_missing_files() {
- let dir = tempfile::tempdir().unwrap();
- let file1 = dir.path().join("file1");
- fs::write(&file1, b"foo").unwrap();
- let file2 = dir.path().join("file2");
- fs::write(&file2, b"bar").unwrap();
-
- let output = cmd!(b3sum_exe(), "file1", "missing_file", "file2")
- .dir(dir.path())
- .stdout_capture()
- .stderr_capture()
- .unchecked()
- .run()
- .unwrap();
- assert!(!output.status.success());
-
- let foo_hash = blake3::hash(b"foo");
- let bar_hash = blake3::hash(b"bar");
- let expected_stdout = format!(
- "{} file1\n{} file2\n",
- foo_hash.to_hex(),
- bar_hash.to_hex(),
- );
- assert_eq!(expected_stdout.as_bytes(), &output.stdout[..]);
-
- let bing_error = fs::File::open(dir.path().join("missing_file")).unwrap_err();
- let expected_stderr = format!("b3sum: missing_file: {}\n", bing_error.to_string());
- assert_eq!(expected_stderr.as_bytes(), &output.stderr[..]);
-}
-
-#[test]
-fn test_hash_length() {
- let mut buf = [0; 100];
- blake3::Hasher::new()
- .update(b"foo")
- .finalize_xof()
- .fill(&mut buf);
- let expected = format!("{} -", hex::encode(&buf[..]));
- let output = cmd!(b3sum_exe(), "--length=100")
- .stdin_bytes("foo")
- .read()
- .unwrap();
- assert_eq!(&*expected, &*output);
-}
-
-#[test]
-fn test_keyed() {
- let key = [42; blake3::KEY_LEN];
- let f = tempfile::NamedTempFile::new().unwrap();
- f.as_file().write_all(b"foo").unwrap();
- f.as_file().flush().unwrap();
- let expected = blake3::keyed_hash(&key, b"foo").to_hex();
- let output = cmd!(b3sum_exe(), "--keyed", "--no-names", f.path())
- .stdin_bytes(&key[..])
- .read()
- .unwrap();
- assert_eq!(&*expected, &*output);
-}
-
-#[test]
-fn test_derive_key() {
- let context = "BLAKE3 2019-12-28 10:28:41 example context";
- let f = tempfile::NamedTempFile::new().unwrap();
- f.as_file().write_all(b"key material").unwrap();
- f.as_file().flush().unwrap();
- let mut derive_key_out = [0; blake3::OUT_LEN];
- blake3::derive_key(context, b"key material", &mut derive_key_out);
- let expected = hex::encode(&derive_key_out);
- let output = cmd!(b3sum_exe(), "--derive-key", context, "--no-names", f.path())
- .read()
- .unwrap();
- assert_eq!(&*expected, &*output);
-}
-
-#[test]
-fn test_no_mmap() {
- let f = tempfile::NamedTempFile::new().unwrap();
- f.as_file().write_all(b"foo").unwrap();
- f.as_file().flush().unwrap();
-
- let expected = blake3::hash(b"foo").to_hex();
- let output = cmd!(b3sum_exe(), "--no-mmap", "--no-names", f.path())
- .read()
- .unwrap();
- assert_eq!(&*expected, &*output);
-}
-
-#[test]
-fn test_length_without_value_is_an_error() {
- let result = cmd!(b3sum_exe(), "--length")
- .stdin_bytes("foo")
- .stderr_capture()
- .run();
- assert!(result.is_err());
-}
-
-#[test]
-fn test_raw_with_multi_files_is_an_error() {
- let f1 = tempfile::NamedTempFile::new().unwrap();
- let f2 = tempfile::NamedTempFile::new().unwrap();
-
- // Make sure it doesn't error with just one file
- let result = cmd!(b3sum_exe(), "--raw", f1.path()).stdout_capture().run();
- assert!(result.is_ok());
-
- // Make sure it errors when both file are passed
- let result = cmd!(b3sum_exe(), "--raw", f1.path(), f2.path())
- .stderr_capture()
- .run();
- assert!(result.is_err());
-}
-
-#[test]
-#[cfg(unix)]
-fn test_newline_and_backslash_escaping_on_unix() {
- let empty_hash = blake3::hash(b"").to_hex();
- let dir = tempfile::tempdir().unwrap();
- fs::create_dir(dir.path().join("subdir")).unwrap();
- let names = [
- "abcdef",
- "abc\ndef",
- "abc\\def",
- "abc\rdef",
- "abc\r\ndef",
- "subdir/foo",
- ];
- let mut paths = Vec::new();
- for name in &names {
- let path = dir.path().join(name);
- println!("creating file at {:?}", path);
- fs::write(&path, b"").unwrap();
- paths.push(path);
- }
- let output = cmd(b3sum_exe(), &names).dir(dir.path()).read().unwrap();
- let expected = format!(
- "\
-{0} abcdef
-\\{0} abc\\ndef
-\\{0} abc\\\\def
-{0} abc\rdef
-\\{0} abc\r\\ndef
-{0} subdir/foo",
- empty_hash,
- );
- println!("output");
- println!("======");
- println!("{}", output);
- println!();
- println!("expected");
- println!("========");
- println!("{}", expected);
- println!();
- assert_eq!(expected, output);
-}
-
-#[test]
-#[cfg(windows)]
-fn test_slash_normalization_on_windows() {
- let empty_hash = blake3::hash(b"").to_hex();
- let dir = tempfile::tempdir().unwrap();
- fs::create_dir(dir.path().join("subdir")).unwrap();
- // Note that filenames can't contain newlines or backslashes on Windows, so
- // we don't test escaping here. We only test forward slash and backslash as
- // directory separators.
- let names = ["abcdef", "subdir/foo", "subdir\\bar"];
- let mut paths = Vec::new();
- for name in &names {
- let path = dir.path().join(name);
- println!("creating file at {:?}", path);
- fs::write(&path, b"").unwrap();
- paths.push(path);
- }
- let output = cmd(b3sum_exe(), &names).dir(dir.path()).read().unwrap();
- let expected = format!(
- "\
-{0} abcdef
-{0} subdir/foo
-{0} subdir/bar",
- empty_hash,
- );
- println!("output");
- println!("======");
- println!("{}", output);
- println!();
- println!("expected");
- println!("========");
- println!("{}", expected);
- println!();
- assert_eq!(expected, output);
-}
-
-#[test]
-#[cfg(unix)]
-fn test_invalid_unicode_on_unix() {
- use std::os::unix::ffi::OsStringExt;
-
- let empty_hash = blake3::hash(b"").to_hex();
- let dir = tempfile::tempdir().unwrap();
- let names = ["abcdef".into(), OsString::from_vec(b"abc\xffdef".to_vec())];
- let mut paths = Vec::new();
- for name in &names {
- let path = dir.path().join(name);
- println!("creating file at {:?}", path);
- // Note: Some operating systems, macOS in particular, simply don't
- // allow invalid Unicode in filenames. On those systems, this write
- // will fail. That's fine, we'll just short-circuit this test in that
- // case. But assert that at least Linux allows this.
- let write_result = fs::write(&path, b"");
- if cfg!(target_os = "linux") {
- write_result.expect("Linux should allow invalid Unicode");
- } else if write_result.is_err() {
- return;
- }
- paths.push(path);
- }
- let output = cmd(b3sum_exe(), &names).dir(dir.path()).read().unwrap();
- let expected = format!(
- "\
-{0} abcdef
-{0} abc�def",
- empty_hash,
- );
- println!("output");
- println!("======");
- println!("{}", output);
- println!();
- println!("expected");
- println!("========");
- println!("{}", expected);
- println!();
- assert_eq!(expected, output);
-}
-
-#[test]
-#[cfg(windows)]
-fn test_invalid_unicode_on_windows() {
- use std::os::windows::ffi::OsStringExt;
-
- let empty_hash = blake3::hash(b"").to_hex();
- let dir = tempfile::tempdir().unwrap();
- let surrogate_char = 0xDC00;
- let bad_unicode_wchars = [
- 'a' as u16,
- 'b' as u16,
- 'c' as u16,
- surrogate_char,
- 'd' as u16,
- 'e' as u16,
- 'f' as u16,
- ];
- let bad_osstring = OsString::from_wide(&bad_unicode_wchars);
- let names = ["abcdef".into(), bad_osstring];
- let mut paths = Vec::new();
- for name in &names {
- let path = dir.path().join(name);
- println!("creating file at {:?}", path);
- fs::write(&path, b"").unwrap();
- paths.push(path);
- }
- let output = cmd(b3sum_exe(), &names).dir(dir.path()).read().unwrap();
- let expected = format!(
- "\
-{0} abcdef
-{0} abc�def",
- empty_hash,
- );
- println!("output");
- println!("======");
- println!("{}", output);
- println!();
- println!("expected");
- println!("========");
- println!("{}", expected);
- println!();
- assert_eq!(expected, output);
-}
-
-#[test]
-fn test_check() {
- // Make a directory full of files, and make sure the b3sum output in that
- // directory is what we expect.
- let a_hash = blake3::hash(b"a").to_hex();
- let b_hash = blake3::hash(b"b").to_hex();
- let cd_hash = blake3::hash(b"cd").to_hex();
- let dir = tempfile::tempdir().unwrap();
- fs::write(dir.path().join("a"), b"a").unwrap();
- fs::write(dir.path().join("b"), b"b").unwrap();
- fs::create_dir(dir.path().join("c")).unwrap();
- fs::write(dir.path().join("c/d"), b"cd").unwrap();
- let output = cmd!(b3sum_exe(), "a", "b", "c/d")
- .dir(dir.path())
- .stdout_capture()
- .stderr_capture()
- .run()
- .unwrap();
- let stdout = std::str::from_utf8(&output.stdout).unwrap();
- let stderr = std::str::from_utf8(&output.stderr).unwrap();
- let expected_checkfile = format!(
- "{} a\n\
- {} b\n\
- {} c/d\n",
- a_hash, b_hash, cd_hash,
- );
- assert_eq!(expected_checkfile, stdout);
- assert_eq!("", stderr);
-
- // Now use the output we just validated as a checkfile, passed to stdin.
- let output = cmd!(b3sum_exe(), "--check")
- .stdin_bytes(expected_checkfile.as_bytes())
- .dir(dir.path())
- .stdout_capture()
- .stderr_capture()
- .run()
- .unwrap();
- let stdout = std::str::from_utf8(&output.stdout).unwrap();
- let stderr = std::str::from_utf8(&output.stderr).unwrap();
- let expected_check_output = "\
- a: OK\n\
- b: OK\n\
- c/d: OK\n";
- assert_eq!(expected_check_output, stdout);
- assert_eq!("", stderr);
-
- // Now pass the same checkfile twice on the command line just for fun.
- let checkfile_path = dir.path().join("checkfile");
- fs::write(&checkfile_path, &expected_checkfile).unwrap();
- let output = cmd!(b3sum_exe(), "--check", &checkfile_path, &checkfile_path)
- .dir(dir.path())
- .stdout_capture()
- .stderr_capture()
- .run()
- .unwrap();
- let stdout = std::str::from_utf8(&output.stdout).unwrap();
- let stderr = std::str::from_utf8(&output.stderr).unwrap();
- let mut double_check_output = String::new();
- double_check_output.push_str(&expected_check_output);
- double_check_output.push_str(&expected_check_output);
- assert_eq!(double_check_output, stdout);
- assert_eq!("", stderr);
-
- // Corrupt one of the files and check again.
- fs::write(dir.path().join("b"), b"CORRUPTION").unwrap();
- let output = cmd!(b3sum_exe(), "--check", &checkfile_path)
- .dir(dir.path())
- .stdout_capture()
- .stderr_capture()
- .unchecked()
- .run()
- .unwrap();
- let stdout = std::str::from_utf8(&output.stdout).unwrap();
- let stderr = std::str::from_utf8(&output.stderr).unwrap();
- let expected_check_failure = "\
- a: OK\n\
- b: FAILED\n\
- c/d: OK\n";
- assert!(!output.status.success());
- assert_eq!(expected_check_failure, stdout);
- assert_eq!("", stderr);
-
- // Delete one of the files and check again.
- fs::remove_file(dir.path().join("b")).unwrap();
- let open_file_error = fs::File::open(dir.path().join("b")).unwrap_err();
- let output = cmd!(b3sum_exe(), "--check", &checkfile_path)
- .dir(dir.path())
- .stdout_capture()
- .stderr_capture()
- .unchecked()
- .run()
- .unwrap();
- let stdout = std::str::from_utf8(&output.stdout).unwrap();
- let stderr = std::str::from_utf8(&output.stderr).unwrap();
- let expected_check_failure = format!(
- "a: OK\n\
- b: FAILED ({})\n\
- c/d: OK\n",
- open_file_error,
- );
- assert!(!output.status.success());
- assert_eq!(expected_check_failure, stdout);
- assert_eq!("", stderr);
-
- // Confirm that --quiet suppresses the OKs but not the FAILEDs.
- let output = cmd!(b3sum_exe(), "--check", "--quiet", &checkfile_path)
- .dir(dir.path())
- .stdout_capture()
- .stderr_capture()
- .unchecked()
- .run()
- .unwrap();
- let stdout = std::str::from_utf8(&output.stdout).unwrap();
- let stderr = std::str::from_utf8(&output.stderr).unwrap();
- let expected_check_failure = format!("b: FAILED ({})\n", open_file_error);
- assert!(!output.status.success());
- assert_eq!(expected_check_failure, stdout);
- assert_eq!("", stderr);
-}
-
-#[test]
-fn test_check_invalid_characters() {
- // Check that a null character in the path fails.
- let output = cmd!(b3sum_exe(), "--check")
- .stdin_bytes("0000000000000000000000000000000000000000000000000000000000000000 \0")
- .stdout_capture()
- .stderr_capture()
- .unchecked()
- .run()
- .unwrap();
- let stdout = std::str::from_utf8(&output.stdout).unwrap();
- let stderr = std::str::from_utf8(&output.stderr).unwrap();
- assert!(!output.status.success());
- assert_eq!("", stdout);
- assert_eq!("b3sum: Null character in path\n", stderr);
-
- // Check that a Unicode replacement character in the path fails.
- let output = cmd!(b3sum_exe(), "--check")
- .stdin_bytes("0000000000000000000000000000000000000000000000000000000000000000 �")
- .stdout_capture()
- .stderr_capture()
- .unchecked()
- .run()
- .unwrap();
- let stdout = std::str::from_utf8(&output.stdout).unwrap();
- let stderr = std::str::from_utf8(&output.stderr).unwrap();
- assert!(!output.status.success());
- assert_eq!("", stdout);
- assert_eq!("b3sum: Unicode replacement character in path\n", stderr);
-
- // Check that an invalid escape sequence in the path fails.
- let output = cmd!(b3sum_exe(), "--check")
- .stdin_bytes("\\0000000000000000000000000000000000000000000000000000000000000000 \\a")
- .stdout_capture()
- .stderr_capture()
- .unchecked()
- .run()
- .unwrap();
- let stdout = std::str::from_utf8(&output.stdout).unwrap();
- let stderr = std::str::from_utf8(&output.stderr).unwrap();
- assert!(!output.status.success());
- assert_eq!("", stdout);
- assert_eq!("b3sum: Invalid backslash escape\n", stderr);
-
- // Windows also forbids literal backslashes. Check for that if and only if
- // we're on Windows.
- if cfg!(windows) {
- let output = cmd!(b3sum_exe(), "--check")
- .stdin_bytes("0000000000000000000000000000000000000000000000000000000000000000 \\")
- .stdout_capture()
- .stderr_capture()
- .unchecked()
- .run()
- .unwrap();
- let stdout = std::str::from_utf8(&output.stdout).unwrap();
- let stderr = std::str::from_utf8(&output.stderr).unwrap();
- assert!(!output.status.success());
- assert_eq!("", stdout);
- assert_eq!("b3sum: Backslash in path\n", stderr);
- }
-}
-
-#[test]
-fn test_globbing() {
- // On Unix, globbing is provided by the shell. On Windows, globbing is
- // provided by us, using the `wild` crate.
- let dir = tempfile::tempdir().unwrap();
- let file1 = dir.path().join("file1");
- fs::write(&file1, b"foo").unwrap();
- let file2 = dir.path().join("file2");
- fs::write(&file2, b"bar").unwrap();
-
- let foo_hash = blake3::hash(b"foo");
- let bar_hash = blake3::hash(b"bar");
- // NOTE: This assumes that the glob will be expanded in alphabetical order,
- // to "file1 file2" rather than "file2 file1". So far, this seems to
- // be true (guaranteed?) of Unix shell behavior, and true in practice
- // with the `wild` crate on Windows. It's possible that this could
- // start failing in the future, though, or on some unknown platform.
- // If that ever happens, we'll need to relax this test somehow,
- // probably by just testing for both possible outputs. I'm not
- // handling that case in advance, though, because I'd prefer to hear
- // about it if it comes up.
- let expected = format!("{} file1\n{} file2", foo_hash.to_hex(), bar_hash.to_hex());
-
- let star_command = format!("{} *", b3sum_exe().to_str().unwrap());
- let (exe, c_flag) = if cfg!(windows) {
- ("cmd.exe", "/C")
- } else {
- ("/bin/sh", "-c")
- };
- let output = cmd!(exe, c_flag, star_command)
- .dir(dir.path())
- .read()
- .unwrap();
- assert_eq!(expected, output);
-}
diff --git a/thirdparty/BLAKE3/b3sum/what_does_check_do.md b/thirdparty/BLAKE3/b3sum/what_does_check_do.md
deleted file mode 100644
index 3a44a0010..000000000
--- a/thirdparty/BLAKE3/b3sum/what_does_check_do.md
+++ /dev/null
@@ -1,174 +0,0 @@
-# How does `b3sum --check` behave exactly?<br>or: Are filepaths...text?
-
-Most of the time, `b3sum --check` is a drop-in replacement for `md5sum --check`
-and other Coreutils hashing tools. It consumes a checkfile (the output of a
-regular `b3sum` command), re-hashes all the files listed there, and returns
-success if all of those hashes are still correct. What makes this more
-complicated than it might seem, is that representing filepaths as text means we
-need to consider many possible edge cases of unrepresentable filepaths. This
-document describes all of these edge cases in detail.
-
-## The simple case
-
-Here's the result of running `b3sum a b c/d` in a directory that contains
-those three files:
-
-```bash
-$ echo hi > a
-$ echo lo > b
-$ mkdir c
-$ echo stuff > c/d
-$ b3sum a b c/d
-0b8b60248fad7ac6dfac221b7e01a8b91c772421a15b387dd1fb2d6a94aee438 a
-6ae4a57bbba24f79c461d30bcb4db973b9427d9207877e34d2d74528daa84115 b
-2d477356c962e54784f1c5dc5297718d92087006f6ee96b08aeaf7f3cd252377 c/d
-```
-
-If we pipe that output into `b3sum --check`, it will exit with status zero
-(success) and print:
-
-```bash
-$ b3sum a b c/d | b3sum --check
-a: OK
-b: OK
-c/d: OK
-```
-
-If we delete `b` and change the contents of `c/d`, and then use the same
-checkfile as above, `b3sum --check` will exit with a non-zero status (failure)
-and print:
-
-```bash
-$ b3sum a b c/d > checkfile
-$ rm b
-$ echo more stuff >> c/d
-$ b3sum --check checkfile
-a: OK
-b: FAILED (No such file or directory (os error 2))
-c/d: FAILED
-```
-
-In these typical cases, `b3sum` and `md5sum` have identical output for success
-and very similar output for failure.
-
-## Escaping newlines and backslashes
-
-Since the checkfile format (the regular output format of `b3sum`) is
-newline-separated text, we need to worry about what happens when a filepath
-contains a newline, or worse. Suppose we create a file named `x[newline]x`
-(3 characters). One way to create such a file is with a Python one-liner like
-this:
-
-```python
->>> open("x\nx", "w")
-```
-
-Here's what happens when we hash that file with `b3sum`:
-
-```bash
-$ b3sum x*
-\af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262 x\nx
-```
-
-Notice two things. First, `b3sum` puts a single `\` character at the front of
-the line. This indicates that the filepath contains escape sequences that
-`b3sum --check` will need to unescape. Then, `b3sum` replaces the newline
-character in the filepath with the two-character escape sequence `\n`.
-Similarly, if the filepath contained a backslash, `b3sum` would escape it as
-`\\` in the output. So far, all of this behavior is still identical to
-`md5sum`.
-
-## Invalid Unicode
-
-This is where `b3sum` and `md5um` diverge. Apart from the newline and backslash
-escapes described above, `md5sum` copies all other filepath bytes verbatim to
-its output. That means its output encoding is "ASCII plus whatever bytes we got
-from the command line". This creates two problems:
-
-1. Printing something that isn't UTF-8 is kind of gross.
-2. Windows support.
-
-What's the deal with Windows? To start with, there's a fundamental difference
-in how Unix and Windows represent filepaths. Unix filepaths are "usually UTF-8"
-and Windows filepaths are "usually UTF-16". That means that a file named `abc`
-is typically represented as the bytes `[97, 98, 99]` on Unix and as the bytes
-`[97, 0, 98, 0, 99, 0]` on Windows. The `md5sum` approach won't work if we plan
-on creating a checkfile on Unix and checking it on Windows, or vice versa.
-
-A more portable approach is to convert platform-specific bytes into some
-consistent Unicode encoding. (In practice this is going to be UTF-8, but in
-theory it could be anything.) Then when `--check` needs to open a file, we
-convert the Unicode representation back into platform-specific bytes. This
-makes important common cases like `abc`, and in fact even `abc[newline]def`,
-work as expected. Great!
-
-But...what did we mean above when we said *usually* UTF-8 and *usually* UTF-16?
-It turns out that not every possible sequence of bytes is valid UTF-8, and not
-every possible sequence of 16-bit wide chars is valid UTF-16. For example, the
-byte 0xFF (255) can never appear in any UTF-8 string. If we ask Python to
-decode it, it yells at us:
-
-```python
->>> b"\xFF".decode("UTF-8")
-UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
-```
-
-However, tragically, we *can* create a file with that byte in its name (on
-Linux at least, though not usually on macOS):
-
-```python
->>> open(b"y\xFFy", "w")
-```
-
-So some filepaths aren't representable in Unicode at all. Our plan to "convert
-platform-specific bytes into some consistent Unicode encoding" isn't going to
-work for everything. What does `b3sum` do with the file above?
-
-```bash
-$ b3sum y*
-af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262 y�y
-```
-
-That � in there is a "Unicode replacement character". When we run into
-filepaths that we can't represent in Unicode, we replace the unrepresentable
-parts with these characters. On the checking side, to avoid any possible
-confusion between two different invalid filepaths, we automatically fail if we
-see a replacement character. Together with a few more details covered in the
-next section, this gives us an important set of properties:
-
-1. Any file can be hashed locally.
-2. Any file with a valid Unicode name not containing the � character can be
- checked.
-3. Checking ambiguous or unrepresentable filepaths always fails.
-4. Checkfiles are always valid UTF-8.
-5. Checkfiles are portable between Unix and Windows.
-
-## Formal Rules
-
-1. When hashing, filepaths are represented in a platform-specific encoding,
- which can accommodate any filepath on the current platform. In Rust, this is
- `OsStr`/`OsString`.
-2. In output, filepaths are first converted to UTF-8. Any non-Unicode segments
- are replaced with Unicode replacement characters (U+FFFD). In Rust, this is
- `OsStr::to_string_lossy`.
-3. Then, if a filepath contains any backslashes (U+005C) or newlines (U+000A),
- these characters are escaped as `\\` and `\n` respectively.
-4. Finally, any output line containing an escape sequence is prefixed with a
- single backslash.
-5. When checking, each line is parsed as UTF-8, separated by a newline
- (U+000A). Invalid UTF-8 is an error.
-6. Then, if a line begins with a backslash, the filepath component is
- unescaped. Any escape sequence other than `\\` or `\n` is an error. If a
- line does not begin with a backslash, unescaping is not performed, and any
- backslashes in the filepath component are interpreted literally. (`b3sum`
- output never contains unescaped backslashes, but they can occur in
- checkfiles assembled by hand.)
-7. Finally, if a filepath contains a Unicode replacement character (U+FFFD) or
- a null character (U+0000), it is an error.
-
- **Additionally, on Windows only:**
-
-8. In output, all backslashes (U+005C) are replaced with forward slashes
- (U+002F).
-9. When checking, after unescaping, if a filepath contains a backslash, it is
- an error.
diff --git a/thirdparty/BLAKE3/benches/bench.rs b/thirdparty/BLAKE3/benches/bench.rs
deleted file mode 100644
index ba5a4041f..000000000
--- a/thirdparty/BLAKE3/benches/bench.rs
+++ /dev/null
@@ -1,520 +0,0 @@
-#![feature(test)]
-
-extern crate test;
-
-use arrayref::array_ref;
-use arrayvec::ArrayVec;
-use blake3::platform::{Platform, MAX_SIMD_DEGREE};
-use blake3::{BLOCK_LEN, CHUNK_LEN, OUT_LEN};
-use rand::prelude::*;
-use test::Bencher;
-
-const KIB: usize = 1024;
-
-// This struct randomizes two things:
-// 1. The actual bytes of input.
-// 2. The page offset the input starts at.
-pub struct RandomInput {
- buf: Vec<u8>,
- len: usize,
- offsets: Vec<usize>,
- offset_index: usize,
-}
-
-impl RandomInput {
- pub fn new(b: &mut Bencher, len: usize) -> Self {
- b.bytes += len as u64;
- let page_size: usize = page_size::get();
- let mut buf = vec![0u8; len + page_size];
- let mut rng = rand::thread_rng();
- rng.fill_bytes(&mut buf);
- let mut offsets: Vec<usize> = (0..page_size).collect();
- offsets.shuffle(&mut rng);
- Self {
- buf,
- len,
- offsets,
- offset_index: 0,
- }
- }
-
- pub fn get(&mut self) -> &[u8] {
- let offset = self.offsets[self.offset_index];
- self.offset_index += 1;
- if self.offset_index >= self.offsets.len() {
- self.offset_index = 0;
- }
- &self.buf[offset..][..self.len]
- }
-}
-
-fn bench_single_compression_fn(b: &mut Bencher, platform: Platform) {
- let mut state = [1u32; 8];
- let mut r = RandomInput::new(b, 64);
- let input = array_ref!(r.get(), 0, 64);
- b.iter(|| platform.compress_in_place(&mut state, input, 64 as u8, 0, 0));
-}
-
-#[bench]
-fn bench_single_compression_portable(b: &mut Bencher) {
- bench_single_compression_fn(b, Platform::portable());
-}
-
-#[bench]
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-fn bench_single_compression_sse2(b: &mut Bencher) {
- if let Some(platform) = Platform::sse2() {
- bench_single_compression_fn(b, platform);
- }
-}
-
-#[bench]
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-fn bench_single_compression_sse41(b: &mut Bencher) {
- if let Some(platform) = Platform::sse41() {
- bench_single_compression_fn(b, platform);
- }
-}
-
-#[bench]
-#[cfg(blake3_avx512_ffi)]
-fn bench_single_compression_avx512(b: &mut Bencher) {
- if let Some(platform) = Platform::avx512() {
- bench_single_compression_fn(b, platform);
- }
-}
-
-fn bench_many_chunks_fn(b: &mut Bencher, platform: Platform) {
- let degree = platform.simd_degree();
- let mut inputs = Vec::new();
- for _ in 0..degree {
- inputs.push(RandomInput::new(b, CHUNK_LEN));
- }
- b.iter(|| {
- let input_arrays: ArrayVec<[&[u8; CHUNK_LEN]; MAX_SIMD_DEGREE]> = inputs
- .iter_mut()
- .take(degree)
- .map(|i| array_ref!(i.get(), 0, CHUNK_LEN))
- .collect();
- let mut out = [0; MAX_SIMD_DEGREE * OUT_LEN];
- platform.hash_many(
- &input_arrays[..],
- &[0; 8],
- 0,
- blake3::IncrementCounter::Yes,
- 0,
- 0,
- 0,
- &mut out,
- );
- });
-}
-
-#[bench]
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-fn bench_many_chunks_sse2(b: &mut Bencher) {
- if let Some(platform) = Platform::sse2() {
- bench_many_chunks_fn(b, platform);
- }
-}
-
-#[bench]
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-fn bench_many_chunks_sse41(b: &mut Bencher) {
- if let Some(platform) = Platform::sse41() {
- bench_many_chunks_fn(b, platform);
- }
-}
-
-#[bench]
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-fn bench_many_chunks_avx2(b: &mut Bencher) {
- if let Some(platform) = Platform::avx2() {
- bench_many_chunks_fn(b, platform);
- }
-}
-
-#[bench]
-#[cfg(blake3_avx512_ffi)]
-fn bench_many_chunks_avx512(b: &mut Bencher) {
- if let Some(platform) = Platform::avx512() {
- bench_many_chunks_fn(b, platform);
- }
-}
-
-#[bench]
-#[cfg(feature = "neon")]
-fn bench_many_chunks_neon(b: &mut Bencher) {
- if let Some(platform) = Platform::neon() {
- bench_many_chunks_fn(b, platform);
- }
-}
-
-// TODO: When we get const generics we can unify this with the chunks code.
-fn bench_many_parents_fn(b: &mut Bencher, platform: Platform) {
- let degree = platform.simd_degree();
- let mut inputs = Vec::new();
- for _ in 0..degree {
- inputs.push(RandomInput::new(b, BLOCK_LEN));
- }
- b.iter(|| {
- let input_arrays: ArrayVec<[&[u8; BLOCK_LEN]; MAX_SIMD_DEGREE]> = inputs
- .iter_mut()
- .take(degree)
- .map(|i| array_ref!(i.get(), 0, BLOCK_LEN))
- .collect();
- let mut out = [0; MAX_SIMD_DEGREE * OUT_LEN];
- platform.hash_many(
- &input_arrays[..],
- &[0; 8],
- 0,
- blake3::IncrementCounter::No,
- 0,
- 0,
- 0,
- &mut out,
- );
- });
-}
-
-#[bench]
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-fn bench_many_parents_sse2(b: &mut Bencher) {
- if let Some(platform) = Platform::sse2() {
- bench_many_parents_fn(b, platform);
- }
-}
-
-#[bench]
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-fn bench_many_parents_sse41(b: &mut Bencher) {
- if let Some(platform) = Platform::sse41() {
- bench_many_parents_fn(b, platform);
- }
-}
-
-#[bench]
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-fn bench_many_parents_avx2(b: &mut Bencher) {
- if let Some(platform) = Platform::avx2() {
- bench_many_parents_fn(b, platform);
- }
-}
-
-#[bench]
-#[cfg(blake3_avx512_ffi)]
-fn bench_many_parents_avx512(b: &mut Bencher) {
- if let Some(platform) = Platform::avx512() {
- bench_many_parents_fn(b, platform);
- }
-}
-
-#[bench]
-#[cfg(feature = "neon")]
-fn bench_many_parents_neon(b: &mut Bencher) {
- if let Some(platform) = Platform::neon() {
- bench_many_parents_fn(b, platform);
- }
-}
-
-fn bench_atonce(b: &mut Bencher, len: usize) {
- let mut input = RandomInput::new(b, len);
- b.iter(|| blake3::hash(input.get()));
-}
-
-#[bench]
-fn bench_atonce_0001_block(b: &mut Bencher) {
- bench_atonce(b, BLOCK_LEN);
-}
-
-#[bench]
-fn bench_atonce_0001_kib(b: &mut Bencher) {
- bench_atonce(b, 1 * KIB);
-}
-
-#[bench]
-fn bench_atonce_0002_kib(b: &mut Bencher) {
- bench_atonce(b, 2 * KIB);
-}
-
-#[bench]
-fn bench_atonce_0004_kib(b: &mut Bencher) {
- bench_atonce(b, 4 * KIB);
-}
-
-#[bench]
-fn bench_atonce_0008_kib(b: &mut Bencher) {
- bench_atonce(b, 8 * KIB);
-}
-
-#[bench]
-fn bench_atonce_0016_kib(b: &mut Bencher) {
- bench_atonce(b, 16 * KIB);
-}
-
-#[bench]
-fn bench_atonce_0032_kib(b: &mut Bencher) {
- bench_atonce(b, 32 * KIB);
-}
-
-#[bench]
-fn bench_atonce_0064_kib(b: &mut Bencher) {
- bench_atonce(b, 64 * KIB);
-}
-
-#[bench]
-fn bench_atonce_0128_kib(b: &mut Bencher) {
- bench_atonce(b, 128 * KIB);
-}
-
-#[bench]
-fn bench_atonce_0256_kib(b: &mut Bencher) {
- bench_atonce(b, 256 * KIB);
-}
-
-#[bench]
-fn bench_atonce_0512_kib(b: &mut Bencher) {
- bench_atonce(b, 512 * KIB);
-}
-
-#[bench]
-fn bench_atonce_1024_kib(b: &mut Bencher) {
- bench_atonce(b, 1024 * KIB);
-}
-
-fn bench_incremental(b: &mut Bencher, len: usize) {
- let mut input = RandomInput::new(b, len);
- b.iter(|| blake3::Hasher::new().update(input.get()).finalize());
-}
-
-#[bench]
-fn bench_incremental_0001_block(b: &mut Bencher) {
- bench_incremental(b, BLOCK_LEN);
-}
-
-#[bench]
-fn bench_incremental_0001_kib(b: &mut Bencher) {
- bench_incremental(b, 1 * KIB);
-}
-
-#[bench]
-fn bench_incremental_0002_kib(b: &mut Bencher) {
- bench_incremental(b, 2 * KIB);
-}
-
-#[bench]
-fn bench_incremental_0004_kib(b: &mut Bencher) {
- bench_incremental(b, 4 * KIB);
-}
-
-#[bench]
-fn bench_incremental_0008_kib(b: &mut Bencher) {
- bench_incremental(b, 8 * KIB);
-}
-
-#[bench]
-fn bench_incremental_0016_kib(b: &mut Bencher) {
- bench_incremental(b, 16 * KIB);
-}
-
-#[bench]
-fn bench_incremental_0032_kib(b: &mut Bencher) {
- bench_incremental(b, 32 * KIB);
-}
-
-#[bench]
-fn bench_incremental_0064_kib(b: &mut Bencher) {
- bench_incremental(b, 64 * KIB);
-}
-
-#[bench]
-fn bench_incremental_0128_kib(b: &mut Bencher) {
- bench_incremental(b, 128 * KIB);
-}
-
-#[bench]
-fn bench_incremental_0256_kib(b: &mut Bencher) {
- bench_incremental(b, 256 * KIB);
-}
-
-#[bench]
-fn bench_incremental_0512_kib(b: &mut Bencher) {
- bench_incremental(b, 512 * KIB);
-}
-
-#[bench]
-fn bench_incremental_1024_kib(b: &mut Bencher) {
- bench_incremental(b, 1024 * KIB);
-}
-
-fn bench_reference(b: &mut Bencher, len: usize) {
- let mut input = RandomInput::new(b, len);
- b.iter(|| {
- let mut hasher = reference_impl::Hasher::new();
- hasher.update(input.get());
- let mut out = [0; 32];
- hasher.finalize(&mut out);
- out
- });
-}
-
-#[bench]
-fn bench_reference_0001_block(b: &mut Bencher) {
- bench_reference(b, BLOCK_LEN);
-}
-
-#[bench]
-fn bench_reference_0001_kib(b: &mut Bencher) {
- bench_reference(b, 1 * KIB);
-}
-
-#[bench]
-fn bench_reference_0002_kib(b: &mut Bencher) {
- bench_reference(b, 2 * KIB);
-}
-
-#[bench]
-fn bench_reference_0004_kib(b: &mut Bencher) {
- bench_reference(b, 4 * KIB);
-}
-
-#[bench]
-fn bench_reference_0008_kib(b: &mut Bencher) {
- bench_reference(b, 8 * KIB);
-}
-
-#[bench]
-fn bench_reference_0016_kib(b: &mut Bencher) {
- bench_reference(b, 16 * KIB);
-}
-
-#[bench]
-fn bench_reference_0032_kib(b: &mut Bencher) {
- bench_reference(b, 32 * KIB);
-}
-
-#[bench]
-fn bench_reference_0064_kib(b: &mut Bencher) {
- bench_reference(b, 64 * KIB);
-}
-
-#[bench]
-fn bench_reference_0128_kib(b: &mut Bencher) {
- bench_reference(b, 128 * KIB);
-}
-
-#[bench]
-fn bench_reference_0256_kib(b: &mut Bencher) {
- bench_reference(b, 256 * KIB);
-}
-
-#[bench]
-fn bench_reference_0512_kib(b: &mut Bencher) {
- bench_reference(b, 512 * KIB);
-}
-
-#[bench]
-fn bench_reference_1024_kib(b: &mut Bencher) {
- bench_reference(b, 1024 * KIB);
-}
-
-#[cfg(feature = "rayon")]
-fn bench_rayon(b: &mut Bencher, len: usize) {
- let mut input = RandomInput::new(b, len);
- b.iter(|| {
- blake3::Hasher::new()
- .update_with_join::<blake3::join::RayonJoin>(input.get())
- .finalize()
- });
-}
-
-#[bench]
-#[cfg(feature = "rayon")]
-fn bench_rayon_0001_block(b: &mut Bencher) {
- bench_rayon(b, BLOCK_LEN);
-}
-
-#[bench]
-#[cfg(feature = "rayon")]
-fn bench_rayon_0001_kib(b: &mut Bencher) {
- bench_rayon(b, 1 * KIB);
-}
-
-#[bench]
-#[cfg(feature = "rayon")]
-fn bench_rayon_0002_kib(b: &mut Bencher) {
- bench_rayon(b, 2 * KIB);
-}
-
-#[bench]
-#[cfg(feature = "rayon")]
-fn bench_rayon_0004_kib(b: &mut Bencher) {
- bench_rayon(b, 4 * KIB);
-}
-
-#[bench]
-#[cfg(feature = "rayon")]
-fn bench_rayon_0008_kib(b: &mut Bencher) {
- bench_rayon(b, 8 * KIB);
-}
-
-#[bench]
-#[cfg(feature = "rayon")]
-fn bench_rayon_0016_kib(b: &mut Bencher) {
- bench_rayon(b, 16 * KIB);
-}
-
-#[bench]
-#[cfg(feature = "rayon")]
-fn bench_rayon_0032_kib(b: &mut Bencher) {
- bench_rayon(b, 32 * KIB);
-}
-
-#[bench]
-#[cfg(feature = "rayon")]
-fn bench_rayon_0064_kib(b: &mut Bencher) {
- bench_rayon(b, 64 * KIB);
-}
-
-#[bench]
-#[cfg(feature = "rayon")]
-fn bench_rayon_0128_kib(b: &mut Bencher) {
- bench_rayon(b, 128 * KIB);
-}
-
-#[bench]
-#[cfg(feature = "rayon")]
-fn bench_rayon_0256_kib(b: &mut Bencher) {
- bench_rayon(b, 256 * KIB);
-}
-
-#[bench]
-#[cfg(feature = "rayon")]
-fn bench_rayon_0512_kib(b: &mut Bencher) {
- bench_rayon(b, 512 * KIB);
-}
-
-#[bench]
-#[cfg(feature = "rayon")]
-fn bench_rayon_1024_kib(b: &mut Bencher) {
- bench_rayon(b, 1024 * KIB);
-}
-
-// This checks that update() splits up its input in increasing powers of 2, so
-// that it can recover a high degree of parallelism when the number of bytes
-// hashed so far is uneven. The performance of this benchmark should be
-// reasonably close to bench_incremental_0064_kib, within 80% or so. When we
-// had a bug in this logic (https://github.com/BLAKE3-team/BLAKE3/issues/69),
-// performance was less than half.
-#[bench]
-fn bench_two_updates(b: &mut Bencher) {
- let len = 65536;
- let mut input = RandomInput::new(b, len);
- b.iter(|| {
- let mut hasher = blake3::Hasher::new();
- let input = input.get();
- hasher.update(&input[..1]);
- hasher.update(&input[1..]);
- hasher.finalize()
- });
-}
diff --git a/thirdparty/BLAKE3/build.rs b/thirdparty/BLAKE3/build.rs
deleted file mode 100644
index ea657d8db..000000000
--- a/thirdparty/BLAKE3/build.rs
+++ /dev/null
@@ -1,260 +0,0 @@
-use std::env;
-
-fn defined(var: &str) -> bool {
- println!("cargo:rerun-if-env-changed={}", var);
- env::var_os(var).is_some()
-}
-
-fn is_pure() -> bool {
- defined("CARGO_FEATURE_PURE")
-}
-
-fn should_prefer_intrinsics() -> bool {
- defined("CARGO_FEATURE_PREFER_INTRINSICS")
-}
-
-fn is_neon() -> bool {
- defined("CARGO_FEATURE_NEON")
-}
-
-fn is_ci() -> bool {
- defined("BLAKE3_CI")
-}
-
-fn warn(warning: &str) {
- assert!(!warning.contains("\n"));
- println!("cargo:warning={}", warning);
- if is_ci() {
- println!("cargo:warning=Warnings in CI are treated as errors. Build failed.");
- std::process::exit(1);
- }
-}
-
-fn target_components() -> Vec<String> {
- let target = env::var("TARGET").unwrap();
- target.split("-").map(|s| s.to_string()).collect()
-}
-
-fn is_x86_64() -> bool {
- target_components()[0] == "x86_64"
-}
-
-fn is_x86_32() -> bool {
- let arch = &target_components()[0];
- arch == "i386" || arch == "i586" || arch == "i686"
-}
-
-fn is_armv7() -> bool {
- target_components()[0] == "armv7"
-}
-
-// Windows targets may be using the MSVC toolchain or the GNU toolchain. The
-// right compiler flags to use depend on the toolchain. (And we don't want to
-// use flag_if_supported, because we don't want features to be silently
-// disabled by old compilers.)
-fn is_windows_msvc() -> bool {
- // Some targets are only two components long, so check in steps.
- target_components()[1] == "pc"
- && target_components()[2] == "windows"
- && target_components()[3] == "msvc"
-}
-
-fn is_windows_gnu() -> bool {
- // Some targets are only two components long, so check in steps.
- target_components()[1] == "pc"
- && target_components()[2] == "windows"
- && target_components()[3] == "gnu"
-}
-
-fn new_build() -> cc::Build {
- let mut build = cc::Build::new();
- if !is_windows_msvc() {
- build.flag("-std=c11");
- }
- build
-}
-
-#[derive(PartialEq)]
-enum CCompilerSupport {
- NoCompiler,
- NoAVX512,
- YesAVX512,
-}
-use CCompilerSupport::*;
-
-fn c_compiler_support() -> CCompilerSupport {
- let build = new_build();
- let flags_checked;
- let support_result: Result<bool, _> = if is_windows_msvc() {
- flags_checked = "/arch:AVX512";
- build.is_flag_supported("/arch:AVX512")
- } else {
- // Check for both of the flags we use. If -mavx512f works, then -mavx512vl
- // will probably always work too, but we might as well be thorough.
- flags_checked = "-mavx512f and -mavx512vl";
- match build.is_flag_supported("-mavx512f") {
- Ok(true) => build.is_flag_supported("-mavx512vl"),
- false_or_error => false_or_error,
- }
- };
- match support_result {
- Ok(true) => YesAVX512,
- Ok(false) => {
- warn(&format!(
- "The C compiler {:?} does not support {}.",
- build.get_compiler().path(),
- flags_checked,
- ));
- NoAVX512
- }
- Err(e) => {
- println!("{:?}", e);
- warn(&format!(
- "No C compiler {:?} detected.",
- build.get_compiler().path()
- ));
- NoCompiler
- }
- }
-}
-
-fn build_sse2_sse41_avx2_rust_intrinsics() {
- // No C code to compile here. Set the cfg flags that enable the Rust SSE2,
- // SSE4.1, and AVX2 intrinsics modules. The regular Cargo build will compile
- // them.
- println!("cargo:rustc-cfg=blake3_sse2_rust");
- println!("cargo:rustc-cfg=blake3_sse41_rust");
- println!("cargo:rustc-cfg=blake3_avx2_rust");
-}
-
-fn build_sse2_sse41_avx2_assembly() {
- // Build the assembly implementations for SSE4.1 and AVX2. This is
- // preferred, but it only supports x86_64.
- assert!(is_x86_64());
- println!("cargo:rustc-cfg=blake3_sse2_ffi");
- println!("cargo:rustc-cfg=blake3_sse41_ffi");
- println!("cargo:rustc-cfg=blake3_avx2_ffi");
- let mut build = new_build();
- if is_windows_msvc() {
- build.file("c/blake3_sse2_x86-64_windows_msvc.asm");
- build.file("c/blake3_sse41_x86-64_windows_msvc.asm");
- build.file("c/blake3_avx2_x86-64_windows_msvc.asm");
- } else if is_windows_gnu() {
- build.file("c/blake3_sse2_x86-64_windows_gnu.S");
- build.file("c/blake3_sse41_x86-64_windows_gnu.S");
- build.file("c/blake3_avx2_x86-64_windows_gnu.S");
- } else {
- // All non-Windows implementations are assumed to support
- // Linux-style assembly. These files do contain a small
- // explicit workaround for macOS also.
- build.file("c/blake3_sse2_x86-64_unix.S");
- build.file("c/blake3_sse41_x86-64_unix.S");
- build.file("c/blake3_avx2_x86-64_unix.S");
- }
- build.compile("blake3_sse2_sse41_avx2_assembly");
-}
-
-fn build_avx512_c_intrinsics() {
- // This is required on 32-bit x86 targets, since the assembly
- // implementation doesn't support support those.
- println!("cargo:rustc-cfg=blake3_avx512_ffi");
- let mut build = new_build();
- build.file("c/blake3_avx512.c");
- if is_windows_msvc() {
- build.flag("/arch:AVX512");
- } else {
- build.flag("-mavx512f");
- build.flag("-mavx512vl");
- }
- if is_windows_gnu() {
- // Workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65782.
- build.flag("-fno-asynchronous-unwind-tables");
- }
- build.compile("blake3_avx512_intrinsics");
-}
-
-fn build_avx512_assembly() {
- // Build the assembly implementation for AVX-512. This is preferred, but it
- // only supports x86_64.
- assert!(is_x86_64());
- println!("cargo:rustc-cfg=blake3_avx512_ffi");
- let mut build = new_build();
- if is_windows_msvc() {
- build.file("c/blake3_avx512_x86-64_windows_msvc.asm");
- } else {
- if is_windows_gnu() {
- build.file("c/blake3_avx512_x86-64_windows_gnu.S");
- } else {
- // All non-Windows implementations are assumed to support Linux-style
- // assembly. These files do contain a small explicit workaround for
- // macOS also.
- build.file("c/blake3_avx512_x86-64_unix.S");
- }
- // Older versions of Clang require these flags, even for assembly. See
- // https://github.com/BLAKE3-team/BLAKE3/issues/79.
- build.flag("-mavx512f");
- build.flag("-mavx512vl");
- }
- build.compile("blake3_avx512_assembly");
-}
-
-fn build_neon_c_intrinsics() {
- let mut build = new_build();
- // Note that blake3_neon.c normally depends on the blake3_portable.c
- // for the single-instance compression function, but we expose
- // portable.rs over FFI instead. See ffi_neon.rs.
- build.file("c/blake3_neon.c");
- // ARMv7 platforms that support NEON generally need the following
- // flags. AArch64 supports NEON by default and does not support -mpfu.
- if is_armv7() {
- build.flag("-mfpu=neon-vfpv4");
- build.flag("-mfloat-abi=hard");
- }
- build.compile("blake3_neon");
-}
-
-fn main() -> Result<(), Box<dyn std::error::Error>> {
- if is_pure() && is_neon() {
- panic!("It doesn't make sense to enable both \"pure\" and \"neon\".");
- }
-
- if is_x86_64() || is_x86_32() {
- let support = c_compiler_support();
- if is_x86_32() || should_prefer_intrinsics() || is_pure() || support == NoCompiler {
- build_sse2_sse41_avx2_rust_intrinsics();
- } else {
- // We assume that all C compilers can assemble SSE4.1 and AVX2. We
- // don't explicitly check for support.
- build_sse2_sse41_avx2_assembly();
- }
-
- if is_pure() || support == NoCompiler || support == NoAVX512 {
- // The binary will not include any AVX-512 code.
- } else if is_x86_32() || should_prefer_intrinsics() {
- build_avx512_c_intrinsics();
- } else {
- build_avx512_assembly();
- }
- }
-
- if is_neon() {
- build_neon_c_intrinsics();
- }
-
- // The `cc` crate doesn't automatically emit rerun-if directives for the
- // environment variables it supports, in particular for $CC. We expect to
- // do a lot of benchmarking across different compilers, so we explicitly
- // add the variables that we're likely to need.
- println!("cargo:rerun-if-env-changed=CC");
- println!("cargo:rerun-if-env-changed=CFLAGS");
-
- // Ditto for source files, though these shouldn't change as often.
- for file in std::fs::read_dir("c")? {
- println!(
- "cargo:rerun-if-changed={}",
- file?.path().to_str().expect("utf-8")
- );
- }
-
- Ok(())
-}
diff --git a/thirdparty/BLAKE3/c/.gitignore b/thirdparty/BLAKE3/c/.gitignore
deleted file mode 100644
index 0bf608cee..000000000
--- a/thirdparty/BLAKE3/c/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
-blake3
-example
-*.o
diff --git a/thirdparty/BLAKE3/c/Makefile.testing b/thirdparty/BLAKE3/c/Makefile.testing
deleted file mode 100644
index 41e6b8285..000000000
--- a/thirdparty/BLAKE3/c/Makefile.testing
+++ /dev/null
@@ -1,78 +0,0 @@
-# This Makefile is only for testing. C callers should follow the instructions
-# in ./README.md to incorporate these C files into their existing build.
-
-NAME=blake3
-CC=gcc
-CFLAGS=-O3 -Wall -Wextra -std=c11 -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2 -fPIE -fvisibility=hidden
-LDFLAGS=-pie -Wl,-z,relro,-z,now
-TARGETS=
-ASM_TARGETS=
-EXTRAFLAGS=-Wa,--noexecstack
-
-ifdef BLAKE3_NO_SSE2
-EXTRAFLAGS += -DBLAKE3_NO_SSE2
-else
-TARGETS += blake3_sse2.o
-ASM_TARGETS += blake3_sse2_x86-64_unix.S
-endif
-
-ifdef BLAKE3_NO_SSE41
-EXTRAFLAGS += -DBLAKE3_NO_SSE41
-else
-TARGETS += blake3_sse41.o
-ASM_TARGETS += blake3_sse41_x86-64_unix.S
-endif
-
-ifdef BLAKE3_NO_AVX2
-EXTRAFLAGS += -DBLAKE3_NO_AVX2
-else
-TARGETS += blake3_avx2.o
-ASM_TARGETS += blake3_avx2_x86-64_unix.S
-endif
-
-ifdef BLAKE3_NO_AVX512
-EXTRAFLAGS += -DBLAKE3_NO_AVX512
-else
-TARGETS += blake3_avx512.o
-ASM_TARGETS += blake3_avx512_x86-64_unix.S
-endif
-
-ifdef BLAKE3_USE_NEON
-EXTRAFLAGS += -DBLAKE3_USE_NEON
-TARGETS += blake3_neon.o
-endif
-
-all: blake3.c blake3_dispatch.c blake3_portable.c main.c $(TARGETS)
- $(CC) $(CFLAGS) $(EXTRAFLAGS) $^ -o $(NAME) $(LDFLAGS)
-
-blake3_sse2.o: blake3_sse2.c
- $(CC) $(CFLAGS) $(EXTRAFLAGS) -c $^ -o $@ -msse2
-
-blake3_sse41.o: blake3_sse41.c
- $(CC) $(CFLAGS) $(EXTRAFLAGS) -c $^ -o $@ -msse4.1
-
-blake3_avx2.o: blake3_avx2.c
- $(CC) $(CFLAGS) $(EXTRAFLAGS) -c $^ -o $@ -mavx2
-
-blake3_avx512.o: blake3_avx512.c
- $(CC) $(CFLAGS) $(EXTRAFLAGS) -c $^ -o $@ -mavx512f -mavx512vl
-
-blake3_neon.o: blake3_neon.c
- $(CC) $(CFLAGS) $(EXTRAFLAGS) -c $^ -o $@
-
-test: CFLAGS += -DBLAKE3_TESTING -fsanitize=address,undefined
-test: all
- ./test.py
-
-asm: blake3.c blake3_dispatch.c blake3_portable.c main.c $(ASM_TARGETS)
- $(CC) $(CFLAGS) $(EXTRAFLAGS) $^ -o $(NAME) $(LDFLAGS)
-
-test_asm: CFLAGS += -DBLAKE3_TESTING -fsanitize=address,undefined
-test_asm: asm
- ./test.py
-
-example: example.c blake3.c blake3_dispatch.c blake3_portable.c $(ASM_TARGETS)
- $(CC) $(CFLAGS) $(EXTRAFLAGS) $^ -o $@ $(LDFLAGS)
-
-clean:
- rm -f $(NAME) *.o
diff --git a/thirdparty/BLAKE3/c/README.md b/thirdparty/BLAKE3/c/README.md
deleted file mode 100644
index 5e8b4e682..000000000
--- a/thirdparty/BLAKE3/c/README.md
+++ /dev/null
@@ -1,270 +0,0 @@
-The official C implementation of BLAKE3.
-
-# Example
-
-An example program that hashes bytes from standard input and prints the
-result:
-
-```c
-#include "blake3.h"
-#include <stdio.h>
-#include <unistd.h>
-
-int main() {
- // Initialize the hasher.
- blake3_hasher hasher;
- blake3_hasher_init(&hasher);
-
- // Read input bytes from stdin.
- unsigned char buf[65536];
- ssize_t n;
- while ((n = read(STDIN_FILENO, buf, sizeof(buf))) > 0) {
- blake3_hasher_update(&hasher, buf, n);
- }
-
- // Finalize the hash. BLAKE3_OUT_LEN is the default output length, 32 bytes.
- uint8_t output[BLAKE3_OUT_LEN];
- blake3_hasher_finalize(&hasher, output, BLAKE3_OUT_LEN);
-
- // Print the hash as hexadecimal.
- for (size_t i = 0; i < BLAKE3_OUT_LEN; i++) {
- printf("%02x", output[i]);
- }
- printf("\n");
- return 0;
-}
-```
-
-The code above is included in this directory as `example.c`. If you're
-on x86\_64 with a Unix-like OS, you can compile a working binary like
-this:
-
-```bash
-gcc -O3 -o example example.c blake3.c blake3_dispatch.c blake3_portable.c \
- blake3_sse2_x86-64_unix.S blake3_sse41_x86-64_unix.S blake3_avx2_x86-64_unix.S \
- blake3_avx512_x86-64_unix.S
-```
-
-# API
-
-## The Struct
-
-```c
-typedef struct {
- // private fields
-} blake3_hasher;
-```
-
-An incremental BLAKE3 hashing state, which can accept any number of
-updates. This implementation doesn't allocate any heap memory, but
-`sizeof(blake3_hasher)` itself is relatively large, currently 1912 bytes
-on x86-64. This size can be reduced by restricting the maximum input
-length, as described in Section 5.4 of [the BLAKE3
-spec](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf),
-but this implementation doesn't currently support that strategy.
-
-## Common API Functions
-
-```c
-void blake3_hasher_init(
- blake3_hasher *self);
-```
-
-Initialize a `blake3_hasher` in the default hashing mode.
-
----
-
-```c
-void blake3_hasher_update(
- blake3_hasher *self,
- const void *input,
- size_t input_len);
-```
-
-Add input to the hasher. This can be called any number of times.
-
----
-
-```c
-void blake3_hasher_finalize(
- const blake3_hasher *self,
- uint8_t *out,
- size_t out_len);
-```
-
-Finalize the hasher and emit an output of any length. This doesn't
-modify the hasher itself, and it's possible to finalize again after
-adding more input. The constant `BLAKE3_OUT_LEN` provides the default
-output length, 32 bytes.
-
-## Less Common API Functions
-
-```c
-void blake3_hasher_init_keyed(
- blake3_hasher *self,
- const uint8_t key[BLAKE3_KEY_LEN]);
-```
-
-Initialize a `blake3_hasher` in the keyed hashing mode. The key must be
-exactly 32 bytes.
-
----
-
-```c
-void blake3_hasher_init_derive_key(
- blake3_hasher *self,
- const char *context);
-```
-
-Initialize a `blake3_hasher` in the key derivation mode. The context
-string is given as an initialization parameter, and afterwards input key
-material should be given with `blake3_hasher_update`. The context string
-is a null-terminated C string which should be **hardcoded, globally
-unique, and application-specific**. The context string should not
-include any dynamic input like salts, nonces, or identifiers read from a
-database at runtime. A good default format for the context string is
-`"[application] [commit timestamp] [purpose]"`, e.g., `"example.com
-2019-12-25 16:18:03 session tokens v1"`.
-
-This function is intended for application code written in C. For
-language bindings, see `blake3_hasher_init_derive_key_raw` below.
-
----
-
-```c
-void blake3_hasher_init_derive_key_raw(
- blake3_hasher *self,
- const void *context,
- size_t context_len);
-```
-
-As `blake3_hasher_init_derive_key` above, except that the context string
-is given as a pointer to an array of arbitrary bytes with a provided
-length. This is intended for writing language bindings, where C string
-conversion would add unnecessary overhead and new error cases. Unicode
-strings should be encoded as UTF-8.
-
-Application code in C should prefer `blake3_hasher_init_derive_key`,
-which takes the context as a C string. If you need to use arbitrary
-bytes as a context string in application code, consider whether you're
-violating the requirement that context strings should be hardcoded.
-
----
-
-```c
-void blake3_hasher_finalize_seek(
- const blake3_hasher *self,
- uint64_t seek,
- uint8_t *out,
- size_t out_len);
-```
-
-The same as `blake3_hasher_finalize`, but with an additional `seek`
-parameter for the starting byte position in the output stream. To
-efficiently stream a large output without allocating memory, call this
-function in a loop, incrementing `seek` by the output length each time.
-
-# Building
-
-This implementation is just C and assembly files. It doesn't include a
-public-facing build system. (The `Makefile` in this directory is only
-for testing.) Instead, the intention is that you can include these files
-in whatever build system you're already using. This section describes
-the commands your build system should execute, or which you can execute
-by hand. Note that these steps may change in future versions.
-
-## x86
-
-Dynamic dispatch is enabled by default on x86. The implementation will
-query the CPU at runtime to detect SIMD support, and it will use the
-widest instruction set available. By default, `blake3_dispatch.c`
-expects to be linked with code for five different instruction sets:
-portable C, SSE2, SSE4.1, AVX2, and AVX-512.
-
-For each of the x86 SIMD instruction sets, two versions are available,
-one in assembly (which is further divided into three flavors: Unix,
-Windows MSVC, and Windows GNU) and one using C intrinsics. The assembly
-versions are generally preferred: they perform better, they perform more
-consistently across different compilers, and they build more quickly. On
-the other hand, the assembly versions are x86\_64-only, and you need to
-select the right flavor for your target platform.
-
-Here's an example of building a shared library on x86\_64 Linux using
-the assembly implementations:
-
-```bash
-gcc -shared -O3 -o libblake3.so blake3.c blake3_dispatch.c blake3_portable.c \
- blake3_sse2_x86-64_unix.S blake3_sse41_x86-64_unix.S blake3_avx2_x86-64_unix.S \
- blake3_avx512_x86-64_unix.S
-```
-
-When building the intrinsics-based implementations, you need to build
-each implementation separately, with the corresponding instruction set
-explicitly enabled in the compiler. Here's the same shared library using
-the intrinsics-based implementations:
-
-```bash
-gcc -c -fPIC -O3 -msse2 blake3_sse2.c -o blake3_sse2.o
-gcc -c -fPIC -O3 -msse4.1 blake3_sse41.c -o blake3_sse41.o
-gcc -c -fPIC -O3 -mavx2 blake3_avx2.c -o blake3_avx2.o
-gcc -c -fPIC -O3 -mavx512f -mavx512vl blake3_avx512.c -o blake3_avx512.o
-gcc -shared -O3 -o libblake3.so blake3.c blake3_dispatch.c blake3_portable.c \
- blake3_avx2.o blake3_avx512.o blake3_sse41.o blake3_sse2.o
-```
-
-Note above that building `blake3_avx512.c` requires both `-mavx512f` and
-`-mavx512vl` under GCC and Clang. Under MSVC, the single `/arch:AVX512`
-flag is sufficient. The MSVC equivalent of `-mavx2` is `/arch:AVX2`.
-MSVC enables SSE2 and SSE4.1 by defaut, and it doesn't have a
-corresponding flag.
-
-If you want to omit SIMD code entirely, you need to explicitly disable
-each instruction set. Here's an example of building a shared library on
-x86 with only portable code:
-
-```bash
-gcc -shared -O3 -o libblake3.so -DBLAKE3_NO_SSE2 -DBLAKE3_NO_SSE41 -DBLAKE3_NO_AVX2 \
- -DBLAKE3_NO_AVX512 blake3.c blake3_dispatch.c blake3_portable.c
-```
-
-## ARM NEON
-
-The NEON implementation is not enabled by default on ARM, since not all
-ARM targets support it. To enable it, set `BLAKE3_USE_NEON=1`. Here's an
-example of building a shared library on ARM Linux with NEON support:
-
-```bash
-gcc -shared -O3 -o libblake3.so -DBLAKE3_USE_NEON blake3.c blake3_dispatch.c \
- blake3_portable.c blake3_neon.c
-```
-
-Note that on some targets (ARMv7 in particular), extra flags may be
-required to activate NEON support in the compiler. If you see an error
-like...
-
-```
-/usr/lib/gcc/armv7l-unknown-linux-gnueabihf/9.2.0/include/arm_neon.h:635:1: error: inlining failed
-in call to always_inline ‘vaddq_u32’: target specific option mismatch
-```
-
-...then you may need to add something like `-mfpu=neon-vfpv4
--mfloat-abi=hard`.
-
-## Other Platforms
-
-The portable implementation should work on most other architectures. For
-example:
-
-```bash
-gcc -shared -O3 -o libblake3.so blake3.c blake3_dispatch.c blake3_portable.c
-```
-
-# Differences from the Rust Implementation
-
-The single-threaded Rust and C implementations use the same algorithms,
-and their performance is the same if you use the assembly
-implementations or if you compile the intrinsics-based implementations
-with Clang. (Both Clang and rustc are LLVM-based.)
-
-The C implementation doesn't currently include any multithreading
-optimizations. OpenMP support or similar might be added in the future.
diff --git a/thirdparty/BLAKE3/c/blake3.c b/thirdparty/BLAKE3/c/blake3.c
deleted file mode 100644
index 7abf5324e..000000000
--- a/thirdparty/BLAKE3/c/blake3.c
+++ /dev/null
@@ -1,607 +0,0 @@
-#include <assert.h>
-#include <stdbool.h>
-#include <string.h>
-
-#include "blake3.h"
-#include "blake3_impl.h"
-
-const char * blake3_version(void) {
- return BLAKE3_VERSION_STRING;
-}
-
-INLINE void chunk_state_init(blake3_chunk_state *self, const uint32_t key[8],
- uint8_t flags) {
- memcpy(self->cv, key, BLAKE3_KEY_LEN);
- self->chunk_counter = 0;
- memset(self->buf, 0, BLAKE3_BLOCK_LEN);
- self->buf_len = 0;
- self->blocks_compressed = 0;
- self->flags = flags;
-}
-
-INLINE void chunk_state_reset(blake3_chunk_state *self, const uint32_t key[8],
- uint64_t chunk_counter) {
- memcpy(self->cv, key, BLAKE3_KEY_LEN);
- self->chunk_counter = chunk_counter;
- self->blocks_compressed = 0;
- memset(self->buf, 0, BLAKE3_BLOCK_LEN);
- self->buf_len = 0;
-}
-
-INLINE size_t chunk_state_len(const blake3_chunk_state *self) {
- return (BLAKE3_BLOCK_LEN * (size_t)self->blocks_compressed) +
- ((size_t)self->buf_len);
-}
-
-INLINE size_t chunk_state_fill_buf(blake3_chunk_state *self,
- const uint8_t *input, size_t input_len) {
- size_t take = BLAKE3_BLOCK_LEN - ((size_t)self->buf_len);
- if (take > input_len) {
- take = input_len;
- }
- uint8_t *dest = self->buf + ((size_t)self->buf_len);
- memcpy(dest, input, take);
- self->buf_len += (uint8_t)take;
- return take;
-}
-
-INLINE uint8_t chunk_state_maybe_start_flag(const blake3_chunk_state *self) {
- if (self->blocks_compressed == 0) {
- return CHUNK_START;
- } else {
- return 0;
- }
-}
-
-typedef struct {
- uint32_t input_cv[8];
- uint64_t counter;
- uint8_t block[BLAKE3_BLOCK_LEN];
- uint8_t block_len;
- uint8_t flags;
-} output_t;
-
-INLINE output_t make_output(const uint32_t input_cv[8],
- const uint8_t block[BLAKE3_BLOCK_LEN],
- uint8_t block_len, uint64_t counter,
- uint8_t flags) {
- output_t ret;
- memcpy(ret.input_cv, input_cv, 32);
- memcpy(ret.block, block, BLAKE3_BLOCK_LEN);
- ret.block_len = block_len;
- ret.counter = counter;
- ret.flags = flags;
- return ret;
-}
-
-// Chaining values within a given chunk (specifically the compress_in_place
-// interface) are represented as words. This avoids unnecessary bytes<->words
-// conversion overhead in the portable implementation. However, the hash_many
-// interface handles both user input and parent node blocks, so it accepts
-// bytes. For that reason, chaining values in the CV stack are represented as
-// bytes.
-INLINE void output_chaining_value(const output_t *self, uint8_t cv[32]) {
- uint32_t cv_words[8];
- memcpy(cv_words, self->input_cv, 32);
- blake3_compress_in_place(cv_words, self->block, self->block_len,
- self->counter, self->flags);
- store_cv_words(cv, cv_words);
-}
-
-INLINE void output_root_bytes(const output_t *self, uint64_t seek, uint8_t *out,
- size_t out_len) {
- uint64_t output_block_counter = seek / 64;
- size_t offset_within_block = seek % 64;
- uint8_t wide_buf[64];
- while (out_len > 0) {
- blake3_compress_xof(self->input_cv, self->block, self->block_len,
- output_block_counter, self->flags | ROOT, wide_buf);
- size_t available_bytes = 64 - offset_within_block;
- size_t memcpy_len;
- if (out_len > available_bytes) {
- memcpy_len = available_bytes;
- } else {
- memcpy_len = out_len;
- }
- memcpy(out, wide_buf + offset_within_block, memcpy_len);
- out += memcpy_len;
- out_len -= memcpy_len;
- output_block_counter += 1;
- offset_within_block = 0;
- }
-}
-
-INLINE void chunk_state_update(blake3_chunk_state *self, const uint8_t *input,
- size_t input_len) {
- if (self->buf_len > 0) {
- size_t take = chunk_state_fill_buf(self, input, input_len);
- input += take;
- input_len -= take;
- if (input_len > 0) {
- blake3_compress_in_place(
- self->cv, self->buf, BLAKE3_BLOCK_LEN, self->chunk_counter,
- self->flags | chunk_state_maybe_start_flag(self));
- self->blocks_compressed += 1;
- self->buf_len = 0;
- memset(self->buf, 0, BLAKE3_BLOCK_LEN);
- }
- }
-
- while (input_len > BLAKE3_BLOCK_LEN) {
- blake3_compress_in_place(self->cv, input, BLAKE3_BLOCK_LEN,
- self->chunk_counter,
- self->flags | chunk_state_maybe_start_flag(self));
- self->blocks_compressed += 1;
- input += BLAKE3_BLOCK_LEN;
- input_len -= BLAKE3_BLOCK_LEN;
- }
-
- size_t take = chunk_state_fill_buf(self, input, input_len);
- input += take;
- input_len -= take;
-}
-
-INLINE output_t chunk_state_output(const blake3_chunk_state *self) {
- uint8_t block_flags =
- self->flags | chunk_state_maybe_start_flag(self) | CHUNK_END;
- return make_output(self->cv, self->buf, self->buf_len, self->chunk_counter,
- block_flags);
-}
-
-INLINE output_t parent_output(const uint8_t block[BLAKE3_BLOCK_LEN],
- const uint32_t key[8], uint8_t flags) {
- return make_output(key, block, BLAKE3_BLOCK_LEN, 0, flags | PARENT);
-}
-
-// Given some input larger than one chunk, return the number of bytes that
-// should go in the left subtree. This is the largest power-of-2 number of
-// chunks that leaves at least 1 byte for the right subtree.
-INLINE size_t left_len(size_t content_len) {
- // Subtract 1 to reserve at least one byte for the right side. content_len
- // should always be greater than BLAKE3_CHUNK_LEN.
- size_t full_chunks = (content_len - 1) / BLAKE3_CHUNK_LEN;
- return round_down_to_power_of_2(full_chunks) * BLAKE3_CHUNK_LEN;
-}
-
-// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE chunks at the same time
-// on a single thread. Write out the chunk chaining values and return the
-// number of chunks hashed. These chunks are never the root and never empty;
-// those cases use a different codepath.
-INLINE size_t compress_chunks_parallel(const uint8_t *input, size_t input_len,
- const uint32_t key[8],
- uint64_t chunk_counter, uint8_t flags,
- uint8_t *out) {
-#if defined(BLAKE3_TESTING)
- assert(0 < input_len);
- assert(input_len <= MAX_SIMD_DEGREE * BLAKE3_CHUNK_LEN);
-#endif
-
- const uint8_t *chunks_array[MAX_SIMD_DEGREE];
- size_t input_position = 0;
- size_t chunks_array_len = 0;
- while (input_len - input_position >= BLAKE3_CHUNK_LEN) {
- chunks_array[chunks_array_len] = &input[input_position];
- input_position += BLAKE3_CHUNK_LEN;
- chunks_array_len += 1;
- }
-
- blake3_hash_many(chunks_array, chunks_array_len,
- BLAKE3_CHUNK_LEN / BLAKE3_BLOCK_LEN, key, chunk_counter,
- true, flags, CHUNK_START, CHUNK_END, out);
-
- // Hash the remaining partial chunk, if there is one. Note that the empty
- // chunk (meaning the empty message) is a different codepath.
- if (input_len > input_position) {
- uint64_t counter = chunk_counter + (uint64_t)chunks_array_len;
- blake3_chunk_state chunk_state;
- chunk_state_init(&chunk_state, key, flags);
- chunk_state.chunk_counter = counter;
- chunk_state_update(&chunk_state, &input[input_position],
- input_len - input_position);
- output_t output = chunk_state_output(&chunk_state);
- output_chaining_value(&output, &out[chunks_array_len * BLAKE3_OUT_LEN]);
- return chunks_array_len + 1;
- } else {
- return chunks_array_len;
- }
-}
-
-// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE parents at the same time
-// on a single thread. Write out the parent chaining values and return the
-// number of parents hashed. (If there's an odd input chaining value left over,
-// return it as an additional output.) These parents are never the root and
-// never empty; those cases use a different codepath.
-INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values,
- size_t num_chaining_values,
- const uint32_t key[8], uint8_t flags,
- uint8_t *out) {
-#if defined(BLAKE3_TESTING)
- assert(2 <= num_chaining_values);
- assert(num_chaining_values <= 2 * MAX_SIMD_DEGREE_OR_2);
-#endif
-
- const uint8_t *parents_array[MAX_SIMD_DEGREE_OR_2];
- size_t parents_array_len = 0;
- while (num_chaining_values - (2 * parents_array_len) >= 2) {
- parents_array[parents_array_len] =
- &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN];
- parents_array_len += 1;
- }
-
- blake3_hash_many(parents_array, parents_array_len, 1, key,
- 0, // Parents always use counter 0.
- false, flags | PARENT,
- 0, // Parents have no start flags.
- 0, // Parents have no end flags.
- out);
-
- // If there's an odd child left over, it becomes an output.
- if (num_chaining_values > 2 * parents_array_len) {
- memcpy(&out[parents_array_len * BLAKE3_OUT_LEN],
- &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN],
- BLAKE3_OUT_LEN);
- return parents_array_len + 1;
- } else {
- return parents_array_len;
- }
-}
-
-// The wide helper function returns (writes out) an array of chaining values
-// and returns the length of that array. The number of chaining values returned
-// is the dyanmically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer,
-// if the input is shorter than that many chunks. The reason for maintaining a
-// wide array of chaining values going back up the tree, is to allow the
-// implementation to hash as many parents in parallel as possible.
-//
-// As a special case when the SIMD degree is 1, this function will still return
-// at least 2 outputs. This guarantees that this function doesn't perform the
-// root compression. (If it did, it would use the wrong flags, and also we
-// wouldn't be able to implement exendable ouput.) Note that this function is
-// not used when the whole input is only 1 chunk long; that's a different
-// codepath.
-//
-// Why not just have the caller split the input on the first update(), instead
-// of implementing this special rule? Because we don't want to limit SIMD or
-// multi-threading parallelism for that update().
-static size_t blake3_compress_subtree_wide(const uint8_t *input,
- size_t input_len,
- const uint32_t key[8],
- uint64_t chunk_counter,
- uint8_t flags, uint8_t *out) {
- // Note that the single chunk case does *not* bump the SIMD degree up to 2
- // when it is 1. If this implementation adds multi-threading in the future,
- // this gives us the option of multi-threading even the 2-chunk case, which
- // can help performance on smaller platforms.
- if (input_len <= blake3_simd_degree() * BLAKE3_CHUNK_LEN) {
- return compress_chunks_parallel(input, input_len, key, chunk_counter, flags,
- out);
- }
-
- // With more than simd_degree chunks, we need to recurse. Start by dividing
- // the input into left and right subtrees. (Note that this is only optimal
- // as long as the SIMD degree is a power of 2. If we ever get a SIMD degree
- // of 3 or something, we'll need a more complicated strategy.)
- size_t left_input_len = left_len(input_len);
- size_t right_input_len = input_len - left_input_len;
- const uint8_t *right_input = &input[left_input_len];
- uint64_t right_chunk_counter =
- chunk_counter + (uint64_t)(left_input_len / BLAKE3_CHUNK_LEN);
-
- // Make space for the child outputs. Here we use MAX_SIMD_DEGREE_OR_2 to
- // account for the special case of returning 2 outputs when the SIMD degree
- // is 1.
- uint8_t cv_array[2 * MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN];
- size_t degree = blake3_simd_degree();
- if (left_input_len > BLAKE3_CHUNK_LEN && degree == 1) {
- // The special case: We always use a degree of at least two, to make
- // sure there are two outputs. Except, as noted above, at the chunk
- // level, where we allow degree=1. (Note that the 1-chunk-input case is
- // a different codepath.)
- degree = 2;
- }
- uint8_t *right_cvs = &cv_array[degree * BLAKE3_OUT_LEN];
-
- // Recurse! If this implementation adds multi-threading support in the
- // future, this is where it will go.
- size_t left_n = blake3_compress_subtree_wide(input, left_input_len, key,
- chunk_counter, flags, cv_array);
- size_t right_n = blake3_compress_subtree_wide(
- right_input, right_input_len, key, right_chunk_counter, flags, right_cvs);
-
- // The special case again. If simd_degree=1, then we'll have left_n=1 and
- // right_n=1. Rather than compressing them into a single output, return
- // them directly, to make sure we always have at least two outputs.
- if (left_n == 1) {
- memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN);
- return 2;
- }
-
- // Otherwise, do one layer of parent node compression.
- size_t num_chaining_values = left_n + right_n;
- return compress_parents_parallel(cv_array, num_chaining_values, key, flags,
- out);
-}
-
-// Hash a subtree with compress_subtree_wide(), and then condense the resulting
-// list of chaining values down to a single parent node. Don't compress that
-// last parent node, however. Instead, return its message bytes (the
-// concatenated chaining values of its children). This is necessary when the
-// first call to update() supplies a complete subtree, because the topmost
-// parent node of that subtree could end up being the root. It's also necessary
-// for extended output in the general case.
-//
-// As with compress_subtree_wide(), this function is not used on inputs of 1
-// chunk or less. That's a different codepath.
-INLINE void compress_subtree_to_parent_node(
- const uint8_t *input, size_t input_len, const uint32_t key[8],
- uint64_t chunk_counter, uint8_t flags, uint8_t out[2 * BLAKE3_OUT_LEN]) {
-#if defined(BLAKE3_TESTING)
- assert(input_len > BLAKE3_CHUNK_LEN);
-#endif
-
- uint8_t cv_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN];
- size_t num_cvs = blake3_compress_subtree_wide(input, input_len, key,
- chunk_counter, flags, cv_array);
-
- // If MAX_SIMD_DEGREE is greater than 2 and there's enough input,
- // compress_subtree_wide() returns more than 2 chaining values. Condense
- // them into 2 by forming parent nodes repeatedly.
- uint8_t out_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2];
- while (num_cvs > 2) {
- num_cvs =
- compress_parents_parallel(cv_array, num_cvs, key, flags, out_array);
- memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN);
- }
- memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN);
-}
-
-INLINE void hasher_init_base(blake3_hasher *self, const uint32_t key[8],
- uint8_t flags) {
- memcpy(self->key, key, BLAKE3_KEY_LEN);
- chunk_state_init(&self->chunk, key, flags);
- self->cv_stack_len = 0;
-}
-
-void blake3_hasher_init(blake3_hasher *self) { hasher_init_base(self, IV, 0); }
-
-void blake3_hasher_init_keyed(blake3_hasher *self,
- const uint8_t key[BLAKE3_KEY_LEN]) {
- uint32_t key_words[8];
- load_key_words(key, key_words);
- hasher_init_base(self, key_words, KEYED_HASH);
-}
-
-void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context,
- size_t context_len) {
- blake3_hasher context_hasher;
- hasher_init_base(&context_hasher, IV, DERIVE_KEY_CONTEXT);
- blake3_hasher_update(&context_hasher, context, context_len);
- uint8_t context_key[BLAKE3_KEY_LEN];
- blake3_hasher_finalize(&context_hasher, context_key, BLAKE3_KEY_LEN);
- uint32_t context_key_words[8];
- load_key_words(context_key, context_key_words);
- hasher_init_base(self, context_key_words, DERIVE_KEY_MATERIAL);
-}
-
-void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context) {
- blake3_hasher_init_derive_key_raw(self, context, strlen(context));
-}
-
-// As described in hasher_push_cv() below, we do "lazy merging", delaying
-// merges until right before the next CV is about to be added. This is
-// different from the reference implementation. Another difference is that we
-// aren't always merging 1 chunk at a time. Instead, each CV might represent
-// any power-of-two number of chunks, as long as the smaller-above-larger stack
-// order is maintained. Instead of the "count the trailing 0-bits" algorithm
-// described in the spec, we use a "count the total number of 1-bits" variant
-// that doesn't require us to retain the subtree size of the CV on top of the
-// stack. The principle is the same: each CV that should remain in the stack is
-// represented by a 1-bit in the total number of chunks (or bytes) so far.
-INLINE void hasher_merge_cv_stack(blake3_hasher *self, uint64_t total_len) {
- size_t post_merge_stack_len = (size_t)popcnt(total_len);
- while (self->cv_stack_len > post_merge_stack_len) {
- uint8_t *parent_node =
- &self->cv_stack[(self->cv_stack_len - 2) * BLAKE3_OUT_LEN];
- output_t output = parent_output(parent_node, self->key, self->chunk.flags);
- output_chaining_value(&output, parent_node);
- self->cv_stack_len -= 1;
- }
-}
-
-// In reference_impl.rs, we merge the new CV with existing CVs from the stack
-// before pushing it. We can do that because we know more input is coming, so
-// we know none of the merges are root.
-//
-// This setting is different. We want to feed as much input as possible to
-// compress_subtree_wide(), without setting aside anything for the chunk_state.
-// If the user gives us 64 KiB, we want to parallelize over all 64 KiB at once
-// as a single subtree, if at all possible.
-//
-// This leads to two problems:
-// 1) This 64 KiB input might be the only call that ever gets made to update.
-// In this case, the root node of the 64 KiB subtree would be the root node
-// of the whole tree, and it would need to be ROOT finalized. We can't
-// compress it until we know.
-// 2) This 64 KiB input might complete a larger tree, whose root node is
-// similarly going to be the the root of the whole tree. For example, maybe
-// we have 196 KiB (that is, 128 + 64) hashed so far. We can't compress the
-// node at the root of the 256 KiB subtree until we know how to finalize it.
-//
-// The second problem is solved with "lazy merging". That is, when we're about
-// to add a CV to the stack, we don't merge it with anything first, as the
-// reference impl does. Instead we do merges using the *previous* CV that was
-// added, which is sitting on top of the stack, and we put the new CV
-// (unmerged) on top of the stack afterwards. This guarantees that we never
-// merge the root node until finalize().
-//
-// Solving the first problem requires an additional tool,
-// compress_subtree_to_parent_node(). That function always returns the top
-// *two* chaining values of the subtree it's compressing. We then do lazy
-// merging with each of them separately, so that the second CV will always
-// remain unmerged. (That also helps us support extendable output when we're
-// hashing an input all-at-once.)
-INLINE void hasher_push_cv(blake3_hasher *self, uint8_t new_cv[BLAKE3_OUT_LEN],
- uint64_t chunk_counter) {
- hasher_merge_cv_stack(self, chunk_counter);
- memcpy(&self->cv_stack[self->cv_stack_len * BLAKE3_OUT_LEN], new_cv,
- BLAKE3_OUT_LEN);
- self->cv_stack_len += 1;
-}
-
-void blake3_hasher_update(blake3_hasher *self, const void *input,
- size_t input_len) {
- // Explicitly checking for zero avoids causing UB by passing a null pointer
- // to memcpy. This comes up in practice with things like:
- // std::vector<uint8_t> v;
- // blake3_hasher_update(&hasher, v.data(), v.size());
- if (input_len == 0) {
- return;
- }
-
- const uint8_t *input_bytes = (const uint8_t *)input;
-
- // If we have some partial chunk bytes in the internal chunk_state, we need
- // to finish that chunk first.
- if (chunk_state_len(&self->chunk) > 0) {
- size_t take = BLAKE3_CHUNK_LEN - chunk_state_len(&self->chunk);
- if (take > input_len) {
- take = input_len;
- }
- chunk_state_update(&self->chunk, input_bytes, take);
- input_bytes += take;
- input_len -= take;
- // If we've filled the current chunk and there's more coming, finalize this
- // chunk and proceed. In this case we know it's not the root.
- if (input_len > 0) {
- output_t output = chunk_state_output(&self->chunk);
- uint8_t chunk_cv[32];
- output_chaining_value(&output, chunk_cv);
- hasher_push_cv(self, chunk_cv, self->chunk.chunk_counter);
- chunk_state_reset(&self->chunk, self->key, self->chunk.chunk_counter + 1);
- } else {
- return;
- }
- }
-
- // Now the chunk_state is clear, and we have more input. If there's more than
- // a single chunk (so, definitely not the root chunk), hash the largest whole
- // subtree we can, with the full benefits of SIMD (and maybe in the future,
- // multi-threading) parallelism. Two restrictions:
- // - The subtree has to be a power-of-2 number of chunks. Only subtrees along
- // the right edge can be incomplete, and we don't know where the right edge
- // is going to be until we get to finalize().
- // - The subtree must evenly divide the total number of chunks up until this
- // point (if total is not 0). If the current incomplete subtree is only
- // waiting for 1 more chunk, we can't hash a subtree of 4 chunks. We have
- // to complete the current subtree first.
- // Because we might need to break up the input to form powers of 2, or to
- // evenly divide what we already have, this part runs in a loop.
- while (input_len > BLAKE3_CHUNK_LEN) {
- size_t subtree_len = round_down_to_power_of_2(input_len);
- uint64_t count_so_far = self->chunk.chunk_counter * BLAKE3_CHUNK_LEN;
- // Shrink the subtree_len until it evenly divides the count so far. We know
- // that subtree_len itself is a power of 2, so we can use a bitmasking
- // trick instead of an actual remainder operation. (Note that if the caller
- // consistently passes power-of-2 inputs of the same size, as is hopefully
- // typical, this loop condition will always fail, and subtree_len will
- // always be the full length of the input.)
- //
- // An aside: We don't have to shrink subtree_len quite this much. For
- // example, if count_so_far is 1, we could pass 2 chunks to
- // compress_subtree_to_parent_node. Since we'll get 2 CVs back, we'll still
- // get the right answer in the end, and we might get to use 2-way SIMD
- // parallelism. The problem with this optimization, is that it gets us
- // stuck always hashing 2 chunks. The total number of chunks will remain
- // odd, and we'll never graduate to higher degrees of parallelism. See
- // https://github.com/BLAKE3-team/BLAKE3/issues/69.
- while ((((uint64_t)(subtree_len - 1)) & count_so_far) != 0) {
- subtree_len /= 2;
- }
- // The shrunken subtree_len might now be 1 chunk long. If so, hash that one
- // chunk by itself. Otherwise, compress the subtree into a pair of CVs.
- uint64_t subtree_chunks = subtree_len / BLAKE3_CHUNK_LEN;
- if (subtree_len <= BLAKE3_CHUNK_LEN) {
- blake3_chunk_state chunk_state;
- chunk_state_init(&chunk_state, self->key, self->chunk.flags);
- chunk_state.chunk_counter = self->chunk.chunk_counter;
- chunk_state_update(&chunk_state, input_bytes, subtree_len);
- output_t output = chunk_state_output(&chunk_state);
- uint8_t cv[BLAKE3_OUT_LEN];
- output_chaining_value(&output, cv);
- hasher_push_cv(self, cv, chunk_state.chunk_counter);
- } else {
- // This is the high-performance happy path, though getting here depends
- // on the caller giving us a long enough input.
- uint8_t cv_pair[2 * BLAKE3_OUT_LEN];
- compress_subtree_to_parent_node(input_bytes, subtree_len, self->key,
- self->chunk.chunk_counter,
- self->chunk.flags, cv_pair);
- hasher_push_cv(self, cv_pair, self->chunk.chunk_counter);
- hasher_push_cv(self, &cv_pair[BLAKE3_OUT_LEN],
- self->chunk.chunk_counter + (subtree_chunks / 2));
- }
- self->chunk.chunk_counter += subtree_chunks;
- input_bytes += subtree_len;
- input_len -= subtree_len;
- }
-
- // If there's any remaining input less than a full chunk, add it to the chunk
- // state. In that case, also do a final merge loop to make sure the subtree
- // stack doesn't contain any unmerged pairs. The remaining input means we
- // know these merges are non-root. This merge loop isn't strictly necessary
- // here, because hasher_push_chunk_cv already does its own merge loop, but it
- // simplifies blake3_hasher_finalize below.
- if (input_len > 0) {
- chunk_state_update(&self->chunk, input_bytes, input_len);
- hasher_merge_cv_stack(self, self->chunk.chunk_counter);
- }
-}
-
-void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
- size_t out_len) {
- blake3_hasher_finalize_seek(self, 0, out, out_len);
-}
-
-void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek,
- uint8_t *out, size_t out_len) {
- // Explicitly checking for zero avoids causing UB by passing a null pointer
- // to memcpy. This comes up in practice with things like:
- // std::vector<uint8_t> v;
- // blake3_hasher_finalize(&hasher, v.data(), v.size());
- if (out_len == 0) {
- return;
- }
-
- // If the subtree stack is empty, then the current chunk is the root.
- if (self->cv_stack_len == 0) {
- output_t output = chunk_state_output(&self->chunk);
- output_root_bytes(&output, seek, out, out_len);
- return;
- }
- // If there are any bytes in the chunk state, finalize that chunk and do a
- // roll-up merge between that chunk hash and every subtree in the stack. In
- // this case, the extra merge loop at the end of blake3_hasher_update
- // guarantees that none of the subtrees in the stack need to be merged with
- // each other first. Otherwise, if there are no bytes in the chunk state,
- // then the top of the stack is a chunk hash, and we start the merge from
- // that.
- output_t output;
- size_t cvs_remaining;
- if (chunk_state_len(&self->chunk) > 0) {
- cvs_remaining = self->cv_stack_len;
- output = chunk_state_output(&self->chunk);
- } else {
- // There are always at least 2 CVs in the stack in this case.
- cvs_remaining = self->cv_stack_len - 2;
- output = parent_output(&self->cv_stack[cvs_remaining * 32], self->key,
- self->chunk.flags);
- }
- while (cvs_remaining > 0) {
- cvs_remaining -= 1;
- uint8_t parent_block[BLAKE3_BLOCK_LEN];
- memcpy(parent_block, &self->cv_stack[cvs_remaining * 32], 32);
- output_chaining_value(&output, &parent_block[32]);
- output = parent_output(parent_block, self->key, self->chunk.flags);
- }
- output_root_bytes(&output, seek, out, out_len);
-}
diff --git a/thirdparty/BLAKE3/c/blake3.h b/thirdparty/BLAKE3/c/blake3.h
deleted file mode 100644
index 57ebd5adc..000000000
--- a/thirdparty/BLAKE3/c/blake3.h
+++ /dev/null
@@ -1,60 +0,0 @@
-#ifndef BLAKE3_H
-#define BLAKE3_H
-
-#include <stddef.h>
-#include <stdint.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define BLAKE3_VERSION_STRING "0.3.7"
-#define BLAKE3_KEY_LEN 32
-#define BLAKE3_OUT_LEN 32
-#define BLAKE3_BLOCK_LEN 64
-#define BLAKE3_CHUNK_LEN 1024
-#define BLAKE3_MAX_DEPTH 54
-#define BLAKE3_MAX_SIMD_DEGREE 16
-
-// This struct is a private implementation detail. It has to be here because
-// it's part of blake3_hasher below.
-typedef struct {
- uint32_t cv[8];
- uint64_t chunk_counter;
- uint8_t buf[BLAKE3_BLOCK_LEN];
- uint8_t buf_len;
- uint8_t blocks_compressed;
- uint8_t flags;
-} blake3_chunk_state;
-
-typedef struct {
- uint32_t key[8];
- blake3_chunk_state chunk;
- uint8_t cv_stack_len;
- // The stack size is MAX_DEPTH + 1 because we do lazy merging. For example,
- // with 7 chunks, we have 3 entries in the stack. Adding an 8th chunk
- // requires a 4th entry, rather than merging everything down to 1, because we
- // don't know whether more input is coming. This is different from how the
- // reference implementation does things.
- uint8_t cv_stack[(BLAKE3_MAX_DEPTH + 1) * BLAKE3_OUT_LEN];
-} blake3_hasher;
-
-const char * blake3_version(void);
-void blake3_hasher_init(blake3_hasher *self);
-void blake3_hasher_init_keyed(blake3_hasher *self,
- const uint8_t key[BLAKE3_KEY_LEN]);
-void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context);
-void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context,
- size_t context_len);
-void blake3_hasher_update(blake3_hasher *self, const void *input,
- size_t input_len);
-void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
- size_t out_len);
-void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek,
- uint8_t *out, size_t out_len);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* BLAKE3_H */
diff --git a/thirdparty/BLAKE3/c/blake3_avx2.c b/thirdparty/BLAKE3/c/blake3_avx2.c
deleted file mode 100644
index c5a2ce9e2..000000000
--- a/thirdparty/BLAKE3/c/blake3_avx2.c
+++ /dev/null
@@ -1,325 +0,0 @@
-#include "blake3_impl.h"
-
-#include <immintrin.h>
-
-#define DEGREE 8
-
-INLINE __m256i loadu(const uint8_t src[32]) {
- return _mm256_loadu_si256((const __m256i *)src);
-}
-
-INLINE void storeu(__m256i src, uint8_t dest[16]) {
- _mm256_storeu_si256((__m256i *)dest, src);
-}
-
-INLINE __m256i addv(__m256i a, __m256i b) { return _mm256_add_epi32(a, b); }
-
-// Note that clang-format doesn't like the name "xor" for some reason.
-INLINE __m256i xorv(__m256i a, __m256i b) { return _mm256_xor_si256(a, b); }
-
-INLINE __m256i set1(uint32_t x) { return _mm256_set1_epi32((int32_t)x); }
-
-INLINE __m256i rot16(__m256i x) {
- return _mm256_shuffle_epi8(
- x, _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2,
- 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2));
-}
-
-INLINE __m256i rot12(__m256i x) {
- return _mm256_or_si256(_mm256_srli_epi32(x, 12), _mm256_slli_epi32(x, 32 - 12));
-}
-
-INLINE __m256i rot8(__m256i x) {
- return _mm256_shuffle_epi8(
- x, _mm256_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1,
- 12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1));
-}
-
-INLINE __m256i rot7(__m256i x) {
- return _mm256_or_si256(_mm256_srli_epi32(x, 7), _mm256_slli_epi32(x, 32 - 7));
-}
-
-INLINE void round_fn(__m256i v[16], __m256i m[16], size_t r) {
- v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
- v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
- v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
- v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
- v[0] = addv(v[0], v[4]);
- v[1] = addv(v[1], v[5]);
- v[2] = addv(v[2], v[6]);
- v[3] = addv(v[3], v[7]);
- v[12] = xorv(v[12], v[0]);
- v[13] = xorv(v[13], v[1]);
- v[14] = xorv(v[14], v[2]);
- v[15] = xorv(v[15], v[3]);
- v[12] = rot16(v[12]);
- v[13] = rot16(v[13]);
- v[14] = rot16(v[14]);
- v[15] = rot16(v[15]);
- v[8] = addv(v[8], v[12]);
- v[9] = addv(v[9], v[13]);
- v[10] = addv(v[10], v[14]);
- v[11] = addv(v[11], v[15]);
- v[4] = xorv(v[4], v[8]);
- v[5] = xorv(v[5], v[9]);
- v[6] = xorv(v[6], v[10]);
- v[7] = xorv(v[7], v[11]);
- v[4] = rot12(v[4]);
- v[5] = rot12(v[5]);
- v[6] = rot12(v[6]);
- v[7] = rot12(v[7]);
- v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
- v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
- v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
- v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
- v[0] = addv(v[0], v[4]);
- v[1] = addv(v[1], v[5]);
- v[2] = addv(v[2], v[6]);
- v[3] = addv(v[3], v[7]);
- v[12] = xorv(v[12], v[0]);
- v[13] = xorv(v[13], v[1]);
- v[14] = xorv(v[14], v[2]);
- v[15] = xorv(v[15], v[3]);
- v[12] = rot8(v[12]);
- v[13] = rot8(v[13]);
- v[14] = rot8(v[14]);
- v[15] = rot8(v[15]);
- v[8] = addv(v[8], v[12]);
- v[9] = addv(v[9], v[13]);
- v[10] = addv(v[10], v[14]);
- v[11] = addv(v[11], v[15]);
- v[4] = xorv(v[4], v[8]);
- v[5] = xorv(v[5], v[9]);
- v[6] = xorv(v[6], v[10]);
- v[7] = xorv(v[7], v[11]);
- v[4] = rot7(v[4]);
- v[5] = rot7(v[5]);
- v[6] = rot7(v[6]);
- v[7] = rot7(v[7]);
-
- v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
- v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
- v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
- v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
- v[0] = addv(v[0], v[5]);
- v[1] = addv(v[1], v[6]);
- v[2] = addv(v[2], v[7]);
- v[3] = addv(v[3], v[4]);
- v[15] = xorv(v[15], v[0]);
- v[12] = xorv(v[12], v[1]);
- v[13] = xorv(v[13], v[2]);
- v[14] = xorv(v[14], v[3]);
- v[15] = rot16(v[15]);
- v[12] = rot16(v[12]);
- v[13] = rot16(v[13]);
- v[14] = rot16(v[14]);
- v[10] = addv(v[10], v[15]);
- v[11] = addv(v[11], v[12]);
- v[8] = addv(v[8], v[13]);
- v[9] = addv(v[9], v[14]);
- v[5] = xorv(v[5], v[10]);
- v[6] = xorv(v[6], v[11]);
- v[7] = xorv(v[7], v[8]);
- v[4] = xorv(v[4], v[9]);
- v[5] = rot12(v[5]);
- v[6] = rot12(v[6]);
- v[7] = rot12(v[7]);
- v[4] = rot12(v[4]);
- v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
- v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
- v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
- v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
- v[0] = addv(v[0], v[5]);
- v[1] = addv(v[1], v[6]);
- v[2] = addv(v[2], v[7]);
- v[3] = addv(v[3], v[4]);
- v[15] = xorv(v[15], v[0]);
- v[12] = xorv(v[12], v[1]);
- v[13] = xorv(v[13], v[2]);
- v[14] = xorv(v[14], v[3]);
- v[15] = rot8(v[15]);
- v[12] = rot8(v[12]);
- v[13] = rot8(v[13]);
- v[14] = rot8(v[14]);
- v[10] = addv(v[10], v[15]);
- v[11] = addv(v[11], v[12]);
- v[8] = addv(v[8], v[13]);
- v[9] = addv(v[9], v[14]);
- v[5] = xorv(v[5], v[10]);
- v[6] = xorv(v[6], v[11]);
- v[7] = xorv(v[7], v[8]);
- v[4] = xorv(v[4], v[9]);
- v[5] = rot7(v[5]);
- v[6] = rot7(v[6]);
- v[7] = rot7(v[7]);
- v[4] = rot7(v[4]);
-}
-
-INLINE void transpose_vecs(__m256i vecs[DEGREE]) {
- // Interleave 32-bit lanes. The low unpack is lanes 00/11/44/55, and the high
- // is 22/33/66/77.
- __m256i ab_0145 = _mm256_unpacklo_epi32(vecs[0], vecs[1]);
- __m256i ab_2367 = _mm256_unpackhi_epi32(vecs[0], vecs[1]);
- __m256i cd_0145 = _mm256_unpacklo_epi32(vecs[2], vecs[3]);
- __m256i cd_2367 = _mm256_unpackhi_epi32(vecs[2], vecs[3]);
- __m256i ef_0145 = _mm256_unpacklo_epi32(vecs[4], vecs[5]);
- __m256i ef_2367 = _mm256_unpackhi_epi32(vecs[4], vecs[5]);
- __m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]);
- __m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]);
-
- // Interleave 64-bit lates. The low unpack is lanes 00/22 and the high is
- // 11/33.
- __m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145);
- __m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145);
- __m256i abcd_26 = _mm256_unpacklo_epi64(ab_2367, cd_2367);
- __m256i abcd_37 = _mm256_unpackhi_epi64(ab_2367, cd_2367);
- __m256i efgh_04 = _mm256_unpacklo_epi64(ef_0145, gh_0145);
- __m256i efgh_15 = _mm256_unpackhi_epi64(ef_0145, gh_0145);
- __m256i efgh_26 = _mm256_unpacklo_epi64(ef_2367, gh_2367);
- __m256i efgh_37 = _mm256_unpackhi_epi64(ef_2367, gh_2367);
-
- // Interleave 128-bit lanes.
- vecs[0] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x20);
- vecs[1] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x20);
- vecs[2] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x20);
- vecs[3] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x20);
- vecs[4] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x31);
- vecs[5] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x31);
- vecs[6] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x31);
- vecs[7] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x31);
-}
-
-INLINE void transpose_msg_vecs(const uint8_t *const *inputs,
- size_t block_offset, __m256i out[16]) {
- out[0] = loadu(&inputs[0][block_offset + 0 * sizeof(__m256i)]);
- out[1] = loadu(&inputs[1][block_offset + 0 * sizeof(__m256i)]);
- out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m256i)]);
- out[3] = loadu(&inputs[3][block_offset + 0 * sizeof(__m256i)]);
- out[4] = loadu(&inputs[4][block_offset + 0 * sizeof(__m256i)]);
- out[5] = loadu(&inputs[5][block_offset + 0 * sizeof(__m256i)]);
- out[6] = loadu(&inputs[6][block_offset + 0 * sizeof(__m256i)]);
- out[7] = loadu(&inputs[7][block_offset + 0 * sizeof(__m256i)]);
- out[8] = loadu(&inputs[0][block_offset + 1 * sizeof(__m256i)]);
- out[9] = loadu(&inputs[1][block_offset + 1 * sizeof(__m256i)]);
- out[10] = loadu(&inputs[2][block_offset + 1 * sizeof(__m256i)]);
- out[11] = loadu(&inputs[3][block_offset + 1 * sizeof(__m256i)]);
- out[12] = loadu(&inputs[4][block_offset + 1 * sizeof(__m256i)]);
- out[13] = loadu(&inputs[5][block_offset + 1 * sizeof(__m256i)]);
- out[14] = loadu(&inputs[6][block_offset + 1 * sizeof(__m256i)]);
- out[15] = loadu(&inputs[7][block_offset + 1 * sizeof(__m256i)]);
- for (size_t i = 0; i < 8; ++i) {
- _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0);
- }
- transpose_vecs(&out[0]);
- transpose_vecs(&out[8]);
-}
-
-INLINE void load_counters(uint64_t counter, bool increment_counter,
- __m256i *out_lo, __m256i *out_hi) {
- const __m256i mask = _mm256_set1_epi32(-(int32_t)increment_counter);
- const __m256i add0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
- const __m256i add1 = _mm256_and_si256(mask, add0);
- __m256i l = _mm256_add_epi32(_mm256_set1_epi32(counter), add1);
- __m256i carry = _mm256_cmpgt_epi32(_mm256_xor_si256(add1, _mm256_set1_epi32(0x80000000)),
- _mm256_xor_si256( l, _mm256_set1_epi32(0x80000000)));
- __m256i h = _mm256_sub_epi32(_mm256_set1_epi32(counter >> 32), carry);
- *out_lo = l;
- *out_hi = h;
-}
-
-void blake3_hash8_avx2(const uint8_t *const *inputs, size_t blocks,
- const uint32_t key[8], uint64_t counter,
- bool increment_counter, uint8_t flags,
- uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
- __m256i h_vecs[8] = {
- set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]),
- set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]),
- };
- __m256i counter_low_vec, counter_high_vec;
- load_counters(counter, increment_counter, &counter_low_vec,
- &counter_high_vec);
- uint8_t block_flags = flags | flags_start;
-
- for (size_t block = 0; block < blocks; block++) {
- if (block + 1 == blocks) {
- block_flags |= flags_end;
- }
- __m256i block_len_vec = set1(BLAKE3_BLOCK_LEN);
- __m256i block_flags_vec = set1(block_flags);
- __m256i msg_vecs[16];
- transpose_msg_vecs(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
-
- __m256i v[16] = {
- h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
- h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
- set1(IV[0]), set1(IV[1]), set1(IV[2]), set1(IV[3]),
- counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
- };
- round_fn(v, msg_vecs, 0);
- round_fn(v, msg_vecs, 1);
- round_fn(v, msg_vecs, 2);
- round_fn(v, msg_vecs, 3);
- round_fn(v, msg_vecs, 4);
- round_fn(v, msg_vecs, 5);
- round_fn(v, msg_vecs, 6);
- h_vecs[0] = xorv(v[0], v[8]);
- h_vecs[1] = xorv(v[1], v[9]);
- h_vecs[2] = xorv(v[2], v[10]);
- h_vecs[3] = xorv(v[3], v[11]);
- h_vecs[4] = xorv(v[4], v[12]);
- h_vecs[5] = xorv(v[5], v[13]);
- h_vecs[6] = xorv(v[6], v[14]);
- h_vecs[7] = xorv(v[7], v[15]);
-
- block_flags = flags;
- }
-
- transpose_vecs(h_vecs);
- storeu(h_vecs[0], &out[0 * sizeof(__m256i)]);
- storeu(h_vecs[1], &out[1 * sizeof(__m256i)]);
- storeu(h_vecs[2], &out[2 * sizeof(__m256i)]);
- storeu(h_vecs[3], &out[3 * sizeof(__m256i)]);
- storeu(h_vecs[4], &out[4 * sizeof(__m256i)]);
- storeu(h_vecs[5], &out[5 * sizeof(__m256i)]);
- storeu(h_vecs[6], &out[6 * sizeof(__m256i)]);
- storeu(h_vecs[7], &out[7 * sizeof(__m256i)]);
-}
-
-#if !defined(BLAKE3_NO_SSE41)
-void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs,
- size_t blocks, const uint32_t key[8],
- uint64_t counter, bool increment_counter,
- uint8_t flags, uint8_t flags_start,
- uint8_t flags_end, uint8_t *out);
-#else
-void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,
- size_t blocks, const uint32_t key[8],
- uint64_t counter, bool increment_counter,
- uint8_t flags, uint8_t flags_start,
- uint8_t flags_end, uint8_t *out);
-#endif
-
-void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs,
- size_t blocks, const uint32_t key[8],
- uint64_t counter, bool increment_counter,
- uint8_t flags, uint8_t flags_start,
- uint8_t flags_end, uint8_t *out) {
- while (num_inputs >= DEGREE) {
- blake3_hash8_avx2(inputs, blocks, key, counter, increment_counter, flags,
- flags_start, flags_end, out);
- if (increment_counter) {
- counter += DEGREE;
- }
- inputs += DEGREE;
- num_inputs -= DEGREE;
- out = &out[DEGREE * BLAKE3_OUT_LEN];
- }
-#if !defined(BLAKE3_NO_SSE41)
- blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter,
- increment_counter, flags, flags_start, flags_end, out);
-#else
- blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter,
- increment_counter, flags, flags_start, flags_end,
- out);
-#endif
-}
diff --git a/thirdparty/BLAKE3/c/blake3_avx2_x86-64_unix.S b/thirdparty/BLAKE3/c/blake3_avx2_x86-64_unix.S
deleted file mode 100644
index 812bb8568..000000000
--- a/thirdparty/BLAKE3/c/blake3_avx2_x86-64_unix.S
+++ /dev/null
@@ -1,1815 +0,0 @@
-#if defined(__ELF__) && defined(__linux__)
-.section .note.GNU-stack,"",%progbits
-#endif
-
-#if defined(__ELF__) && defined(__CET__) && defined(__has_include)
-#if __has_include(<cet.h>)
-#include <cet.h>
-#endif
-#endif
-
-#if !defined(_CET_ENDBR)
-#define _CET_ENDBR
-#endif
-
-.intel_syntax noprefix
-.global _blake3_hash_many_avx2
-.global blake3_hash_many_avx2
-#ifdef __APPLE__
-.text
-#else
-.section .text
-#endif
- .p2align 6
-_blake3_hash_many_avx2:
-blake3_hash_many_avx2:
- _CET_ENDBR
- push r15
- push r14
- push r13
- push r12
- push rbx
- push rbp
- mov rbp, rsp
- sub rsp, 680
- and rsp, 0xFFFFFFFFFFFFFFC0
- neg r9d
- vmovd xmm0, r9d
- vpbroadcastd ymm0, xmm0
- vmovdqa ymmword ptr [rsp+0x280], ymm0
- vpand ymm1, ymm0, ymmword ptr [ADD0+rip]
- vpand ymm2, ymm0, ymmword ptr [ADD1+rip]
- vmovdqa ymmword ptr [rsp+0x220], ymm2
- vmovd xmm2, r8d
- vpbroadcastd ymm2, xmm2
- vpaddd ymm2, ymm2, ymm1
- vmovdqa ymmword ptr [rsp+0x240], ymm2
- vpxor ymm1, ymm1, ymmword ptr [CMP_MSB_MASK+rip]
- vpxor ymm2, ymm2, ymmword ptr [CMP_MSB_MASK+rip]
- vpcmpgtd ymm2, ymm1, ymm2
- shr r8, 32
- vmovd xmm3, r8d
- vpbroadcastd ymm3, xmm3
- vpsubd ymm3, ymm3, ymm2
- vmovdqa ymmword ptr [rsp+0x260], ymm3
- shl rdx, 6
- mov qword ptr [rsp+0x2A0], rdx
- cmp rsi, 8
- jc 3f
-2:
- vpbroadcastd ymm0, dword ptr [rcx]
- vpbroadcastd ymm1, dword ptr [rcx+0x4]
- vpbroadcastd ymm2, dword ptr [rcx+0x8]
- vpbroadcastd ymm3, dword ptr [rcx+0xC]
- vpbroadcastd ymm4, dword ptr [rcx+0x10]
- vpbroadcastd ymm5, dword ptr [rcx+0x14]
- vpbroadcastd ymm6, dword ptr [rcx+0x18]
- vpbroadcastd ymm7, dword ptr [rcx+0x1C]
- mov r8, qword ptr [rdi]
- mov r9, qword ptr [rdi+0x8]
- mov r10, qword ptr [rdi+0x10]
- mov r11, qword ptr [rdi+0x18]
- mov r12, qword ptr [rdi+0x20]
- mov r13, qword ptr [rdi+0x28]
- mov r14, qword ptr [rdi+0x30]
- mov r15, qword ptr [rdi+0x38]
- movzx eax, byte ptr [rbp+0x38]
- movzx ebx, byte ptr [rbp+0x40]
- or eax, ebx
- xor edx, edx
-.p2align 5
-9:
- movzx ebx, byte ptr [rbp+0x48]
- or ebx, eax
- add rdx, 64
- cmp rdx, qword ptr [rsp+0x2A0]
- cmove eax, ebx
- mov dword ptr [rsp+0x200], eax
- vmovups xmm8, xmmword ptr [r8+rdx-0x40]
- vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01
- vmovups xmm9, xmmword ptr [r9+rdx-0x40]
- vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01
- vunpcklpd ymm12, ymm8, ymm9
- vunpckhpd ymm13, ymm8, ymm9
- vmovups xmm10, xmmword ptr [r10+rdx-0x40]
- vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01
- vmovups xmm11, xmmword ptr [r11+rdx-0x40]
- vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01
- vunpcklpd ymm14, ymm10, ymm11
- vunpckhpd ymm15, ymm10, ymm11
- vshufps ymm8, ymm12, ymm14, 136
- vmovaps ymmword ptr [rsp], ymm8
- vshufps ymm9, ymm12, ymm14, 221
- vmovaps ymmword ptr [rsp+0x20], ymm9
- vshufps ymm10, ymm13, ymm15, 136
- vmovaps ymmword ptr [rsp+0x40], ymm10
- vshufps ymm11, ymm13, ymm15, 221
- vmovaps ymmword ptr [rsp+0x60], ymm11
- vmovups xmm8, xmmword ptr [r8+rdx-0x30]
- vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01
- vmovups xmm9, xmmword ptr [r9+rdx-0x30]
- vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01
- vunpcklpd ymm12, ymm8, ymm9
- vunpckhpd ymm13, ymm8, ymm9
- vmovups xmm10, xmmword ptr [r10+rdx-0x30]
- vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01
- vmovups xmm11, xmmword ptr [r11+rdx-0x30]
- vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01
- vunpcklpd ymm14, ymm10, ymm11
- vunpckhpd ymm15, ymm10, ymm11
- vshufps ymm8, ymm12, ymm14, 136
- vmovaps ymmword ptr [rsp+0x80], ymm8
- vshufps ymm9, ymm12, ymm14, 221
- vmovaps ymmword ptr [rsp+0xA0], ymm9
- vshufps ymm10, ymm13, ymm15, 136
- vmovaps ymmword ptr [rsp+0xC0], ymm10
- vshufps ymm11, ymm13, ymm15, 221
- vmovaps ymmword ptr [rsp+0xE0], ymm11
- vmovups xmm8, xmmword ptr [r8+rdx-0x20]
- vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01
- vmovups xmm9, xmmword ptr [r9+rdx-0x20]
- vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01
- vunpcklpd ymm12, ymm8, ymm9
- vunpckhpd ymm13, ymm8, ymm9
- vmovups xmm10, xmmword ptr [r10+rdx-0x20]
- vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01
- vmovups xmm11, xmmword ptr [r11+rdx-0x20]
- vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01
- vunpcklpd ymm14, ymm10, ymm11
- vunpckhpd ymm15, ymm10, ymm11
- vshufps ymm8, ymm12, ymm14, 136
- vmovaps ymmword ptr [rsp+0x100], ymm8
- vshufps ymm9, ymm12, ymm14, 221
- vmovaps ymmword ptr [rsp+0x120], ymm9
- vshufps ymm10, ymm13, ymm15, 136
- vmovaps ymmword ptr [rsp+0x140], ymm10
- vshufps ymm11, ymm13, ymm15, 221
- vmovaps ymmword ptr [rsp+0x160], ymm11
- vmovups xmm8, xmmword ptr [r8+rdx-0x10]
- vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01
- vmovups xmm9, xmmword ptr [r9+rdx-0x10]
- vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01
- vunpcklpd ymm12, ymm8, ymm9
- vunpckhpd ymm13, ymm8, ymm9
- vmovups xmm10, xmmword ptr [r10+rdx-0x10]
- vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01
- vmovups xmm11, xmmword ptr [r11+rdx-0x10]
- vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01
- vunpcklpd ymm14, ymm10, ymm11
- vunpckhpd ymm15, ymm10, ymm11
- vshufps ymm8, ymm12, ymm14, 136
- vmovaps ymmword ptr [rsp+0x180], ymm8
- vshufps ymm9, ymm12, ymm14, 221
- vmovaps ymmword ptr [rsp+0x1A0], ymm9
- vshufps ymm10, ymm13, ymm15, 136
- vmovaps ymmword ptr [rsp+0x1C0], ymm10
- vshufps ymm11, ymm13, ymm15, 221
- vmovaps ymmword ptr [rsp+0x1E0], ymm11
- vpbroadcastd ymm15, dword ptr [rsp+0x200]
- prefetcht0 [r8+rdx+0x80]
- prefetcht0 [r12+rdx+0x80]
- prefetcht0 [r9+rdx+0x80]
- prefetcht0 [r13+rdx+0x80]
- prefetcht0 [r10+rdx+0x80]
- prefetcht0 [r14+rdx+0x80]
- prefetcht0 [r11+rdx+0x80]
- prefetcht0 [r15+rdx+0x80]
- vpaddd ymm0, ymm0, ymmword ptr [rsp]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+0x80]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0]
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxor ymm12, ymm0, ymmword ptr [rsp+0x240]
- vpxor ymm13, ymm1, ymmword ptr [rsp+0x260]
- vpxor ymm14, ymm2, ymmword ptr [BLAKE3_BLOCK_LEN+rip]
- vpxor ymm15, ymm3, ymm15
- vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpshufb ymm15, ymm15, ymm8
- vpaddd ymm8, ymm12, ymmword ptr [BLAKE3_IV_0+rip]
- vpaddd ymm9, ymm13, ymmword ptr [BLAKE3_IV_1+rip]
- vpaddd ymm10, ymm14, ymmword ptr [BLAKE3_IV_2+rip]
- vpaddd ymm11, ymm15, ymmword ptr [BLAKE3_IV_3+rip]
- vpxor ymm4, ymm4, ymm8
- vpxor ymm5, ymm5, ymm9
- vpxor ymm6, ymm6, ymm10
- vpxor ymm7, ymm7, ymm11
- vmovdqa ymmword ptr [rsp+0x200], ymm8
- vpsrld ymm8, ymm4, 12
- vpslld ymm4, ymm4, 20
- vpor ymm4, ymm4, ymm8
- vpsrld ymm8, ymm5, 12
- vpslld ymm5, ymm5, 20
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 12
- vpslld ymm6, ymm6, 20
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 12
- vpslld ymm7, ymm7, 20
- vpor ymm7, ymm7, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+0x20]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+0xA0]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0]
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxor ymm12, ymm12, ymm0
- vpxor ymm13, ymm13, ymm1
- vpxor ymm14, ymm14, ymm2
- vpxor ymm15, ymm15, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpshufb ymm15, ymm15, ymm8
- vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxor ymm4, ymm4, ymm8
- vpxor ymm5, ymm5, ymm9
- vpxor ymm6, ymm6, ymm10
- vpxor ymm7, ymm7, ymm11
- vmovdqa ymmword ptr [rsp+0x200], ymm8
- vpsrld ymm8, ymm4, 7
- vpslld ymm4, ymm4, 25
- vpor ymm4, ymm4, ymm8
- vpsrld ymm8, ymm5, 7
- vpslld ymm5, ymm5, 25
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 7
- vpslld ymm6, ymm6, 25
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 7
- vpslld ymm7, ymm7, 25
- vpor ymm7, ymm7, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+0x100]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+0x180]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0]
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxor ymm15, ymm15, ymm0
- vpxor ymm12, ymm12, ymm1
- vpxor ymm13, ymm13, ymm2
- vpxor ymm14, ymm14, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
- vpshufb ymm15, ymm15, ymm8
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
- vpaddd ymm9, ymm9, ymm14
- vpxor ymm5, ymm5, ymm10
- vpxor ymm6, ymm6, ymm11
- vpxor ymm7, ymm7, ymm8
- vpxor ymm4, ymm4, ymm9
- vmovdqa ymmword ptr [rsp+0x200], ymm8
- vpsrld ymm8, ymm5, 12
- vpslld ymm5, ymm5, 20
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 12
- vpslld ymm6, ymm6, 20
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 12
- vpslld ymm7, ymm7, 20
- vpor ymm7, ymm7, ymm8
- vpsrld ymm8, ymm4, 12
- vpslld ymm4, ymm4, 20
- vpor ymm4, ymm4, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+0x120]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1A0]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0]
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxor ymm15, ymm15, ymm0
- vpxor ymm12, ymm12, ymm1
- vpxor ymm13, ymm13, ymm2
- vpxor ymm14, ymm14, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
- vpshufb ymm15, ymm15, ymm8
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
- vpaddd ymm9, ymm9, ymm14
- vpxor ymm5, ymm5, ymm10
- vpxor ymm6, ymm6, ymm11
- vpxor ymm7, ymm7, ymm8
- vpxor ymm4, ymm4, ymm9
- vmovdqa ymmword ptr [rsp+0x200], ymm8
- vpsrld ymm8, ymm5, 7
- vpslld ymm5, ymm5, 25
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 7
- vpslld ymm6, ymm6, 25
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 7
- vpslld ymm7, ymm7, 25
- vpor ymm7, ymm7, ymm8
- vpsrld ymm8, ymm4, 7
- vpslld ymm4, ymm4, 25
- vpor ymm4, ymm4, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+0x40]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+0xE0]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80]
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxor ymm12, ymm12, ymm0
- vpxor ymm13, ymm13, ymm1
- vpxor ymm14, ymm14, ymm2
- vpxor ymm15, ymm15, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpshufb ymm15, ymm15, ymm8
- vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxor ymm4, ymm4, ymm8
- vpxor ymm5, ymm5, ymm9
- vpxor ymm6, ymm6, ymm10
- vpxor ymm7, ymm7, ymm11
- vmovdqa ymmword ptr [rsp+0x200], ymm8
- vpsrld ymm8, ymm4, 12
- vpslld ymm4, ymm4, 20
- vpor ymm4, ymm4, ymm8
- vpsrld ymm8, ymm5, 12
- vpslld ymm5, ymm5, 20
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 12
- vpslld ymm6, ymm6, 20
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 12
- vpslld ymm7, ymm7, 20
- vpor ymm7, ymm7, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+0xC0]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140]
- vpaddd ymm2, ymm2, ymmword ptr [rsp]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0]
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxor ymm12, ymm12, ymm0
- vpxor ymm13, ymm13, ymm1
- vpxor ymm14, ymm14, ymm2
- vpxor ymm15, ymm15, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpshufb ymm15, ymm15, ymm8
- vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxor ymm4, ymm4, ymm8
- vpxor ymm5, ymm5, ymm9
- vpxor ymm6, ymm6, ymm10
- vpxor ymm7, ymm7, ymm11
- vmovdqa ymmword ptr [rsp+0x200], ymm8
- vpsrld ymm8, ymm4, 7
- vpslld ymm4, ymm4, 25
- vpor ymm4, ymm4, ymm8
- vpsrld ymm8, ymm5, 7
- vpslld ymm5, ymm5, 25
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 7
- vpslld ymm6, ymm6, 25
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 7
- vpslld ymm7, ymm7, 25
- vpor ymm7, ymm7, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+0x20]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+0x120]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0]
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxor ymm15, ymm15, ymm0
- vpxor ymm12, ymm12, ymm1
- vpxor ymm13, ymm13, ymm2
- vpxor ymm14, ymm14, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
- vpshufb ymm15, ymm15, ymm8
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
- vpaddd ymm9, ymm9, ymm14
- vpxor ymm5, ymm5, ymm10
- vpxor ymm6, ymm6, ymm11
- vpxor ymm7, ymm7, ymm8
- vpxor ymm4, ymm4, ymm9
- vmovdqa ymmword ptr [rsp+0x200], ymm8
- vpsrld ymm8, ymm5, 12
- vpslld ymm5, ymm5, 20
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 12
- vpslld ymm6, ymm6, 20
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 12
- vpslld ymm7, ymm7, 20
- vpor ymm7, ymm7, ymm8
- vpsrld ymm8, ymm4, 12
- vpslld ymm4, ymm4, 20
- vpor ymm4, ymm4, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+0x160]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1C0]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100]
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxor ymm15, ymm15, ymm0
- vpxor ymm12, ymm12, ymm1
- vpxor ymm13, ymm13, ymm2
- vpxor ymm14, ymm14, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
- vpshufb ymm15, ymm15, ymm8
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
- vpaddd ymm9, ymm9, ymm14
- vpxor ymm5, ymm5, ymm10
- vpxor ymm6, ymm6, ymm11
- vpxor ymm7, ymm7, ymm8
- vpxor ymm4, ymm4, ymm9
- vmovdqa ymmword ptr [rsp+0x200], ymm8
- vpsrld ymm8, ymm5, 7
- vpslld ymm5, ymm5, 25
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 7
- vpslld ymm6, ymm6, 25
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 7
- vpslld ymm7, ymm7, 25
- vpor ymm7, ymm7, ymm8
- vpsrld ymm8, ymm4, 7
- vpslld ymm4, ymm4, 25
- vpor ymm4, ymm4, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+0x60]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1A0]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0]
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxor ymm12, ymm12, ymm0
- vpxor ymm13, ymm13, ymm1
- vpxor ymm14, ymm14, ymm2
- vpxor ymm15, ymm15, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpshufb ymm15, ymm15, ymm8
- vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxor ymm4, ymm4, ymm8
- vpxor ymm5, ymm5, ymm9
- vpxor ymm6, ymm6, ymm10
- vpxor ymm7, ymm7, ymm11
- vmovdqa ymmword ptr [rsp+0x200], ymm8
- vpsrld ymm8, ymm4, 12
- vpslld ymm4, ymm4, 20
- vpor ymm4, ymm4, ymm8
- vpsrld ymm8, ymm5, 12
- vpslld ymm5, ymm5, 20
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 12
- vpslld ymm6, ymm6, 20
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 12
- vpslld ymm7, ymm7, 20
- vpor ymm7, ymm7, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+0x80]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+0x40]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0]
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxor ymm12, ymm12, ymm0
- vpxor ymm13, ymm13, ymm1
- vpxor ymm14, ymm14, ymm2
- vpxor ymm15, ymm15, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpshufb ymm15, ymm15, ymm8
- vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxor ymm4, ymm4, ymm8
- vpxor ymm5, ymm5, ymm9
- vpxor ymm6, ymm6, ymm10
- vpxor ymm7, ymm7, ymm11
- vmovdqa ymmword ptr [rsp+0x200], ymm8
- vpsrld ymm8, ymm4, 7
- vpslld ymm4, ymm4, 25
- vpor ymm4, ymm4, ymm8
- vpsrld ymm8, ymm5, 7
- vpslld ymm5, ymm5, 25
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 7
- vpslld ymm6, ymm6, 25
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 7
- vpslld ymm7, ymm7, 25
- vpor ymm7, ymm7, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+0xC0]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+0x160]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100]
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxor ymm15, ymm15, ymm0
- vpxor ymm12, ymm12, ymm1
- vpxor ymm13, ymm13, ymm2
- vpxor ymm14, ymm14, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
- vpshufb ymm15, ymm15, ymm8
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
- vpaddd ymm9, ymm9, ymm14
- vpxor ymm5, ymm5, ymm10
- vpxor ymm6, ymm6, ymm11
- vpxor ymm7, ymm7, ymm8
- vpxor ymm4, ymm4, ymm9
- vmovdqa ymmword ptr [rsp+0x200], ymm8
- vpsrld ymm8, ymm5, 12
- vpslld ymm5, ymm5, 20
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 12
- vpslld ymm6, ymm6, 20
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 12
- vpslld ymm7, ymm7, 20
- vpor ymm7, ymm7, ymm8
- vpsrld ymm8, ymm4, 12
- vpslld ymm4, ymm4, 20
- vpor ymm4, ymm4, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+0xA0]
- vpaddd ymm1, ymm1, ymmword ptr [rsp]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1E0]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20]
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxor ymm15, ymm15, ymm0
- vpxor ymm12, ymm12, ymm1
- vpxor ymm13, ymm13, ymm2
- vpxor ymm14, ymm14, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
- vpshufb ymm15, ymm15, ymm8
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
- vpaddd ymm9, ymm9, ymm14
- vpxor ymm5, ymm5, ymm10
- vpxor ymm6, ymm6, ymm11
- vpxor ymm7, ymm7, ymm8
- vpxor ymm4, ymm4, ymm9
- vmovdqa ymmword ptr [rsp+0x200], ymm8
- vpsrld ymm8, ymm5, 7
- vpslld ymm5, ymm5, 25
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 7
- vpslld ymm6, ymm6, 25
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 7
- vpslld ymm7, ymm7, 25
- vpor ymm7, ymm7, ymm8
- vpsrld ymm8, ymm4, 7
- vpslld ymm4, ymm4, 25
- vpor ymm4, ymm4, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+0x140]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1C0]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0]
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxor ymm12, ymm12, ymm0
- vpxor ymm13, ymm13, ymm1
- vpxor ymm14, ymm14, ymm2
- vpxor ymm15, ymm15, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpshufb ymm15, ymm15, ymm8
- vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxor ymm4, ymm4, ymm8
- vpxor ymm5, ymm5, ymm9
- vpxor ymm6, ymm6, ymm10
- vpxor ymm7, ymm7, ymm11
- vmovdqa ymmword ptr [rsp+0x200], ymm8
- vpsrld ymm8, ymm4, 12
- vpslld ymm4, ymm4, 20
- vpor ymm4, ymm4, ymm8
- vpsrld ymm8, ymm5, 12
- vpslld ymm5, ymm5, 20
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 12
- vpslld ymm6, ymm6, 20
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 12
- vpslld ymm7, ymm7, 20
- vpor ymm7, ymm7, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+0xE0]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+0x60]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0]
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxor ymm12, ymm12, ymm0
- vpxor ymm13, ymm13, ymm1
- vpxor ymm14, ymm14, ymm2
- vpxor ymm15, ymm15, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpshufb ymm15, ymm15, ymm8
- vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxor ymm4, ymm4, ymm8
- vpxor ymm5, ymm5, ymm9
- vpxor ymm6, ymm6, ymm10
- vpxor ymm7, ymm7, ymm11
- vmovdqa ymmword ptr [rsp+0x200], ymm8
- vpsrld ymm8, ymm4, 7
- vpslld ymm4, ymm4, 25
- vpor ymm4, ymm4, ymm8
- vpsrld ymm8, ymm5, 7
- vpslld ymm5, ymm5, 25
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 7
- vpslld ymm6, ymm6, 25
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 7
- vpslld ymm7, ymm7, 25
- vpor ymm7, ymm7, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+0x80]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+0xA0]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20]
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxor ymm15, ymm15, ymm0
- vpxor ymm12, ymm12, ymm1
- vpxor ymm13, ymm13, ymm2
- vpxor ymm14, ymm14, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
- vpshufb ymm15, ymm15, ymm8
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
- vpaddd ymm9, ymm9, ymm14
- vpxor ymm5, ymm5, ymm10
- vpxor ymm6, ymm6, ymm11
- vpxor ymm7, ymm7, ymm8
- vpxor ymm4, ymm4, ymm9
- vmovdqa ymmword ptr [rsp+0x200], ymm8
- vpsrld ymm8, ymm5, 12
- vpslld ymm5, ymm5, 20
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 12
- vpslld ymm6, ymm6, 20
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 12
- vpslld ymm7, ymm7, 20
- vpor ymm7, ymm7, ymm8
- vpsrld ymm8, ymm4, 12
- vpslld ymm4, ymm4, 20
- vpor ymm4, ymm4, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+0x100]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0]
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxor ymm15, ymm15, ymm0
- vpxor ymm12, ymm12, ymm1
- vpxor ymm13, ymm13, ymm2
- vpxor ymm14, ymm14, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
- vpshufb ymm15, ymm15, ymm8
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
- vpaddd ymm9, ymm9, ymm14
- vpxor ymm5, ymm5, ymm10
- vpxor ymm6, ymm6, ymm11
- vpxor ymm7, ymm7, ymm8
- vpxor ymm4, ymm4, ymm9
- vmovdqa ymmword ptr [rsp+0x200], ymm8
- vpsrld ymm8, ymm5, 7
- vpslld ymm5, ymm5, 25
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 7
- vpslld ymm6, ymm6, 25
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 7
- vpslld ymm7, ymm7, 25
- vpor ymm7, ymm7, ymm8
- vpsrld ymm8, ymm4, 7
- vpslld ymm4, ymm4, 25
- vpor ymm4, ymm4, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+0x180]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1E0]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0]
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxor ymm12, ymm12, ymm0
- vpxor ymm13, ymm13, ymm1
- vpxor ymm14, ymm14, ymm2
- vpxor ymm15, ymm15, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpshufb ymm15, ymm15, ymm8
- vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxor ymm4, ymm4, ymm8
- vpxor ymm5, ymm5, ymm9
- vpxor ymm6, ymm6, ymm10
- vpxor ymm7, ymm7, ymm11
- vmovdqa ymmword ptr [rsp+0x200], ymm8
- vpsrld ymm8, ymm4, 12
- vpslld ymm4, ymm4, 20
- vpor ymm4, ymm4, ymm8
- vpsrld ymm8, ymm5, 12
- vpslld ymm5, ymm5, 20
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 12
- vpslld ymm6, ymm6, 20
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 12
- vpslld ymm7, ymm7, 20
- vpor ymm7, ymm7, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1A0]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+0x140]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100]
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxor ymm12, ymm12, ymm0
- vpxor ymm13, ymm13, ymm1
- vpxor ymm14, ymm14, ymm2
- vpxor ymm15, ymm15, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpshufb ymm15, ymm15, ymm8
- vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxor ymm4, ymm4, ymm8
- vpxor ymm5, ymm5, ymm9
- vpxor ymm6, ymm6, ymm10
- vpxor ymm7, ymm7, ymm11
- vmovdqa ymmword ptr [rsp+0x200], ymm8
- vpsrld ymm8, ymm4, 7
- vpslld ymm4, ymm4, 25
- vpor ymm4, ymm4, ymm8
- vpsrld ymm8, ymm5, 7
- vpslld ymm5, ymm5, 25
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 7
- vpslld ymm6, ymm6, 25
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 7
- vpslld ymm7, ymm7, 25
- vpor ymm7, ymm7, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+0xE0]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0]
- vpaddd ymm2, ymm2, ymmword ptr [rsp]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0]
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxor ymm15, ymm15, ymm0
- vpxor ymm12, ymm12, ymm1
- vpxor ymm13, ymm13, ymm2
- vpxor ymm14, ymm14, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
- vpshufb ymm15, ymm15, ymm8
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
- vpaddd ymm9, ymm9, ymm14
- vpxor ymm5, ymm5, ymm10
- vpxor ymm6, ymm6, ymm11
- vpxor ymm7, ymm7, ymm8
- vpxor ymm4, ymm4, ymm9
- vmovdqa ymmword ptr [rsp+0x200], ymm8
- vpsrld ymm8, ymm5, 12
- vpslld ymm5, ymm5, 20
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 12
- vpslld ymm6, ymm6, 20
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 12
- vpslld ymm7, ymm7, 20
- vpor ymm7, ymm7, ymm8
- vpsrld ymm8, ymm4, 12
- vpslld ymm4, ymm4, 20
- vpor ymm4, ymm4, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+0x40]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+0x20]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80]
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxor ymm15, ymm15, ymm0
- vpxor ymm12, ymm12, ymm1
- vpxor ymm13, ymm13, ymm2
- vpxor ymm14, ymm14, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
- vpshufb ymm15, ymm15, ymm8
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
- vpaddd ymm9, ymm9, ymm14
- vpxor ymm5, ymm5, ymm10
- vpxor ymm6, ymm6, ymm11
- vpxor ymm7, ymm7, ymm8
- vpxor ymm4, ymm4, ymm9
- vmovdqa ymmword ptr [rsp+0x200], ymm8
- vpsrld ymm8, ymm5, 7
- vpslld ymm5, ymm5, 25
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 7
- vpslld ymm6, ymm6, 25
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 7
- vpslld ymm7, ymm7, 25
- vpor ymm7, ymm7, ymm8
- vpsrld ymm8, ymm4, 7
- vpslld ymm4, ymm4, 25
- vpor ymm4, ymm4, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+0x120]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+0x100]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0]
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxor ymm12, ymm12, ymm0
- vpxor ymm13, ymm13, ymm1
- vpxor ymm14, ymm14, ymm2
- vpxor ymm15, ymm15, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpshufb ymm15, ymm15, ymm8
- vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxor ymm4, ymm4, ymm8
- vpxor ymm5, ymm5, ymm9
- vpxor ymm6, ymm6, ymm10
- vpxor ymm7, ymm7, ymm11
- vmovdqa ymmword ptr [rsp+0x200], ymm8
- vpsrld ymm8, ymm4, 12
- vpslld ymm4, ymm4, 20
- vpor ymm4, ymm4, ymm8
- vpsrld ymm8, ymm5, 12
- vpslld ymm5, ymm5, 20
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 12
- vpslld ymm6, ymm6, 20
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 12
- vpslld ymm7, ymm7, 20
- vpor ymm7, ymm7, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1C0]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+0x180]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20]
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxor ymm12, ymm12, ymm0
- vpxor ymm13, ymm13, ymm1
- vpxor ymm14, ymm14, ymm2
- vpxor ymm15, ymm15, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpshufb ymm15, ymm15, ymm8
- vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxor ymm4, ymm4, ymm8
- vpxor ymm5, ymm5, ymm9
- vpxor ymm6, ymm6, ymm10
- vpxor ymm7, ymm7, ymm11
- vmovdqa ymmword ptr [rsp+0x200], ymm8
- vpsrld ymm8, ymm4, 7
- vpslld ymm4, ymm4, 25
- vpor ymm4, ymm4, ymm8
- vpsrld ymm8, ymm5, 7
- vpslld ymm5, ymm5, 25
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 7
- vpslld ymm6, ymm6, 25
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 7
- vpslld ymm7, ymm7, 25
- vpor ymm7, ymm7, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1A0]
- vpaddd ymm1, ymm1, ymmword ptr [rsp]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+0x40]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80]
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxor ymm15, ymm15, ymm0
- vpxor ymm12, ymm12, ymm1
- vpxor ymm13, ymm13, ymm2
- vpxor ymm14, ymm14, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
- vpshufb ymm15, ymm15, ymm8
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
- vpaddd ymm9, ymm9, ymm14
- vpxor ymm5, ymm5, ymm10
- vpxor ymm6, ymm6, ymm11
- vpxor ymm7, ymm7, ymm8
- vpxor ymm4, ymm4, ymm9
- vmovdqa ymmword ptr [rsp+0x200], ymm8
- vpsrld ymm8, ymm5, 12
- vpslld ymm5, ymm5, 20
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 12
- vpslld ymm6, ymm6, 20
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 12
- vpslld ymm7, ymm7, 20
- vpor ymm7, ymm7, ymm8
- vpsrld ymm8, ymm4, 12
- vpslld ymm4, ymm4, 20
- vpor ymm4, ymm4, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+0x60]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+0xC0]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0]
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxor ymm15, ymm15, ymm0
- vpxor ymm12, ymm12, ymm1
- vpxor ymm13, ymm13, ymm2
- vpxor ymm14, ymm14, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
- vpshufb ymm15, ymm15, ymm8
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
- vpaddd ymm9, ymm9, ymm14
- vpxor ymm5, ymm5, ymm10
- vpxor ymm6, ymm6, ymm11
- vpxor ymm7, ymm7, ymm8
- vpxor ymm4, ymm4, ymm9
- vmovdqa ymmword ptr [rsp+0x200], ymm8
- vpsrld ymm8, ymm5, 7
- vpslld ymm5, ymm5, 25
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 7
- vpslld ymm6, ymm6, 25
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 7
- vpslld ymm7, ymm7, 25
- vpor ymm7, ymm7, ymm8
- vpsrld ymm8, ymm4, 7
- vpslld ymm4, ymm4, 25
- vpor ymm4, ymm4, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+0x160]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+0x20]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100]
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxor ymm12, ymm12, ymm0
- vpxor ymm13, ymm13, ymm1
- vpxor ymm14, ymm14, ymm2
- vpxor ymm15, ymm15, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpshufb ymm15, ymm15, ymm8
- vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxor ymm4, ymm4, ymm8
- vpxor ymm5, ymm5, ymm9
- vpxor ymm6, ymm6, ymm10
- vpxor ymm7, ymm7, ymm11
- vmovdqa ymmword ptr [rsp+0x200], ymm8
- vpsrld ymm8, ymm4, 12
- vpslld ymm4, ymm4, 20
- vpor ymm4, ymm4, ymm8
- vpsrld ymm8, ymm5, 12
- vpslld ymm5, ymm5, 20
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 12
- vpslld ymm6, ymm6, 20
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 12
- vpslld ymm7, ymm7, 20
- vpor ymm7, ymm7, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1E0]
- vpaddd ymm1, ymm1, ymmword ptr [rsp]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+0x120]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0]
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxor ymm12, ymm12, ymm0
- vpxor ymm13, ymm13, ymm1
- vpxor ymm14, ymm14, ymm2
- vpxor ymm15, ymm15, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpshufb ymm15, ymm15, ymm8
- vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxor ymm4, ymm4, ymm8
- vpxor ymm5, ymm5, ymm9
- vpxor ymm6, ymm6, ymm10
- vpxor ymm7, ymm7, ymm11
- vmovdqa ymmword ptr [rsp+0x200], ymm8
- vpsrld ymm8, ymm4, 7
- vpslld ymm4, ymm4, 25
- vpor ymm4, ymm4, ymm8
- vpsrld ymm8, ymm5, 7
- vpslld ymm5, ymm5, 25
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 7
- vpslld ymm6, ymm6, 25
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 7
- vpslld ymm7, ymm7, 25
- vpor ymm7, ymm7, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1C0]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+0x60]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0]
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxor ymm15, ymm15, ymm0
- vpxor ymm12, ymm12, ymm1
- vpxor ymm13, ymm13, ymm2
- vpxor ymm14, ymm14, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
- vpshufb ymm15, ymm15, ymm8
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
- vpaddd ymm9, ymm9, ymm14
- vpxor ymm5, ymm5, ymm10
- vpxor ymm6, ymm6, ymm11
- vpxor ymm7, ymm7, ymm8
- vpxor ymm4, ymm4, ymm9
- vmovdqa ymmword ptr [rsp+0x200], ymm8
- vpsrld ymm8, ymm5, 12
- vpslld ymm5, ymm5, 20
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 12
- vpslld ymm6, ymm6, 20
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 12
- vpslld ymm7, ymm7, 20
- vpor ymm7, ymm7, ymm8
- vpsrld ymm8, ymm4, 12
- vpslld ymm4, ymm4, 20
- vpor ymm4, ymm4, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+0x140]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+0x80]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0]
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxor ymm15, ymm15, ymm0
- vpxor ymm12, ymm12, ymm1
- vpxor ymm13, ymm13, ymm2
- vpxor ymm14, ymm14, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
- vpshufb ymm15, ymm15, ymm8
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
- vpaddd ymm9, ymm9, ymm14
- vpxor ymm5, ymm5, ymm10
- vpxor ymm6, ymm6, ymm11
- vpxor ymm7, ymm7, ymm8
- vpxor ymm4, ymm4, ymm9
- vpxor ymm0, ymm0, ymm8
- vpxor ymm1, ymm1, ymm9
- vpxor ymm2, ymm2, ymm10
- vpxor ymm3, ymm3, ymm11
- vpsrld ymm8, ymm5, 7
- vpslld ymm5, ymm5, 25
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 7
- vpslld ymm6, ymm6, 25
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 7
- vpslld ymm7, ymm7, 25
- vpor ymm7, ymm7, ymm8
- vpsrld ymm8, ymm4, 7
- vpslld ymm4, ymm4, 25
- vpor ymm4, ymm4, ymm8
- vpxor ymm4, ymm4, ymm12
- vpxor ymm5, ymm5, ymm13
- vpxor ymm6, ymm6, ymm14
- vpxor ymm7, ymm7, ymm15
- movzx eax, byte ptr [rbp+0x38]
- jne 9b
- mov rbx, qword ptr [rbp+0x50]
- vunpcklps ymm8, ymm0, ymm1
- vunpcklps ymm9, ymm2, ymm3
- vunpckhps ymm10, ymm0, ymm1
- vunpcklps ymm11, ymm4, ymm5
- vunpcklps ymm0, ymm6, ymm7
- vshufps ymm12, ymm8, ymm9, 78
- vblendps ymm1, ymm8, ymm12, 0xCC
- vshufps ymm8, ymm11, ymm0, 78
- vunpckhps ymm13, ymm2, ymm3
- vblendps ymm2, ymm11, ymm8, 0xCC
- vblendps ymm3, ymm12, ymm9, 0xCC
- vperm2f128 ymm12, ymm1, ymm2, 0x20
- vmovups ymmword ptr [rbx], ymm12
- vunpckhps ymm14, ymm4, ymm5
- vblendps ymm4, ymm8, ymm0, 0xCC
- vunpckhps ymm15, ymm6, ymm7
- vperm2f128 ymm7, ymm3, ymm4, 0x20
- vmovups ymmword ptr [rbx+0x20], ymm7
- vshufps ymm5, ymm10, ymm13, 78
- vblendps ymm6, ymm5, ymm13, 0xCC
- vshufps ymm13, ymm14, ymm15, 78
- vblendps ymm10, ymm10, ymm5, 0xCC
- vblendps ymm14, ymm14, ymm13, 0xCC
- vperm2f128 ymm8, ymm10, ymm14, 0x20
- vmovups ymmword ptr [rbx+0x40], ymm8
- vblendps ymm15, ymm13, ymm15, 0xCC
- vperm2f128 ymm13, ymm6, ymm15, 0x20
- vmovups ymmword ptr [rbx+0x60], ymm13
- vperm2f128 ymm9, ymm1, ymm2, 0x31
- vperm2f128 ymm11, ymm3, ymm4, 0x31
- vmovups ymmword ptr [rbx+0x80], ymm9
- vperm2f128 ymm14, ymm10, ymm14, 0x31
- vperm2f128 ymm15, ymm6, ymm15, 0x31
- vmovups ymmword ptr [rbx+0xA0], ymm11
- vmovups ymmword ptr [rbx+0xC0], ymm14
- vmovups ymmword ptr [rbx+0xE0], ymm15
- vmovdqa ymm0, ymmword ptr [rsp+0x220]
- vpaddd ymm1, ymm0, ymmword ptr [rsp+0x240]
- vmovdqa ymmword ptr [rsp+0x240], ymm1
- vpxor ymm0, ymm0, ymmword ptr [CMP_MSB_MASK+rip]
- vpxor ymm2, ymm1, ymmword ptr [CMP_MSB_MASK+rip]
- vpcmpgtd ymm2, ymm0, ymm2
- vmovdqa ymm0, ymmword ptr [rsp+0x260]
- vpsubd ymm2, ymm0, ymm2
- vmovdqa ymmword ptr [rsp+0x260], ymm2
- add rdi, 64
- add rbx, 256
- mov qword ptr [rbp+0x50], rbx
- sub rsi, 8
- cmp rsi, 8
- jnc 2b
- test rsi, rsi
- jnz 3f
-4:
- vzeroupper
- mov rsp, rbp
- pop rbp
- pop rbx
- pop r12
- pop r13
- pop r14
- pop r15
- ret
-.p2align 5
-3:
- mov rbx, qword ptr [rbp+0x50]
- mov r15, qword ptr [rsp+0x2A0]
- movzx r13d, byte ptr [rbp+0x38]
- movzx r12d, byte ptr [rbp+0x48]
- test rsi, 0x4
- je 3f
- vbroadcasti128 ymm0, xmmword ptr [rcx]
- vbroadcasti128 ymm1, xmmword ptr [rcx+0x10]
- vmovdqa ymm8, ymm0
- vmovdqa ymm9, ymm1
- vbroadcasti128 ymm12, xmmword ptr [rsp+0x240]
- vbroadcasti128 ymm13, xmmword ptr [rsp+0x260]
- vpunpckldq ymm14, ymm12, ymm13
- vpunpckhdq ymm15, ymm12, ymm13
- vpermq ymm14, ymm14, 0x50
- vpermq ymm15, ymm15, 0x50
- vbroadcasti128 ymm12, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
- vpblendd ymm14, ymm14, ymm12, 0x44
- vpblendd ymm15, ymm15, ymm12, 0x44
- vmovdqa ymmword ptr [rsp], ymm14
- vmovdqa ymmword ptr [rsp+0x20], ymm15
- mov r8, qword ptr [rdi]
- mov r9, qword ptr [rdi+0x8]
- mov r10, qword ptr [rdi+0x10]
- mov r11, qword ptr [rdi+0x18]
- movzx eax, byte ptr [rbp+0x40]
- or eax, r13d
- xor edx, edx
-.p2align 5
-2:
- mov r14d, eax
- or eax, r12d
- add rdx, 64
- cmp rdx, r15
- cmovne eax, r14d
- mov dword ptr [rsp+0x200], eax
- vmovups ymm2, ymmword ptr [r8+rdx-0x40]
- vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x40], 0x01
- vmovups ymm3, ymmword ptr [r8+rdx-0x30]
- vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x30], 0x01
- vshufps ymm4, ymm2, ymm3, 136
- vshufps ymm5, ymm2, ymm3, 221
- vmovups ymm2, ymmword ptr [r8+rdx-0x20]
- vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x20], 0x01
- vmovups ymm3, ymmword ptr [r8+rdx-0x10]
- vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x10], 0x01
- vshufps ymm6, ymm2, ymm3, 136
- vshufps ymm7, ymm2, ymm3, 221
- vpshufd ymm6, ymm6, 0x93
- vpshufd ymm7, ymm7, 0x93
- vmovups ymm10, ymmword ptr [r10+rdx-0x40]
- vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x40], 0x01
- vmovups ymm11, ymmword ptr [r10+rdx-0x30]
- vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x30], 0x01
- vshufps ymm12, ymm10, ymm11, 136
- vshufps ymm13, ymm10, ymm11, 221
- vmovups ymm10, ymmword ptr [r10+rdx-0x20]
- vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x20], 0x01
- vmovups ymm11, ymmword ptr [r10+rdx-0x10]
- vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x10], 0x01
- vshufps ymm14, ymm10, ymm11, 136
- vshufps ymm15, ymm10, ymm11, 221
- vpshufd ymm14, ymm14, 0x93
- vpshufd ymm15, ymm15, 0x93
- prefetcht0 [r8+rdx+0x80]
- prefetcht0 [r9+rdx+0x80]
- prefetcht0 [r10+rdx+0x80]
- prefetcht0 [r11+rdx+0x80]
- vpbroadcastd ymm2, dword ptr [rsp+0x200]
- vmovdqa ymm3, ymmword ptr [rsp]
- vmovdqa ymm11, ymmword ptr [rsp+0x20]
- vpblendd ymm3, ymm3, ymm2, 0x88
- vpblendd ymm11, ymm11, ymm2, 0x88
- vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip]
- vmovdqa ymm10, ymm2
- mov al, 7
-9:
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm8, ymm8, ymm12
- vmovdqa ymmword ptr [rsp+0x40], ymm4
- nop
- vmovdqa ymmword ptr [rsp+0x60], ymm12
- nop
- vpaddd ymm0, ymm0, ymm1
- vpaddd ymm8, ymm8, ymm9
- vpxor ymm3, ymm3, ymm0
- vpxor ymm11, ymm11, ymm8
- vbroadcasti128 ymm4, xmmword ptr [ROT16+rip]
- vpshufb ymm3, ymm3, ymm4
- vpshufb ymm11, ymm11, ymm4
- vpaddd ymm2, ymm2, ymm3
- vpaddd ymm10, ymm10, ymm11
- vpxor ymm1, ymm1, ymm2
- vpxor ymm9, ymm9, ymm10
- vpsrld ymm4, ymm1, 12
- vpslld ymm1, ymm1, 20
- vpor ymm1, ymm1, ymm4
- vpsrld ymm4, ymm9, 12
- vpslld ymm9, ymm9, 20
- vpor ymm9, ymm9, ymm4
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm8, ymm8, ymm13
- vpaddd ymm0, ymm0, ymm1
- vpaddd ymm8, ymm8, ymm9
- vmovdqa ymmword ptr [rsp+0x80], ymm5
- vmovdqa ymmword ptr [rsp+0xA0], ymm13
- vpxor ymm3, ymm3, ymm0
- vpxor ymm11, ymm11, ymm8
- vbroadcasti128 ymm4, xmmword ptr [ROT8+rip]
- vpshufb ymm3, ymm3, ymm4
- vpshufb ymm11, ymm11, ymm4
- vpaddd ymm2, ymm2, ymm3
- vpaddd ymm10, ymm10, ymm11
- vpxor ymm1, ymm1, ymm2
- vpxor ymm9, ymm9, ymm10
- vpsrld ymm4, ymm1, 7
- vpslld ymm1, ymm1, 25
- vpor ymm1, ymm1, ymm4
- vpsrld ymm4, ymm9, 7
- vpslld ymm9, ymm9, 25
- vpor ymm9, ymm9, ymm4
- vpshufd ymm0, ymm0, 0x93
- vpshufd ymm8, ymm8, 0x93
- vpshufd ymm3, ymm3, 0x4E
- vpshufd ymm11, ymm11, 0x4E
- vpshufd ymm2, ymm2, 0x39
- vpshufd ymm10, ymm10, 0x39
- vpaddd ymm0, ymm0, ymm6
- vpaddd ymm8, ymm8, ymm14
- vpaddd ymm0, ymm0, ymm1
- vpaddd ymm8, ymm8, ymm9
- vpxor ymm3, ymm3, ymm0
- vpxor ymm11, ymm11, ymm8
- vbroadcasti128 ymm4, xmmword ptr [ROT16+rip]
- vpshufb ymm3, ymm3, ymm4
- vpshufb ymm11, ymm11, ymm4
- vpaddd ymm2, ymm2, ymm3
- vpaddd ymm10, ymm10, ymm11
- vpxor ymm1, ymm1, ymm2
- vpxor ymm9, ymm9, ymm10
- vpsrld ymm4, ymm1, 12
- vpslld ymm1, ymm1, 20
- vpor ymm1, ymm1, ymm4
- vpsrld ymm4, ymm9, 12
- vpslld ymm9, ymm9, 20
- vpor ymm9, ymm9, ymm4
- vpaddd ymm0, ymm0, ymm7
- vpaddd ymm8, ymm8, ymm15
- vpaddd ymm0, ymm0, ymm1
- vpaddd ymm8, ymm8, ymm9
- vpxor ymm3, ymm3, ymm0
- vpxor ymm11, ymm11, ymm8
- vbroadcasti128 ymm4, xmmword ptr [ROT8+rip]
- vpshufb ymm3, ymm3, ymm4
- vpshufb ymm11, ymm11, ymm4
- vpaddd ymm2, ymm2, ymm3
- vpaddd ymm10, ymm10, ymm11
- vpxor ymm1, ymm1, ymm2
- vpxor ymm9, ymm9, ymm10
- vpsrld ymm4, ymm1, 7
- vpslld ymm1, ymm1, 25
- vpor ymm1, ymm1, ymm4
- vpsrld ymm4, ymm9, 7
- vpslld ymm9, ymm9, 25
- vpor ymm9, ymm9, ymm4
- vpshufd ymm0, ymm0, 0x39
- vpshufd ymm8, ymm8, 0x39
- vpshufd ymm3, ymm3, 0x4E
- vpshufd ymm11, ymm11, 0x4E
- vpshufd ymm2, ymm2, 0x93
- vpshufd ymm10, ymm10, 0x93
- dec al
- je 9f
- vmovdqa ymm4, ymmword ptr [rsp+0x40]
- vmovdqa ymm5, ymmword ptr [rsp+0x80]
- vshufps ymm12, ymm4, ymm5, 214
- vpshufd ymm13, ymm4, 0x0F
- vpshufd ymm4, ymm12, 0x39
- vshufps ymm12, ymm6, ymm7, 250
- vpblendd ymm13, ymm13, ymm12, 0xAA
- vpunpcklqdq ymm12, ymm7, ymm5
- vpblendd ymm12, ymm12, ymm6, 0x88
- vpshufd ymm12, ymm12, 0x78
- vpunpckhdq ymm5, ymm5, ymm7
- vpunpckldq ymm6, ymm6, ymm5
- vpshufd ymm7, ymm6, 0x1E
- vmovdqa ymmword ptr [rsp+0x40], ymm13
- vmovdqa ymmword ptr [rsp+0x80], ymm12
- vmovdqa ymm12, ymmword ptr [rsp+0x60]
- vmovdqa ymm13, ymmword ptr [rsp+0xA0]
- vshufps ymm5, ymm12, ymm13, 214
- vpshufd ymm6, ymm12, 0x0F
- vpshufd ymm12, ymm5, 0x39
- vshufps ymm5, ymm14, ymm15, 250
- vpblendd ymm6, ymm6, ymm5, 0xAA
- vpunpcklqdq ymm5, ymm15, ymm13
- vpblendd ymm5, ymm5, ymm14, 0x88
- vpshufd ymm5, ymm5, 0x78
- vpunpckhdq ymm13, ymm13, ymm15
- vpunpckldq ymm14, ymm14, ymm13
- vpshufd ymm15, ymm14, 0x1E
- vmovdqa ymm13, ymm6
- vmovdqa ymm14, ymm5
- vmovdqa ymm5, ymmword ptr [rsp+0x40]
- vmovdqa ymm6, ymmword ptr [rsp+0x80]
- jmp 9b
-9:
- vpxor ymm0, ymm0, ymm2
- vpxor ymm1, ymm1, ymm3
- vpxor ymm8, ymm8, ymm10
- vpxor ymm9, ymm9, ymm11
- mov eax, r13d
- cmp rdx, r15
- jne 2b
- vmovdqu xmmword ptr [rbx], xmm0
- vmovdqu xmmword ptr [rbx+0x10], xmm1
- vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01
- vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01
- vmovdqu xmmword ptr [rbx+0x40], xmm8
- vmovdqu xmmword ptr [rbx+0x50], xmm9
- vextracti128 xmmword ptr [rbx+0x60], ymm8, 0x01
- vextracti128 xmmword ptr [rbx+0x70], ymm9, 0x01
- vmovaps xmm8, xmmword ptr [rsp+0x280]
- vmovaps xmm0, xmmword ptr [rsp+0x240]
- vmovaps xmm1, xmmword ptr [rsp+0x250]
- vmovaps xmm2, xmmword ptr [rsp+0x260]
- vmovaps xmm3, xmmword ptr [rsp+0x270]
- vblendvps xmm0, xmm0, xmm1, xmm8
- vblendvps xmm2, xmm2, xmm3, xmm8
- vmovaps xmmword ptr [rsp+0x240], xmm0
- vmovaps xmmword ptr [rsp+0x260], xmm2
- add rbx, 128
- add rdi, 32
- sub rsi, 4
-3:
- test rsi, 0x2
- je 3f
- vbroadcasti128 ymm0, xmmword ptr [rcx]
- vbroadcasti128 ymm1, xmmword ptr [rcx+0x10]
- vmovd xmm13, dword ptr [rsp+0x240]
- vpinsrd xmm13, xmm13, dword ptr [rsp+0x260], 1
- vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
- vmovd xmm14, dword ptr [rsp+0x244]
- vpinsrd xmm14, xmm14, dword ptr [rsp+0x264], 1
- vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
- vinserti128 ymm13, ymm13, xmm14, 0x01
- vbroadcasti128 ymm14, xmmword ptr [ROT16+rip]
- vbroadcasti128 ymm15, xmmword ptr [ROT8+rip]
- mov r8, qword ptr [rdi]
- mov r9, qword ptr [rdi+0x8]
- movzx eax, byte ptr [rbp+0x40]
- or eax, r13d
- xor edx, edx
-.p2align 5
-2:
- mov r14d, eax
- or eax, r12d
- add rdx, 64
- cmp rdx, r15
- cmovne eax, r14d
- mov dword ptr [rsp+0x200], eax
- vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip]
- vpbroadcastd ymm8, dword ptr [rsp+0x200]
- vpblendd ymm3, ymm13, ymm8, 0x88
- vmovups ymm8, ymmword ptr [r8+rdx-0x40]
- vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x40], 0x01
- vmovups ymm9, ymmword ptr [r8+rdx-0x30]
- vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x30], 0x01
- vshufps ymm4, ymm8, ymm9, 136
- vshufps ymm5, ymm8, ymm9, 221
- vmovups ymm8, ymmword ptr [r8+rdx-0x20]
- vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x20], 0x01
- vmovups ymm9, ymmword ptr [r8+rdx-0x10]
- vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x10], 0x01
- vshufps ymm6, ymm8, ymm9, 136
- vshufps ymm7, ymm8, ymm9, 221
- vpshufd ymm6, ymm6, 0x93
- vpshufd ymm7, ymm7, 0x93
- mov al, 7
-9:
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm0, ymm0, ymm1
- vpxor ymm3, ymm3, ymm0
- vpshufb ymm3, ymm3, ymm14
- vpaddd ymm2, ymm2, ymm3
- vpxor ymm1, ymm1, ymm2
- vpsrld ymm8, ymm1, 12
- vpslld ymm1, ymm1, 20
- vpor ymm1, ymm1, ymm8
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm0, ymm0, ymm1
- vpxor ymm3, ymm3, ymm0
- vpshufb ymm3, ymm3, ymm15
- vpaddd ymm2, ymm2, ymm3
- vpxor ymm1, ymm1, ymm2
- vpsrld ymm8, ymm1, 7
- vpslld ymm1, ymm1, 25
- vpor ymm1, ymm1, ymm8
- vpshufd ymm0, ymm0, 0x93
- vpshufd ymm3, ymm3, 0x4E
- vpshufd ymm2, ymm2, 0x39
- vpaddd ymm0, ymm0, ymm6
- vpaddd ymm0, ymm0, ymm1
- vpxor ymm3, ymm3, ymm0
- vpshufb ymm3, ymm3, ymm14
- vpaddd ymm2, ymm2, ymm3
- vpxor ymm1, ymm1, ymm2
- vpsrld ymm8, ymm1, 12
- vpslld ymm1, ymm1, 20
- vpor ymm1, ymm1, ymm8
- vpaddd ymm0, ymm0, ymm7
- vpaddd ymm0, ymm0, ymm1
- vpxor ymm3, ymm3, ymm0
- vpshufb ymm3, ymm3, ymm15
- vpaddd ymm2, ymm2, ymm3
- vpxor ymm1, ymm1, ymm2
- vpsrld ymm8, ymm1, 7
- vpslld ymm1, ymm1, 25
- vpor ymm1, ymm1, ymm8
- vpshufd ymm0, ymm0, 0x39
- vpshufd ymm3, ymm3, 0x4E
- vpshufd ymm2, ymm2, 0x93
- dec al
- jz 9f
- vshufps ymm8, ymm4, ymm5, 214
- vpshufd ymm9, ymm4, 0x0F
- vpshufd ymm4, ymm8, 0x39
- vshufps ymm8, ymm6, ymm7, 250
- vpblendd ymm9, ymm9, ymm8, 0xAA
- vpunpcklqdq ymm8, ymm7, ymm5
- vpblendd ymm8, ymm8, ymm6, 0x88
- vpshufd ymm8, ymm8, 0x78
- vpunpckhdq ymm5, ymm5, ymm7
- vpunpckldq ymm6, ymm6, ymm5
- vpshufd ymm7, ymm6, 0x1E
- vmovdqa ymm5, ymm9
- vmovdqa ymm6, ymm8
- jmp 9b
-9:
- vpxor ymm0, ymm0, ymm2
- vpxor ymm1, ymm1, ymm3
- mov eax, r13d
- cmp rdx, r15
- jne 2b
- vmovdqu xmmword ptr [rbx], xmm0
- vmovdqu xmmword ptr [rbx+0x10], xmm1
- vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01
- vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01
- vmovaps ymm8, ymmword ptr [rsp+0x280]
- vmovaps ymm0, ymmword ptr [rsp+0x240]
- vmovups ymm1, ymmword ptr [rsp+0x248]
- vmovaps ymm2, ymmword ptr [rsp+0x260]
- vmovups ymm3, ymmword ptr [rsp+0x268]
- vblendvps ymm0, ymm0, ymm1, ymm8
- vblendvps ymm2, ymm2, ymm3, ymm8
- vmovaps ymmword ptr [rsp+0x240], ymm0
- vmovaps ymmword ptr [rsp+0x260], ymm2
- add rbx, 64
- add rdi, 16
- sub rsi, 2
-3:
- test rsi, 0x1
- je 4b
- vmovdqu xmm0, xmmword ptr [rcx]
- vmovdqu xmm1, xmmword ptr [rcx+0x10]
- vmovd xmm3, dword ptr [rsp+0x240]
- vpinsrd xmm3, xmm3, dword ptr [rsp+0x260], 1
- vpinsrd xmm13, xmm3, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
- vmovdqa xmm14, xmmword ptr [ROT16+rip]
- vmovdqa xmm15, xmmword ptr [ROT8+rip]
- mov r8, qword ptr [rdi]
- movzx eax, byte ptr [rbp+0x40]
- or eax, r13d
- xor edx, edx
-.p2align 5
-2:
- mov r14d, eax
- or eax, r12d
- add rdx, 64
- cmp rdx, r15
- cmovne eax, r14d
- vmovdqa xmm2, xmmword ptr [BLAKE3_IV+rip]
- vmovdqa xmm3, xmm13
- vpinsrd xmm3, xmm3, eax, 3
- vmovups xmm8, xmmword ptr [r8+rdx-0x40]
- vmovups xmm9, xmmword ptr [r8+rdx-0x30]
- vshufps xmm4, xmm8, xmm9, 136
- vshufps xmm5, xmm8, xmm9, 221
- vmovups xmm8, xmmword ptr [r8+rdx-0x20]
- vmovups xmm9, xmmword ptr [r8+rdx-0x10]
- vshufps xmm6, xmm8, xmm9, 136
- vshufps xmm7, xmm8, xmm9, 221
- vpshufd xmm6, xmm6, 0x93
- vpshufd xmm7, xmm7, 0x93
- mov al, 7
-9:
- vpaddd xmm0, xmm0, xmm4
- vpaddd xmm0, xmm0, xmm1
- vpxor xmm3, xmm3, xmm0
- vpshufb xmm3, xmm3, xmm14
- vpaddd xmm2, xmm2, xmm3
- vpxor xmm1, xmm1, xmm2
- vpsrld xmm8, xmm1, 12
- vpslld xmm1, xmm1, 20
- vpor xmm1, xmm1, xmm8
- vpaddd xmm0, xmm0, xmm5
- vpaddd xmm0, xmm0, xmm1
- vpxor xmm3, xmm3, xmm0
- vpshufb xmm3, xmm3, xmm15
- vpaddd xmm2, xmm2, xmm3
- vpxor xmm1, xmm1, xmm2
- vpsrld xmm8, xmm1, 7
- vpslld xmm1, xmm1, 25
- vpor xmm1, xmm1, xmm8
- vpshufd xmm0, xmm0, 0x93
- vpshufd xmm3, xmm3, 0x4E
- vpshufd xmm2, xmm2, 0x39
- vpaddd xmm0, xmm0, xmm6
- vpaddd xmm0, xmm0, xmm1
- vpxor xmm3, xmm3, xmm0
- vpshufb xmm3, xmm3, xmm14
- vpaddd xmm2, xmm2, xmm3
- vpxor xmm1, xmm1, xmm2
- vpsrld xmm8, xmm1, 12
- vpslld xmm1, xmm1, 20
- vpor xmm1, xmm1, xmm8
- vpaddd xmm0, xmm0, xmm7
- vpaddd xmm0, xmm0, xmm1
- vpxor xmm3, xmm3, xmm0
- vpshufb xmm3, xmm3, xmm15
- vpaddd xmm2, xmm2, xmm3
- vpxor xmm1, xmm1, xmm2
- vpsrld xmm8, xmm1, 7
- vpslld xmm1, xmm1, 25
- vpor xmm1, xmm1, xmm8
- vpshufd xmm0, xmm0, 0x39
- vpshufd xmm3, xmm3, 0x4E
- vpshufd xmm2, xmm2, 0x93
- dec al
- jz 9f
- vshufps xmm8, xmm4, xmm5, 214
- vpshufd xmm9, xmm4, 0x0F
- vpshufd xmm4, xmm8, 0x39
- vshufps xmm8, xmm6, xmm7, 250
- vpblendd xmm9, xmm9, xmm8, 0xAA
- vpunpcklqdq xmm8, xmm7, xmm5
- vpblendd xmm8, xmm8, xmm6, 0x88
- vpshufd xmm8, xmm8, 0x78
- vpunpckhdq xmm5, xmm5, xmm7
- vpunpckldq xmm6, xmm6, xmm5
- vpshufd xmm7, xmm6, 0x1E
- vmovdqa xmm5, xmm9
- vmovdqa xmm6, xmm8
- jmp 9b
-9:
- vpxor xmm0, xmm0, xmm2
- vpxor xmm1, xmm1, xmm3
- mov eax, r13d
- cmp rdx, r15
- jne 2b
- vmovdqu xmmword ptr [rbx], xmm0
- vmovdqu xmmword ptr [rbx+0x10], xmm1
- jmp 4b
-
-
-#ifdef __APPLE__
-.static_data
-#else
-.section .rodata
-#endif
-.p2align 6
-ADD0:
- .long 0, 1, 2, 3, 4, 5, 6, 7
-ADD1:
- .long 8, 8, 8, 8, 8, 8, 8, 8
-BLAKE3_IV_0:
- .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
- .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
-BLAKE3_IV_1:
- .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
- .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
-BLAKE3_IV_2:
- .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
- .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
-BLAKE3_IV_3:
- .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
- .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
-BLAKE3_BLOCK_LEN:
- .long 0x00000040, 0x00000040, 0x00000040, 0x00000040
- .long 0x00000040, 0x00000040, 0x00000040, 0x00000040
-ROT16:
- .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
-ROT8:
- .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
-CMP_MSB_MASK:
- .long 0x80000000, 0x80000000, 0x80000000, 0x80000000
- .long 0x80000000, 0x80000000, 0x80000000, 0x80000000
-BLAKE3_IV:
- .long 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A
-
diff --git a/thirdparty/BLAKE3/c/blake3_avx2_x86-64_windows_gnu.S b/thirdparty/BLAKE3/c/blake3_avx2_x86-64_windows_gnu.S
deleted file mode 100644
index bb58d2ae6..000000000
--- a/thirdparty/BLAKE3/c/blake3_avx2_x86-64_windows_gnu.S
+++ /dev/null
@@ -1,1817 +0,0 @@
-.intel_syntax noprefix
-.global _blake3_hash_many_avx2
-.global blake3_hash_many_avx2
-.section .text
- .p2align 6
-_blake3_hash_many_avx2:
-blake3_hash_many_avx2:
- push r15
- push r14
- push r13
- push r12
- push rsi
- push rdi
- push rbx
- push rbp
- mov rbp, rsp
- sub rsp, 880
- and rsp, 0xFFFFFFFFFFFFFFC0
- vmovdqa xmmword ptr [rsp+0x2D0], xmm6
- vmovdqa xmmword ptr [rsp+0x2E0], xmm7
- vmovdqa xmmword ptr [rsp+0x2F0], xmm8
- vmovdqa xmmword ptr [rsp+0x300], xmm9
- vmovdqa xmmword ptr [rsp+0x310], xmm10
- vmovdqa xmmword ptr [rsp+0x320], xmm11
- vmovdqa xmmword ptr [rsp+0x330], xmm12
- vmovdqa xmmword ptr [rsp+0x340], xmm13
- vmovdqa xmmword ptr [rsp+0x350], xmm14
- vmovdqa xmmword ptr [rsp+0x360], xmm15
- mov rdi, rcx
- mov rsi, rdx
- mov rdx, r8
- mov rcx, r9
- mov r8, qword ptr [rbp+0x68]
- movzx r9, byte ptr [rbp+0x70]
- neg r9d
- vmovd xmm0, r9d
- vpbroadcastd ymm0, xmm0
- vmovdqa ymmword ptr [rsp+0x260], ymm0
- vpand ymm1, ymm0, ymmword ptr [ADD0+rip]
- vpand ymm2, ymm0, ymmword ptr [ADD1+rip]
- vmovdqa ymmword ptr [rsp+0x2A0], ymm2
- vmovd xmm2, r8d
- vpbroadcastd ymm2, xmm2
- vpaddd ymm2, ymm2, ymm1
- vmovdqa ymmword ptr [rsp+0x220], ymm2
- vpxor ymm1, ymm1, ymmword ptr [CMP_MSB_MASK+rip]
- vpxor ymm2, ymm2, ymmword ptr [CMP_MSB_MASK+rip]
- vpcmpgtd ymm2, ymm1, ymm2
- shr r8, 32
- vmovd xmm3, r8d
- vpbroadcastd ymm3, xmm3
- vpsubd ymm3, ymm3, ymm2
- vmovdqa ymmword ptr [rsp+0x240], ymm3
- shl rdx, 6
- mov qword ptr [rsp+0x2C0], rdx
- cmp rsi, 8
- jc 3f
-2:
- vpbroadcastd ymm0, dword ptr [rcx]
- vpbroadcastd ymm1, dword ptr [rcx+0x4]
- vpbroadcastd ymm2, dword ptr [rcx+0x8]
- vpbroadcastd ymm3, dword ptr [rcx+0xC]
- vpbroadcastd ymm4, dword ptr [rcx+0x10]
- vpbroadcastd ymm5, dword ptr [rcx+0x14]
- vpbroadcastd ymm6, dword ptr [rcx+0x18]
- vpbroadcastd ymm7, dword ptr [rcx+0x1C]
- mov r8, qword ptr [rdi]
- mov r9, qword ptr [rdi+0x8]
- mov r10, qword ptr [rdi+0x10]
- mov r11, qword ptr [rdi+0x18]
- mov r12, qword ptr [rdi+0x20]
- mov r13, qword ptr [rdi+0x28]
- mov r14, qword ptr [rdi+0x30]
- mov r15, qword ptr [rdi+0x38]
- movzx eax, byte ptr [rbp+0x78]
- movzx ebx, byte ptr [rbp+0x80]
- or eax, ebx
- xor edx, edx
-.p2align 5
-9:
- movzx ebx, byte ptr [rbp+0x88]
- or ebx, eax
- add rdx, 64
- cmp rdx, qword ptr [rsp+0x2C0]
- cmove eax, ebx
- mov dword ptr [rsp+0x200], eax
- vmovups xmm8, xmmword ptr [r8+rdx-0x40]
- vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01
- vmovups xmm9, xmmword ptr [r9+rdx-0x40]
- vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01
- vunpcklpd ymm12, ymm8, ymm9
- vunpckhpd ymm13, ymm8, ymm9
- vmovups xmm10, xmmword ptr [r10+rdx-0x40]
- vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01
- vmovups xmm11, xmmword ptr [r11+rdx-0x40]
- vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01
- vunpcklpd ymm14, ymm10, ymm11
- vunpckhpd ymm15, ymm10, ymm11
- vshufps ymm8, ymm12, ymm14, 136
- vmovaps ymmword ptr [rsp], ymm8
- vshufps ymm9, ymm12, ymm14, 221
- vmovaps ymmword ptr [rsp+0x20], ymm9
- vshufps ymm10, ymm13, ymm15, 136
- vmovaps ymmword ptr [rsp+0x40], ymm10
- vshufps ymm11, ymm13, ymm15, 221
- vmovaps ymmword ptr [rsp+0x60], ymm11
- vmovups xmm8, xmmword ptr [r8+rdx-0x30]
- vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01
- vmovups xmm9, xmmword ptr [r9+rdx-0x30]
- vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01
- vunpcklpd ymm12, ymm8, ymm9
- vunpckhpd ymm13, ymm8, ymm9
- vmovups xmm10, xmmword ptr [r10+rdx-0x30]
- vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01
- vmovups xmm11, xmmword ptr [r11+rdx-0x30]
- vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01
- vunpcklpd ymm14, ymm10, ymm11
- vunpckhpd ymm15, ymm10, ymm11
- vshufps ymm8, ymm12, ymm14, 136
- vmovaps ymmword ptr [rsp+0x80], ymm8
- vshufps ymm9, ymm12, ymm14, 221
- vmovaps ymmword ptr [rsp+0xA0], ymm9
- vshufps ymm10, ymm13, ymm15, 136
- vmovaps ymmword ptr [rsp+0xC0], ymm10
- vshufps ymm11, ymm13, ymm15, 221
- vmovaps ymmword ptr [rsp+0xE0], ymm11
- vmovups xmm8, xmmword ptr [r8+rdx-0x20]
- vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01
- vmovups xmm9, xmmword ptr [r9+rdx-0x20]
- vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01
- vunpcklpd ymm12, ymm8, ymm9
- vunpckhpd ymm13, ymm8, ymm9
- vmovups xmm10, xmmword ptr [r10+rdx-0x20]
- vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01
- vmovups xmm11, xmmword ptr [r11+rdx-0x20]
- vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01
- vunpcklpd ymm14, ymm10, ymm11
- vunpckhpd ymm15, ymm10, ymm11
- vshufps ymm8, ymm12, ymm14, 136
- vmovaps ymmword ptr [rsp+0x100], ymm8
- vshufps ymm9, ymm12, ymm14, 221
- vmovaps ymmword ptr [rsp+0x120], ymm9
- vshufps ymm10, ymm13, ymm15, 136
- vmovaps ymmword ptr [rsp+0x140], ymm10
- vshufps ymm11, ymm13, ymm15, 221
- vmovaps ymmword ptr [rsp+0x160], ymm11
- vmovups xmm8, xmmword ptr [r8+rdx-0x10]
- vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01
- vmovups xmm9, xmmword ptr [r9+rdx-0x10]
- vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01
- vunpcklpd ymm12, ymm8, ymm9
- vunpckhpd ymm13, ymm8, ymm9
- vmovups xmm10, xmmword ptr [r10+rdx-0x10]
- vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01
- vmovups xmm11, xmmword ptr [r11+rdx-0x10]
- vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01
- vunpcklpd ymm14, ymm10, ymm11
- vunpckhpd ymm15, ymm10, ymm11
- vshufps ymm8, ymm12, ymm14, 136
- vmovaps ymmword ptr [rsp+0x180], ymm8
- vshufps ymm9, ymm12, ymm14, 221
- vmovaps ymmword ptr [rsp+0x1A0], ymm9
- vshufps ymm10, ymm13, ymm15, 136
- vmovaps ymmword ptr [rsp+0x1C0], ymm10
- vshufps ymm11, ymm13, ymm15, 221
- vmovaps ymmword ptr [rsp+0x1E0], ymm11
- vpbroadcastd ymm15, dword ptr [rsp+0x200]
- prefetcht0 [r8+rdx+0x80]
- prefetcht0 [r12+rdx+0x80]
- prefetcht0 [r9+rdx+0x80]
- prefetcht0 [r13+rdx+0x80]
- prefetcht0 [r10+rdx+0x80]
- prefetcht0 [r14+rdx+0x80]
- prefetcht0 [r11+rdx+0x80]
- prefetcht0 [r15+rdx+0x80]
- vpaddd ymm0, ymm0, ymmword ptr [rsp]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+0x80]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0]
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxor ymm12, ymm0, ymmword ptr [rsp+0x220]
- vpxor ymm13, ymm1, ymmword ptr [rsp+0x240]
- vpxor ymm14, ymm2, ymmword ptr [BLAKE3_BLOCK_LEN+rip]
- vpxor ymm15, ymm3, ymm15
- vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpshufb ymm15, ymm15, ymm8
- vpaddd ymm8, ymm12, ymmword ptr [BLAKE3_IV_0+rip]
- vpaddd ymm9, ymm13, ymmword ptr [BLAKE3_IV_1+rip]
- vpaddd ymm10, ymm14, ymmword ptr [BLAKE3_IV_2+rip]
- vpaddd ymm11, ymm15, ymmword ptr [BLAKE3_IV_3+rip]
- vpxor ymm4, ymm4, ymm8
- vpxor ymm5, ymm5, ymm9
- vpxor ymm6, ymm6, ymm10
- vpxor ymm7, ymm7, ymm11
- vmovdqa ymmword ptr [rsp+0x200], ymm8
- vpsrld ymm8, ymm4, 12
- vpslld ymm4, ymm4, 20
- vpor ymm4, ymm4, ymm8
- vpsrld ymm8, ymm5, 12
- vpslld ymm5, ymm5, 20
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 12
- vpslld ymm6, ymm6, 20
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 12
- vpslld ymm7, ymm7, 20
- vpor ymm7, ymm7, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+0x20]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+0xA0]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0]
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxor ymm12, ymm12, ymm0
- vpxor ymm13, ymm13, ymm1
- vpxor ymm14, ymm14, ymm2
- vpxor ymm15, ymm15, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpshufb ymm15, ymm15, ymm8
- vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxor ymm4, ymm4, ymm8
- vpxor ymm5, ymm5, ymm9
- vpxor ymm6, ymm6, ymm10
- vpxor ymm7, ymm7, ymm11
- vmovdqa ymmword ptr [rsp+0x200], ymm8
- vpsrld ymm8, ymm4, 7
- vpslld ymm4, ymm4, 25
- vpor ymm4, ymm4, ymm8
- vpsrld ymm8, ymm5, 7
- vpslld ymm5, ymm5, 25
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 7
- vpslld ymm6, ymm6, 25
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 7
- vpslld ymm7, ymm7, 25
- vpor ymm7, ymm7, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+0x100]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+0x180]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0]
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxor ymm15, ymm15, ymm0
- vpxor ymm12, ymm12, ymm1
- vpxor ymm13, ymm13, ymm2
- vpxor ymm14, ymm14, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
- vpshufb ymm15, ymm15, ymm8
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
- vpaddd ymm9, ymm9, ymm14
- vpxor ymm5, ymm5, ymm10
- vpxor ymm6, ymm6, ymm11
- vpxor ymm7, ymm7, ymm8
- vpxor ymm4, ymm4, ymm9
- vmovdqa ymmword ptr [rsp+0x200], ymm8
- vpsrld ymm8, ymm5, 12
- vpslld ymm5, ymm5, 20
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 12
- vpslld ymm6, ymm6, 20
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 12
- vpslld ymm7, ymm7, 20
- vpor ymm7, ymm7, ymm8
- vpsrld ymm8, ymm4, 12
- vpslld ymm4, ymm4, 20
- vpor ymm4, ymm4, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+0x120]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1A0]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0]
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxor ymm15, ymm15, ymm0
- vpxor ymm12, ymm12, ymm1
- vpxor ymm13, ymm13, ymm2
- vpxor ymm14, ymm14, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
- vpshufb ymm15, ymm15, ymm8
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
- vpaddd ymm9, ymm9, ymm14
- vpxor ymm5, ymm5, ymm10
- vpxor ymm6, ymm6, ymm11
- vpxor ymm7, ymm7, ymm8
- vpxor ymm4, ymm4, ymm9
- vmovdqa ymmword ptr [rsp+0x200], ymm8
- vpsrld ymm8, ymm5, 7
- vpslld ymm5, ymm5, 25
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 7
- vpslld ymm6, ymm6, 25
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 7
- vpslld ymm7, ymm7, 25
- vpor ymm7, ymm7, ymm8
- vpsrld ymm8, ymm4, 7
- vpslld ymm4, ymm4, 25
- vpor ymm4, ymm4, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+0x40]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+0xE0]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80]
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxor ymm12, ymm12, ymm0
- vpxor ymm13, ymm13, ymm1
- vpxor ymm14, ymm14, ymm2
- vpxor ymm15, ymm15, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpshufb ymm15, ymm15, ymm8
- vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxor ymm4, ymm4, ymm8
- vpxor ymm5, ymm5, ymm9
- vpxor ymm6, ymm6, ymm10
- vpxor ymm7, ymm7, ymm11
- vmovdqa ymmword ptr [rsp+0x200], ymm8
- vpsrld ymm8, ymm4, 12
- vpslld ymm4, ymm4, 20
- vpor ymm4, ymm4, ymm8
- vpsrld ymm8, ymm5, 12
- vpslld ymm5, ymm5, 20
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 12
- vpslld ymm6, ymm6, 20
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 12
- vpslld ymm7, ymm7, 20
- vpor ymm7, ymm7, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+0xC0]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140]
- vpaddd ymm2, ymm2, ymmword ptr [rsp]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0]
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxor ymm12, ymm12, ymm0
- vpxor ymm13, ymm13, ymm1
- vpxor ymm14, ymm14, ymm2
- vpxor ymm15, ymm15, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpshufb ymm15, ymm15, ymm8
- vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxor ymm4, ymm4, ymm8
- vpxor ymm5, ymm5, ymm9
- vpxor ymm6, ymm6, ymm10
- vpxor ymm7, ymm7, ymm11
- vmovdqa ymmword ptr [rsp+0x200], ymm8
- vpsrld ymm8, ymm4, 7
- vpslld ymm4, ymm4, 25
- vpor ymm4, ymm4, ymm8
- vpsrld ymm8, ymm5, 7
- vpslld ymm5, ymm5, 25
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 7
- vpslld ymm6, ymm6, 25
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 7
- vpslld ymm7, ymm7, 25
- vpor ymm7, ymm7, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+0x20]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+0x120]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0]
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxor ymm15, ymm15, ymm0
- vpxor ymm12, ymm12, ymm1
- vpxor ymm13, ymm13, ymm2
- vpxor ymm14, ymm14, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
- vpshufb ymm15, ymm15, ymm8
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
- vpaddd ymm9, ymm9, ymm14
- vpxor ymm5, ymm5, ymm10
- vpxor ymm6, ymm6, ymm11
- vpxor ymm7, ymm7, ymm8
- vpxor ymm4, ymm4, ymm9
- vmovdqa ymmword ptr [rsp+0x200], ymm8
- vpsrld ymm8, ymm5, 12
- vpslld ymm5, ymm5, 20
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 12
- vpslld ymm6, ymm6, 20
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 12
- vpslld ymm7, ymm7, 20
- vpor ymm7, ymm7, ymm8
- vpsrld ymm8, ymm4, 12
- vpslld ymm4, ymm4, 20
- vpor ymm4, ymm4, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+0x160]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1C0]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100]
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxor ymm15, ymm15, ymm0
- vpxor ymm12, ymm12, ymm1
- vpxor ymm13, ymm13, ymm2
- vpxor ymm14, ymm14, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
- vpshufb ymm15, ymm15, ymm8
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
- vpaddd ymm9, ymm9, ymm14
- vpxor ymm5, ymm5, ymm10
- vpxor ymm6, ymm6, ymm11
- vpxor ymm7, ymm7, ymm8
- vpxor ymm4, ymm4, ymm9
- vmovdqa ymmword ptr [rsp+0x200], ymm8
- vpsrld ymm8, ymm5, 7
- vpslld ymm5, ymm5, 25
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 7
- vpslld ymm6, ymm6, 25
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 7
- vpslld ymm7, ymm7, 25
- vpor ymm7, ymm7, ymm8
- vpsrld ymm8, ymm4, 7
- vpslld ymm4, ymm4, 25
- vpor ymm4, ymm4, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+0x60]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1A0]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0]
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxor ymm12, ymm12, ymm0
- vpxor ymm13, ymm13, ymm1
- vpxor ymm14, ymm14, ymm2
- vpxor ymm15, ymm15, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpshufb ymm15, ymm15, ymm8
- vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxor ymm4, ymm4, ymm8
- vpxor ymm5, ymm5, ymm9
- vpxor ymm6, ymm6, ymm10
- vpxor ymm7, ymm7, ymm11
- vmovdqa ymmword ptr [rsp+0x200], ymm8
- vpsrld ymm8, ymm4, 12
- vpslld ymm4, ymm4, 20
- vpor ymm4, ymm4, ymm8
- vpsrld ymm8, ymm5, 12
- vpslld ymm5, ymm5, 20
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 12
- vpslld ymm6, ymm6, 20
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 12
- vpslld ymm7, ymm7, 20
- vpor ymm7, ymm7, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+0x80]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+0x40]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0]
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxor ymm12, ymm12, ymm0
- vpxor ymm13, ymm13, ymm1
- vpxor ymm14, ymm14, ymm2
- vpxor ymm15, ymm15, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpshufb ymm15, ymm15, ymm8
- vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxor ymm4, ymm4, ymm8
- vpxor ymm5, ymm5, ymm9
- vpxor ymm6, ymm6, ymm10
- vpxor ymm7, ymm7, ymm11
- vmovdqa ymmword ptr [rsp+0x200], ymm8
- vpsrld ymm8, ymm4, 7
- vpslld ymm4, ymm4, 25
- vpor ymm4, ymm4, ymm8
- vpsrld ymm8, ymm5, 7
- vpslld ymm5, ymm5, 25
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 7
- vpslld ymm6, ymm6, 25
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 7
- vpslld ymm7, ymm7, 25
- vpor ymm7, ymm7, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+0xC0]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+0x160]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100]
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxor ymm15, ymm15, ymm0
- vpxor ymm12, ymm12, ymm1
- vpxor ymm13, ymm13, ymm2
- vpxor ymm14, ymm14, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
- vpshufb ymm15, ymm15, ymm8
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
- vpaddd ymm9, ymm9, ymm14
- vpxor ymm5, ymm5, ymm10
- vpxor ymm6, ymm6, ymm11
- vpxor ymm7, ymm7, ymm8
- vpxor ymm4, ymm4, ymm9
- vmovdqa ymmword ptr [rsp+0x200], ymm8
- vpsrld ymm8, ymm5, 12
- vpslld ymm5, ymm5, 20
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 12
- vpslld ymm6, ymm6, 20
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 12
- vpslld ymm7, ymm7, 20
- vpor ymm7, ymm7, ymm8
- vpsrld ymm8, ymm4, 12
- vpslld ymm4, ymm4, 20
- vpor ymm4, ymm4, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+0xA0]
- vpaddd ymm1, ymm1, ymmword ptr [rsp]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1E0]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20]
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxor ymm15, ymm15, ymm0
- vpxor ymm12, ymm12, ymm1
- vpxor ymm13, ymm13, ymm2
- vpxor ymm14, ymm14, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
- vpshufb ymm15, ymm15, ymm8
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
- vpaddd ymm9, ymm9, ymm14
- vpxor ymm5, ymm5, ymm10
- vpxor ymm6, ymm6, ymm11
- vpxor ymm7, ymm7, ymm8
- vpxor ymm4, ymm4, ymm9
- vmovdqa ymmword ptr [rsp+0x200], ymm8
- vpsrld ymm8, ymm5, 7
- vpslld ymm5, ymm5, 25
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 7
- vpslld ymm6, ymm6, 25
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 7
- vpslld ymm7, ymm7, 25
- vpor ymm7, ymm7, ymm8
- vpsrld ymm8, ymm4, 7
- vpslld ymm4, ymm4, 25
- vpor ymm4, ymm4, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+0x140]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1C0]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0]
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxor ymm12, ymm12, ymm0
- vpxor ymm13, ymm13, ymm1
- vpxor ymm14, ymm14, ymm2
- vpxor ymm15, ymm15, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpshufb ymm15, ymm15, ymm8
- vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxor ymm4, ymm4, ymm8
- vpxor ymm5, ymm5, ymm9
- vpxor ymm6, ymm6, ymm10
- vpxor ymm7, ymm7, ymm11
- vmovdqa ymmword ptr [rsp+0x200], ymm8
- vpsrld ymm8, ymm4, 12
- vpslld ymm4, ymm4, 20
- vpor ymm4, ymm4, ymm8
- vpsrld ymm8, ymm5, 12
- vpslld ymm5, ymm5, 20
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 12
- vpslld ymm6, ymm6, 20
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 12
- vpslld ymm7, ymm7, 20
- vpor ymm7, ymm7, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+0xE0]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+0x60]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0]
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxor ymm12, ymm12, ymm0
- vpxor ymm13, ymm13, ymm1
- vpxor ymm14, ymm14, ymm2
- vpxor ymm15, ymm15, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpshufb ymm15, ymm15, ymm8
- vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxor ymm4, ymm4, ymm8
- vpxor ymm5, ymm5, ymm9
- vpxor ymm6, ymm6, ymm10
- vpxor ymm7, ymm7, ymm11
- vmovdqa ymmword ptr [rsp+0x200], ymm8
- vpsrld ymm8, ymm4, 7
- vpslld ymm4, ymm4, 25
- vpor ymm4, ymm4, ymm8
- vpsrld ymm8, ymm5, 7
- vpslld ymm5, ymm5, 25
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 7
- vpslld ymm6, ymm6, 25
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 7
- vpslld ymm7, ymm7, 25
- vpor ymm7, ymm7, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+0x80]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+0xA0]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20]
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxor ymm15, ymm15, ymm0
- vpxor ymm12, ymm12, ymm1
- vpxor ymm13, ymm13, ymm2
- vpxor ymm14, ymm14, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
- vpshufb ymm15, ymm15, ymm8
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
- vpaddd ymm9, ymm9, ymm14
- vpxor ymm5, ymm5, ymm10
- vpxor ymm6, ymm6, ymm11
- vpxor ymm7, ymm7, ymm8
- vpxor ymm4, ymm4, ymm9
- vmovdqa ymmword ptr [rsp+0x200], ymm8
- vpsrld ymm8, ymm5, 12
- vpslld ymm5, ymm5, 20
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 12
- vpslld ymm6, ymm6, 20
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 12
- vpslld ymm7, ymm7, 20
- vpor ymm7, ymm7, ymm8
- vpsrld ymm8, ymm4, 12
- vpslld ymm4, ymm4, 20
- vpor ymm4, ymm4, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+0x100]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0]
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxor ymm15, ymm15, ymm0
- vpxor ymm12, ymm12, ymm1
- vpxor ymm13, ymm13, ymm2
- vpxor ymm14, ymm14, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
- vpshufb ymm15, ymm15, ymm8
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
- vpaddd ymm9, ymm9, ymm14
- vpxor ymm5, ymm5, ymm10
- vpxor ymm6, ymm6, ymm11
- vpxor ymm7, ymm7, ymm8
- vpxor ymm4, ymm4, ymm9
- vmovdqa ymmword ptr [rsp+0x200], ymm8
- vpsrld ymm8, ymm5, 7
- vpslld ymm5, ymm5, 25
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 7
- vpslld ymm6, ymm6, 25
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 7
- vpslld ymm7, ymm7, 25
- vpor ymm7, ymm7, ymm8
- vpsrld ymm8, ymm4, 7
- vpslld ymm4, ymm4, 25
- vpor ymm4, ymm4, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+0x180]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1E0]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0]
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxor ymm12, ymm12, ymm0
- vpxor ymm13, ymm13, ymm1
- vpxor ymm14, ymm14, ymm2
- vpxor ymm15, ymm15, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpshufb ymm15, ymm15, ymm8
- vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxor ymm4, ymm4, ymm8
- vpxor ymm5, ymm5, ymm9
- vpxor ymm6, ymm6, ymm10
- vpxor ymm7, ymm7, ymm11
- vmovdqa ymmword ptr [rsp+0x200], ymm8
- vpsrld ymm8, ymm4, 12
- vpslld ymm4, ymm4, 20
- vpor ymm4, ymm4, ymm8
- vpsrld ymm8, ymm5, 12
- vpslld ymm5, ymm5, 20
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 12
- vpslld ymm6, ymm6, 20
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 12
- vpslld ymm7, ymm7, 20
- vpor ymm7, ymm7, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1A0]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+0x140]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100]
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxor ymm12, ymm12, ymm0
- vpxor ymm13, ymm13, ymm1
- vpxor ymm14, ymm14, ymm2
- vpxor ymm15, ymm15, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpshufb ymm15, ymm15, ymm8
- vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxor ymm4, ymm4, ymm8
- vpxor ymm5, ymm5, ymm9
- vpxor ymm6, ymm6, ymm10
- vpxor ymm7, ymm7, ymm11
- vmovdqa ymmword ptr [rsp+0x200], ymm8
- vpsrld ymm8, ymm4, 7
- vpslld ymm4, ymm4, 25
- vpor ymm4, ymm4, ymm8
- vpsrld ymm8, ymm5, 7
- vpslld ymm5, ymm5, 25
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 7
- vpslld ymm6, ymm6, 25
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 7
- vpslld ymm7, ymm7, 25
- vpor ymm7, ymm7, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+0xE0]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0]
- vpaddd ymm2, ymm2, ymmword ptr [rsp]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0]
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxor ymm15, ymm15, ymm0
- vpxor ymm12, ymm12, ymm1
- vpxor ymm13, ymm13, ymm2
- vpxor ymm14, ymm14, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
- vpshufb ymm15, ymm15, ymm8
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
- vpaddd ymm9, ymm9, ymm14
- vpxor ymm5, ymm5, ymm10
- vpxor ymm6, ymm6, ymm11
- vpxor ymm7, ymm7, ymm8
- vpxor ymm4, ymm4, ymm9
- vmovdqa ymmword ptr [rsp+0x200], ymm8
- vpsrld ymm8, ymm5, 12
- vpslld ymm5, ymm5, 20
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 12
- vpslld ymm6, ymm6, 20
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 12
- vpslld ymm7, ymm7, 20
- vpor ymm7, ymm7, ymm8
- vpsrld ymm8, ymm4, 12
- vpslld ymm4, ymm4, 20
- vpor ymm4, ymm4, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+0x40]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+0x20]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80]
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxor ymm15, ymm15, ymm0
- vpxor ymm12, ymm12, ymm1
- vpxor ymm13, ymm13, ymm2
- vpxor ymm14, ymm14, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
- vpshufb ymm15, ymm15, ymm8
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
- vpaddd ymm9, ymm9, ymm14
- vpxor ymm5, ymm5, ymm10
- vpxor ymm6, ymm6, ymm11
- vpxor ymm7, ymm7, ymm8
- vpxor ymm4, ymm4, ymm9
- vmovdqa ymmword ptr [rsp+0x200], ymm8
- vpsrld ymm8, ymm5, 7
- vpslld ymm5, ymm5, 25
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 7
- vpslld ymm6, ymm6, 25
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 7
- vpslld ymm7, ymm7, 25
- vpor ymm7, ymm7, ymm8
- vpsrld ymm8, ymm4, 7
- vpslld ymm4, ymm4, 25
- vpor ymm4, ymm4, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+0x120]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+0x100]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0]
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxor ymm12, ymm12, ymm0
- vpxor ymm13, ymm13, ymm1
- vpxor ymm14, ymm14, ymm2
- vpxor ymm15, ymm15, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpshufb ymm15, ymm15, ymm8
- vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxor ymm4, ymm4, ymm8
- vpxor ymm5, ymm5, ymm9
- vpxor ymm6, ymm6, ymm10
- vpxor ymm7, ymm7, ymm11
- vmovdqa ymmword ptr [rsp+0x200], ymm8
- vpsrld ymm8, ymm4, 12
- vpslld ymm4, ymm4, 20
- vpor ymm4, ymm4, ymm8
- vpsrld ymm8, ymm5, 12
- vpslld ymm5, ymm5, 20
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 12
- vpslld ymm6, ymm6, 20
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 12
- vpslld ymm7, ymm7, 20
- vpor ymm7, ymm7, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1C0]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+0x180]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20]
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxor ymm12, ymm12, ymm0
- vpxor ymm13, ymm13, ymm1
- vpxor ymm14, ymm14, ymm2
- vpxor ymm15, ymm15, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpshufb ymm15, ymm15, ymm8
- vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxor ymm4, ymm4, ymm8
- vpxor ymm5, ymm5, ymm9
- vpxor ymm6, ymm6, ymm10
- vpxor ymm7, ymm7, ymm11
- vmovdqa ymmword ptr [rsp+0x200], ymm8
- vpsrld ymm8, ymm4, 7
- vpslld ymm4, ymm4, 25
- vpor ymm4, ymm4, ymm8
- vpsrld ymm8, ymm5, 7
- vpslld ymm5, ymm5, 25
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 7
- vpslld ymm6, ymm6, 25
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 7
- vpslld ymm7, ymm7, 25
- vpor ymm7, ymm7, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1A0]
- vpaddd ymm1, ymm1, ymmword ptr [rsp]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+0x40]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80]
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxor ymm15, ymm15, ymm0
- vpxor ymm12, ymm12, ymm1
- vpxor ymm13, ymm13, ymm2
- vpxor ymm14, ymm14, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
- vpshufb ymm15, ymm15, ymm8
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
- vpaddd ymm9, ymm9, ymm14
- vpxor ymm5, ymm5, ymm10
- vpxor ymm6, ymm6, ymm11
- vpxor ymm7, ymm7, ymm8
- vpxor ymm4, ymm4, ymm9
- vmovdqa ymmword ptr [rsp+0x200], ymm8
- vpsrld ymm8, ymm5, 12
- vpslld ymm5, ymm5, 20
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 12
- vpslld ymm6, ymm6, 20
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 12
- vpslld ymm7, ymm7, 20
- vpor ymm7, ymm7, ymm8
- vpsrld ymm8, ymm4, 12
- vpslld ymm4, ymm4, 20
- vpor ymm4, ymm4, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+0x60]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+0xC0]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0]
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxor ymm15, ymm15, ymm0
- vpxor ymm12, ymm12, ymm1
- vpxor ymm13, ymm13, ymm2
- vpxor ymm14, ymm14, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
- vpshufb ymm15, ymm15, ymm8
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
- vpaddd ymm9, ymm9, ymm14
- vpxor ymm5, ymm5, ymm10
- vpxor ymm6, ymm6, ymm11
- vpxor ymm7, ymm7, ymm8
- vpxor ymm4, ymm4, ymm9
- vmovdqa ymmword ptr [rsp+0x200], ymm8
- vpsrld ymm8, ymm5, 7
- vpslld ymm5, ymm5, 25
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 7
- vpslld ymm6, ymm6, 25
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 7
- vpslld ymm7, ymm7, 25
- vpor ymm7, ymm7, ymm8
- vpsrld ymm8, ymm4, 7
- vpslld ymm4, ymm4, 25
- vpor ymm4, ymm4, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+0x160]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+0x20]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100]
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxor ymm12, ymm12, ymm0
- vpxor ymm13, ymm13, ymm1
- vpxor ymm14, ymm14, ymm2
- vpxor ymm15, ymm15, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpshufb ymm15, ymm15, ymm8
- vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxor ymm4, ymm4, ymm8
- vpxor ymm5, ymm5, ymm9
- vpxor ymm6, ymm6, ymm10
- vpxor ymm7, ymm7, ymm11
- vmovdqa ymmword ptr [rsp+0x200], ymm8
- vpsrld ymm8, ymm4, 12
- vpslld ymm4, ymm4, 20
- vpor ymm4, ymm4, ymm8
- vpsrld ymm8, ymm5, 12
- vpslld ymm5, ymm5, 20
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 12
- vpslld ymm6, ymm6, 20
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 12
- vpslld ymm7, ymm7, 20
- vpor ymm7, ymm7, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1E0]
- vpaddd ymm1, ymm1, ymmword ptr [rsp]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+0x120]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0]
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxor ymm12, ymm12, ymm0
- vpxor ymm13, ymm13, ymm1
- vpxor ymm14, ymm14, ymm2
- vpxor ymm15, ymm15, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpshufb ymm15, ymm15, ymm8
- vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxor ymm4, ymm4, ymm8
- vpxor ymm5, ymm5, ymm9
- vpxor ymm6, ymm6, ymm10
- vpxor ymm7, ymm7, ymm11
- vmovdqa ymmword ptr [rsp+0x200], ymm8
- vpsrld ymm8, ymm4, 7
- vpslld ymm4, ymm4, 25
- vpor ymm4, ymm4, ymm8
- vpsrld ymm8, ymm5, 7
- vpslld ymm5, ymm5, 25
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 7
- vpslld ymm6, ymm6, 25
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 7
- vpslld ymm7, ymm7, 25
- vpor ymm7, ymm7, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1C0]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+0x60]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0]
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxor ymm15, ymm15, ymm0
- vpxor ymm12, ymm12, ymm1
- vpxor ymm13, ymm13, ymm2
- vpxor ymm14, ymm14, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
- vpshufb ymm15, ymm15, ymm8
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
- vpaddd ymm9, ymm9, ymm14
- vpxor ymm5, ymm5, ymm10
- vpxor ymm6, ymm6, ymm11
- vpxor ymm7, ymm7, ymm8
- vpxor ymm4, ymm4, ymm9
- vmovdqa ymmword ptr [rsp+0x200], ymm8
- vpsrld ymm8, ymm5, 12
- vpslld ymm5, ymm5, 20
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 12
- vpslld ymm6, ymm6, 20
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 12
- vpslld ymm7, ymm7, 20
- vpor ymm7, ymm7, ymm8
- vpsrld ymm8, ymm4, 12
- vpslld ymm4, ymm4, 20
- vpor ymm4, ymm4, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+0x140]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+0x80]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0]
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxor ymm15, ymm15, ymm0
- vpxor ymm12, ymm12, ymm1
- vpxor ymm13, ymm13, ymm2
- vpxor ymm14, ymm14, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
- vpshufb ymm15, ymm15, ymm8
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
- vpaddd ymm9, ymm9, ymm14
- vpxor ymm5, ymm5, ymm10
- vpxor ymm6, ymm6, ymm11
- vpxor ymm7, ymm7, ymm8
- vpxor ymm4, ymm4, ymm9
- vpxor ymm0, ymm0, ymm8
- vpxor ymm1, ymm1, ymm9
- vpxor ymm2, ymm2, ymm10
- vpxor ymm3, ymm3, ymm11
- vpsrld ymm8, ymm5, 7
- vpslld ymm5, ymm5, 25
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 7
- vpslld ymm6, ymm6, 25
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 7
- vpslld ymm7, ymm7, 25
- vpor ymm7, ymm7, ymm8
- vpsrld ymm8, ymm4, 7
- vpslld ymm4, ymm4, 25
- vpor ymm4, ymm4, ymm8
- vpxor ymm4, ymm4, ymm12
- vpxor ymm5, ymm5, ymm13
- vpxor ymm6, ymm6, ymm14
- vpxor ymm7, ymm7, ymm15
- movzx eax, byte ptr [rbp+0x78]
- jne 9b
- mov rbx, qword ptr [rbp+0x90]
- vunpcklps ymm8, ymm0, ymm1
- vunpcklps ymm9, ymm2, ymm3
- vunpckhps ymm10, ymm0, ymm1
- vunpcklps ymm11, ymm4, ymm5
- vunpcklps ymm0, ymm6, ymm7
- vshufps ymm12, ymm8, ymm9, 78
- vblendps ymm1, ymm8, ymm12, 0xCC
- vshufps ymm8, ymm11, ymm0, 78
- vunpckhps ymm13, ymm2, ymm3
- vblendps ymm2, ymm11, ymm8, 0xCC
- vblendps ymm3, ymm12, ymm9, 0xCC
- vperm2f128 ymm12, ymm1, ymm2, 0x20
- vmovups ymmword ptr [rbx], ymm12
- vunpckhps ymm14, ymm4, ymm5
- vblendps ymm4, ymm8, ymm0, 0xCC
- vunpckhps ymm15, ymm6, ymm7
- vperm2f128 ymm7, ymm3, ymm4, 0x20
- vmovups ymmword ptr [rbx+0x20], ymm7
- vshufps ymm5, ymm10, ymm13, 78
- vblendps ymm6, ymm5, ymm13, 0xCC
- vshufps ymm13, ymm14, ymm15, 78
- vblendps ymm10, ymm10, ymm5, 0xCC
- vblendps ymm14, ymm14, ymm13, 0xCC
- vperm2f128 ymm8, ymm10, ymm14, 0x20
- vmovups ymmword ptr [rbx+0x40], ymm8
- vblendps ymm15, ymm13, ymm15, 0xCC
- vperm2f128 ymm13, ymm6, ymm15, 0x20
- vmovups ymmword ptr [rbx+0x60], ymm13
- vperm2f128 ymm9, ymm1, ymm2, 0x31
- vperm2f128 ymm11, ymm3, ymm4, 0x31
- vmovups ymmword ptr [rbx+0x80], ymm9
- vperm2f128 ymm14, ymm10, ymm14, 0x31
- vperm2f128 ymm15, ymm6, ymm15, 0x31
- vmovups ymmword ptr [rbx+0xA0], ymm11
- vmovups ymmword ptr [rbx+0xC0], ymm14
- vmovups ymmword ptr [rbx+0xE0], ymm15
- vmovdqa ymm0, ymmword ptr [rsp+0x2A0]
- vpaddd ymm1, ymm0, ymmword ptr [rsp+0x220]
- vmovdqa ymmword ptr [rsp+0x220], ymm1
- vpxor ymm0, ymm0, ymmword ptr [CMP_MSB_MASK+rip]
- vpxor ymm2, ymm1, ymmword ptr [CMP_MSB_MASK+rip]
- vpcmpgtd ymm2, ymm0, ymm2
- vmovdqa ymm0, ymmword ptr [rsp+0x240]
- vpsubd ymm2, ymm0, ymm2
- vmovdqa ymmword ptr [rsp+0x240], ymm2
- add rdi, 64
- add rbx, 256
- mov qword ptr [rbp+0x90], rbx
- sub rsi, 8
- cmp rsi, 8
- jnc 2b
- test rsi, rsi
- jnz 3f
-4:
- vzeroupper
- vmovdqa xmm6, xmmword ptr [rsp+0x2D0]
- vmovdqa xmm7, xmmword ptr [rsp+0x2E0]
- vmovdqa xmm8, xmmword ptr [rsp+0x2F0]
- vmovdqa xmm9, xmmword ptr [rsp+0x300]
- vmovdqa xmm10, xmmword ptr [rsp+0x310]
- vmovdqa xmm11, xmmword ptr [rsp+0x320]
- vmovdqa xmm12, xmmword ptr [rsp+0x330]
- vmovdqa xmm13, xmmword ptr [rsp+0x340]
- vmovdqa xmm14, xmmword ptr [rsp+0x350]
- vmovdqa xmm15, xmmword ptr [rsp+0x360]
- mov rsp, rbp
- pop rbp
- pop rbx
- pop rdi
- pop rsi
- pop r12
- pop r13
- pop r14
- pop r15
- ret
-.p2align 5
-3:
- mov rbx, qword ptr [rbp+0x90]
- mov r15, qword ptr [rsp+0x2C0]
- movzx r13d, byte ptr [rbp+0x78]
- movzx r12d, byte ptr [rbp+0x88]
- test rsi, 0x4
- je 3f
- vbroadcasti128 ymm0, xmmword ptr [rcx]
- vbroadcasti128 ymm1, xmmword ptr [rcx+0x10]
- vmovdqa ymm8, ymm0
- vmovdqa ymm9, ymm1
- vbroadcasti128 ymm12, xmmword ptr [rsp+0x220]
- vbroadcasti128 ymm13, xmmword ptr [rsp+0x240]
- vpunpckldq ymm14, ymm12, ymm13
- vpunpckhdq ymm15, ymm12, ymm13
- vpermq ymm14, ymm14, 0x50
- vpermq ymm15, ymm15, 0x50
- vbroadcasti128 ymm12, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
- vpblendd ymm14, ymm14, ymm12, 0x44
- vpblendd ymm15, ymm15, ymm12, 0x44
- vmovdqa ymmword ptr [rsp], ymm14
- vmovdqa ymmword ptr [rsp+0x20], ymm15
- mov r8, qword ptr [rdi]
- mov r9, qword ptr [rdi+0x8]
- mov r10, qword ptr [rdi+0x10]
- mov r11, qword ptr [rdi+0x18]
- movzx eax, byte ptr [rbp+0x80]
- or eax, r13d
- xor edx, edx
-.p2align 5
-2:
- mov r14d, eax
- or eax, r12d
- add rdx, 64
- cmp rdx, r15
- cmovne eax, r14d
- mov dword ptr [rsp+0x200], eax
- vmovups ymm2, ymmword ptr [r8+rdx-0x40]
- vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x40], 0x01
- vmovups ymm3, ymmword ptr [r8+rdx-0x30]
- vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x30], 0x01
- vshufps ymm4, ymm2, ymm3, 136
- vshufps ymm5, ymm2, ymm3, 221
- vmovups ymm2, ymmword ptr [r8+rdx-0x20]
- vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x20], 0x01
- vmovups ymm3, ymmword ptr [r8+rdx-0x10]
- vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x10], 0x01
- vshufps ymm6, ymm2, ymm3, 136
- vshufps ymm7, ymm2, ymm3, 221
- vpshufd ymm6, ymm6, 0x93
- vpshufd ymm7, ymm7, 0x93
- vmovups ymm10, ymmword ptr [r10+rdx-0x40]
- vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x40], 0x01
- vmovups ymm11, ymmword ptr [r10+rdx-0x30]
- vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x30], 0x01
- vshufps ymm12, ymm10, ymm11, 136
- vshufps ymm13, ymm10, ymm11, 221
- vmovups ymm10, ymmword ptr [r10+rdx-0x20]
- vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x20], 0x01
- vmovups ymm11, ymmword ptr [r10+rdx-0x10]
- vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x10], 0x01
- vshufps ymm14, ymm10, ymm11, 136
- vshufps ymm15, ymm10, ymm11, 221
- vpshufd ymm14, ymm14, 0x93
- vpshufd ymm15, ymm15, 0x93
- vpbroadcastd ymm2, dword ptr [rsp+0x200]
- vmovdqa ymm3, ymmword ptr [rsp]
- vmovdqa ymm11, ymmword ptr [rsp+0x20]
- vpblendd ymm3, ymm3, ymm2, 0x88
- vpblendd ymm11, ymm11, ymm2, 0x88
- vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip]
- vmovdqa ymm10, ymm2
- mov al, 7
-9:
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm8, ymm8, ymm12
- vmovdqa ymmword ptr [rsp+0x40], ymm4
- nop
- vmovdqa ymmword ptr [rsp+0x60], ymm12
- nop
- vpaddd ymm0, ymm0, ymm1
- vpaddd ymm8, ymm8, ymm9
- vpxor ymm3, ymm3, ymm0
- vpxor ymm11, ymm11, ymm8
- vbroadcasti128 ymm4, xmmword ptr [ROT16+rip]
- vpshufb ymm3, ymm3, ymm4
- vpshufb ymm11, ymm11, ymm4
- vpaddd ymm2, ymm2, ymm3
- vpaddd ymm10, ymm10, ymm11
- vpxor ymm1, ymm1, ymm2
- vpxor ymm9, ymm9, ymm10
- vpsrld ymm4, ymm1, 12
- vpslld ymm1, ymm1, 20
- vpor ymm1, ymm1, ymm4
- vpsrld ymm4, ymm9, 12
- vpslld ymm9, ymm9, 20
- vpor ymm9, ymm9, ymm4
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm8, ymm8, ymm13
- vpaddd ymm0, ymm0, ymm1
- vpaddd ymm8, ymm8, ymm9
- vmovdqa ymmword ptr [rsp+0x80], ymm5
- vmovdqa ymmword ptr [rsp+0xA0], ymm13
- vpxor ymm3, ymm3, ymm0
- vpxor ymm11, ymm11, ymm8
- vbroadcasti128 ymm4, xmmword ptr [ROT8+rip]
- vpshufb ymm3, ymm3, ymm4
- vpshufb ymm11, ymm11, ymm4
- vpaddd ymm2, ymm2, ymm3
- vpaddd ymm10, ymm10, ymm11
- vpxor ymm1, ymm1, ymm2
- vpxor ymm9, ymm9, ymm10
- vpsrld ymm4, ymm1, 7
- vpslld ymm1, ymm1, 25
- vpor ymm1, ymm1, ymm4
- vpsrld ymm4, ymm9, 7
- vpslld ymm9, ymm9, 25
- vpor ymm9, ymm9, ymm4
- vpshufd ymm0, ymm0, 0x93
- vpshufd ymm8, ymm8, 0x93
- vpshufd ymm3, ymm3, 0x4E
- vpshufd ymm11, ymm11, 0x4E
- vpshufd ymm2, ymm2, 0x39
- vpshufd ymm10, ymm10, 0x39
- vpaddd ymm0, ymm0, ymm6
- vpaddd ymm8, ymm8, ymm14
- vpaddd ymm0, ymm0, ymm1
- vpaddd ymm8, ymm8, ymm9
- vpxor ymm3, ymm3, ymm0
- vpxor ymm11, ymm11, ymm8
- vbroadcasti128 ymm4, xmmword ptr [ROT16+rip]
- vpshufb ymm3, ymm3, ymm4
- vpshufb ymm11, ymm11, ymm4
- vpaddd ymm2, ymm2, ymm3
- vpaddd ymm10, ymm10, ymm11
- vpxor ymm1, ymm1, ymm2
- vpxor ymm9, ymm9, ymm10
- vpsrld ymm4, ymm1, 12
- vpslld ymm1, ymm1, 20
- vpor ymm1, ymm1, ymm4
- vpsrld ymm4, ymm9, 12
- vpslld ymm9, ymm9, 20
- vpor ymm9, ymm9, ymm4
- vpaddd ymm0, ymm0, ymm7
- vpaddd ymm8, ymm8, ymm15
- vpaddd ymm0, ymm0, ymm1
- vpaddd ymm8, ymm8, ymm9
- vpxor ymm3, ymm3, ymm0
- vpxor ymm11, ymm11, ymm8
- vbroadcasti128 ymm4, xmmword ptr [ROT8+rip]
- vpshufb ymm3, ymm3, ymm4
- vpshufb ymm11, ymm11, ymm4
- vpaddd ymm2, ymm2, ymm3
- vpaddd ymm10, ymm10, ymm11
- vpxor ymm1, ymm1, ymm2
- vpxor ymm9, ymm9, ymm10
- vpsrld ymm4, ymm1, 7
- vpslld ymm1, ymm1, 25
- vpor ymm1, ymm1, ymm4
- vpsrld ymm4, ymm9, 7
- vpslld ymm9, ymm9, 25
- vpor ymm9, ymm9, ymm4
- vpshufd ymm0, ymm0, 0x39
- vpshufd ymm8, ymm8, 0x39
- vpshufd ymm3, ymm3, 0x4E
- vpshufd ymm11, ymm11, 0x4E
- vpshufd ymm2, ymm2, 0x93
- vpshufd ymm10, ymm10, 0x93
- dec al
- je 9f
- vmovdqa ymm4, ymmword ptr [rsp+0x40]
- vmovdqa ymm5, ymmword ptr [rsp+0x80]
- vshufps ymm12, ymm4, ymm5, 214
- vpshufd ymm13, ymm4, 0x0F
- vpshufd ymm4, ymm12, 0x39
- vshufps ymm12, ymm6, ymm7, 250
- vpblendd ymm13, ymm13, ymm12, 0xAA
- vpunpcklqdq ymm12, ymm7, ymm5
- vpblendd ymm12, ymm12, ymm6, 0x88
- vpshufd ymm12, ymm12, 0x78
- vpunpckhdq ymm5, ymm5, ymm7
- vpunpckldq ymm6, ymm6, ymm5
- vpshufd ymm7, ymm6, 0x1E
- vmovdqa ymmword ptr [rsp+0x40], ymm13
- vmovdqa ymmword ptr [rsp+0x80], ymm12
- vmovdqa ymm12, ymmword ptr [rsp+0x60]
- vmovdqa ymm13, ymmword ptr [rsp+0xA0]
- vshufps ymm5, ymm12, ymm13, 214
- vpshufd ymm6, ymm12, 0x0F
- vpshufd ymm12, ymm5, 0x39
- vshufps ymm5, ymm14, ymm15, 250
- vpblendd ymm6, ymm6, ymm5, 0xAA
- vpunpcklqdq ymm5, ymm15, ymm13
- vpblendd ymm5, ymm5, ymm14, 0x88
- vpshufd ymm5, ymm5, 0x78
- vpunpckhdq ymm13, ymm13, ymm15
- vpunpckldq ymm14, ymm14, ymm13
- vpshufd ymm15, ymm14, 0x1E
- vmovdqa ymm13, ymm6
- vmovdqa ymm14, ymm5
- vmovdqa ymm5, ymmword ptr [rsp+0x40]
- vmovdqa ymm6, ymmword ptr [rsp+0x80]
- jmp 9b
-9:
- vpxor ymm0, ymm0, ymm2
- vpxor ymm1, ymm1, ymm3
- vpxor ymm8, ymm8, ymm10
- vpxor ymm9, ymm9, ymm11
- mov eax, r13d
- cmp rdx, r15
- jne 2b
- vmovdqu xmmword ptr [rbx], xmm0
- vmovdqu xmmword ptr [rbx+0x10], xmm1
- vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01
- vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01
- vmovdqu xmmword ptr [rbx+0x40], xmm8
- vmovdqu xmmword ptr [rbx+0x50], xmm9
- vextracti128 xmmword ptr [rbx+0x60], ymm8, 0x01
- vextracti128 xmmword ptr [rbx+0x70], ymm9, 0x01
- vmovaps xmm8, xmmword ptr [rsp+0x260]
- vmovaps xmm0, xmmword ptr [rsp+0x220]
- vmovaps xmm1, xmmword ptr [rsp+0x230]
- vmovaps xmm2, xmmword ptr [rsp+0x240]
- vmovaps xmm3, xmmword ptr [rsp+0x250]
- vblendvps xmm0, xmm0, xmm1, xmm8
- vblendvps xmm2, xmm2, xmm3, xmm8
- vmovaps xmmword ptr [rsp+0x220], xmm0
- vmovaps xmmword ptr [rsp+0x240], xmm2
- add rbx, 128
- add rdi, 32
- sub rsi, 4
-3:
- test rsi, 0x2
- je 3f
- vbroadcasti128 ymm0, xmmword ptr [rcx]
- vbroadcasti128 ymm1, xmmword ptr [rcx+0x10]
- vmovd xmm13, dword ptr [rsp+0x220]
- vpinsrd xmm13, xmm13, dword ptr [rsp+0x240], 1
- vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
- vmovd xmm14, dword ptr [rsp+0x224]
- vpinsrd xmm14, xmm14, dword ptr [rsp+0x244], 1
- vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
- vinserti128 ymm13, ymm13, xmm14, 0x01
- vbroadcasti128 ymm14, xmmword ptr [ROT16+rip]
- vbroadcasti128 ymm15, xmmword ptr [ROT8+rip]
- mov r8, qword ptr [rdi]
- mov r9, qword ptr [rdi+0x8]
- movzx eax, byte ptr [rbp+0x80]
- or eax, r13d
- xor edx, edx
-.p2align 5
-2:
- mov r14d, eax
- or eax, r12d
- add rdx, 64
- cmp rdx, r15
- cmovne eax, r14d
- mov dword ptr [rsp+0x200], eax
- vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip]
- vpbroadcastd ymm8, dword ptr [rsp+0x200]
- vpblendd ymm3, ymm13, ymm8, 0x88
- vmovups ymm8, ymmword ptr [r8+rdx-0x40]
- vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x40], 0x01
- vmovups ymm9, ymmword ptr [r8+rdx-0x30]
- vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x30], 0x01
- vshufps ymm4, ymm8, ymm9, 136
- vshufps ymm5, ymm8, ymm9, 221
- vmovups ymm8, ymmword ptr [r8+rdx-0x20]
- vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x20], 0x01
- vmovups ymm9, ymmword ptr [r8+rdx-0x10]
- vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x10], 0x01
- vshufps ymm6, ymm8, ymm9, 136
- vshufps ymm7, ymm8, ymm9, 221
- vpshufd ymm6, ymm6, 0x93
- vpshufd ymm7, ymm7, 0x93
- mov al, 7
-9:
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm0, ymm0, ymm1
- vpxor ymm3, ymm3, ymm0
- vpshufb ymm3, ymm3, ymm14
- vpaddd ymm2, ymm2, ymm3
- vpxor ymm1, ymm1, ymm2
- vpsrld ymm8, ymm1, 12
- vpslld ymm1, ymm1, 20
- vpor ymm1, ymm1, ymm8
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm0, ymm0, ymm1
- vpxor ymm3, ymm3, ymm0
- vpshufb ymm3, ymm3, ymm15
- vpaddd ymm2, ymm2, ymm3
- vpxor ymm1, ymm1, ymm2
- vpsrld ymm8, ymm1, 7
- vpslld ymm1, ymm1, 25
- vpor ymm1, ymm1, ymm8
- vpshufd ymm0, ymm0, 0x93
- vpshufd ymm3, ymm3, 0x4E
- vpshufd ymm2, ymm2, 0x39
- vpaddd ymm0, ymm0, ymm6
- vpaddd ymm0, ymm0, ymm1
- vpxor ymm3, ymm3, ymm0
- vpshufb ymm3, ymm3, ymm14
- vpaddd ymm2, ymm2, ymm3
- vpxor ymm1, ymm1, ymm2
- vpsrld ymm8, ymm1, 12
- vpslld ymm1, ymm1, 20
- vpor ymm1, ymm1, ymm8
- vpaddd ymm0, ymm0, ymm7
- vpaddd ymm0, ymm0, ymm1
- vpxor ymm3, ymm3, ymm0
- vpshufb ymm3, ymm3, ymm15
- vpaddd ymm2, ymm2, ymm3
- vpxor ymm1, ymm1, ymm2
- vpsrld ymm8, ymm1, 7
- vpslld ymm1, ymm1, 25
- vpor ymm1, ymm1, ymm8
- vpshufd ymm0, ymm0, 0x39
- vpshufd ymm3, ymm3, 0x4E
- vpshufd ymm2, ymm2, 0x93
- dec al
- jz 9f
- vshufps ymm8, ymm4, ymm5, 214
- vpshufd ymm9, ymm4, 0x0F
- vpshufd ymm4, ymm8, 0x39
- vshufps ymm8, ymm6, ymm7, 250
- vpblendd ymm9, ymm9, ymm8, 0xAA
- vpunpcklqdq ymm8, ymm7, ymm5
- vpblendd ymm8, ymm8, ymm6, 0x88
- vpshufd ymm8, ymm8, 0x78
- vpunpckhdq ymm5, ymm5, ymm7
- vpunpckldq ymm6, ymm6, ymm5
- vpshufd ymm7, ymm6, 0x1E
- vmovdqa ymm5, ymm9
- vmovdqa ymm6, ymm8
- jmp 9b
-9:
- vpxor ymm0, ymm0, ymm2
- vpxor ymm1, ymm1, ymm3
- mov eax, r13d
- cmp rdx, r15
- jne 2b
- vmovdqu xmmword ptr [rbx], xmm0
- vmovdqu xmmword ptr [rbx+0x10], xmm1
- vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01
- vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01
- vmovaps ymm8, ymmword ptr [rsp+0x260]
- vmovaps ymm0, ymmword ptr [rsp+0x220]
- vmovups ymm1, ymmword ptr [rsp+0x228]
- vmovaps ymm2, ymmword ptr [rsp+0x240]
- vmovups ymm3, ymmword ptr [rsp+0x248]
- vblendvps ymm0, ymm0, ymm1, ymm8
- vblendvps ymm2, ymm2, ymm3, ymm8
- vmovaps ymmword ptr [rsp+0x220], ymm0
- vmovaps ymmword ptr [rsp+0x240], ymm2
- add rbx, 64
- add rdi, 16
- sub rsi, 2
-3:
- test rsi, 0x1
- je 4b
- vmovdqu xmm0, xmmword ptr [rcx]
- vmovdqu xmm1, xmmword ptr [rcx+0x10]
- vmovd xmm3, dword ptr [rsp+0x220]
- vpinsrd xmm3, xmm3, dword ptr [rsp+0x240], 1
- vpinsrd xmm13, xmm3, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
- vmovdqa xmm14, xmmword ptr [ROT16+rip]
- vmovdqa xmm15, xmmword ptr [ROT8+rip]
- mov r8, qword ptr [rdi]
- movzx eax, byte ptr [rbp+0x80]
- or eax, r13d
- xor edx, edx
-.p2align 5
-2:
- mov r14d, eax
- or eax, r12d
- add rdx, 64
- cmp rdx, r15
- cmovne eax, r14d
- vmovdqa xmm2, xmmword ptr [BLAKE3_IV+rip]
- vmovdqa xmm3, xmm13
- vpinsrd xmm3, xmm3, eax, 3
- vmovups xmm8, xmmword ptr [r8+rdx-0x40]
- vmovups xmm9, xmmword ptr [r8+rdx-0x30]
- vshufps xmm4, xmm8, xmm9, 136
- vshufps xmm5, xmm8, xmm9, 221
- vmovups xmm8, xmmword ptr [r8+rdx-0x20]
- vmovups xmm9, xmmword ptr [r8+rdx-0x10]
- vshufps xmm6, xmm8, xmm9, 136
- vshufps xmm7, xmm8, xmm9, 221
- vpshufd xmm6, xmm6, 0x93
- vpshufd xmm7, xmm7, 0x93
- mov al, 7
-9:
- vpaddd xmm0, xmm0, xmm4
- vpaddd xmm0, xmm0, xmm1
- vpxor xmm3, xmm3, xmm0
- vpshufb xmm3, xmm3, xmm14
- vpaddd xmm2, xmm2, xmm3
- vpxor xmm1, xmm1, xmm2
- vpsrld xmm8, xmm1, 12
- vpslld xmm1, xmm1, 20
- vpor xmm1, xmm1, xmm8
- vpaddd xmm0, xmm0, xmm5
- vpaddd xmm0, xmm0, xmm1
- vpxor xmm3, xmm3, xmm0
- vpshufb xmm3, xmm3, xmm15
- vpaddd xmm2, xmm2, xmm3
- vpxor xmm1, xmm1, xmm2
- vpsrld xmm8, xmm1, 7
- vpslld xmm1, xmm1, 25
- vpor xmm1, xmm1, xmm8
- vpshufd xmm0, xmm0, 0x93
- vpshufd xmm3, xmm3, 0x4E
- vpshufd xmm2, xmm2, 0x39
- vpaddd xmm0, xmm0, xmm6
- vpaddd xmm0, xmm0, xmm1
- vpxor xmm3, xmm3, xmm0
- vpshufb xmm3, xmm3, xmm14
- vpaddd xmm2, xmm2, xmm3
- vpxor xmm1, xmm1, xmm2
- vpsrld xmm8, xmm1, 12
- vpslld xmm1, xmm1, 20
- vpor xmm1, xmm1, xmm8
- vpaddd xmm0, xmm0, xmm7
- vpaddd xmm0, xmm0, xmm1
- vpxor xmm3, xmm3, xmm0
- vpshufb xmm3, xmm3, xmm15
- vpaddd xmm2, xmm2, xmm3
- vpxor xmm1, xmm1, xmm2
- vpsrld xmm8, xmm1, 7
- vpslld xmm1, xmm1, 25
- vpor xmm1, xmm1, xmm8
- vpshufd xmm0, xmm0, 0x39
- vpshufd xmm3, xmm3, 0x4E
- vpshufd xmm2, xmm2, 0x93
- dec al
- jz 9f
- vshufps xmm8, xmm4, xmm5, 214
- vpshufd xmm9, xmm4, 0x0F
- vpshufd xmm4, xmm8, 0x39
- vshufps xmm8, xmm6, xmm7, 250
- vpblendd xmm9, xmm9, xmm8, 0xAA
- vpunpcklqdq xmm8, xmm7, xmm5
- vpblendd xmm8, xmm8, xmm6, 0x88
- vpshufd xmm8, xmm8, 0x78
- vpunpckhdq xmm5, xmm5, xmm7
- vpunpckldq xmm6, xmm6, xmm5
- vpshufd xmm7, xmm6, 0x1E
- vmovdqa xmm5, xmm9
- vmovdqa xmm6, xmm8
- jmp 9b
-9:
- vpxor xmm0, xmm0, xmm2
- vpxor xmm1, xmm1, xmm3
- mov eax, r13d
- cmp rdx, r15
- jne 2b
- vmovdqu xmmword ptr [rbx], xmm0
- vmovdqu xmmword ptr [rbx+0x10], xmm1
- jmp 4b
-
-.section .rodata
-.p2align 6
-ADD0:
- .long 0, 1, 2, 3, 4, 5, 6, 7
-ADD1:
- .long 8, 8, 8, 8, 8, 8, 8, 8
-BLAKE3_IV_0:
- .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
- .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
-BLAKE3_IV_1:
- .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
- .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
-BLAKE3_IV_2:
- .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
- .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
-BLAKE3_IV_3:
- .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
- .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
-BLAKE3_BLOCK_LEN:
- .long 0x00000040, 0x00000040, 0x00000040, 0x00000040
- .long 0x00000040, 0x00000040, 0x00000040, 0x00000040
-ROT16:
- .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
-ROT8:
- .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
-CMP_MSB_MASK:
- .long 0x80000000, 0x80000000, 0x80000000, 0x80000000
- .long 0x80000000, 0x80000000, 0x80000000, 0x80000000
-BLAKE3_IV:
- .long 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A
-
diff --git a/thirdparty/BLAKE3/c/blake3_avx2_x86-64_windows_msvc.asm b/thirdparty/BLAKE3/c/blake3_avx2_x86-64_windows_msvc.asm
deleted file mode 100644
index 352298edd..000000000
--- a/thirdparty/BLAKE3/c/blake3_avx2_x86-64_windows_msvc.asm
+++ /dev/null
@@ -1,1828 +0,0 @@
-public _blake3_hash_many_avx2
-public blake3_hash_many_avx2
-
-_TEXT SEGMENT ALIGN(16) 'CODE'
-
-ALIGN 16
-blake3_hash_many_avx2 PROC
-_blake3_hash_many_avx2 PROC
- push r15
- push r14
- push r13
- push r12
- push rsi
- push rdi
- push rbx
- push rbp
- mov rbp, rsp
- sub rsp, 880
- and rsp, 0FFFFFFFFFFFFFFC0H
- vmovdqa xmmword ptr [rsp+2D0H], xmm6
- vmovdqa xmmword ptr [rsp+2E0H], xmm7
- vmovdqa xmmword ptr [rsp+2F0H], xmm8
- vmovdqa xmmword ptr [rsp+300H], xmm9
- vmovdqa xmmword ptr [rsp+310H], xmm10
- vmovdqa xmmword ptr [rsp+320H], xmm11
- vmovdqa xmmword ptr [rsp+330H], xmm12
- vmovdqa xmmword ptr [rsp+340H], xmm13
- vmovdqa xmmword ptr [rsp+350H], xmm14
- vmovdqa xmmword ptr [rsp+360H], xmm15
- mov rdi, rcx
- mov rsi, rdx
- mov rdx, r8
- mov rcx, r9
- mov r8, qword ptr [rbp+68H]
- movzx r9, byte ptr [rbp+70H]
- neg r9d
- vmovd xmm0, r9d
- vpbroadcastd ymm0, xmm0
- vmovdqa ymmword ptr [rsp+260H], ymm0
- vpand ymm1, ymm0, ymmword ptr [ADD0]
- vpand ymm2, ymm0, ymmword ptr [ADD1]
- vmovdqa ymmword ptr [rsp+2A0H], ymm2
- vmovd xmm2, r8d
- vpbroadcastd ymm2, xmm2
- vpaddd ymm2, ymm2, ymm1
- vmovdqa ymmword ptr [rsp+220H], ymm2
- vpxor ymm1, ymm1, ymmword ptr [CMP_MSB_MASK]
- vpxor ymm2, ymm2, ymmword ptr [CMP_MSB_MASK]
- vpcmpgtd ymm2, ymm1, ymm2
- shr r8, 32
- vmovd xmm3, r8d
- vpbroadcastd ymm3, xmm3
- vpsubd ymm3, ymm3, ymm2
- vmovdqa ymmword ptr [rsp+240H], ymm3
- shl rdx, 6
- mov qword ptr [rsp+2C0H], rdx
- cmp rsi, 8
- jc final7blocks
-outerloop8:
- vpbroadcastd ymm0, dword ptr [rcx]
- vpbroadcastd ymm1, dword ptr [rcx+4H]
- vpbroadcastd ymm2, dword ptr [rcx+8H]
- vpbroadcastd ymm3, dword ptr [rcx+0CH]
- vpbroadcastd ymm4, dword ptr [rcx+10H]
- vpbroadcastd ymm5, dword ptr [rcx+14H]
- vpbroadcastd ymm6, dword ptr [rcx+18H]
- vpbroadcastd ymm7, dword ptr [rcx+1CH]
- mov r8, qword ptr [rdi]
- mov r9, qword ptr [rdi+8H]
- mov r10, qword ptr [rdi+10H]
- mov r11, qword ptr [rdi+18H]
- mov r12, qword ptr [rdi+20H]
- mov r13, qword ptr [rdi+28H]
- mov r14, qword ptr [rdi+30H]
- mov r15, qword ptr [rdi+38H]
- movzx eax, byte ptr [rbp+78H]
- movzx ebx, byte ptr [rbp+80H]
- or eax, ebx
- xor edx, edx
-ALIGN 16
-innerloop8:
- movzx ebx, byte ptr [rbp+88H]
- or ebx, eax
- add rdx, 64
- cmp rdx, qword ptr [rsp+2C0H]
- cmove eax, ebx
- mov dword ptr [rsp+200H], eax
- vmovups xmm8, xmmword ptr [r8+rdx-40H]
- vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-40H], 01H
- vmovups xmm9, xmmword ptr [r9+rdx-40H]
- vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-40H], 01H
- vunpcklpd ymm12, ymm8, ymm9
- vunpckhpd ymm13, ymm8, ymm9
- vmovups xmm10, xmmword ptr [r10+rdx-40H]
- vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-40H], 01H
- vmovups xmm11, xmmword ptr [r11+rdx-40H]
- vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-40H], 01H
- vunpcklpd ymm14, ymm10, ymm11
- vunpckhpd ymm15, ymm10, ymm11
- vshufps ymm8, ymm12, ymm14, 136
- vmovaps ymmword ptr [rsp], ymm8
- vshufps ymm9, ymm12, ymm14, 221
- vmovaps ymmword ptr [rsp+20H], ymm9
- vshufps ymm10, ymm13, ymm15, 136
- vmovaps ymmword ptr [rsp+40H], ymm10
- vshufps ymm11, ymm13, ymm15, 221
- vmovaps ymmword ptr [rsp+60H], ymm11
- vmovups xmm8, xmmword ptr [r8+rdx-30H]
- vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-30H], 01H
- vmovups xmm9, xmmword ptr [r9+rdx-30H]
- vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-30H], 01H
- vunpcklpd ymm12, ymm8, ymm9
- vunpckhpd ymm13, ymm8, ymm9
- vmovups xmm10, xmmword ptr [r10+rdx-30H]
- vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-30H], 01H
- vmovups xmm11, xmmword ptr [r11+rdx-30H]
- vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-30H], 01H
- vunpcklpd ymm14, ymm10, ymm11
- vunpckhpd ymm15, ymm10, ymm11
- vshufps ymm8, ymm12, ymm14, 136
- vmovaps ymmword ptr [rsp+80H], ymm8
- vshufps ymm9, ymm12, ymm14, 221
- vmovaps ymmword ptr [rsp+0A0H], ymm9
- vshufps ymm10, ymm13, ymm15, 136
- vmovaps ymmword ptr [rsp+0C0H], ymm10
- vshufps ymm11, ymm13, ymm15, 221
- vmovaps ymmword ptr [rsp+0E0H], ymm11
- vmovups xmm8, xmmword ptr [r8+rdx-20H]
- vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-20H], 01H
- vmovups xmm9, xmmword ptr [r9+rdx-20H]
- vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-20H], 01H
- vunpcklpd ymm12, ymm8, ymm9
- vunpckhpd ymm13, ymm8, ymm9
- vmovups xmm10, xmmword ptr [r10+rdx-20H]
- vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-20H], 01H
- vmovups xmm11, xmmword ptr [r11+rdx-20H]
- vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-20H], 01H
- vunpcklpd ymm14, ymm10, ymm11
- vunpckhpd ymm15, ymm10, ymm11
- vshufps ymm8, ymm12, ymm14, 136
- vmovaps ymmword ptr [rsp+100H], ymm8
- vshufps ymm9, ymm12, ymm14, 221
- vmovaps ymmword ptr [rsp+120H], ymm9
- vshufps ymm10, ymm13, ymm15, 136
- vmovaps ymmword ptr [rsp+140H], ymm10
- vshufps ymm11, ymm13, ymm15, 221
- vmovaps ymmword ptr [rsp+160H], ymm11
- vmovups xmm8, xmmword ptr [r8+rdx-10H]
- vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-10H], 01H
- vmovups xmm9, xmmword ptr [r9+rdx-10H]
- vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-10H], 01H
- vunpcklpd ymm12, ymm8, ymm9
- vunpckhpd ymm13, ymm8, ymm9
- vmovups xmm10, xmmword ptr [r10+rdx-10H]
- vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-10H], 01H
- vmovups xmm11, xmmword ptr [r11+rdx-10H]
- vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-10H], 01H
- vunpcklpd ymm14, ymm10, ymm11
- vunpckhpd ymm15, ymm10, ymm11
- vshufps ymm8, ymm12, ymm14, 136
- vmovaps ymmword ptr [rsp+180H], ymm8
- vshufps ymm9, ymm12, ymm14, 221
- vmovaps ymmword ptr [rsp+1A0H], ymm9
- vshufps ymm10, ymm13, ymm15, 136
- vmovaps ymmword ptr [rsp+1C0H], ymm10
- vshufps ymm11, ymm13, ymm15, 221
- vmovaps ymmword ptr [rsp+1E0H], ymm11
- vpbroadcastd ymm15, dword ptr [rsp+200H]
- prefetcht0 byte ptr [r8+rdx+80H]
- prefetcht0 byte ptr [r12+rdx+80H]
- prefetcht0 byte ptr [r9+rdx+80H]
- prefetcht0 byte ptr [r13+rdx+80H]
- prefetcht0 byte ptr [r10+rdx+80H]
- prefetcht0 byte ptr [r14+rdx+80H]
- prefetcht0 byte ptr [r11+rdx+80H]
- prefetcht0 byte ptr [r15+rdx+80H]
- vpaddd ymm0, ymm0, ymmword ptr [rsp]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+40H]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+80H]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+0C0H]
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxor ymm12, ymm0, ymmword ptr [rsp+220H]
- vpxor ymm13, ymm1, ymmword ptr [rsp+240H]
- vpxor ymm14, ymm2, ymmword ptr [BLAKE3_BLOCK_LEN]
- vpxor ymm15, ymm3, ymm15
- vbroadcasti128 ymm8, xmmword ptr [ROT16]
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpshufb ymm15, ymm15, ymm8
- vpaddd ymm8, ymm12, ymmword ptr [BLAKE3_IV_0]
- vpaddd ymm9, ymm13, ymmword ptr [BLAKE3_IV_1]
- vpaddd ymm10, ymm14, ymmword ptr [BLAKE3_IV_2]
- vpaddd ymm11, ymm15, ymmword ptr [BLAKE3_IV_3]
- vpxor ymm4, ymm4, ymm8
- vpxor ymm5, ymm5, ymm9
- vpxor ymm6, ymm6, ymm10
- vpxor ymm7, ymm7, ymm11
- vmovdqa ymmword ptr [rsp+200H], ymm8
- vpsrld ymm8, ymm4, 12
- vpslld ymm4, ymm4, 20
- vpor ymm4, ymm4, ymm8
- vpsrld ymm8, ymm5, 12
- vpslld ymm5, ymm5, 20
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 12
- vpslld ymm6, ymm6, 20
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 12
- vpslld ymm7, ymm7, 20
- vpor ymm7, ymm7, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+20H]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+60H]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+0A0H]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+0E0H]
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxor ymm12, ymm12, ymm0
- vpxor ymm13, ymm13, ymm1
- vpxor ymm14, ymm14, ymm2
- vpxor ymm15, ymm15, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT8]
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpshufb ymm15, ymm15, ymm8
- vpaddd ymm8, ymm12, ymmword ptr [rsp+200H]
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxor ymm4, ymm4, ymm8
- vpxor ymm5, ymm5, ymm9
- vpxor ymm6, ymm6, ymm10
- vpxor ymm7, ymm7, ymm11
- vmovdqa ymmword ptr [rsp+200H], ymm8
- vpsrld ymm8, ymm4, 7
- vpslld ymm4, ymm4, 25
- vpor ymm4, ymm4, ymm8
- vpsrld ymm8, ymm5, 7
- vpslld ymm5, ymm5, 25
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 7
- vpslld ymm6, ymm6, 25
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 7
- vpslld ymm7, ymm7, 25
- vpor ymm7, ymm7, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+100H]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+140H]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+180H]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+1C0H]
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxor ymm15, ymm15, ymm0
- vpxor ymm12, ymm12, ymm1
- vpxor ymm13, ymm13, ymm2
- vpxor ymm14, ymm14, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT16]
- vpshufb ymm15, ymm15, ymm8
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm13, ymmword ptr [rsp+200H]
- vpaddd ymm9, ymm9, ymm14
- vpxor ymm5, ymm5, ymm10
- vpxor ymm6, ymm6, ymm11
- vpxor ymm7, ymm7, ymm8
- vpxor ymm4, ymm4, ymm9
- vmovdqa ymmword ptr [rsp+200H], ymm8
- vpsrld ymm8, ymm5, 12
- vpslld ymm5, ymm5, 20
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 12
- vpslld ymm6, ymm6, 20
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 12
- vpslld ymm7, ymm7, 20
- vpor ymm7, ymm7, ymm8
- vpsrld ymm8, ymm4, 12
- vpslld ymm4, ymm4, 20
- vpor ymm4, ymm4, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+120H]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+160H]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+1A0H]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+1E0H]
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxor ymm15, ymm15, ymm0
- vpxor ymm12, ymm12, ymm1
- vpxor ymm13, ymm13, ymm2
- vpxor ymm14, ymm14, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT8]
- vpshufb ymm15, ymm15, ymm8
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm13, ymmword ptr [rsp+200H]
- vpaddd ymm9, ymm9, ymm14
- vpxor ymm5, ymm5, ymm10
- vpxor ymm6, ymm6, ymm11
- vpxor ymm7, ymm7, ymm8
- vpxor ymm4, ymm4, ymm9
- vmovdqa ymmword ptr [rsp+200H], ymm8
- vpsrld ymm8, ymm5, 7
- vpslld ymm5, ymm5, 25
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 7
- vpslld ymm6, ymm6, 25
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 7
- vpslld ymm7, ymm7, 25
- vpor ymm7, ymm7, ymm8
- vpsrld ymm8, ymm4, 7
- vpslld ymm4, ymm4, 25
- vpor ymm4, ymm4, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+40H]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+60H]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+0E0H]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+80H]
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxor ymm12, ymm12, ymm0
- vpxor ymm13, ymm13, ymm1
- vpxor ymm14, ymm14, ymm2
- vpxor ymm15, ymm15, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT16]
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpshufb ymm15, ymm15, ymm8
- vpaddd ymm8, ymm12, ymmword ptr [rsp+200H]
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxor ymm4, ymm4, ymm8
- vpxor ymm5, ymm5, ymm9
- vpxor ymm6, ymm6, ymm10
- vpxor ymm7, ymm7, ymm11
- vmovdqa ymmword ptr [rsp+200H], ymm8
- vpsrld ymm8, ymm4, 12
- vpslld ymm4, ymm4, 20
- vpor ymm4, ymm4, ymm8
- vpsrld ymm8, ymm5, 12
- vpslld ymm5, ymm5, 20
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 12
- vpslld ymm6, ymm6, 20
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 12
- vpslld ymm7, ymm7, 20
- vpor ymm7, ymm7, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+0C0H]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+140H]
- vpaddd ymm2, ymm2, ymmword ptr [rsp]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+1A0H]
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxor ymm12, ymm12, ymm0
- vpxor ymm13, ymm13, ymm1
- vpxor ymm14, ymm14, ymm2
- vpxor ymm15, ymm15, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT8]
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpshufb ymm15, ymm15, ymm8
- vpaddd ymm8, ymm12, ymmword ptr [rsp+200H]
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxor ymm4, ymm4, ymm8
- vpxor ymm5, ymm5, ymm9
- vpxor ymm6, ymm6, ymm10
- vpxor ymm7, ymm7, ymm11
- vmovdqa ymmword ptr [rsp+200H], ymm8
- vpsrld ymm8, ymm4, 7
- vpslld ymm4, ymm4, 25
- vpor ymm4, ymm4, ymm8
- vpsrld ymm8, ymm5, 7
- vpslld ymm5, ymm5, 25
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 7
- vpslld ymm6, ymm6, 25
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 7
- vpslld ymm7, ymm7, 25
- vpor ymm7, ymm7, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+20H]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+180H]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+120H]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+1E0H]
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxor ymm15, ymm15, ymm0
- vpxor ymm12, ymm12, ymm1
- vpxor ymm13, ymm13, ymm2
- vpxor ymm14, ymm14, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT16]
- vpshufb ymm15, ymm15, ymm8
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm13, ymmword ptr [rsp+200H]
- vpaddd ymm9, ymm9, ymm14
- vpxor ymm5, ymm5, ymm10
- vpxor ymm6, ymm6, ymm11
- vpxor ymm7, ymm7, ymm8
- vpxor ymm4, ymm4, ymm9
- vmovdqa ymmword ptr [rsp+200H], ymm8
- vpsrld ymm8, ymm5, 12
- vpslld ymm5, ymm5, 20
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 12
- vpslld ymm6, ymm6, 20
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 12
- vpslld ymm7, ymm7, 20
- vpor ymm7, ymm7, ymm8
- vpsrld ymm8, ymm4, 12
- vpslld ymm4, ymm4, 20
- vpor ymm4, ymm4, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+160H]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+0A0H]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+1C0H]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+100H]
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxor ymm15, ymm15, ymm0
- vpxor ymm12, ymm12, ymm1
- vpxor ymm13, ymm13, ymm2
- vpxor ymm14, ymm14, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT8]
- vpshufb ymm15, ymm15, ymm8
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm13, ymmword ptr [rsp+200H]
- vpaddd ymm9, ymm9, ymm14
- vpxor ymm5, ymm5, ymm10
- vpxor ymm6, ymm6, ymm11
- vpxor ymm7, ymm7, ymm8
- vpxor ymm4, ymm4, ymm9
- vmovdqa ymmword ptr [rsp+200H], ymm8
- vpsrld ymm8, ymm5, 7
- vpslld ymm5, ymm5, 25
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 7
- vpslld ymm6, ymm6, 25
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 7
- vpslld ymm7, ymm7, 25
- vpor ymm7, ymm7, ymm8
- vpsrld ymm8, ymm4, 7
- vpslld ymm4, ymm4, 25
- vpor ymm4, ymm4, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+60H]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+140H]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+1A0H]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+0E0H]
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxor ymm12, ymm12, ymm0
- vpxor ymm13, ymm13, ymm1
- vpxor ymm14, ymm14, ymm2
- vpxor ymm15, ymm15, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT16]
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpshufb ymm15, ymm15, ymm8
- vpaddd ymm8, ymm12, ymmword ptr [rsp+200H]
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxor ymm4, ymm4, ymm8
- vpxor ymm5, ymm5, ymm9
- vpxor ymm6, ymm6, ymm10
- vpxor ymm7, ymm7, ymm11
- vmovdqa ymmword ptr [rsp+200H], ymm8
- vpsrld ymm8, ymm4, 12
- vpslld ymm4, ymm4, 20
- vpor ymm4, ymm4, ymm8
- vpsrld ymm8, ymm5, 12
- vpslld ymm5, ymm5, 20
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 12
- vpslld ymm6, ymm6, 20
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 12
- vpslld ymm7, ymm7, 20
- vpor ymm7, ymm7, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+80H]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+180H]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+40H]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+1C0H]
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxor ymm12, ymm12, ymm0
- vpxor ymm13, ymm13, ymm1
- vpxor ymm14, ymm14, ymm2
- vpxor ymm15, ymm15, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT8]
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpshufb ymm15, ymm15, ymm8
- vpaddd ymm8, ymm12, ymmword ptr [rsp+200H]
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxor ymm4, ymm4, ymm8
- vpxor ymm5, ymm5, ymm9
- vpxor ymm6, ymm6, ymm10
- vpxor ymm7, ymm7, ymm11
- vmovdqa ymmword ptr [rsp+200H], ymm8
- vpsrld ymm8, ymm4, 7
- vpslld ymm4, ymm4, 25
- vpor ymm4, ymm4, ymm8
- vpsrld ymm8, ymm5, 7
- vpslld ymm5, ymm5, 25
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 7
- vpslld ymm6, ymm6, 25
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 7
- vpslld ymm7, ymm7, 25
- vpor ymm7, ymm7, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+0C0H]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+120H]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+160H]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+100H]
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxor ymm15, ymm15, ymm0
- vpxor ymm12, ymm12, ymm1
- vpxor ymm13, ymm13, ymm2
- vpxor ymm14, ymm14, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT16]
- vpshufb ymm15, ymm15, ymm8
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm13, ymmword ptr [rsp+200H]
- vpaddd ymm9, ymm9, ymm14
- vpxor ymm5, ymm5, ymm10
- vpxor ymm6, ymm6, ymm11
- vpxor ymm7, ymm7, ymm8
- vpxor ymm4, ymm4, ymm9
- vmovdqa ymmword ptr [rsp+200H], ymm8
- vpsrld ymm8, ymm5, 12
- vpslld ymm5, ymm5, 20
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 12
- vpslld ymm6, ymm6, 20
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 12
- vpslld ymm7, ymm7, 20
- vpor ymm7, ymm7, ymm8
- vpsrld ymm8, ymm4, 12
- vpslld ymm4, ymm4, 20
- vpor ymm4, ymm4, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+0A0H]
- vpaddd ymm1, ymm1, ymmword ptr [rsp]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+1E0H]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+20H]
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxor ymm15, ymm15, ymm0
- vpxor ymm12, ymm12, ymm1
- vpxor ymm13, ymm13, ymm2
- vpxor ymm14, ymm14, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT8]
- vpshufb ymm15, ymm15, ymm8
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm13, ymmword ptr [rsp+200H]
- vpaddd ymm9, ymm9, ymm14
- vpxor ymm5, ymm5, ymm10
- vpxor ymm6, ymm6, ymm11
- vpxor ymm7, ymm7, ymm8
- vpxor ymm4, ymm4, ymm9
- vmovdqa ymmword ptr [rsp+200H], ymm8
- vpsrld ymm8, ymm5, 7
- vpslld ymm5, ymm5, 25
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 7
- vpslld ymm6, ymm6, 25
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 7
- vpslld ymm7, ymm7, 25
- vpor ymm7, ymm7, ymm8
- vpsrld ymm8, ymm4, 7
- vpslld ymm4, ymm4, 25
- vpor ymm4, ymm4, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+140H]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+180H]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+1C0H]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+1A0H]
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxor ymm12, ymm12, ymm0
- vpxor ymm13, ymm13, ymm1
- vpxor ymm14, ymm14, ymm2
- vpxor ymm15, ymm15, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT16]
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpshufb ymm15, ymm15, ymm8
- vpaddd ymm8, ymm12, ymmword ptr [rsp+200H]
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxor ymm4, ymm4, ymm8
- vpxor ymm5, ymm5, ymm9
- vpxor ymm6, ymm6, ymm10
- vpxor ymm7, ymm7, ymm11
- vmovdqa ymmword ptr [rsp+200H], ymm8
- vpsrld ymm8, ymm4, 12
- vpslld ymm4, ymm4, 20
- vpor ymm4, ymm4, ymm8
- vpsrld ymm8, ymm5, 12
- vpslld ymm5, ymm5, 20
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 12
- vpslld ymm6, ymm6, 20
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 12
- vpslld ymm7, ymm7, 20
- vpor ymm7, ymm7, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+0E0H]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+120H]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+60H]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+1E0H]
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxor ymm12, ymm12, ymm0
- vpxor ymm13, ymm13, ymm1
- vpxor ymm14, ymm14, ymm2
- vpxor ymm15, ymm15, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT8]
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpshufb ymm15, ymm15, ymm8
- vpaddd ymm8, ymm12, ymmword ptr [rsp+200H]
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxor ymm4, ymm4, ymm8
- vpxor ymm5, ymm5, ymm9
- vpxor ymm6, ymm6, ymm10
- vpxor ymm7, ymm7, ymm11
- vmovdqa ymmword ptr [rsp+200H], ymm8
- vpsrld ymm8, ymm4, 7
- vpslld ymm4, ymm4, 25
- vpor ymm4, ymm4, ymm8
- vpsrld ymm8, ymm5, 7
- vpslld ymm5, ymm5, 25
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 7
- vpslld ymm6, ymm6, 25
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 7
- vpslld ymm7, ymm7, 25
- vpor ymm7, ymm7, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+80H]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+160H]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+0A0H]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+20H]
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxor ymm15, ymm15, ymm0
- vpxor ymm12, ymm12, ymm1
- vpxor ymm13, ymm13, ymm2
- vpxor ymm14, ymm14, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT16]
- vpshufb ymm15, ymm15, ymm8
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm13, ymmword ptr [rsp+200H]
- vpaddd ymm9, ymm9, ymm14
- vpxor ymm5, ymm5, ymm10
- vpxor ymm6, ymm6, ymm11
- vpxor ymm7, ymm7, ymm8
- vpxor ymm4, ymm4, ymm9
- vmovdqa ymmword ptr [rsp+200H], ymm8
- vpsrld ymm8, ymm5, 12
- vpslld ymm5, ymm5, 20
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 12
- vpslld ymm6, ymm6, 20
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 12
- vpslld ymm7, ymm7, 20
- vpor ymm7, ymm7, ymm8
- vpsrld ymm8, ymm4, 12
- vpslld ymm4, ymm4, 20
- vpor ymm4, ymm4, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+40H]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+100H]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+0C0H]
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxor ymm15, ymm15, ymm0
- vpxor ymm12, ymm12, ymm1
- vpxor ymm13, ymm13, ymm2
- vpxor ymm14, ymm14, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT8]
- vpshufb ymm15, ymm15, ymm8
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm13, ymmword ptr [rsp+200H]
- vpaddd ymm9, ymm9, ymm14
- vpxor ymm5, ymm5, ymm10
- vpxor ymm6, ymm6, ymm11
- vpxor ymm7, ymm7, ymm8
- vpxor ymm4, ymm4, ymm9
- vmovdqa ymmword ptr [rsp+200H], ymm8
- vpsrld ymm8, ymm5, 7
- vpslld ymm5, ymm5, 25
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 7
- vpslld ymm6, ymm6, 25
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 7
- vpslld ymm7, ymm7, 25
- vpor ymm7, ymm7, ymm8
- vpsrld ymm8, ymm4, 7
- vpslld ymm4, ymm4, 25
- vpor ymm4, ymm4, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+180H]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+120H]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+1E0H]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+1C0H]
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxor ymm12, ymm12, ymm0
- vpxor ymm13, ymm13, ymm1
- vpxor ymm14, ymm14, ymm2
- vpxor ymm15, ymm15, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT16]
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpshufb ymm15, ymm15, ymm8
- vpaddd ymm8, ymm12, ymmword ptr [rsp+200H]
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxor ymm4, ymm4, ymm8
- vpxor ymm5, ymm5, ymm9
- vpxor ymm6, ymm6, ymm10
- vpxor ymm7, ymm7, ymm11
- vmovdqa ymmword ptr [rsp+200H], ymm8
- vpsrld ymm8, ymm4, 12
- vpslld ymm4, ymm4, 20
- vpor ymm4, ymm4, ymm8
- vpsrld ymm8, ymm5, 12
- vpslld ymm5, ymm5, 20
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 12
- vpslld ymm6, ymm6, 20
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 12
- vpslld ymm7, ymm7, 20
- vpor ymm7, ymm7, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+1A0H]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+160H]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+140H]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+100H]
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxor ymm12, ymm12, ymm0
- vpxor ymm13, ymm13, ymm1
- vpxor ymm14, ymm14, ymm2
- vpxor ymm15, ymm15, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT8]
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpshufb ymm15, ymm15, ymm8
- vpaddd ymm8, ymm12, ymmword ptr [rsp+200H]
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxor ymm4, ymm4, ymm8
- vpxor ymm5, ymm5, ymm9
- vpxor ymm6, ymm6, ymm10
- vpxor ymm7, ymm7, ymm11
- vmovdqa ymmword ptr [rsp+200H], ymm8
- vpsrld ymm8, ymm4, 7
- vpslld ymm4, ymm4, 25
- vpor ymm4, ymm4, ymm8
- vpsrld ymm8, ymm5, 7
- vpslld ymm5, ymm5, 25
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 7
- vpslld ymm6, ymm6, 25
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 7
- vpslld ymm7, ymm7, 25
- vpor ymm7, ymm7, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+0E0H]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+0A0H]
- vpaddd ymm2, ymm2, ymmword ptr [rsp]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+0C0H]
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxor ymm15, ymm15, ymm0
- vpxor ymm12, ymm12, ymm1
- vpxor ymm13, ymm13, ymm2
- vpxor ymm14, ymm14, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT16]
- vpshufb ymm15, ymm15, ymm8
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm13, ymmword ptr [rsp+200H]
- vpaddd ymm9, ymm9, ymm14
- vpxor ymm5, ymm5, ymm10
- vpxor ymm6, ymm6, ymm11
- vpxor ymm7, ymm7, ymm8
- vpxor ymm4, ymm4, ymm9
- vmovdqa ymmword ptr [rsp+200H], ymm8
- vpsrld ymm8, ymm5, 12
- vpslld ymm5, ymm5, 20
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 12
- vpslld ymm6, ymm6, 20
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 12
- vpslld ymm7, ymm7, 20
- vpor ymm7, ymm7, ymm8
- vpsrld ymm8, ymm4, 12
- vpslld ymm4, ymm4, 20
- vpor ymm4, ymm4, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+40H]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+60H]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+20H]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+80H]
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxor ymm15, ymm15, ymm0
- vpxor ymm12, ymm12, ymm1
- vpxor ymm13, ymm13, ymm2
- vpxor ymm14, ymm14, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT8]
- vpshufb ymm15, ymm15, ymm8
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm13, ymmword ptr [rsp+200H]
- vpaddd ymm9, ymm9, ymm14
- vpxor ymm5, ymm5, ymm10
- vpxor ymm6, ymm6, ymm11
- vpxor ymm7, ymm7, ymm8
- vpxor ymm4, ymm4, ymm9
- vmovdqa ymmword ptr [rsp+200H], ymm8
- vpsrld ymm8, ymm5, 7
- vpslld ymm5, ymm5, 25
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 7
- vpslld ymm6, ymm6, 25
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 7
- vpslld ymm7, ymm7, 25
- vpor ymm7, ymm7, ymm8
- vpsrld ymm8, ymm4, 7
- vpslld ymm4, ymm4, 25
- vpor ymm4, ymm4, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+120H]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+160H]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+100H]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+1E0H]
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxor ymm12, ymm12, ymm0
- vpxor ymm13, ymm13, ymm1
- vpxor ymm14, ymm14, ymm2
- vpxor ymm15, ymm15, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT16]
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpshufb ymm15, ymm15, ymm8
- vpaddd ymm8, ymm12, ymmword ptr [rsp+200H]
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxor ymm4, ymm4, ymm8
- vpxor ymm5, ymm5, ymm9
- vpxor ymm6, ymm6, ymm10
- vpxor ymm7, ymm7, ymm11
- vmovdqa ymmword ptr [rsp+200H], ymm8
- vpsrld ymm8, ymm4, 12
- vpslld ymm4, ymm4, 20
- vpor ymm4, ymm4, ymm8
- vpsrld ymm8, ymm5, 12
- vpslld ymm5, ymm5, 20
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 12
- vpslld ymm6, ymm6, 20
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 12
- vpslld ymm7, ymm7, 20
- vpor ymm7, ymm7, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+1C0H]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+0A0H]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+180H]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+20H]
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxor ymm12, ymm12, ymm0
- vpxor ymm13, ymm13, ymm1
- vpxor ymm14, ymm14, ymm2
- vpxor ymm15, ymm15, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT8]
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpshufb ymm15, ymm15, ymm8
- vpaddd ymm8, ymm12, ymmword ptr [rsp+200H]
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxor ymm4, ymm4, ymm8
- vpxor ymm5, ymm5, ymm9
- vpxor ymm6, ymm6, ymm10
- vpxor ymm7, ymm7, ymm11
- vmovdqa ymmword ptr [rsp+200H], ymm8
- vpsrld ymm8, ymm4, 7
- vpslld ymm4, ymm4, 25
- vpor ymm4, ymm4, ymm8
- vpsrld ymm8, ymm5, 7
- vpslld ymm5, ymm5, 25
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 7
- vpslld ymm6, ymm6, 25
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 7
- vpslld ymm7, ymm7, 25
- vpor ymm7, ymm7, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+1A0H]
- vpaddd ymm1, ymm1, ymmword ptr [rsp]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+40H]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+80H]
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxor ymm15, ymm15, ymm0
- vpxor ymm12, ymm12, ymm1
- vpxor ymm13, ymm13, ymm2
- vpxor ymm14, ymm14, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT16]
- vpshufb ymm15, ymm15, ymm8
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm13, ymmword ptr [rsp+200H]
- vpaddd ymm9, ymm9, ymm14
- vpxor ymm5, ymm5, ymm10
- vpxor ymm6, ymm6, ymm11
- vpxor ymm7, ymm7, ymm8
- vpxor ymm4, ymm4, ymm9
- vmovdqa ymmword ptr [rsp+200H], ymm8
- vpsrld ymm8, ymm5, 12
- vpslld ymm5, ymm5, 20
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 12
- vpslld ymm6, ymm6, 20
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 12
- vpslld ymm7, ymm7, 20
- vpor ymm7, ymm7, ymm8
- vpsrld ymm8, ymm4, 12
- vpslld ymm4, ymm4, 20
- vpor ymm4, ymm4, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+60H]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+140H]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+0C0H]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+0E0H]
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxor ymm15, ymm15, ymm0
- vpxor ymm12, ymm12, ymm1
- vpxor ymm13, ymm13, ymm2
- vpxor ymm14, ymm14, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT8]
- vpshufb ymm15, ymm15, ymm8
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm13, ymmword ptr [rsp+200H]
- vpaddd ymm9, ymm9, ymm14
- vpxor ymm5, ymm5, ymm10
- vpxor ymm6, ymm6, ymm11
- vpxor ymm7, ymm7, ymm8
- vpxor ymm4, ymm4, ymm9
- vmovdqa ymmword ptr [rsp+200H], ymm8
- vpsrld ymm8, ymm5, 7
- vpslld ymm5, ymm5, 25
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 7
- vpslld ymm6, ymm6, 25
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 7
- vpslld ymm7, ymm7, 25
- vpor ymm7, ymm7, ymm8
- vpsrld ymm8, ymm4, 7
- vpslld ymm4, ymm4, 25
- vpor ymm4, ymm4, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+160H]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+0A0H]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+20H]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+100H]
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxor ymm12, ymm12, ymm0
- vpxor ymm13, ymm13, ymm1
- vpxor ymm14, ymm14, ymm2
- vpxor ymm15, ymm15, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT16]
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpshufb ymm15, ymm15, ymm8
- vpaddd ymm8, ymm12, ymmword ptr [rsp+200H]
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxor ymm4, ymm4, ymm8
- vpxor ymm5, ymm5, ymm9
- vpxor ymm6, ymm6, ymm10
- vpxor ymm7, ymm7, ymm11
- vmovdqa ymmword ptr [rsp+200H], ymm8
- vpsrld ymm8, ymm4, 12
- vpslld ymm4, ymm4, 20
- vpor ymm4, ymm4, ymm8
- vpsrld ymm8, ymm5, 12
- vpslld ymm5, ymm5, 20
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 12
- vpslld ymm6, ymm6, 20
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 12
- vpslld ymm7, ymm7, 20
- vpor ymm7, ymm7, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+1E0H]
- vpaddd ymm1, ymm1, ymmword ptr [rsp]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+120H]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+0C0H]
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxor ymm12, ymm12, ymm0
- vpxor ymm13, ymm13, ymm1
- vpxor ymm14, ymm14, ymm2
- vpxor ymm15, ymm15, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT8]
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpshufb ymm15, ymm15, ymm8
- vpaddd ymm8, ymm12, ymmword ptr [rsp+200H]
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxor ymm4, ymm4, ymm8
- vpxor ymm5, ymm5, ymm9
- vpxor ymm6, ymm6, ymm10
- vpxor ymm7, ymm7, ymm11
- vmovdqa ymmword ptr [rsp+200H], ymm8
- vpsrld ymm8, ymm4, 7
- vpslld ymm4, ymm4, 25
- vpor ymm4, ymm4, ymm8
- vpsrld ymm8, ymm5, 7
- vpslld ymm5, ymm5, 25
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 7
- vpslld ymm6, ymm6, 25
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 7
- vpslld ymm7, ymm7, 25
- vpor ymm7, ymm7, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+1C0H]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+40H]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+60H]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+0E0H]
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxor ymm15, ymm15, ymm0
- vpxor ymm12, ymm12, ymm1
- vpxor ymm13, ymm13, ymm2
- vpxor ymm14, ymm14, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT16]
- vpshufb ymm15, ymm15, ymm8
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm13, ymmword ptr [rsp+200H]
- vpaddd ymm9, ymm9, ymm14
- vpxor ymm5, ymm5, ymm10
- vpxor ymm6, ymm6, ymm11
- vpxor ymm7, ymm7, ymm8
- vpxor ymm4, ymm4, ymm9
- vmovdqa ymmword ptr [rsp+200H], ymm8
- vpsrld ymm8, ymm5, 12
- vpslld ymm5, ymm5, 20
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 12
- vpslld ymm6, ymm6, 20
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 12
- vpslld ymm7, ymm7, 20
- vpor ymm7, ymm7, ymm8
- vpsrld ymm8, ymm4, 12
- vpslld ymm4, ymm4, 20
- vpor ymm4, ymm4, ymm8
- vpaddd ymm0, ymm0, ymmword ptr [rsp+140H]
- vpaddd ymm1, ymm1, ymmword ptr [rsp+180H]
- vpaddd ymm2, ymm2, ymmword ptr [rsp+80H]
- vpaddd ymm3, ymm3, ymmword ptr [rsp+1A0H]
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxor ymm15, ymm15, ymm0
- vpxor ymm12, ymm12, ymm1
- vpxor ymm13, ymm13, ymm2
- vpxor ymm14, ymm14, ymm3
- vbroadcasti128 ymm8, xmmword ptr [ROT8]
- vpshufb ymm15, ymm15, ymm8
- vpshufb ymm12, ymm12, ymm8
- vpshufb ymm13, ymm13, ymm8
- vpshufb ymm14, ymm14, ymm8
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm13, ymmword ptr [rsp+200H]
- vpaddd ymm9, ymm9, ymm14
- vpxor ymm5, ymm5, ymm10
- vpxor ymm6, ymm6, ymm11
- vpxor ymm7, ymm7, ymm8
- vpxor ymm4, ymm4, ymm9
- vpxor ymm0, ymm0, ymm8
- vpxor ymm1, ymm1, ymm9
- vpxor ymm2, ymm2, ymm10
- vpxor ymm3, ymm3, ymm11
- vpsrld ymm8, ymm5, 7
- vpslld ymm5, ymm5, 25
- vpor ymm5, ymm5, ymm8
- vpsrld ymm8, ymm6, 7
- vpslld ymm6, ymm6, 25
- vpor ymm6, ymm6, ymm8
- vpsrld ymm8, ymm7, 7
- vpslld ymm7, ymm7, 25
- vpor ymm7, ymm7, ymm8
- vpsrld ymm8, ymm4, 7
- vpslld ymm4, ymm4, 25
- vpor ymm4, ymm4, ymm8
- vpxor ymm4, ymm4, ymm12
- vpxor ymm5, ymm5, ymm13
- vpxor ymm6, ymm6, ymm14
- vpxor ymm7, ymm7, ymm15
- movzx eax, byte ptr [rbp+78H]
- jne innerloop8
- mov rbx, qword ptr [rbp+90H]
- vunpcklps ymm8, ymm0, ymm1
- vunpcklps ymm9, ymm2, ymm3
- vunpckhps ymm10, ymm0, ymm1
- vunpcklps ymm11, ymm4, ymm5
- vunpcklps ymm0, ymm6, ymm7
- vshufps ymm12, ymm8, ymm9, 78
- vblendps ymm1, ymm8, ymm12, 0CCH
- vshufps ymm8, ymm11, ymm0, 78
- vunpckhps ymm13, ymm2, ymm3
- vblendps ymm2, ymm11, ymm8, 0CCH
- vblendps ymm3, ymm12, ymm9, 0CCH
- vperm2f128 ymm12, ymm1, ymm2, 20H
- vmovups ymmword ptr [rbx], ymm12
- vunpckhps ymm14, ymm4, ymm5
- vblendps ymm4, ymm8, ymm0, 0CCH
- vunpckhps ymm15, ymm6, ymm7
- vperm2f128 ymm7, ymm3, ymm4, 20H
- vmovups ymmword ptr [rbx+20H], ymm7
- vshufps ymm5, ymm10, ymm13, 78
- vblendps ymm6, ymm5, ymm13, 0CCH
- vshufps ymm13, ymm14, ymm15, 78
- vblendps ymm10, ymm10, ymm5, 0CCH
- vblendps ymm14, ymm14, ymm13, 0CCH
- vperm2f128 ymm8, ymm10, ymm14, 20H
- vmovups ymmword ptr [rbx+40H], ymm8
- vblendps ymm15, ymm13, ymm15, 0CCH
- vperm2f128 ymm13, ymm6, ymm15, 20H
- vmovups ymmword ptr [rbx+60H], ymm13
- vperm2f128 ymm9, ymm1, ymm2, 31H
- vperm2f128 ymm11, ymm3, ymm4, 31H
- vmovups ymmword ptr [rbx+80H], ymm9
- vperm2f128 ymm14, ymm10, ymm14, 31H
- vperm2f128 ymm15, ymm6, ymm15, 31H
- vmovups ymmword ptr [rbx+0A0H], ymm11
- vmovups ymmword ptr [rbx+0C0H], ymm14
- vmovups ymmword ptr [rbx+0E0H], ymm15
- vmovdqa ymm0, ymmword ptr [rsp+2A0H]
- vpaddd ymm1, ymm0, ymmword ptr [rsp+220H]
- vmovdqa ymmword ptr [rsp+220H], ymm1
- vpxor ymm0, ymm0, ymmword ptr [CMP_MSB_MASK]
- vpxor ymm2, ymm1, ymmword ptr [CMP_MSB_MASK]
- vpcmpgtd ymm2, ymm0, ymm2
- vmovdqa ymm0, ymmword ptr [rsp+240H]
- vpsubd ymm2, ymm0, ymm2
- vmovdqa ymmword ptr [rsp+240H], ymm2
- add rdi, 64
- add rbx, 256
- mov qword ptr [rbp+90H], rbx
- sub rsi, 8
- cmp rsi, 8
- jnc outerloop8
- test rsi, rsi
- jnz final7blocks
-unwind:
- vzeroupper
- vmovdqa xmm6, xmmword ptr [rsp+2D0H]
- vmovdqa xmm7, xmmword ptr [rsp+2E0H]
- vmovdqa xmm8, xmmword ptr [rsp+2F0H]
- vmovdqa xmm9, xmmword ptr [rsp+300H]
- vmovdqa xmm10, xmmword ptr [rsp+310H]
- vmovdqa xmm11, xmmword ptr [rsp+320H]
- vmovdqa xmm12, xmmword ptr [rsp+330H]
- vmovdqa xmm13, xmmword ptr [rsp+340H]
- vmovdqa xmm14, xmmword ptr [rsp+350H]
- vmovdqa xmm15, xmmword ptr [rsp+360H]
- mov rsp, rbp
- pop rbp
- pop rbx
- pop rdi
- pop rsi
- pop r12
- pop r13
- pop r14
- pop r15
- ret
-ALIGN 16
-final7blocks:
- mov rbx, qword ptr [rbp+90H]
- mov r15, qword ptr [rsp+2C0H]
- movzx r13d, byte ptr [rbp+78H]
- movzx r12d, byte ptr [rbp+88H]
- test rsi, 4H
- je final3blocks
- vbroadcasti128 ymm0, xmmword ptr [rcx]
- vbroadcasti128 ymm1, xmmword ptr [rcx+10H]
- vmovdqa ymm8, ymm0
- vmovdqa ymm9, ymm1
- vbroadcasti128 ymm12, xmmword ptr [rsp+220H]
- vbroadcasti128 ymm13, xmmword ptr [rsp+240H]
- vpunpckldq ymm14, ymm12, ymm13
- vpunpckhdq ymm15, ymm12, ymm13
- vpermq ymm14, ymm14, 50H
- vpermq ymm15, ymm15, 50H
- vbroadcasti128 ymm12, xmmword ptr [BLAKE3_BLOCK_LEN]
- vpblendd ymm14, ymm14, ymm12, 44H
- vpblendd ymm15, ymm15, ymm12, 44H
- vmovdqa ymmword ptr [rsp], ymm14
- vmovdqa ymmword ptr [rsp+20H], ymm15
- mov r8, qword ptr [rdi]
- mov r9, qword ptr [rdi+8H]
- mov r10, qword ptr [rdi+10H]
- mov r11, qword ptr [rdi+18H]
- movzx eax, byte ptr [rbp+80H]
- or eax, r13d
- xor edx, edx
-ALIGN 16
-innerloop4:
- mov r14d, eax
- or eax, r12d
- add rdx, 64
- cmp rdx, r15
- cmovne eax, r14d
- mov dword ptr [rsp+200H], eax
- vmovups ymm2, ymmword ptr [r8+rdx-40H]
- vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-40H], 01H
- vmovups ymm3, ymmword ptr [r8+rdx-30H]
- vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-30H], 01H
- vshufps ymm4, ymm2, ymm3, 136
- vshufps ymm5, ymm2, ymm3, 221
- vmovups ymm2, ymmword ptr [r8+rdx-20H]
- vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-20H], 01H
- vmovups ymm3, ymmword ptr [r8+rdx-10H]
- vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-10H], 01H
- vshufps ymm6, ymm2, ymm3, 136
- vshufps ymm7, ymm2, ymm3, 221
- vpshufd ymm6, ymm6, 93H
- vpshufd ymm7, ymm7, 93H
- vmovups ymm10, ymmword ptr [r10+rdx-40H]
- vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-40H], 01H
- vmovups ymm11, ymmword ptr [r10+rdx-30H]
- vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-30H], 01H
- vshufps ymm12, ymm10, ymm11, 136
- vshufps ymm13, ymm10, ymm11, 221
- vmovups ymm10, ymmword ptr [r10+rdx-20H]
- vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-20H], 01H
- vmovups ymm11, ymmword ptr [r10+rdx-10H]
- vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-10H], 01H
- vshufps ymm14, ymm10, ymm11, 136
- vshufps ymm15, ymm10, ymm11, 221
- vpshufd ymm14, ymm14, 93H
- vpshufd ymm15, ymm15, 93H
- vpbroadcastd ymm2, dword ptr [rsp+200H]
- vmovdqa ymm3, ymmword ptr [rsp]
- vmovdqa ymm11, ymmword ptr [rsp+20H]
- vpblendd ymm3, ymm3, ymm2, 88H
- vpblendd ymm11, ymm11, ymm2, 88H
- vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV]
- vmovdqa ymm10, ymm2
- mov al, 7
-roundloop4:
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm8, ymm8, ymm12
- vmovdqa ymmword ptr [rsp+40H], ymm4
- nop
- vmovdqa ymmword ptr [rsp+60H], ymm12
- nop
- vpaddd ymm0, ymm0, ymm1
- vpaddd ymm8, ymm8, ymm9
- vpxor ymm3, ymm3, ymm0
- vpxor ymm11, ymm11, ymm8
- vbroadcasti128 ymm4, xmmword ptr [ROT16]
- vpshufb ymm3, ymm3, ymm4
- vpshufb ymm11, ymm11, ymm4
- vpaddd ymm2, ymm2, ymm3
- vpaddd ymm10, ymm10, ymm11
- vpxor ymm1, ymm1, ymm2
- vpxor ymm9, ymm9, ymm10
- vpsrld ymm4, ymm1, 12
- vpslld ymm1, ymm1, 20
- vpor ymm1, ymm1, ymm4
- vpsrld ymm4, ymm9, 12
- vpslld ymm9, ymm9, 20
- vpor ymm9, ymm9, ymm4
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm8, ymm8, ymm13
- vpaddd ymm0, ymm0, ymm1
- vpaddd ymm8, ymm8, ymm9
- vmovdqa ymmword ptr [rsp+80H], ymm5
- vmovdqa ymmword ptr [rsp+0A0H], ymm13
- vpxor ymm3, ymm3, ymm0
- vpxor ymm11, ymm11, ymm8
- vbroadcasti128 ymm4, xmmword ptr [ROT8]
- vpshufb ymm3, ymm3, ymm4
- vpshufb ymm11, ymm11, ymm4
- vpaddd ymm2, ymm2, ymm3
- vpaddd ymm10, ymm10, ymm11
- vpxor ymm1, ymm1, ymm2
- vpxor ymm9, ymm9, ymm10
- vpsrld ymm4, ymm1, 7
- vpslld ymm1, ymm1, 25
- vpor ymm1, ymm1, ymm4
- vpsrld ymm4, ymm9, 7
- vpslld ymm9, ymm9, 25
- vpor ymm9, ymm9, ymm4
- vpshufd ymm0, ymm0, 93H
- vpshufd ymm8, ymm8, 93H
- vpshufd ymm3, ymm3, 4EH
- vpshufd ymm11, ymm11, 4EH
- vpshufd ymm2, ymm2, 39H
- vpshufd ymm10, ymm10, 39H
- vpaddd ymm0, ymm0, ymm6
- vpaddd ymm8, ymm8, ymm14
- vpaddd ymm0, ymm0, ymm1
- vpaddd ymm8, ymm8, ymm9
- vpxor ymm3, ymm3, ymm0
- vpxor ymm11, ymm11, ymm8
- vbroadcasti128 ymm4, xmmword ptr [ROT16]
- vpshufb ymm3, ymm3, ymm4
- vpshufb ymm11, ymm11, ymm4
- vpaddd ymm2, ymm2, ymm3
- vpaddd ymm10, ymm10, ymm11
- vpxor ymm1, ymm1, ymm2
- vpxor ymm9, ymm9, ymm10
- vpsrld ymm4, ymm1, 12
- vpslld ymm1, ymm1, 20
- vpor ymm1, ymm1, ymm4
- vpsrld ymm4, ymm9, 12
- vpslld ymm9, ymm9, 20
- vpor ymm9, ymm9, ymm4
- vpaddd ymm0, ymm0, ymm7
- vpaddd ymm8, ymm8, ymm15
- vpaddd ymm0, ymm0, ymm1
- vpaddd ymm8, ymm8, ymm9
- vpxor ymm3, ymm3, ymm0
- vpxor ymm11, ymm11, ymm8
- vbroadcasti128 ymm4, xmmword ptr [ROT8]
- vpshufb ymm3, ymm3, ymm4
- vpshufb ymm11, ymm11, ymm4
- vpaddd ymm2, ymm2, ymm3
- vpaddd ymm10, ymm10, ymm11
- vpxor ymm1, ymm1, ymm2
- vpxor ymm9, ymm9, ymm10
- vpsrld ymm4, ymm1, 7
- vpslld ymm1, ymm1, 25
- vpor ymm1, ymm1, ymm4
- vpsrld ymm4, ymm9, 7
- vpslld ymm9, ymm9, 25
- vpor ymm9, ymm9, ymm4
- vpshufd ymm0, ymm0, 39H
- vpshufd ymm8, ymm8, 39H
- vpshufd ymm3, ymm3, 4EH
- vpshufd ymm11, ymm11, 4EH
- vpshufd ymm2, ymm2, 93H
- vpshufd ymm10, ymm10, 93H
- dec al
- je endroundloop4
- vmovdqa ymm4, ymmword ptr [rsp+40H]
- vmovdqa ymm5, ymmword ptr [rsp+80H]
- vshufps ymm12, ymm4, ymm5, 214
- vpshufd ymm13, ymm4, 0FH
- vpshufd ymm4, ymm12, 39H
- vshufps ymm12, ymm6, ymm7, 250
- vpblendd ymm13, ymm13, ymm12, 0AAH
- vpunpcklqdq ymm12, ymm7, ymm5
- vpblendd ymm12, ymm12, ymm6, 88H
- vpshufd ymm12, ymm12, 78H
- vpunpckhdq ymm5, ymm5, ymm7
- vpunpckldq ymm6, ymm6, ymm5
- vpshufd ymm7, ymm6, 1EH
- vmovdqa ymmword ptr [rsp+40H], ymm13
- vmovdqa ymmword ptr [rsp+80H], ymm12
- vmovdqa ymm12, ymmword ptr [rsp+60H]
- vmovdqa ymm13, ymmword ptr [rsp+0A0H]
- vshufps ymm5, ymm12, ymm13, 214
- vpshufd ymm6, ymm12, 0FH
- vpshufd ymm12, ymm5, 39H
- vshufps ymm5, ymm14, ymm15, 250
- vpblendd ymm6, ymm6, ymm5, 0AAH
- vpunpcklqdq ymm5, ymm15, ymm13
- vpblendd ymm5, ymm5, ymm14, 88H
- vpshufd ymm5, ymm5, 78H
- vpunpckhdq ymm13, ymm13, ymm15
- vpunpckldq ymm14, ymm14, ymm13
- vpshufd ymm15, ymm14, 1EH
- vmovdqa ymm13, ymm6
- vmovdqa ymm14, ymm5
- vmovdqa ymm5, ymmword ptr [rsp+40H]
- vmovdqa ymm6, ymmword ptr [rsp+80H]
- jmp roundloop4
-endroundloop4:
- vpxor ymm0, ymm0, ymm2
- vpxor ymm1, ymm1, ymm3
- vpxor ymm8, ymm8, ymm10
- vpxor ymm9, ymm9, ymm11
- mov eax, r13d
- cmp rdx, r15
- jne innerloop4
- vmovdqu xmmword ptr [rbx], xmm0
- vmovdqu xmmword ptr [rbx+10H], xmm1
- vextracti128 xmmword ptr [rbx+20H], ymm0, 01H
- vextracti128 xmmword ptr [rbx+30H], ymm1, 01H
- vmovdqu xmmword ptr [rbx+40H], xmm8
- vmovdqu xmmword ptr [rbx+50H], xmm9
- vextracti128 xmmword ptr [rbx+60H], ymm8, 01H
- vextracti128 xmmword ptr [rbx+70H], ymm9, 01H
- vmovaps xmm8, xmmword ptr [rsp+260H]
- vmovaps xmm0, xmmword ptr [rsp+220H]
- vmovaps xmm1, xmmword ptr [rsp+230H]
- vmovaps xmm2, xmmword ptr [rsp+240H]
- vmovaps xmm3, xmmword ptr [rsp+250H]
- vblendvps xmm0, xmm0, xmm1, xmm8
- vblendvps xmm2, xmm2, xmm3, xmm8
- vmovaps xmmword ptr [rsp+220H], xmm0
- vmovaps xmmword ptr [rsp+240H], xmm2
- add rbx, 128
- add rdi, 32
- sub rsi, 4
-final3blocks:
- test rsi, 2H
- je final1blocks
- vbroadcasti128 ymm0, xmmword ptr [rcx]
- vbroadcasti128 ymm1, xmmword ptr [rcx+10H]
- vmovd xmm13, dword ptr [rsp+220H]
- vpinsrd xmm13, xmm13, dword ptr [rsp+240H], 1
- vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN], 2
- vmovd xmm14, dword ptr [rsp+224H]
- vpinsrd xmm14, xmm14, dword ptr [rsp+244H], 1
- vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN], 2
- vinserti128 ymm13, ymm13, xmm14, 01H
- vbroadcasti128 ymm14, xmmword ptr [ROT16]
- vbroadcasti128 ymm15, xmmword ptr [ROT8]
- mov r8, qword ptr [rdi]
- mov r9, qword ptr [rdi+8H]
- movzx eax, byte ptr [rbp+80H]
- or eax, r13d
- xor edx, edx
-ALIGN 16
-innerloop2:
- mov r14d, eax
- or eax, r12d
- add rdx, 64
- cmp rdx, r15
- cmovne eax, r14d
- mov dword ptr [rsp+200H], eax
- vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV]
- vpbroadcastd ymm8, dword ptr [rsp+200H]
- vpblendd ymm3, ymm13, ymm8, 88H
- vmovups ymm8, ymmword ptr [r8+rdx-40H]
- vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-40H], 01H
- vmovups ymm9, ymmword ptr [r8+rdx-30H]
- vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-30H], 01H
- vshufps ymm4, ymm8, ymm9, 136
- vshufps ymm5, ymm8, ymm9, 221
- vmovups ymm8, ymmword ptr [r8+rdx-20H]
- vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-20H], 01H
- vmovups ymm9, ymmword ptr [r8+rdx-10H]
- vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-10H], 01H
- vshufps ymm6, ymm8, ymm9, 136
- vshufps ymm7, ymm8, ymm9, 221
- vpshufd ymm6, ymm6, 93H
- vpshufd ymm7, ymm7, 93H
- mov al, 7
-roundloop2:
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm0, ymm0, ymm1
- vpxor ymm3, ymm3, ymm0
- vpshufb ymm3, ymm3, ymm14
- vpaddd ymm2, ymm2, ymm3
- vpxor ymm1, ymm1, ymm2
- vpsrld ymm8, ymm1, 12
- vpslld ymm1, ymm1, 20
- vpor ymm1, ymm1, ymm8
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm0, ymm0, ymm1
- vpxor ymm3, ymm3, ymm0
- vpshufb ymm3, ymm3, ymm15
- vpaddd ymm2, ymm2, ymm3
- vpxor ymm1, ymm1, ymm2
- vpsrld ymm8, ymm1, 7
- vpslld ymm1, ymm1, 25
- vpor ymm1, ymm1, ymm8
- vpshufd ymm0, ymm0, 93H
- vpshufd ymm3, ymm3, 4EH
- vpshufd ymm2, ymm2, 39H
- vpaddd ymm0, ymm0, ymm6
- vpaddd ymm0, ymm0, ymm1
- vpxor ymm3, ymm3, ymm0
- vpshufb ymm3, ymm3, ymm14
- vpaddd ymm2, ymm2, ymm3
- vpxor ymm1, ymm1, ymm2
- vpsrld ymm8, ymm1, 12
- vpslld ymm1, ymm1, 20
- vpor ymm1, ymm1, ymm8
- vpaddd ymm0, ymm0, ymm7
- vpaddd ymm0, ymm0, ymm1
- vpxor ymm3, ymm3, ymm0
- vpshufb ymm3, ymm3, ymm15
- vpaddd ymm2, ymm2, ymm3
- vpxor ymm1, ymm1, ymm2
- vpsrld ymm8, ymm1, 7
- vpslld ymm1, ymm1, 25
- vpor ymm1, ymm1, ymm8
- vpshufd ymm0, ymm0, 39H
- vpshufd ymm3, ymm3, 4EH
- vpshufd ymm2, ymm2, 93H
- dec al
- jz endroundloop2
- vshufps ymm8, ymm4, ymm5, 214
- vpshufd ymm9, ymm4, 0FH
- vpshufd ymm4, ymm8, 39H
- vshufps ymm8, ymm6, ymm7, 250
- vpblendd ymm9, ymm9, ymm8, 0AAH
- vpunpcklqdq ymm8, ymm7, ymm5
- vpblendd ymm8, ymm8, ymm6, 88H
- vpshufd ymm8, ymm8, 78H
- vpunpckhdq ymm5, ymm5, ymm7
- vpunpckldq ymm6, ymm6, ymm5
- vpshufd ymm7, ymm6, 1EH
- vmovdqa ymm5, ymm9
- vmovdqa ymm6, ymm8
- jmp roundloop2
-endroundloop2:
- vpxor ymm0, ymm0, ymm2
- vpxor ymm1, ymm1, ymm3
- mov eax, r13d
- cmp rdx, r15
- jne innerloop2
- vmovdqu xmmword ptr [rbx], xmm0
- vmovdqu xmmword ptr [rbx+10H], xmm1
- vextracti128 xmmword ptr [rbx+20H], ymm0, 01H
- vextracti128 xmmword ptr [rbx+30H], ymm1, 01H
- vmovaps ymm8, ymmword ptr [rsp+260H]
- vmovaps ymm0, ymmword ptr [rsp+220H]
- vmovups ymm1, ymmword ptr [rsp+228H]
- vmovaps ymm2, ymmword ptr [rsp+240H]
- vmovups ymm3, ymmword ptr [rsp+248H]
- vblendvps ymm0, ymm0, ymm1, ymm8
- vblendvps ymm2, ymm2, ymm3, ymm8
- vmovaps ymmword ptr [rsp+220H], ymm0
- vmovaps ymmword ptr [rsp+240H], ymm2
- add rbx, 64
- add rdi, 16
- sub rsi, 2
-final1blocks:
- test rsi, 1H
- je unwind
- vmovdqu xmm0, xmmword ptr [rcx]
- vmovdqu xmm1, xmmword ptr [rcx+10H]
- vmovd xmm3, dword ptr [rsp+220H]
- vpinsrd xmm3, xmm3, dword ptr [rsp+240H], 1
- vpinsrd xmm13, xmm3, dword ptr [BLAKE3_BLOCK_LEN], 2
- vmovdqa xmm14, xmmword ptr [ROT16]
- vmovdqa xmm15, xmmword ptr [ROT8]
- mov r8, qword ptr [rdi]
- movzx eax, byte ptr [rbp+80H]
- or eax, r13d
- xor edx, edx
-ALIGN 16
-innerloop1:
- mov r14d, eax
- or eax, r12d
- add rdx, 64
- cmp rdx, r15
- cmovne eax, r14d
- vmovdqa xmm2, xmmword ptr [BLAKE3_IV]
- vmovdqa xmm3, xmm13
- vpinsrd xmm3, xmm3, eax, 3
- vmovups xmm8, xmmword ptr [r8+rdx-40H]
- vmovups xmm9, xmmword ptr [r8+rdx-30H]
- vshufps xmm4, xmm8, xmm9, 136
- vshufps xmm5, xmm8, xmm9, 221
- vmovups xmm8, xmmword ptr [r8+rdx-20H]
- vmovups xmm9, xmmword ptr [r8+rdx-10H]
- vshufps xmm6, xmm8, xmm9, 136
- vshufps xmm7, xmm8, xmm9, 221
- vpshufd xmm6, xmm6, 93H
- vpshufd xmm7, xmm7, 93H
- mov al, 7
-roundloop1:
- vpaddd xmm0, xmm0, xmm4
- vpaddd xmm0, xmm0, xmm1
- vpxor xmm3, xmm3, xmm0
- vpshufb xmm3, xmm3, xmm14
- vpaddd xmm2, xmm2, xmm3
- vpxor xmm1, xmm1, xmm2
- vpsrld xmm8, xmm1, 12
- vpslld xmm1, xmm1, 20
- vpor xmm1, xmm1, xmm8
- vpaddd xmm0, xmm0, xmm5
- vpaddd xmm0, xmm0, xmm1
- vpxor xmm3, xmm3, xmm0
- vpshufb xmm3, xmm3, xmm15
- vpaddd xmm2, xmm2, xmm3
- vpxor xmm1, xmm1, xmm2
- vpsrld xmm8, xmm1, 7
- vpslld xmm1, xmm1, 25
- vpor xmm1, xmm1, xmm8
- vpshufd xmm0, xmm0, 93H
- vpshufd xmm3, xmm3, 4EH
- vpshufd xmm2, xmm2, 39H
- vpaddd xmm0, xmm0, xmm6
- vpaddd xmm0, xmm0, xmm1
- vpxor xmm3, xmm3, xmm0
- vpshufb xmm3, xmm3, xmm14
- vpaddd xmm2, xmm2, xmm3
- vpxor xmm1, xmm1, xmm2
- vpsrld xmm8, xmm1, 12
- vpslld xmm1, xmm1, 20
- vpor xmm1, xmm1, xmm8
- vpaddd xmm0, xmm0, xmm7
- vpaddd xmm0, xmm0, xmm1
- vpxor xmm3, xmm3, xmm0
- vpshufb xmm3, xmm3, xmm15
- vpaddd xmm2, xmm2, xmm3
- vpxor xmm1, xmm1, xmm2
- vpsrld xmm8, xmm1, 7
- vpslld xmm1, xmm1, 25
- vpor xmm1, xmm1, xmm8
- vpshufd xmm0, xmm0, 39H
- vpshufd xmm3, xmm3, 4EH
- vpshufd xmm2, xmm2, 93H
- dec al
- jz endroundloop1
- vshufps xmm8, xmm4, xmm5, 214
- vpshufd xmm9, xmm4, 0FH
- vpshufd xmm4, xmm8, 39H
- vshufps xmm8, xmm6, xmm7, 250
- vpblendd xmm9, xmm9, xmm8, 0AAH
- vpunpcklqdq xmm8, xmm7, xmm5
- vpblendd xmm8, xmm8, xmm6, 88H
- vpshufd xmm8, xmm8, 78H
- vpunpckhdq xmm5, xmm5, xmm7
- vpunpckldq xmm6, xmm6, xmm5
- vpshufd xmm7, xmm6, 1EH
- vmovdqa xmm5, xmm9
- vmovdqa xmm6, xmm8
- jmp roundloop1
-endroundloop1:
- vpxor xmm0, xmm0, xmm2
- vpxor xmm1, xmm1, xmm3
- mov eax, r13d
- cmp rdx, r15
- jne innerloop1
- vmovdqu xmmword ptr [rbx], xmm0
- vmovdqu xmmword ptr [rbx+10H], xmm1
- jmp unwind
-
-_blake3_hash_many_avx2 ENDP
-blake3_hash_many_avx2 ENDP
-_TEXT ENDS
-
-_RDATA SEGMENT READONLY PAGE ALIAS(".rdata") 'CONST'
-ALIGN 64
-ADD0:
- dd 0, 1, 2, 3, 4, 5, 6, 7
-
-ADD1:
- dd 8 dup (8)
-
-BLAKE3_IV_0:
- dd 8 dup (6A09E667H)
-
-BLAKE3_IV_1:
- dd 8 dup (0BB67AE85H)
-
-BLAKE3_IV_2:
- dd 8 dup (3C6EF372H)
-
-BLAKE3_IV_3:
- dd 8 dup (0A54FF53AH)
-
-BLAKE3_BLOCK_LEN:
- dd 8 dup (64)
-
-ROT16:
- db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
-
-ROT8:
- db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
-
-CMP_MSB_MASK:
- dd 8 dup(80000000H)
-
-BLAKE3_IV:
- dd 6A09E667H, 0BB67AE85H, 3C6EF372H, 0A54FF53AH
-
-_RDATA ENDS
-END
diff --git a/thirdparty/BLAKE3/c/blake3_avx512.c b/thirdparty/BLAKE3/c/blake3_avx512.c
deleted file mode 100644
index 77a5c385c..000000000
--- a/thirdparty/BLAKE3/c/blake3_avx512.c
+++ /dev/null
@@ -1,1204 +0,0 @@
-#include "blake3_impl.h"
-
-#include <immintrin.h>
-
-#define _mm_shuffle_ps2(a, b, c) \
- (_mm_castps_si128( \
- _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c))))
-
-INLINE __m128i loadu_128(const uint8_t src[16]) {
- return _mm_loadu_si128((const __m128i *)src);
-}
-
-INLINE __m256i loadu_256(const uint8_t src[32]) {
- return _mm256_loadu_si256((const __m256i *)src);
-}
-
-INLINE __m512i loadu_512(const uint8_t src[64]) {
- return _mm512_loadu_si512((const __m512i *)src);
-}
-
-INLINE void storeu_128(__m128i src, uint8_t dest[16]) {
- _mm_storeu_si128((__m128i *)dest, src);
-}
-
-INLINE void storeu_256(__m256i src, uint8_t dest[16]) {
- _mm256_storeu_si256((__m256i *)dest, src);
-}
-
-INLINE __m128i add_128(__m128i a, __m128i b) { return _mm_add_epi32(a, b); }
-
-INLINE __m256i add_256(__m256i a, __m256i b) { return _mm256_add_epi32(a, b); }
-
-INLINE __m512i add_512(__m512i a, __m512i b) { return _mm512_add_epi32(a, b); }
-
-INLINE __m128i xor_128(__m128i a, __m128i b) { return _mm_xor_si128(a, b); }
-
-INLINE __m256i xor_256(__m256i a, __m256i b) { return _mm256_xor_si256(a, b); }
-
-INLINE __m512i xor_512(__m512i a, __m512i b) { return _mm512_xor_si512(a, b); }
-
-INLINE __m128i set1_128(uint32_t x) { return _mm_set1_epi32((int32_t)x); }
-
-INLINE __m256i set1_256(uint32_t x) { return _mm256_set1_epi32((int32_t)x); }
-
-INLINE __m512i set1_512(uint32_t x) { return _mm512_set1_epi32((int32_t)x); }
-
-INLINE __m128i set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
- return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d);
-}
-
-INLINE __m128i rot16_128(__m128i x) { return _mm_ror_epi32(x, 16); }
-
-INLINE __m256i rot16_256(__m256i x) { return _mm256_ror_epi32(x, 16); }
-
-INLINE __m512i rot16_512(__m512i x) { return _mm512_ror_epi32(x, 16); }
-
-INLINE __m128i rot12_128(__m128i x) { return _mm_ror_epi32(x, 12); }
-
-INLINE __m256i rot12_256(__m256i x) { return _mm256_ror_epi32(x, 12); }
-
-INLINE __m512i rot12_512(__m512i x) { return _mm512_ror_epi32(x, 12); }
-
-INLINE __m128i rot8_128(__m128i x) { return _mm_ror_epi32(x, 8); }
-
-INLINE __m256i rot8_256(__m256i x) { return _mm256_ror_epi32(x, 8); }
-
-INLINE __m512i rot8_512(__m512i x) { return _mm512_ror_epi32(x, 8); }
-
-INLINE __m128i rot7_128(__m128i x) { return _mm_ror_epi32(x, 7); }
-
-INLINE __m256i rot7_256(__m256i x) { return _mm256_ror_epi32(x, 7); }
-
-INLINE __m512i rot7_512(__m512i x) { return _mm512_ror_epi32(x, 7); }
-
-/*
- * ----------------------------------------------------------------------------
- * compress_avx512
- * ----------------------------------------------------------------------------
- */
-
-INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
- __m128i m) {
- *row0 = add_128(add_128(*row0, m), *row1);
- *row3 = xor_128(*row3, *row0);
- *row3 = rot16_128(*row3);
- *row2 = add_128(*row2, *row3);
- *row1 = xor_128(*row1, *row2);
- *row1 = rot12_128(*row1);
-}
-
-INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
- __m128i m) {
- *row0 = add_128(add_128(*row0, m), *row1);
- *row3 = xor_128(*row3, *row0);
- *row3 = rot8_128(*row3);
- *row2 = add_128(*row2, *row3);
- *row1 = xor_128(*row1, *row2);
- *row1 = rot7_128(*row1);
-}
-
-// Note the optimization here of leaving row1 as the unrotated row, rather than
-// row0. All the message loads below are adjusted to compensate for this. See
-// discussion at https://github.com/sneves/blake2-avx2/pull/4
-INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
- *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3));
- *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
- *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1));
-}
-
-INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
- *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1));
- *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
- *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3));
-}
-
-INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8],
- const uint8_t block[BLAKE3_BLOCK_LEN],
- uint8_t block_len, uint64_t counter, uint8_t flags) {
- rows[0] = loadu_128((uint8_t *)&cv[0]);
- rows[1] = loadu_128((uint8_t *)&cv[4]);
- rows[2] = set4(IV[0], IV[1], IV[2], IV[3]);
- rows[3] = set4(counter_low(counter), counter_high(counter),
- (uint32_t)block_len, (uint32_t)flags);
-
- __m128i m0 = loadu_128(&block[sizeof(__m128i) * 0]);
- __m128i m1 = loadu_128(&block[sizeof(__m128i) * 1]);
- __m128i m2 = loadu_128(&block[sizeof(__m128i) * 2]);
- __m128i m3 = loadu_128(&block[sizeof(__m128i) * 3]);
-
- __m128i t0, t1, t2, t3, tt;
-
- // Round 1. The first round permutes the message words from the original
- // input order, into the groups that get mixed in parallel.
- t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); // 6 4 2 0
- g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
- t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); // 7 5 3 1
- g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
- diagonalize(&rows[0], &rows[2], &rows[3]);
- t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10 8
- t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3)); // 12 10 8 14
- g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
- t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11 9
- t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3)); // 13 11 9 15
- g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
- undiagonalize(&rows[0], &rows[2], &rows[3]);
- m0 = t0;
- m1 = t1;
- m2 = t2;
- m3 = t3;
-
- // Round 2. This round and all following rounds apply a fixed permutation
- // to the message words from the round before.
- t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
- t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
- g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
- t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
- tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
- t1 = _mm_blend_epi16(tt, t1, 0xCC);
- g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
- diagonalize(&rows[0], &rows[2], &rows[3]);
- t2 = _mm_unpacklo_epi64(m3, m1);
- tt = _mm_blend_epi16(t2, m2, 0xC0);
- t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
- g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
- t3 = _mm_unpackhi_epi32(m1, m3);
- tt = _mm_unpacklo_epi32(m2, t3);
- t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
- g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
- undiagonalize(&rows[0], &rows[2], &rows[3]);
- m0 = t0;
- m1 = t1;
- m2 = t2;
- m3 = t3;
-
- // Round 3
- t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
- t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
- g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
- t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
- tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
- t1 = _mm_blend_epi16(tt, t1, 0xCC);
- g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
- diagonalize(&rows[0], &rows[2], &rows[3]);
- t2 = _mm_unpacklo_epi64(m3, m1);
- tt = _mm_blend_epi16(t2, m2, 0xC0);
- t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
- g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
- t3 = _mm_unpackhi_epi32(m1, m3);
- tt = _mm_unpacklo_epi32(m2, t3);
- t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
- g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
- undiagonalize(&rows[0], &rows[2], &rows[3]);
- m0 = t0;
- m1 = t1;
- m2 = t2;
- m3 = t3;
-
- // Round 4
- t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
- t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
- g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
- t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
- tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
- t1 = _mm_blend_epi16(tt, t1, 0xCC);
- g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
- diagonalize(&rows[0], &rows[2], &rows[3]);
- t2 = _mm_unpacklo_epi64(m3, m1);
- tt = _mm_blend_epi16(t2, m2, 0xC0);
- t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
- g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
- t3 = _mm_unpackhi_epi32(m1, m3);
- tt = _mm_unpacklo_epi32(m2, t3);
- t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
- g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
- undiagonalize(&rows[0], &rows[2], &rows[3]);
- m0 = t0;
- m1 = t1;
- m2 = t2;
- m3 = t3;
-
- // Round 5
- t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
- t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
- g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
- t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
- tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
- t1 = _mm_blend_epi16(tt, t1, 0xCC);
- g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
- diagonalize(&rows[0], &rows[2], &rows[3]);
- t2 = _mm_unpacklo_epi64(m3, m1);
- tt = _mm_blend_epi16(t2, m2, 0xC0);
- t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
- g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
- t3 = _mm_unpackhi_epi32(m1, m3);
- tt = _mm_unpacklo_epi32(m2, t3);
- t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
- g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
- undiagonalize(&rows[0], &rows[2], &rows[3]);
- m0 = t0;
- m1 = t1;
- m2 = t2;
- m3 = t3;
-
- // Round 6
- t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
- t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
- g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
- t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
- tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
- t1 = _mm_blend_epi16(tt, t1, 0xCC);
- g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
- diagonalize(&rows[0], &rows[2], &rows[3]);
- t2 = _mm_unpacklo_epi64(m3, m1);
- tt = _mm_blend_epi16(t2, m2, 0xC0);
- t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
- g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
- t3 = _mm_unpackhi_epi32(m1, m3);
- tt = _mm_unpacklo_epi32(m2, t3);
- t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
- g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
- undiagonalize(&rows[0], &rows[2], &rows[3]);
- m0 = t0;
- m1 = t1;
- m2 = t2;
- m3 = t3;
-
- // Round 7
- t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
- t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
- g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
- t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
- tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
- t1 = _mm_blend_epi16(tt, t1, 0xCC);
- g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
- diagonalize(&rows[0], &rows[2], &rows[3]);
- t2 = _mm_unpacklo_epi64(m3, m1);
- tt = _mm_blend_epi16(t2, m2, 0xC0);
- t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
- g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
- t3 = _mm_unpackhi_epi32(m1, m3);
- tt = _mm_unpacklo_epi32(m2, t3);
- t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
- g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
- undiagonalize(&rows[0], &rows[2], &rows[3]);
-}
-
-void blake3_compress_xof_avx512(const uint32_t cv[8],
- const uint8_t block[BLAKE3_BLOCK_LEN],
- uint8_t block_len, uint64_t counter,
- uint8_t flags, uint8_t out[64]) {
- __m128i rows[4];
- compress_pre(rows, cv, block, block_len, counter, flags);
- storeu_128(xor_128(rows[0], rows[2]), &out[0]);
- storeu_128(xor_128(rows[1], rows[3]), &out[16]);
- storeu_128(xor_128(rows[2], loadu_128((uint8_t *)&cv[0])), &out[32]);
- storeu_128(xor_128(rows[3], loadu_128((uint8_t *)&cv[4])), &out[48]);
-}
-
-void blake3_compress_in_place_avx512(uint32_t cv[8],
- const uint8_t block[BLAKE3_BLOCK_LEN],
- uint8_t block_len, uint64_t counter,
- uint8_t flags) {
- __m128i rows[4];
- compress_pre(rows, cv, block, block_len, counter, flags);
- storeu_128(xor_128(rows[0], rows[2]), (uint8_t *)&cv[0]);
- storeu_128(xor_128(rows[1], rows[3]), (uint8_t *)&cv[4]);
-}
-
-/*
- * ----------------------------------------------------------------------------
- * hash4_avx512
- * ----------------------------------------------------------------------------
- */
-
-INLINE void round_fn4(__m128i v[16], __m128i m[16], size_t r) {
- v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
- v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
- v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
- v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
- v[0] = add_128(v[0], v[4]);
- v[1] = add_128(v[1], v[5]);
- v[2] = add_128(v[2], v[6]);
- v[3] = add_128(v[3], v[7]);
- v[12] = xor_128(v[12], v[0]);
- v[13] = xor_128(v[13], v[1]);
- v[14] = xor_128(v[14], v[2]);
- v[15] = xor_128(v[15], v[3]);
- v[12] = rot16_128(v[12]);
- v[13] = rot16_128(v[13]);
- v[14] = rot16_128(v[14]);
- v[15] = rot16_128(v[15]);
- v[8] = add_128(v[8], v[12]);
- v[9] = add_128(v[9], v[13]);
- v[10] = add_128(v[10], v[14]);
- v[11] = add_128(v[11], v[15]);
- v[4] = xor_128(v[4], v[8]);
- v[5] = xor_128(v[5], v[9]);
- v[6] = xor_128(v[6], v[10]);
- v[7] = xor_128(v[7], v[11]);
- v[4] = rot12_128(v[4]);
- v[5] = rot12_128(v[5]);
- v[6] = rot12_128(v[6]);
- v[7] = rot12_128(v[7]);
- v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
- v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
- v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
- v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
- v[0] = add_128(v[0], v[4]);
- v[1] = add_128(v[1], v[5]);
- v[2] = add_128(v[2], v[6]);
- v[3] = add_128(v[3], v[7]);
- v[12] = xor_128(v[12], v[0]);
- v[13] = xor_128(v[13], v[1]);
- v[14] = xor_128(v[14], v[2]);
- v[15] = xor_128(v[15], v[3]);
- v[12] = rot8_128(v[12]);
- v[13] = rot8_128(v[13]);
- v[14] = rot8_128(v[14]);
- v[15] = rot8_128(v[15]);
- v[8] = add_128(v[8], v[12]);
- v[9] = add_128(v[9], v[13]);
- v[10] = add_128(v[10], v[14]);
- v[11] = add_128(v[11], v[15]);
- v[4] = xor_128(v[4], v[8]);
- v[5] = xor_128(v[5], v[9]);
- v[6] = xor_128(v[6], v[10]);
- v[7] = xor_128(v[7], v[11]);
- v[4] = rot7_128(v[4]);
- v[5] = rot7_128(v[5]);
- v[6] = rot7_128(v[6]);
- v[7] = rot7_128(v[7]);
-
- v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
- v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
- v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
- v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
- v[0] = add_128(v[0], v[5]);
- v[1] = add_128(v[1], v[6]);
- v[2] = add_128(v[2], v[7]);
- v[3] = add_128(v[3], v[4]);
- v[15] = xor_128(v[15], v[0]);
- v[12] = xor_128(v[12], v[1]);
- v[13] = xor_128(v[13], v[2]);
- v[14] = xor_128(v[14], v[3]);
- v[15] = rot16_128(v[15]);
- v[12] = rot16_128(v[12]);
- v[13] = rot16_128(v[13]);
- v[14] = rot16_128(v[14]);
- v[10] = add_128(v[10], v[15]);
- v[11] = add_128(v[11], v[12]);
- v[8] = add_128(v[8], v[13]);
- v[9] = add_128(v[9], v[14]);
- v[5] = xor_128(v[5], v[10]);
- v[6] = xor_128(v[6], v[11]);
- v[7] = xor_128(v[7], v[8]);
- v[4] = xor_128(v[4], v[9]);
- v[5] = rot12_128(v[5]);
- v[6] = rot12_128(v[6]);
- v[7] = rot12_128(v[7]);
- v[4] = rot12_128(v[4]);
- v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
- v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
- v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
- v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
- v[0] = add_128(v[0], v[5]);
- v[1] = add_128(v[1], v[6]);
- v[2] = add_128(v[2], v[7]);
- v[3] = add_128(v[3], v[4]);
- v[15] = xor_128(v[15], v[0]);
- v[12] = xor_128(v[12], v[1]);
- v[13] = xor_128(v[13], v[2]);
- v[14] = xor_128(v[14], v[3]);
- v[15] = rot8_128(v[15]);
- v[12] = rot8_128(v[12]);
- v[13] = rot8_128(v[13]);
- v[14] = rot8_128(v[14]);
- v[10] = add_128(v[10], v[15]);
- v[11] = add_128(v[11], v[12]);
- v[8] = add_128(v[8], v[13]);
- v[9] = add_128(v[9], v[14]);
- v[5] = xor_128(v[5], v[10]);
- v[6] = xor_128(v[6], v[11]);
- v[7] = xor_128(v[7], v[8]);
- v[4] = xor_128(v[4], v[9]);
- v[5] = rot7_128(v[5]);
- v[6] = rot7_128(v[6]);
- v[7] = rot7_128(v[7]);
- v[4] = rot7_128(v[4]);
-}
-
-INLINE void transpose_vecs_128(__m128i vecs[4]) {
- // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is
- // 22/33. Note that this doesn't split the vector into two lanes, as the
- // AVX2 counterparts do.
- __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
- __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
- __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
- __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
-
- // Interleave 64-bit lanes.
- __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01);
- __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01);
- __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23);
- __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23);
-
- vecs[0] = abcd_0;
- vecs[1] = abcd_1;
- vecs[2] = abcd_2;
- vecs[3] = abcd_3;
-}
-
-INLINE void transpose_msg_vecs4(const uint8_t *const *inputs,
- size_t block_offset, __m128i out[16]) {
- out[0] = loadu_128(&inputs[0][block_offset + 0 * sizeof(__m128i)]);
- out[1] = loadu_128(&inputs[1][block_offset + 0 * sizeof(__m128i)]);
- out[2] = loadu_128(&inputs[2][block_offset + 0 * sizeof(__m128i)]);
- out[3] = loadu_128(&inputs[3][block_offset + 0 * sizeof(__m128i)]);
- out[4] = loadu_128(&inputs[0][block_offset + 1 * sizeof(__m128i)]);
- out[5] = loadu_128(&inputs[1][block_offset + 1 * sizeof(__m128i)]);
- out[6] = loadu_128(&inputs[2][block_offset + 1 * sizeof(__m128i)]);
- out[7] = loadu_128(&inputs[3][block_offset + 1 * sizeof(__m128i)]);
- out[8] = loadu_128(&inputs[0][block_offset + 2 * sizeof(__m128i)]);
- out[9] = loadu_128(&inputs[1][block_offset + 2 * sizeof(__m128i)]);
- out[10] = loadu_128(&inputs[2][block_offset + 2 * sizeof(__m128i)]);
- out[11] = loadu_128(&inputs[3][block_offset + 2 * sizeof(__m128i)]);
- out[12] = loadu_128(&inputs[0][block_offset + 3 * sizeof(__m128i)]);
- out[13] = loadu_128(&inputs[1][block_offset + 3 * sizeof(__m128i)]);
- out[14] = loadu_128(&inputs[2][block_offset + 3 * sizeof(__m128i)]);
- out[15] = loadu_128(&inputs[3][block_offset + 3 * sizeof(__m128i)]);
- for (size_t i = 0; i < 4; ++i) {
- _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0);
- }
- transpose_vecs_128(&out[0]);
- transpose_vecs_128(&out[4]);
- transpose_vecs_128(&out[8]);
- transpose_vecs_128(&out[12]);
-}
-
-INLINE void load_counters4(uint64_t counter, bool increment_counter,
- __m128i *out_lo, __m128i *out_hi) {
- uint64_t mask = (increment_counter ? ~0 : 0);
- __m256i mask_vec = _mm256_set1_epi64x(mask);
- __m256i deltas = _mm256_setr_epi64x(0, 1, 2, 3);
- deltas = _mm256_and_si256(mask_vec, deltas);
- __m256i counters =
- _mm256_add_epi64(_mm256_set1_epi64x((int64_t)counter), deltas);
- *out_lo = _mm256_cvtepi64_epi32(counters);
- *out_hi = _mm256_cvtepi64_epi32(_mm256_srli_epi64(counters, 32));
-}
-
-void blake3_hash4_avx512(const uint8_t *const *inputs, size_t blocks,
- const uint32_t key[8], uint64_t counter,
- bool increment_counter, uint8_t flags,
- uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
- __m128i h_vecs[8] = {
- set1_128(key[0]), set1_128(key[1]), set1_128(key[2]), set1_128(key[3]),
- set1_128(key[4]), set1_128(key[5]), set1_128(key[6]), set1_128(key[7]),
- };
- __m128i counter_low_vec, counter_high_vec;
- load_counters4(counter, increment_counter, &counter_low_vec,
- &counter_high_vec);
- uint8_t block_flags = flags | flags_start;
-
- for (size_t block = 0; block < blocks; block++) {
- if (block + 1 == blocks) {
- block_flags |= flags_end;
- }
- __m128i block_len_vec = set1_128(BLAKE3_BLOCK_LEN);
- __m128i block_flags_vec = set1_128(block_flags);
- __m128i msg_vecs[16];
- transpose_msg_vecs4(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
-
- __m128i v[16] = {
- h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
- h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
- set1_128(IV[0]), set1_128(IV[1]), set1_128(IV[2]), set1_128(IV[3]),
- counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
- };
- round_fn4(v, msg_vecs, 0);
- round_fn4(v, msg_vecs, 1);
- round_fn4(v, msg_vecs, 2);
- round_fn4(v, msg_vecs, 3);
- round_fn4(v, msg_vecs, 4);
- round_fn4(v, msg_vecs, 5);
- round_fn4(v, msg_vecs, 6);
- h_vecs[0] = xor_128(v[0], v[8]);
- h_vecs[1] = xor_128(v[1], v[9]);
- h_vecs[2] = xor_128(v[2], v[10]);
- h_vecs[3] = xor_128(v[3], v[11]);
- h_vecs[4] = xor_128(v[4], v[12]);
- h_vecs[5] = xor_128(v[5], v[13]);
- h_vecs[6] = xor_128(v[6], v[14]);
- h_vecs[7] = xor_128(v[7], v[15]);
-
- block_flags = flags;
- }
-
- transpose_vecs_128(&h_vecs[0]);
- transpose_vecs_128(&h_vecs[4]);
- // The first four vecs now contain the first half of each output, and the
- // second four vecs contain the second half of each output.
- storeu_128(h_vecs[0], &out[0 * sizeof(__m128i)]);
- storeu_128(h_vecs[4], &out[1 * sizeof(__m128i)]);
- storeu_128(h_vecs[1], &out[2 * sizeof(__m128i)]);
- storeu_128(h_vecs[5], &out[3 * sizeof(__m128i)]);
- storeu_128(h_vecs[2], &out[4 * sizeof(__m128i)]);
- storeu_128(h_vecs[6], &out[5 * sizeof(__m128i)]);
- storeu_128(h_vecs[3], &out[6 * sizeof(__m128i)]);
- storeu_128(h_vecs[7], &out[7 * sizeof(__m128i)]);
-}
-
-/*
- * ----------------------------------------------------------------------------
- * hash8_avx512
- * ----------------------------------------------------------------------------
- */
-
-INLINE void round_fn8(__m256i v[16], __m256i m[16], size_t r) {
- v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
- v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
- v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
- v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
- v[0] = add_256(v[0], v[4]);
- v[1] = add_256(v[1], v[5]);
- v[2] = add_256(v[2], v[6]);
- v[3] = add_256(v[3], v[7]);
- v[12] = xor_256(v[12], v[0]);
- v[13] = xor_256(v[13], v[1]);
- v[14] = xor_256(v[14], v[2]);
- v[15] = xor_256(v[15], v[3]);
- v[12] = rot16_256(v[12]);
- v[13] = rot16_256(v[13]);
- v[14] = rot16_256(v[14]);
- v[15] = rot16_256(v[15]);
- v[8] = add_256(v[8], v[12]);
- v[9] = add_256(v[9], v[13]);
- v[10] = add_256(v[10], v[14]);
- v[11] = add_256(v[11], v[15]);
- v[4] = xor_256(v[4], v[8]);
- v[5] = xor_256(v[5], v[9]);
- v[6] = xor_256(v[6], v[10]);
- v[7] = xor_256(v[7], v[11]);
- v[4] = rot12_256(v[4]);
- v[5] = rot12_256(v[5]);
- v[6] = rot12_256(v[6]);
- v[7] = rot12_256(v[7]);
- v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
- v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
- v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
- v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
- v[0] = add_256(v[0], v[4]);
- v[1] = add_256(v[1], v[5]);
- v[2] = add_256(v[2], v[6]);
- v[3] = add_256(v[3], v[7]);
- v[12] = xor_256(v[12], v[0]);
- v[13] = xor_256(v[13], v[1]);
- v[14] = xor_256(v[14], v[2]);
- v[15] = xor_256(v[15], v[3]);
- v[12] = rot8_256(v[12]);
- v[13] = rot8_256(v[13]);
- v[14] = rot8_256(v[14]);
- v[15] = rot8_256(v[15]);
- v[8] = add_256(v[8], v[12]);
- v[9] = add_256(v[9], v[13]);
- v[10] = add_256(v[10], v[14]);
- v[11] = add_256(v[11], v[15]);
- v[4] = xor_256(v[4], v[8]);
- v[5] = xor_256(v[5], v[9]);
- v[6] = xor_256(v[6], v[10]);
- v[7] = xor_256(v[7], v[11]);
- v[4] = rot7_256(v[4]);
- v[5] = rot7_256(v[5]);
- v[6] = rot7_256(v[6]);
- v[7] = rot7_256(v[7]);
-
- v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
- v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
- v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
- v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
- v[0] = add_256(v[0], v[5]);
- v[1] = add_256(v[1], v[6]);
- v[2] = add_256(v[2], v[7]);
- v[3] = add_256(v[3], v[4]);
- v[15] = xor_256(v[15], v[0]);
- v[12] = xor_256(v[12], v[1]);
- v[13] = xor_256(v[13], v[2]);
- v[14] = xor_256(v[14], v[3]);
- v[15] = rot16_256(v[15]);
- v[12] = rot16_256(v[12]);
- v[13] = rot16_256(v[13]);
- v[14] = rot16_256(v[14]);
- v[10] = add_256(v[10], v[15]);
- v[11] = add_256(v[11], v[12]);
- v[8] = add_256(v[8], v[13]);
- v[9] = add_256(v[9], v[14]);
- v[5] = xor_256(v[5], v[10]);
- v[6] = xor_256(v[6], v[11]);
- v[7] = xor_256(v[7], v[8]);
- v[4] = xor_256(v[4], v[9]);
- v[5] = rot12_256(v[5]);
- v[6] = rot12_256(v[6]);
- v[7] = rot12_256(v[7]);
- v[4] = rot12_256(v[4]);
- v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
- v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
- v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
- v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
- v[0] = add_256(v[0], v[5]);
- v[1] = add_256(v[1], v[6]);
- v[2] = add_256(v[2], v[7]);
- v[3] = add_256(v[3], v[4]);
- v[15] = xor_256(v[15], v[0]);
- v[12] = xor_256(v[12], v[1]);
- v[13] = xor_256(v[13], v[2]);
- v[14] = xor_256(v[14], v[3]);
- v[15] = rot8_256(v[15]);
- v[12] = rot8_256(v[12]);
- v[13] = rot8_256(v[13]);
- v[14] = rot8_256(v[14]);
- v[10] = add_256(v[10], v[15]);
- v[11] = add_256(v[11], v[12]);
- v[8] = add_256(v[8], v[13]);
- v[9] = add_256(v[9], v[14]);
- v[5] = xor_256(v[5], v[10]);
- v[6] = xor_256(v[6], v[11]);
- v[7] = xor_256(v[7], v[8]);
- v[4] = xor_256(v[4], v[9]);
- v[5] = rot7_256(v[5]);
- v[6] = rot7_256(v[6]);
- v[7] = rot7_256(v[7]);
- v[4] = rot7_256(v[4]);
-}
-
-INLINE void transpose_vecs_256(__m256i vecs[8]) {
- // Interleave 32-bit lanes. The low unpack is lanes 00/11/44/55, and the high
- // is 22/33/66/77.
- __m256i ab_0145 = _mm256_unpacklo_epi32(vecs[0], vecs[1]);
- __m256i ab_2367 = _mm256_unpackhi_epi32(vecs[0], vecs[1]);
- __m256i cd_0145 = _mm256_unpacklo_epi32(vecs[2], vecs[3]);
- __m256i cd_2367 = _mm256_unpackhi_epi32(vecs[2], vecs[3]);
- __m256i ef_0145 = _mm256_unpacklo_epi32(vecs[4], vecs[5]);
- __m256i ef_2367 = _mm256_unpackhi_epi32(vecs[4], vecs[5]);
- __m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]);
- __m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]);
-
- // Interleave 64-bit lates. The low unpack is lanes 00/22 and the high is
- // 11/33.
- __m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145);
- __m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145);
- __m256i abcd_26 = _mm256_unpacklo_epi64(ab_2367, cd_2367);
- __m256i abcd_37 = _mm256_unpackhi_epi64(ab_2367, cd_2367);
- __m256i efgh_04 = _mm256_unpacklo_epi64(ef_0145, gh_0145);
- __m256i efgh_15 = _mm256_unpackhi_epi64(ef_0145, gh_0145);
- __m256i efgh_26 = _mm256_unpacklo_epi64(ef_2367, gh_2367);
- __m256i efgh_37 = _mm256_unpackhi_epi64(ef_2367, gh_2367);
-
- // Interleave 128-bit lanes.
- vecs[0] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x20);
- vecs[1] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x20);
- vecs[2] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x20);
- vecs[3] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x20);
- vecs[4] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x31);
- vecs[5] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x31);
- vecs[6] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x31);
- vecs[7] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x31);
-}
-
-INLINE void transpose_msg_vecs8(const uint8_t *const *inputs,
- size_t block_offset, __m256i out[16]) {
- out[0] = loadu_256(&inputs[0][block_offset + 0 * sizeof(__m256i)]);
- out[1] = loadu_256(&inputs[1][block_offset + 0 * sizeof(__m256i)]);
- out[2] = loadu_256(&inputs[2][block_offset + 0 * sizeof(__m256i)]);
- out[3] = loadu_256(&inputs[3][block_offset + 0 * sizeof(__m256i)]);
- out[4] = loadu_256(&inputs[4][block_offset + 0 * sizeof(__m256i)]);
- out[5] = loadu_256(&inputs[5][block_offset + 0 * sizeof(__m256i)]);
- out[6] = loadu_256(&inputs[6][block_offset + 0 * sizeof(__m256i)]);
- out[7] = loadu_256(&inputs[7][block_offset + 0 * sizeof(__m256i)]);
- out[8] = loadu_256(&inputs[0][block_offset + 1 * sizeof(__m256i)]);
- out[9] = loadu_256(&inputs[1][block_offset + 1 * sizeof(__m256i)]);
- out[10] = loadu_256(&inputs[2][block_offset + 1 * sizeof(__m256i)]);
- out[11] = loadu_256(&inputs[3][block_offset + 1 * sizeof(__m256i)]);
- out[12] = loadu_256(&inputs[4][block_offset + 1 * sizeof(__m256i)]);
- out[13] = loadu_256(&inputs[5][block_offset + 1 * sizeof(__m256i)]);
- out[14] = loadu_256(&inputs[6][block_offset + 1 * sizeof(__m256i)]);
- out[15] = loadu_256(&inputs[7][block_offset + 1 * sizeof(__m256i)]);
- for (size_t i = 0; i < 8; ++i) {
- _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0);
- }
- transpose_vecs_256(&out[0]);
- transpose_vecs_256(&out[8]);
-}
-
-INLINE void load_counters8(uint64_t counter, bool increment_counter,
- __m256i *out_lo, __m256i *out_hi) {
- uint64_t mask = (increment_counter ? ~0 : 0);
- __m512i mask_vec = _mm512_set1_epi64(mask);
- __m512i deltas = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
- deltas = _mm512_and_si512(mask_vec, deltas);
- __m512i counters =
- _mm512_add_epi64(_mm512_set1_epi64((int64_t)counter), deltas);
- *out_lo = _mm512_cvtepi64_epi32(counters);
- *out_hi = _mm512_cvtepi64_epi32(_mm512_srli_epi64(counters, 32));
-}
-
-void blake3_hash8_avx512(const uint8_t *const *inputs, size_t blocks,
- const uint32_t key[8], uint64_t counter,
- bool increment_counter, uint8_t flags,
- uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
- __m256i h_vecs[8] = {
- set1_256(key[0]), set1_256(key[1]), set1_256(key[2]), set1_256(key[3]),
- set1_256(key[4]), set1_256(key[5]), set1_256(key[6]), set1_256(key[7]),
- };
- __m256i counter_low_vec, counter_high_vec;
- load_counters8(counter, increment_counter, &counter_low_vec,
- &counter_high_vec);
- uint8_t block_flags = flags | flags_start;
-
- for (size_t block = 0; block < blocks; block++) {
- if (block + 1 == blocks) {
- block_flags |= flags_end;
- }
- __m256i block_len_vec = set1_256(BLAKE3_BLOCK_LEN);
- __m256i block_flags_vec = set1_256(block_flags);
- __m256i msg_vecs[16];
- transpose_msg_vecs8(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
-
- __m256i v[16] = {
- h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
- h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
- set1_256(IV[0]), set1_256(IV[1]), set1_256(IV[2]), set1_256(IV[3]),
- counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
- };
- round_fn8(v, msg_vecs, 0);
- round_fn8(v, msg_vecs, 1);
- round_fn8(v, msg_vecs, 2);
- round_fn8(v, msg_vecs, 3);
- round_fn8(v, msg_vecs, 4);
- round_fn8(v, msg_vecs, 5);
- round_fn8(v, msg_vecs, 6);
- h_vecs[0] = xor_256(v[0], v[8]);
- h_vecs[1] = xor_256(v[1], v[9]);
- h_vecs[2] = xor_256(v[2], v[10]);
- h_vecs[3] = xor_256(v[3], v[11]);
- h_vecs[4] = xor_256(v[4], v[12]);
- h_vecs[5] = xor_256(v[5], v[13]);
- h_vecs[6] = xor_256(v[6], v[14]);
- h_vecs[7] = xor_256(v[7], v[15]);
-
- block_flags = flags;
- }
-
- transpose_vecs_256(h_vecs);
- storeu_256(h_vecs[0], &out[0 * sizeof(__m256i)]);
- storeu_256(h_vecs[1], &out[1 * sizeof(__m256i)]);
- storeu_256(h_vecs[2], &out[2 * sizeof(__m256i)]);
- storeu_256(h_vecs[3], &out[3 * sizeof(__m256i)]);
- storeu_256(h_vecs[4], &out[4 * sizeof(__m256i)]);
- storeu_256(h_vecs[5], &out[5 * sizeof(__m256i)]);
- storeu_256(h_vecs[6], &out[6 * sizeof(__m256i)]);
- storeu_256(h_vecs[7], &out[7 * sizeof(__m256i)]);
-}
-
-/*
- * ----------------------------------------------------------------------------
- * hash16_avx512
- * ----------------------------------------------------------------------------
- */
-
-INLINE void round_fn16(__m512i v[16], __m512i m[16], size_t r) {
- v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
- v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
- v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
- v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
- v[0] = add_512(v[0], v[4]);
- v[1] = add_512(v[1], v[5]);
- v[2] = add_512(v[2], v[6]);
- v[3] = add_512(v[3], v[7]);
- v[12] = xor_512(v[12], v[0]);
- v[13] = xor_512(v[13], v[1]);
- v[14] = xor_512(v[14], v[2]);
- v[15] = xor_512(v[15], v[3]);
- v[12] = rot16_512(v[12]);
- v[13] = rot16_512(v[13]);
- v[14] = rot16_512(v[14]);
- v[15] = rot16_512(v[15]);
- v[8] = add_512(v[8], v[12]);
- v[9] = add_512(v[9], v[13]);
- v[10] = add_512(v[10], v[14]);
- v[11] = add_512(v[11], v[15]);
- v[4] = xor_512(v[4], v[8]);
- v[5] = xor_512(v[5], v[9]);
- v[6] = xor_512(v[6], v[10]);
- v[7] = xor_512(v[7], v[11]);
- v[4] = rot12_512(v[4]);
- v[5] = rot12_512(v[5]);
- v[6] = rot12_512(v[6]);
- v[7] = rot12_512(v[7]);
- v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
- v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
- v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
- v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
- v[0] = add_512(v[0], v[4]);
- v[1] = add_512(v[1], v[5]);
- v[2] = add_512(v[2], v[6]);
- v[3] = add_512(v[3], v[7]);
- v[12] = xor_512(v[12], v[0]);
- v[13] = xor_512(v[13], v[1]);
- v[14] = xor_512(v[14], v[2]);
- v[15] = xor_512(v[15], v[3]);
- v[12] = rot8_512(v[12]);
- v[13] = rot8_512(v[13]);
- v[14] = rot8_512(v[14]);
- v[15] = rot8_512(v[15]);
- v[8] = add_512(v[8], v[12]);
- v[9] = add_512(v[9], v[13]);
- v[10] = add_512(v[10], v[14]);
- v[11] = add_512(v[11], v[15]);
- v[4] = xor_512(v[4], v[8]);
- v[5] = xor_512(v[5], v[9]);
- v[6] = xor_512(v[6], v[10]);
- v[7] = xor_512(v[7], v[11]);
- v[4] = rot7_512(v[4]);
- v[5] = rot7_512(v[5]);
- v[6] = rot7_512(v[6]);
- v[7] = rot7_512(v[7]);
-
- v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
- v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
- v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
- v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
- v[0] = add_512(v[0], v[5]);
- v[1] = add_512(v[1], v[6]);
- v[2] = add_512(v[2], v[7]);
- v[3] = add_512(v[3], v[4]);
- v[15] = xor_512(v[15], v[0]);
- v[12] = xor_512(v[12], v[1]);
- v[13] = xor_512(v[13], v[2]);
- v[14] = xor_512(v[14], v[3]);
- v[15] = rot16_512(v[15]);
- v[12] = rot16_512(v[12]);
- v[13] = rot16_512(v[13]);
- v[14] = rot16_512(v[14]);
- v[10] = add_512(v[10], v[15]);
- v[11] = add_512(v[11], v[12]);
- v[8] = add_512(v[8], v[13]);
- v[9] = add_512(v[9], v[14]);
- v[5] = xor_512(v[5], v[10]);
- v[6] = xor_512(v[6], v[11]);
- v[7] = xor_512(v[7], v[8]);
- v[4] = xor_512(v[4], v[9]);
- v[5] = rot12_512(v[5]);
- v[6] = rot12_512(v[6]);
- v[7] = rot12_512(v[7]);
- v[4] = rot12_512(v[4]);
- v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
- v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
- v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
- v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
- v[0] = add_512(v[0], v[5]);
- v[1] = add_512(v[1], v[6]);
- v[2] = add_512(v[2], v[7]);
- v[3] = add_512(v[3], v[4]);
- v[15] = xor_512(v[15], v[0]);
- v[12] = xor_512(v[12], v[1]);
- v[13] = xor_512(v[13], v[2]);
- v[14] = xor_512(v[14], v[3]);
- v[15] = rot8_512(v[15]);
- v[12] = rot8_512(v[12]);
- v[13] = rot8_512(v[13]);
- v[14] = rot8_512(v[14]);
- v[10] = add_512(v[10], v[15]);
- v[11] = add_512(v[11], v[12]);
- v[8] = add_512(v[8], v[13]);
- v[9] = add_512(v[9], v[14]);
- v[5] = xor_512(v[5], v[10]);
- v[6] = xor_512(v[6], v[11]);
- v[7] = xor_512(v[7], v[8]);
- v[4] = xor_512(v[4], v[9]);
- v[5] = rot7_512(v[5]);
- v[6] = rot7_512(v[6]);
- v[7] = rot7_512(v[7]);
- v[4] = rot7_512(v[4]);
-}
-
-// 0b10001000, or lanes a0/a2/b0/b2 in little-endian order
-#define LO_IMM8 0x88
-
-INLINE __m512i unpack_lo_128(__m512i a, __m512i b) {
- return _mm512_shuffle_i32x4(a, b, LO_IMM8);
-}
-
-// 0b11011101, or lanes a1/a3/b1/b3 in little-endian order
-#define HI_IMM8 0xdd
-
-INLINE __m512i unpack_hi_128(__m512i a, __m512i b) {
- return _mm512_shuffle_i32x4(a, b, HI_IMM8);
-}
-
-INLINE void transpose_vecs_512(__m512i vecs[16]) {
- // Interleave 32-bit lanes. The _0 unpack is lanes
- // 0/0/1/1/4/4/5/5/8/8/9/9/12/12/13/13, and the _2 unpack is lanes
- // 2/2/3/3/6/6/7/7/10/10/11/11/14/14/15/15.
- __m512i ab_0 = _mm512_unpacklo_epi32(vecs[0], vecs[1]);
- __m512i ab_2 = _mm512_unpackhi_epi32(vecs[0], vecs[1]);
- __m512i cd_0 = _mm512_unpacklo_epi32(vecs[2], vecs[3]);
- __m512i cd_2 = _mm512_unpackhi_epi32(vecs[2], vecs[3]);
- __m512i ef_0 = _mm512_unpacklo_epi32(vecs[4], vecs[5]);
- __m512i ef_2 = _mm512_unpackhi_epi32(vecs[4], vecs[5]);
- __m512i gh_0 = _mm512_unpacklo_epi32(vecs[6], vecs[7]);
- __m512i gh_2 = _mm512_unpackhi_epi32(vecs[6], vecs[7]);
- __m512i ij_0 = _mm512_unpacklo_epi32(vecs[8], vecs[9]);
- __m512i ij_2 = _mm512_unpackhi_epi32(vecs[8], vecs[9]);
- __m512i kl_0 = _mm512_unpacklo_epi32(vecs[10], vecs[11]);
- __m512i kl_2 = _mm512_unpackhi_epi32(vecs[10], vecs[11]);
- __m512i mn_0 = _mm512_unpacklo_epi32(vecs[12], vecs[13]);
- __m512i mn_2 = _mm512_unpackhi_epi32(vecs[12], vecs[13]);
- __m512i op_0 = _mm512_unpacklo_epi32(vecs[14], vecs[15]);
- __m512i op_2 = _mm512_unpackhi_epi32(vecs[14], vecs[15]);
-
- // Interleave 64-bit lates. The _0 unpack is lanes
- // 0/0/0/0/4/4/4/4/8/8/8/8/12/12/12/12, the _1 unpack is lanes
- // 1/1/1/1/5/5/5/5/9/9/9/9/13/13/13/13, the _2 unpack is lanes
- // 2/2/2/2/6/6/6/6/10/10/10/10/14/14/14/14, and the _3 unpack is lanes
- // 3/3/3/3/7/7/7/7/11/11/11/11/15/15/15/15.
- __m512i abcd_0 = _mm512_unpacklo_epi64(ab_0, cd_0);
- __m512i abcd_1 = _mm512_unpackhi_epi64(ab_0, cd_0);
- __m512i abcd_2 = _mm512_unpacklo_epi64(ab_2, cd_2);
- __m512i abcd_3 = _mm512_unpackhi_epi64(ab_2, cd_2);
- __m512i efgh_0 = _mm512_unpacklo_epi64(ef_0, gh_0);
- __m512i efgh_1 = _mm512_unpackhi_epi64(ef_0, gh_0);
- __m512i efgh_2 = _mm512_unpacklo_epi64(ef_2, gh_2);
- __m512i efgh_3 = _mm512_unpackhi_epi64(ef_2, gh_2);
- __m512i ijkl_0 = _mm512_unpacklo_epi64(ij_0, kl_0);
- __m512i ijkl_1 = _mm512_unpackhi_epi64(ij_0, kl_0);
- __m512i ijkl_2 = _mm512_unpacklo_epi64(ij_2, kl_2);
- __m512i ijkl_3 = _mm512_unpackhi_epi64(ij_2, kl_2);
- __m512i mnop_0 = _mm512_unpacklo_epi64(mn_0, op_0);
- __m512i mnop_1 = _mm512_unpackhi_epi64(mn_0, op_0);
- __m512i mnop_2 = _mm512_unpacklo_epi64(mn_2, op_2);
- __m512i mnop_3 = _mm512_unpackhi_epi64(mn_2, op_2);
-
- // Interleave 128-bit lanes. The _0 unpack is
- // 0/0/0/0/8/8/8/8/0/0/0/0/8/8/8/8, the _1 unpack is
- // 1/1/1/1/9/9/9/9/1/1/1/1/9/9/9/9, and so on.
- __m512i abcdefgh_0 = unpack_lo_128(abcd_0, efgh_0);
- __m512i abcdefgh_1 = unpack_lo_128(abcd_1, efgh_1);
- __m512i abcdefgh_2 = unpack_lo_128(abcd_2, efgh_2);
- __m512i abcdefgh_3 = unpack_lo_128(abcd_3, efgh_3);
- __m512i abcdefgh_4 = unpack_hi_128(abcd_0, efgh_0);
- __m512i abcdefgh_5 = unpack_hi_128(abcd_1, efgh_1);
- __m512i abcdefgh_6 = unpack_hi_128(abcd_2, efgh_2);
- __m512i abcdefgh_7 = unpack_hi_128(abcd_3, efgh_3);
- __m512i ijklmnop_0 = unpack_lo_128(ijkl_0, mnop_0);
- __m512i ijklmnop_1 = unpack_lo_128(ijkl_1, mnop_1);
- __m512i ijklmnop_2 = unpack_lo_128(ijkl_2, mnop_2);
- __m512i ijklmnop_3 = unpack_lo_128(ijkl_3, mnop_3);
- __m512i ijklmnop_4 = unpack_hi_128(ijkl_0, mnop_0);
- __m512i ijklmnop_5 = unpack_hi_128(ijkl_1, mnop_1);
- __m512i ijklmnop_6 = unpack_hi_128(ijkl_2, mnop_2);
- __m512i ijklmnop_7 = unpack_hi_128(ijkl_3, mnop_3);
-
- // Interleave 128-bit lanes again for the final outputs.
- vecs[0] = unpack_lo_128(abcdefgh_0, ijklmnop_0);
- vecs[1] = unpack_lo_128(abcdefgh_1, ijklmnop_1);
- vecs[2] = unpack_lo_128(abcdefgh_2, ijklmnop_2);
- vecs[3] = unpack_lo_128(abcdefgh_3, ijklmnop_3);
- vecs[4] = unpack_lo_128(abcdefgh_4, ijklmnop_4);
- vecs[5] = unpack_lo_128(abcdefgh_5, ijklmnop_5);
- vecs[6] = unpack_lo_128(abcdefgh_6, ijklmnop_6);
- vecs[7] = unpack_lo_128(abcdefgh_7, ijklmnop_7);
- vecs[8] = unpack_hi_128(abcdefgh_0, ijklmnop_0);
- vecs[9] = unpack_hi_128(abcdefgh_1, ijklmnop_1);
- vecs[10] = unpack_hi_128(abcdefgh_2, ijklmnop_2);
- vecs[11] = unpack_hi_128(abcdefgh_3, ijklmnop_3);
- vecs[12] = unpack_hi_128(abcdefgh_4, ijklmnop_4);
- vecs[13] = unpack_hi_128(abcdefgh_5, ijklmnop_5);
- vecs[14] = unpack_hi_128(abcdefgh_6, ijklmnop_6);
- vecs[15] = unpack_hi_128(abcdefgh_7, ijklmnop_7);
-}
-
-INLINE void transpose_msg_vecs16(const uint8_t *const *inputs,
- size_t block_offset, __m512i out[16]) {
- out[0] = loadu_512(&inputs[0][block_offset]);
- out[1] = loadu_512(&inputs[1][block_offset]);
- out[2] = loadu_512(&inputs[2][block_offset]);
- out[3] = loadu_512(&inputs[3][block_offset]);
- out[4] = loadu_512(&inputs[4][block_offset]);
- out[5] = loadu_512(&inputs[5][block_offset]);
- out[6] = loadu_512(&inputs[6][block_offset]);
- out[7] = loadu_512(&inputs[7][block_offset]);
- out[8] = loadu_512(&inputs[8][block_offset]);
- out[9] = loadu_512(&inputs[9][block_offset]);
- out[10] = loadu_512(&inputs[10][block_offset]);
- out[11] = loadu_512(&inputs[11][block_offset]);
- out[12] = loadu_512(&inputs[12][block_offset]);
- out[13] = loadu_512(&inputs[13][block_offset]);
- out[14] = loadu_512(&inputs[14][block_offset]);
- out[15] = loadu_512(&inputs[15][block_offset]);
- for (size_t i = 0; i < 16; ++i) {
- _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0);
- }
- transpose_vecs_512(out);
-}
-
-INLINE void load_counters16(uint64_t counter, bool increment_counter,
- __m512i *out_lo, __m512i *out_hi) {
- const __m512i mask = _mm512_set1_epi32(-(int32_t)increment_counter);
- const __m512i add0 = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
- const __m512i add1 = _mm512_and_si512(mask, add0);
- __m512i l = _mm512_add_epi32(_mm512_set1_epi32(counter), add1);
- __mmask16 carry = _mm512_cmp_epu32_mask(l, add1, _MM_CMPINT_LT);
- __m512i h = _mm512_mask_add_epi32(_mm512_set1_epi32(counter >> 32), carry, _mm512_set1_epi32(counter >> 32), _mm512_set1_epi32(1));
- *out_lo = l;
- *out_hi = h;
-}
-
-void blake3_hash16_avx512(const uint8_t *const *inputs, size_t blocks,
- const uint32_t key[8], uint64_t counter,
- bool increment_counter, uint8_t flags,
- uint8_t flags_start, uint8_t flags_end,
- uint8_t *out) {
- __m512i h_vecs[8] = {
- set1_512(key[0]), set1_512(key[1]), set1_512(key[2]), set1_512(key[3]),
- set1_512(key[4]), set1_512(key[5]), set1_512(key[6]), set1_512(key[7]),
- };
- __m512i counter_low_vec, counter_high_vec;
- load_counters16(counter, increment_counter, &counter_low_vec,
- &counter_high_vec);
- uint8_t block_flags = flags | flags_start;
-
- for (size_t block = 0; block < blocks; block++) {
- if (block + 1 == blocks) {
- block_flags |= flags_end;
- }
- __m512i block_len_vec = set1_512(BLAKE3_BLOCK_LEN);
- __m512i block_flags_vec = set1_512(block_flags);
- __m512i msg_vecs[16];
- transpose_msg_vecs16(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
-
- __m512i v[16] = {
- h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
- h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
- set1_512(IV[0]), set1_512(IV[1]), set1_512(IV[2]), set1_512(IV[3]),
- counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
- };
- round_fn16(v, msg_vecs, 0);
- round_fn16(v, msg_vecs, 1);
- round_fn16(v, msg_vecs, 2);
- round_fn16(v, msg_vecs, 3);
- round_fn16(v, msg_vecs, 4);
- round_fn16(v, msg_vecs, 5);
- round_fn16(v, msg_vecs, 6);
- h_vecs[0] = xor_512(v[0], v[8]);
- h_vecs[1] = xor_512(v[1], v[9]);
- h_vecs[2] = xor_512(v[2], v[10]);
- h_vecs[3] = xor_512(v[3], v[11]);
- h_vecs[4] = xor_512(v[4], v[12]);
- h_vecs[5] = xor_512(v[5], v[13]);
- h_vecs[6] = xor_512(v[6], v[14]);
- h_vecs[7] = xor_512(v[7], v[15]);
-
- block_flags = flags;
- }
-
- // transpose_vecs_512 operates on a 16x16 matrix of words, but we only have 8
- // state vectors. Pad the matrix with zeros. After transposition, store the
- // lower half of each vector.
- __m512i padded[16] = {
- h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
- h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
- set1_512(0), set1_512(0), set1_512(0), set1_512(0),
- set1_512(0), set1_512(0), set1_512(0), set1_512(0),
- };
- transpose_vecs_512(padded);
- _mm256_mask_storeu_epi32(&out[0 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[0]));
- _mm256_mask_storeu_epi32(&out[1 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[1]));
- _mm256_mask_storeu_epi32(&out[2 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[2]));
- _mm256_mask_storeu_epi32(&out[3 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[3]));
- _mm256_mask_storeu_epi32(&out[4 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[4]));
- _mm256_mask_storeu_epi32(&out[5 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[5]));
- _mm256_mask_storeu_epi32(&out[6 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[6]));
- _mm256_mask_storeu_epi32(&out[7 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[7]));
- _mm256_mask_storeu_epi32(&out[8 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[8]));
- _mm256_mask_storeu_epi32(&out[9 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[9]));
- _mm256_mask_storeu_epi32(&out[10 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[10]));
- _mm256_mask_storeu_epi32(&out[11 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[11]));
- _mm256_mask_storeu_epi32(&out[12 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[12]));
- _mm256_mask_storeu_epi32(&out[13 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[13]));
- _mm256_mask_storeu_epi32(&out[14 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[14]));
- _mm256_mask_storeu_epi32(&out[15 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[15]));
-}
-
-/*
- * ----------------------------------------------------------------------------
- * hash_many_avx512
- * ----------------------------------------------------------------------------
- */
-
-INLINE void hash_one_avx512(const uint8_t *input, size_t blocks,
- const uint32_t key[8], uint64_t counter,
- uint8_t flags, uint8_t flags_start,
- uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) {
- uint32_t cv[8];
- memcpy(cv, key, BLAKE3_KEY_LEN);
- uint8_t block_flags = flags | flags_start;
- while (blocks > 0) {
- if (blocks == 1) {
- block_flags |= flags_end;
- }
- blake3_compress_in_place_avx512(cv, input, BLAKE3_BLOCK_LEN, counter,
- block_flags);
- input = &input[BLAKE3_BLOCK_LEN];
- blocks -= 1;
- block_flags = flags;
- }
- memcpy(out, cv, BLAKE3_OUT_LEN);
-}
-
-void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
- size_t blocks, const uint32_t key[8],
- uint64_t counter, bool increment_counter,
- uint8_t flags, uint8_t flags_start,
- uint8_t flags_end, uint8_t *out) {
- while (num_inputs >= 16) {
- blake3_hash16_avx512(inputs, blocks, key, counter, increment_counter, flags,
- flags_start, flags_end, out);
- if (increment_counter) {
- counter += 16;
- }
- inputs += 16;
- num_inputs -= 16;
- out = &out[16 * BLAKE3_OUT_LEN];
- }
- while (num_inputs >= 8) {
- blake3_hash8_avx512(inputs, blocks, key, counter, increment_counter, flags,
- flags_start, flags_end, out);
- if (increment_counter) {
- counter += 8;
- }
- inputs += 8;
- num_inputs -= 8;
- out = &out[8 * BLAKE3_OUT_LEN];
- }
- while (num_inputs >= 4) {
- blake3_hash4_avx512(inputs, blocks, key, counter, increment_counter, flags,
- flags_start, flags_end, out);
- if (increment_counter) {
- counter += 4;
- }
- inputs += 4;
- num_inputs -= 4;
- out = &out[4 * BLAKE3_OUT_LEN];
- }
- while (num_inputs > 0) {
- hash_one_avx512(inputs[0], blocks, key, counter, flags, flags_start,
- flags_end, out);
- if (increment_counter) {
- counter += 1;
- }
- inputs += 1;
- num_inputs -= 1;
- out = &out[BLAKE3_OUT_LEN];
- }
-}
diff --git a/thirdparty/BLAKE3/c/blake3_avx512_x86-64_unix.S b/thirdparty/BLAKE3/c/blake3_avx512_x86-64_unix.S
deleted file mode 100644
index a06aede0f..000000000
--- a/thirdparty/BLAKE3/c/blake3_avx512_x86-64_unix.S
+++ /dev/null
@@ -1,2585 +0,0 @@
-#if defined(__ELF__) && defined(__linux__)
-.section .note.GNU-stack,"",%progbits
-#endif
-
-#if defined(__ELF__) && defined(__CET__) && defined(__has_include)
-#if __has_include(<cet.h>)
-#include <cet.h>
-#endif
-#endif
-
-#if !defined(_CET_ENDBR)
-#define _CET_ENDBR
-#endif
-
-.intel_syntax noprefix
-.global _blake3_hash_many_avx512
-.global blake3_hash_many_avx512
-.global blake3_compress_in_place_avx512
-.global _blake3_compress_in_place_avx512
-.global blake3_compress_xof_avx512
-.global _blake3_compress_xof_avx512
-
-#ifdef __APPLE__
-.text
-#else
-.section .text
-#endif
-.p2align 6
-_blake3_hash_many_avx512:
-blake3_hash_many_avx512:
- _CET_ENDBR
- push r15
- push r14
- push r13
- push r12
- push rbx
- push rbp
- mov rbp, rsp
- sub rsp, 144
- and rsp, 0xFFFFFFFFFFFFFFC0
- neg r9
- kmovw k1, r9d
- vmovd xmm0, r8d
- vpbroadcastd ymm0, xmm0
- shr r8, 32
- vmovd xmm1, r8d
- vpbroadcastd ymm1, xmm1
- vmovdqa ymm4, ymm1
- vmovdqa ymm5, ymm1
- vpaddd ymm2, ymm0, ymmword ptr [ADD0+rip]
- vpaddd ymm3, ymm0, ymmword ptr [ADD0+32+rip]
- vpcmpltud k2, ymm2, ymm0
- vpcmpltud k3, ymm3, ymm0
- vpaddd ymm4 {k2}, ymm4, dword ptr [ADD1+rip] {1to8}
- vpaddd ymm5 {k3}, ymm5, dword ptr [ADD1+rip] {1to8}
- knotw k2, k1
- vmovdqa32 ymm2 {k2}, ymm0
- vmovdqa32 ymm3 {k2}, ymm0
- vmovdqa32 ymm4 {k2}, ymm1
- vmovdqa32 ymm5 {k2}, ymm1
- vmovdqa ymmword ptr [rsp], ymm2
- vmovdqa ymmword ptr [rsp+0x1*0x20], ymm3
- vmovdqa ymmword ptr [rsp+0x2*0x20], ymm4
- vmovdqa ymmword ptr [rsp+0x3*0x20], ymm5
- shl rdx, 6
- mov qword ptr [rsp+0x80], rdx
- cmp rsi, 16
- jc 3f
-2:
- vpbroadcastd zmm0, dword ptr [rcx]
- vpbroadcastd zmm1, dword ptr [rcx+0x1*0x4]
- vpbroadcastd zmm2, dword ptr [rcx+0x2*0x4]
- vpbroadcastd zmm3, dword ptr [rcx+0x3*0x4]
- vpbroadcastd zmm4, dword ptr [rcx+0x4*0x4]
- vpbroadcastd zmm5, dword ptr [rcx+0x5*0x4]
- vpbroadcastd zmm6, dword ptr [rcx+0x6*0x4]
- vpbroadcastd zmm7, dword ptr [rcx+0x7*0x4]
- movzx eax, byte ptr [rbp+0x38]
- movzx ebx, byte ptr [rbp+0x40]
- or eax, ebx
- xor edx, edx
-.p2align 5
-9:
- movzx ebx, byte ptr [rbp+0x48]
- or ebx, eax
- add rdx, 64
- cmp rdx, qword ptr [rsp+0x80]
- cmove eax, ebx
- mov dword ptr [rsp+0x88], eax
- mov r8, qword ptr [rdi]
- mov r9, qword ptr [rdi+0x8]
- mov r10, qword ptr [rdi+0x10]
- mov r11, qword ptr [rdi+0x18]
- mov r12, qword ptr [rdi+0x40]
- mov r13, qword ptr [rdi+0x48]
- mov r14, qword ptr [rdi+0x50]
- mov r15, qword ptr [rdi+0x58]
- vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
- vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
- vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
- vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
- vpunpcklqdq zmm8, zmm16, zmm17
- vpunpckhqdq zmm9, zmm16, zmm17
- vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20]
- vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
- vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20]
- vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
- vpunpcklqdq zmm10, zmm18, zmm19
- vpunpckhqdq zmm11, zmm18, zmm19
- mov r8, qword ptr [rdi+0x20]
- mov r9, qword ptr [rdi+0x28]
- mov r10, qword ptr [rdi+0x30]
- mov r11, qword ptr [rdi+0x38]
- mov r12, qword ptr [rdi+0x60]
- mov r13, qword ptr [rdi+0x68]
- mov r14, qword ptr [rdi+0x70]
- mov r15, qword ptr [rdi+0x78]
- vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
- vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
- vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
- vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
- vpunpcklqdq zmm12, zmm16, zmm17
- vpunpckhqdq zmm13, zmm16, zmm17
- vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20]
- vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
- vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20]
- vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
- vpunpcklqdq zmm14, zmm18, zmm19
- vpunpckhqdq zmm15, zmm18, zmm19
- vmovdqa32 zmm27, zmmword ptr [INDEX0+rip]
- vmovdqa32 zmm31, zmmword ptr [INDEX1+rip]
- vshufps zmm16, zmm8, zmm10, 136
- vshufps zmm17, zmm12, zmm14, 136
- vmovdqa32 zmm20, zmm16
- vpermt2d zmm16, zmm27, zmm17
- vpermt2d zmm20, zmm31, zmm17
- vshufps zmm17, zmm8, zmm10, 221
- vshufps zmm30, zmm12, zmm14, 221
- vmovdqa32 zmm21, zmm17
- vpermt2d zmm17, zmm27, zmm30
- vpermt2d zmm21, zmm31, zmm30
- vshufps zmm18, zmm9, zmm11, 136
- vshufps zmm8, zmm13, zmm15, 136
- vmovdqa32 zmm22, zmm18
- vpermt2d zmm18, zmm27, zmm8
- vpermt2d zmm22, zmm31, zmm8
- vshufps zmm19, zmm9, zmm11, 221
- vshufps zmm8, zmm13, zmm15, 221
- vmovdqa32 zmm23, zmm19
- vpermt2d zmm19, zmm27, zmm8
- vpermt2d zmm23, zmm31, zmm8
- mov r8, qword ptr [rdi]
- mov r9, qword ptr [rdi+0x8]
- mov r10, qword ptr [rdi+0x10]
- mov r11, qword ptr [rdi+0x18]
- mov r12, qword ptr [rdi+0x40]
- mov r13, qword ptr [rdi+0x48]
- mov r14, qword ptr [rdi+0x50]
- mov r15, qword ptr [rdi+0x58]
- vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
- vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
- vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
- vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
- vpunpcklqdq zmm8, zmm24, zmm25
- vpunpckhqdq zmm9, zmm24, zmm25
- vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20]
- vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
- vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20]
- vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
- vpunpcklqdq zmm10, zmm24, zmm25
- vpunpckhqdq zmm11, zmm24, zmm25
- prefetcht0 [r8+rdx+0x80]
- prefetcht0 [r12+rdx+0x80]
- prefetcht0 [r9+rdx+0x80]
- prefetcht0 [r13+rdx+0x80]
- prefetcht0 [r10+rdx+0x80]
- prefetcht0 [r14+rdx+0x80]
- prefetcht0 [r11+rdx+0x80]
- prefetcht0 [r15+rdx+0x80]
- mov r8, qword ptr [rdi+0x20]
- mov r9, qword ptr [rdi+0x28]
- mov r10, qword ptr [rdi+0x30]
- mov r11, qword ptr [rdi+0x38]
- mov r12, qword ptr [rdi+0x60]
- mov r13, qword ptr [rdi+0x68]
- mov r14, qword ptr [rdi+0x70]
- mov r15, qword ptr [rdi+0x78]
- vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
- vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
- vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
- vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
- vpunpcklqdq zmm12, zmm24, zmm25
- vpunpckhqdq zmm13, zmm24, zmm25
- vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20]
- vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
- vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20]
- vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
- vpunpcklqdq zmm14, zmm24, zmm25
- vpunpckhqdq zmm15, zmm24, zmm25
- prefetcht0 [r8+rdx+0x80]
- prefetcht0 [r12+rdx+0x80]
- prefetcht0 [r9+rdx+0x80]
- prefetcht0 [r13+rdx+0x80]
- prefetcht0 [r10+rdx+0x80]
- prefetcht0 [r14+rdx+0x80]
- prefetcht0 [r11+rdx+0x80]
- prefetcht0 [r15+rdx+0x80]
- vshufps zmm24, zmm8, zmm10, 136
- vshufps zmm30, zmm12, zmm14, 136
- vmovdqa32 zmm28, zmm24
- vpermt2d zmm24, zmm27, zmm30
- vpermt2d zmm28, zmm31, zmm30
- vshufps zmm25, zmm8, zmm10, 221
- vshufps zmm30, zmm12, zmm14, 221
- vmovdqa32 zmm29, zmm25
- vpermt2d zmm25, zmm27, zmm30
- vpermt2d zmm29, zmm31, zmm30
- vshufps zmm26, zmm9, zmm11, 136
- vshufps zmm8, zmm13, zmm15, 136
- vmovdqa32 zmm30, zmm26
- vpermt2d zmm26, zmm27, zmm8
- vpermt2d zmm30, zmm31, zmm8
- vshufps zmm8, zmm9, zmm11, 221
- vshufps zmm10, zmm13, zmm15, 221
- vpermi2d zmm27, zmm8, zmm10
- vpermi2d zmm31, zmm8, zmm10
- vpbroadcastd zmm8, dword ptr [BLAKE3_IV_0+rip]
- vpbroadcastd zmm9, dword ptr [BLAKE3_IV_1+rip]
- vpbroadcastd zmm10, dword ptr [BLAKE3_IV_2+rip]
- vpbroadcastd zmm11, dword ptr [BLAKE3_IV_3+rip]
- vmovdqa32 zmm12, zmmword ptr [rsp]
- vmovdqa32 zmm13, zmmword ptr [rsp+0x1*0x40]
- vpbroadcastd zmm14, dword ptr [BLAKE3_BLOCK_LEN+rip]
- vpbroadcastd zmm15, dword ptr [rsp+0x22*0x4]
- vpaddd zmm0, zmm0, zmm16
- vpaddd zmm1, zmm1, zmm18
- vpaddd zmm2, zmm2, zmm20
- vpaddd zmm3, zmm3, zmm22
- vpaddd zmm0, zmm0, zmm4
- vpaddd zmm1, zmm1, zmm5
- vpaddd zmm2, zmm2, zmm6
- vpaddd zmm3, zmm3, zmm7
- vpxord zmm12, zmm12, zmm0
- vpxord zmm13, zmm13, zmm1
- vpxord zmm14, zmm14, zmm2
- vpxord zmm15, zmm15, zmm3
- vprord zmm12, zmm12, 16
- vprord zmm13, zmm13, 16
- vprord zmm14, zmm14, 16
- vprord zmm15, zmm15, 16
- vpaddd zmm8, zmm8, zmm12
- vpaddd zmm9, zmm9, zmm13
- vpaddd zmm10, zmm10, zmm14
- vpaddd zmm11, zmm11, zmm15
- vpxord zmm4, zmm4, zmm8
- vpxord zmm5, zmm5, zmm9
- vpxord zmm6, zmm6, zmm10
- vpxord zmm7, zmm7, zmm11
- vprord zmm4, zmm4, 12
- vprord zmm5, zmm5, 12
- vprord zmm6, zmm6, 12
- vprord zmm7, zmm7, 12
- vpaddd zmm0, zmm0, zmm17
- vpaddd zmm1, zmm1, zmm19
- vpaddd zmm2, zmm2, zmm21
- vpaddd zmm3, zmm3, zmm23
- vpaddd zmm0, zmm0, zmm4
- vpaddd zmm1, zmm1, zmm5
- vpaddd zmm2, zmm2, zmm6
- vpaddd zmm3, zmm3, zmm7
- vpxord zmm12, zmm12, zmm0
- vpxord zmm13, zmm13, zmm1
- vpxord zmm14, zmm14, zmm2
- vpxord zmm15, zmm15, zmm3
- vprord zmm12, zmm12, 8
- vprord zmm13, zmm13, 8
- vprord zmm14, zmm14, 8
- vprord zmm15, zmm15, 8
- vpaddd zmm8, zmm8, zmm12
- vpaddd zmm9, zmm9, zmm13
- vpaddd zmm10, zmm10, zmm14
- vpaddd zmm11, zmm11, zmm15
- vpxord zmm4, zmm4, zmm8
- vpxord zmm5, zmm5, zmm9
- vpxord zmm6, zmm6, zmm10
- vpxord zmm7, zmm7, zmm11
- vprord zmm4, zmm4, 7
- vprord zmm5, zmm5, 7
- vprord zmm6, zmm6, 7
- vprord zmm7, zmm7, 7
- vpaddd zmm0, zmm0, zmm24
- vpaddd zmm1, zmm1, zmm26
- vpaddd zmm2, zmm2, zmm28
- vpaddd zmm3, zmm3, zmm30
- vpaddd zmm0, zmm0, zmm5
- vpaddd zmm1, zmm1, zmm6
- vpaddd zmm2, zmm2, zmm7
- vpaddd zmm3, zmm3, zmm4
- vpxord zmm15, zmm15, zmm0
- vpxord zmm12, zmm12, zmm1
- vpxord zmm13, zmm13, zmm2
- vpxord zmm14, zmm14, zmm3
- vprord zmm15, zmm15, 16
- vprord zmm12, zmm12, 16
- vprord zmm13, zmm13, 16
- vprord zmm14, zmm14, 16
- vpaddd zmm10, zmm10, zmm15
- vpaddd zmm11, zmm11, zmm12
- vpaddd zmm8, zmm8, zmm13
- vpaddd zmm9, zmm9, zmm14
- vpxord zmm5, zmm5, zmm10
- vpxord zmm6, zmm6, zmm11
- vpxord zmm7, zmm7, zmm8
- vpxord zmm4, zmm4, zmm9
- vprord zmm5, zmm5, 12
- vprord zmm6, zmm6, 12
- vprord zmm7, zmm7, 12
- vprord zmm4, zmm4, 12
- vpaddd zmm0, zmm0, zmm25
- vpaddd zmm1, zmm1, zmm27
- vpaddd zmm2, zmm2, zmm29
- vpaddd zmm3, zmm3, zmm31
- vpaddd zmm0, zmm0, zmm5
- vpaddd zmm1, zmm1, zmm6
- vpaddd zmm2, zmm2, zmm7
- vpaddd zmm3, zmm3, zmm4
- vpxord zmm15, zmm15, zmm0
- vpxord zmm12, zmm12, zmm1
- vpxord zmm13, zmm13, zmm2
- vpxord zmm14, zmm14, zmm3
- vprord zmm15, zmm15, 8
- vprord zmm12, zmm12, 8
- vprord zmm13, zmm13, 8
- vprord zmm14, zmm14, 8
- vpaddd zmm10, zmm10, zmm15
- vpaddd zmm11, zmm11, zmm12
- vpaddd zmm8, zmm8, zmm13
- vpaddd zmm9, zmm9, zmm14
- vpxord zmm5, zmm5, zmm10
- vpxord zmm6, zmm6, zmm11
- vpxord zmm7, zmm7, zmm8
- vpxord zmm4, zmm4, zmm9
- vprord zmm5, zmm5, 7
- vprord zmm6, zmm6, 7
- vprord zmm7, zmm7, 7
- vprord zmm4, zmm4, 7
- vpaddd zmm0, zmm0, zmm18
- vpaddd zmm1, zmm1, zmm19
- vpaddd zmm2, zmm2, zmm23
- vpaddd zmm3, zmm3, zmm20
- vpaddd zmm0, zmm0, zmm4
- vpaddd zmm1, zmm1, zmm5
- vpaddd zmm2, zmm2, zmm6
- vpaddd zmm3, zmm3, zmm7
- vpxord zmm12, zmm12, zmm0
- vpxord zmm13, zmm13, zmm1
- vpxord zmm14, zmm14, zmm2
- vpxord zmm15, zmm15, zmm3
- vprord zmm12, zmm12, 16
- vprord zmm13, zmm13, 16
- vprord zmm14, zmm14, 16
- vprord zmm15, zmm15, 16
- vpaddd zmm8, zmm8, zmm12
- vpaddd zmm9, zmm9, zmm13
- vpaddd zmm10, zmm10, zmm14
- vpaddd zmm11, zmm11, zmm15
- vpxord zmm4, zmm4, zmm8
- vpxord zmm5, zmm5, zmm9
- vpxord zmm6, zmm6, zmm10
- vpxord zmm7, zmm7, zmm11
- vprord zmm4, zmm4, 12
- vprord zmm5, zmm5, 12
- vprord zmm6, zmm6, 12
- vprord zmm7, zmm7, 12
- vpaddd zmm0, zmm0, zmm22
- vpaddd zmm1, zmm1, zmm26
- vpaddd zmm2, zmm2, zmm16
- vpaddd zmm3, zmm3, zmm29
- vpaddd zmm0, zmm0, zmm4
- vpaddd zmm1, zmm1, zmm5
- vpaddd zmm2, zmm2, zmm6
- vpaddd zmm3, zmm3, zmm7
- vpxord zmm12, zmm12, zmm0
- vpxord zmm13, zmm13, zmm1
- vpxord zmm14, zmm14, zmm2
- vpxord zmm15, zmm15, zmm3
- vprord zmm12, zmm12, 8
- vprord zmm13, zmm13, 8
- vprord zmm14, zmm14, 8
- vprord zmm15, zmm15, 8
- vpaddd zmm8, zmm8, zmm12
- vpaddd zmm9, zmm9, zmm13
- vpaddd zmm10, zmm10, zmm14
- vpaddd zmm11, zmm11, zmm15
- vpxord zmm4, zmm4, zmm8
- vpxord zmm5, zmm5, zmm9
- vpxord zmm6, zmm6, zmm10
- vpxord zmm7, zmm7, zmm11
- vprord zmm4, zmm4, 7
- vprord zmm5, zmm5, 7
- vprord zmm6, zmm6, 7
- vprord zmm7, zmm7, 7
- vpaddd zmm0, zmm0, zmm17
- vpaddd zmm1, zmm1, zmm28
- vpaddd zmm2, zmm2, zmm25
- vpaddd zmm3, zmm3, zmm31
- vpaddd zmm0, zmm0, zmm5
- vpaddd zmm1, zmm1, zmm6
- vpaddd zmm2, zmm2, zmm7
- vpaddd zmm3, zmm3, zmm4
- vpxord zmm15, zmm15, zmm0
- vpxord zmm12, zmm12, zmm1
- vpxord zmm13, zmm13, zmm2
- vpxord zmm14, zmm14, zmm3
- vprord zmm15, zmm15, 16
- vprord zmm12, zmm12, 16
- vprord zmm13, zmm13, 16
- vprord zmm14, zmm14, 16
- vpaddd zmm10, zmm10, zmm15
- vpaddd zmm11, zmm11, zmm12
- vpaddd zmm8, zmm8, zmm13
- vpaddd zmm9, zmm9, zmm14
- vpxord zmm5, zmm5, zmm10
- vpxord zmm6, zmm6, zmm11
- vpxord zmm7, zmm7, zmm8
- vpxord zmm4, zmm4, zmm9
- vprord zmm5, zmm5, 12
- vprord zmm6, zmm6, 12
- vprord zmm7, zmm7, 12
- vprord zmm4, zmm4, 12
- vpaddd zmm0, zmm0, zmm27
- vpaddd zmm1, zmm1, zmm21
- vpaddd zmm2, zmm2, zmm30
- vpaddd zmm3, zmm3, zmm24
- vpaddd zmm0, zmm0, zmm5
- vpaddd zmm1, zmm1, zmm6
- vpaddd zmm2, zmm2, zmm7
- vpaddd zmm3, zmm3, zmm4
- vpxord zmm15, zmm15, zmm0
- vpxord zmm12, zmm12, zmm1
- vpxord zmm13, zmm13, zmm2
- vpxord zmm14, zmm14, zmm3
- vprord zmm15, zmm15, 8
- vprord zmm12, zmm12, 8
- vprord zmm13, zmm13, 8
- vprord zmm14, zmm14, 8
- vpaddd zmm10, zmm10, zmm15
- vpaddd zmm11, zmm11, zmm12
- vpaddd zmm8, zmm8, zmm13
- vpaddd zmm9, zmm9, zmm14
- vpxord zmm5, zmm5, zmm10
- vpxord zmm6, zmm6, zmm11
- vpxord zmm7, zmm7, zmm8
- vpxord zmm4, zmm4, zmm9
- vprord zmm5, zmm5, 7
- vprord zmm6, zmm6, 7
- vprord zmm7, zmm7, 7
- vprord zmm4, zmm4, 7
- vpaddd zmm0, zmm0, zmm19
- vpaddd zmm1, zmm1, zmm26
- vpaddd zmm2, zmm2, zmm29
- vpaddd zmm3, zmm3, zmm23
- vpaddd zmm0, zmm0, zmm4
- vpaddd zmm1, zmm1, zmm5
- vpaddd zmm2, zmm2, zmm6
- vpaddd zmm3, zmm3, zmm7
- vpxord zmm12, zmm12, zmm0
- vpxord zmm13, zmm13, zmm1
- vpxord zmm14, zmm14, zmm2
- vpxord zmm15, zmm15, zmm3
- vprord zmm12, zmm12, 16
- vprord zmm13, zmm13, 16
- vprord zmm14, zmm14, 16
- vprord zmm15, zmm15, 16
- vpaddd zmm8, zmm8, zmm12
- vpaddd zmm9, zmm9, zmm13
- vpaddd zmm10, zmm10, zmm14
- vpaddd zmm11, zmm11, zmm15
- vpxord zmm4, zmm4, zmm8
- vpxord zmm5, zmm5, zmm9
- vpxord zmm6, zmm6, zmm10
- vpxord zmm7, zmm7, zmm11
- vprord zmm4, zmm4, 12
- vprord zmm5, zmm5, 12
- vprord zmm6, zmm6, 12
- vprord zmm7, zmm7, 12
- vpaddd zmm0, zmm0, zmm20
- vpaddd zmm1, zmm1, zmm28
- vpaddd zmm2, zmm2, zmm18
- vpaddd zmm3, zmm3, zmm30
- vpaddd zmm0, zmm0, zmm4
- vpaddd zmm1, zmm1, zmm5
- vpaddd zmm2, zmm2, zmm6
- vpaddd zmm3, zmm3, zmm7
- vpxord zmm12, zmm12, zmm0
- vpxord zmm13, zmm13, zmm1
- vpxord zmm14, zmm14, zmm2
- vpxord zmm15, zmm15, zmm3
- vprord zmm12, zmm12, 8
- vprord zmm13, zmm13, 8
- vprord zmm14, zmm14, 8
- vprord zmm15, zmm15, 8
- vpaddd zmm8, zmm8, zmm12
- vpaddd zmm9, zmm9, zmm13
- vpaddd zmm10, zmm10, zmm14
- vpaddd zmm11, zmm11, zmm15
- vpxord zmm4, zmm4, zmm8
- vpxord zmm5, zmm5, zmm9
- vpxord zmm6, zmm6, zmm10
- vpxord zmm7, zmm7, zmm11
- vprord zmm4, zmm4, 7
- vprord zmm5, zmm5, 7
- vprord zmm6, zmm6, 7
- vprord zmm7, zmm7, 7
- vpaddd zmm0, zmm0, zmm22
- vpaddd zmm1, zmm1, zmm25
- vpaddd zmm2, zmm2, zmm27
- vpaddd zmm3, zmm3, zmm24
- vpaddd zmm0, zmm0, zmm5
- vpaddd zmm1, zmm1, zmm6
- vpaddd zmm2, zmm2, zmm7
- vpaddd zmm3, zmm3, zmm4
- vpxord zmm15, zmm15, zmm0
- vpxord zmm12, zmm12, zmm1
- vpxord zmm13, zmm13, zmm2
- vpxord zmm14, zmm14, zmm3
- vprord zmm15, zmm15, 16
- vprord zmm12, zmm12, 16
- vprord zmm13, zmm13, 16
- vprord zmm14, zmm14, 16
- vpaddd zmm10, zmm10, zmm15
- vpaddd zmm11, zmm11, zmm12
- vpaddd zmm8, zmm8, zmm13
- vpaddd zmm9, zmm9, zmm14
- vpxord zmm5, zmm5, zmm10
- vpxord zmm6, zmm6, zmm11
- vpxord zmm7, zmm7, zmm8
- vpxord zmm4, zmm4, zmm9
- vprord zmm5, zmm5, 12
- vprord zmm6, zmm6, 12
- vprord zmm7, zmm7, 12
- vprord zmm4, zmm4, 12
- vpaddd zmm0, zmm0, zmm21
- vpaddd zmm1, zmm1, zmm16
- vpaddd zmm2, zmm2, zmm31
- vpaddd zmm3, zmm3, zmm17
- vpaddd zmm0, zmm0, zmm5
- vpaddd zmm1, zmm1, zmm6
- vpaddd zmm2, zmm2, zmm7
- vpaddd zmm3, zmm3, zmm4
- vpxord zmm15, zmm15, zmm0
- vpxord zmm12, zmm12, zmm1
- vpxord zmm13, zmm13, zmm2
- vpxord zmm14, zmm14, zmm3
- vprord zmm15, zmm15, 8
- vprord zmm12, zmm12, 8
- vprord zmm13, zmm13, 8
- vprord zmm14, zmm14, 8
- vpaddd zmm10, zmm10, zmm15
- vpaddd zmm11, zmm11, zmm12
- vpaddd zmm8, zmm8, zmm13
- vpaddd zmm9, zmm9, zmm14
- vpxord zmm5, zmm5, zmm10
- vpxord zmm6, zmm6, zmm11
- vpxord zmm7, zmm7, zmm8
- vpxord zmm4, zmm4, zmm9
- vprord zmm5, zmm5, 7
- vprord zmm6, zmm6, 7
- vprord zmm7, zmm7, 7
- vprord zmm4, zmm4, 7
- vpaddd zmm0, zmm0, zmm26
- vpaddd zmm1, zmm1, zmm28
- vpaddd zmm2, zmm2, zmm30
- vpaddd zmm3, zmm3, zmm29
- vpaddd zmm0, zmm0, zmm4
- vpaddd zmm1, zmm1, zmm5
- vpaddd zmm2, zmm2, zmm6
- vpaddd zmm3, zmm3, zmm7
- vpxord zmm12, zmm12, zmm0
- vpxord zmm13, zmm13, zmm1
- vpxord zmm14, zmm14, zmm2
- vpxord zmm15, zmm15, zmm3
- vprord zmm12, zmm12, 16
- vprord zmm13, zmm13, 16
- vprord zmm14, zmm14, 16
- vprord zmm15, zmm15, 16
- vpaddd zmm8, zmm8, zmm12
- vpaddd zmm9, zmm9, zmm13
- vpaddd zmm10, zmm10, zmm14
- vpaddd zmm11, zmm11, zmm15
- vpxord zmm4, zmm4, zmm8
- vpxord zmm5, zmm5, zmm9
- vpxord zmm6, zmm6, zmm10
- vpxord zmm7, zmm7, zmm11
- vprord zmm4, zmm4, 12
- vprord zmm5, zmm5, 12
- vprord zmm6, zmm6, 12
- vprord zmm7, zmm7, 12
- vpaddd zmm0, zmm0, zmm23
- vpaddd zmm1, zmm1, zmm25
- vpaddd zmm2, zmm2, zmm19
- vpaddd zmm3, zmm3, zmm31
- vpaddd zmm0, zmm0, zmm4
- vpaddd zmm1, zmm1, zmm5
- vpaddd zmm2, zmm2, zmm6
- vpaddd zmm3, zmm3, zmm7
- vpxord zmm12, zmm12, zmm0
- vpxord zmm13, zmm13, zmm1
- vpxord zmm14, zmm14, zmm2
- vpxord zmm15, zmm15, zmm3
- vprord zmm12, zmm12, 8
- vprord zmm13, zmm13, 8
- vprord zmm14, zmm14, 8
- vprord zmm15, zmm15, 8
- vpaddd zmm8, zmm8, zmm12
- vpaddd zmm9, zmm9, zmm13
- vpaddd zmm10, zmm10, zmm14
- vpaddd zmm11, zmm11, zmm15
- vpxord zmm4, zmm4, zmm8
- vpxord zmm5, zmm5, zmm9
- vpxord zmm6, zmm6, zmm10
- vpxord zmm7, zmm7, zmm11
- vprord zmm4, zmm4, 7
- vprord zmm5, zmm5, 7
- vprord zmm6, zmm6, 7
- vprord zmm7, zmm7, 7
- vpaddd zmm0, zmm0, zmm20
- vpaddd zmm1, zmm1, zmm27
- vpaddd zmm2, zmm2, zmm21
- vpaddd zmm3, zmm3, zmm17
- vpaddd zmm0, zmm0, zmm5
- vpaddd zmm1, zmm1, zmm6
- vpaddd zmm2, zmm2, zmm7
- vpaddd zmm3, zmm3, zmm4
- vpxord zmm15, zmm15, zmm0
- vpxord zmm12, zmm12, zmm1
- vpxord zmm13, zmm13, zmm2
- vpxord zmm14, zmm14, zmm3
- vprord zmm15, zmm15, 16
- vprord zmm12, zmm12, 16
- vprord zmm13, zmm13, 16
- vprord zmm14, zmm14, 16
- vpaddd zmm10, zmm10, zmm15
- vpaddd zmm11, zmm11, zmm12
- vpaddd zmm8, zmm8, zmm13
- vpaddd zmm9, zmm9, zmm14
- vpxord zmm5, zmm5, zmm10
- vpxord zmm6, zmm6, zmm11
- vpxord zmm7, zmm7, zmm8
- vpxord zmm4, zmm4, zmm9
- vprord zmm5, zmm5, 12
- vprord zmm6, zmm6, 12
- vprord zmm7, zmm7, 12
- vprord zmm4, zmm4, 12
- vpaddd zmm0, zmm0, zmm16
- vpaddd zmm1, zmm1, zmm18
- vpaddd zmm2, zmm2, zmm24
- vpaddd zmm3, zmm3, zmm22
- vpaddd zmm0, zmm0, zmm5
- vpaddd zmm1, zmm1, zmm6
- vpaddd zmm2, zmm2, zmm7
- vpaddd zmm3, zmm3, zmm4
- vpxord zmm15, zmm15, zmm0
- vpxord zmm12, zmm12, zmm1
- vpxord zmm13, zmm13, zmm2
- vpxord zmm14, zmm14, zmm3
- vprord zmm15, zmm15, 8
- vprord zmm12, zmm12, 8
- vprord zmm13, zmm13, 8
- vprord zmm14, zmm14, 8
- vpaddd zmm10, zmm10, zmm15
- vpaddd zmm11, zmm11, zmm12
- vpaddd zmm8, zmm8, zmm13
- vpaddd zmm9, zmm9, zmm14
- vpxord zmm5, zmm5, zmm10
- vpxord zmm6, zmm6, zmm11
- vpxord zmm7, zmm7, zmm8
- vpxord zmm4, zmm4, zmm9
- vprord zmm5, zmm5, 7
- vprord zmm6, zmm6, 7
- vprord zmm7, zmm7, 7
- vprord zmm4, zmm4, 7
- vpaddd zmm0, zmm0, zmm28
- vpaddd zmm1, zmm1, zmm25
- vpaddd zmm2, zmm2, zmm31
- vpaddd zmm3, zmm3, zmm30
- vpaddd zmm0, zmm0, zmm4
- vpaddd zmm1, zmm1, zmm5
- vpaddd zmm2, zmm2, zmm6
- vpaddd zmm3, zmm3, zmm7
- vpxord zmm12, zmm12, zmm0
- vpxord zmm13, zmm13, zmm1
- vpxord zmm14, zmm14, zmm2
- vpxord zmm15, zmm15, zmm3
- vprord zmm12, zmm12, 16
- vprord zmm13, zmm13, 16
- vprord zmm14, zmm14, 16
- vprord zmm15, zmm15, 16
- vpaddd zmm8, zmm8, zmm12
- vpaddd zmm9, zmm9, zmm13
- vpaddd zmm10, zmm10, zmm14
- vpaddd zmm11, zmm11, zmm15
- vpxord zmm4, zmm4, zmm8
- vpxord zmm5, zmm5, zmm9
- vpxord zmm6, zmm6, zmm10
- vpxord zmm7, zmm7, zmm11
- vprord zmm4, zmm4, 12
- vprord zmm5, zmm5, 12
- vprord zmm6, zmm6, 12
- vprord zmm7, zmm7, 12
- vpaddd zmm0, zmm0, zmm29
- vpaddd zmm1, zmm1, zmm27
- vpaddd zmm2, zmm2, zmm26
- vpaddd zmm3, zmm3, zmm24
- vpaddd zmm0, zmm0, zmm4
- vpaddd zmm1, zmm1, zmm5
- vpaddd zmm2, zmm2, zmm6
- vpaddd zmm3, zmm3, zmm7
- vpxord zmm12, zmm12, zmm0
- vpxord zmm13, zmm13, zmm1
- vpxord zmm14, zmm14, zmm2
- vpxord zmm15, zmm15, zmm3
- vprord zmm12, zmm12, 8
- vprord zmm13, zmm13, 8
- vprord zmm14, zmm14, 8
- vprord zmm15, zmm15, 8
- vpaddd zmm8, zmm8, zmm12
- vpaddd zmm9, zmm9, zmm13
- vpaddd zmm10, zmm10, zmm14
- vpaddd zmm11, zmm11, zmm15
- vpxord zmm4, zmm4, zmm8
- vpxord zmm5, zmm5, zmm9
- vpxord zmm6, zmm6, zmm10
- vpxord zmm7, zmm7, zmm11
- vprord zmm4, zmm4, 7
- vprord zmm5, zmm5, 7
- vprord zmm6, zmm6, 7
- vprord zmm7, zmm7, 7
- vpaddd zmm0, zmm0, zmm23
- vpaddd zmm1, zmm1, zmm21
- vpaddd zmm2, zmm2, zmm16
- vpaddd zmm3, zmm3, zmm22
- vpaddd zmm0, zmm0, zmm5
- vpaddd zmm1, zmm1, zmm6
- vpaddd zmm2, zmm2, zmm7
- vpaddd zmm3, zmm3, zmm4
- vpxord zmm15, zmm15, zmm0
- vpxord zmm12, zmm12, zmm1
- vpxord zmm13, zmm13, zmm2
- vpxord zmm14, zmm14, zmm3
- vprord zmm15, zmm15, 16
- vprord zmm12, zmm12, 16
- vprord zmm13, zmm13, 16
- vprord zmm14, zmm14, 16
- vpaddd zmm10, zmm10, zmm15
- vpaddd zmm11, zmm11, zmm12
- vpaddd zmm8, zmm8, zmm13
- vpaddd zmm9, zmm9, zmm14
- vpxord zmm5, zmm5, zmm10
- vpxord zmm6, zmm6, zmm11
- vpxord zmm7, zmm7, zmm8
- vpxord zmm4, zmm4, zmm9
- vprord zmm5, zmm5, 12
- vprord zmm6, zmm6, 12
- vprord zmm7, zmm7, 12
- vprord zmm4, zmm4, 12
- vpaddd zmm0, zmm0, zmm18
- vpaddd zmm1, zmm1, zmm19
- vpaddd zmm2, zmm2, zmm17
- vpaddd zmm3, zmm3, zmm20
- vpaddd zmm0, zmm0, zmm5
- vpaddd zmm1, zmm1, zmm6
- vpaddd zmm2, zmm2, zmm7
- vpaddd zmm3, zmm3, zmm4
- vpxord zmm15, zmm15, zmm0
- vpxord zmm12, zmm12, zmm1
- vpxord zmm13, zmm13, zmm2
- vpxord zmm14, zmm14, zmm3
- vprord zmm15, zmm15, 8
- vprord zmm12, zmm12, 8
- vprord zmm13, zmm13, 8
- vprord zmm14, zmm14, 8
- vpaddd zmm10, zmm10, zmm15
- vpaddd zmm11, zmm11, zmm12
- vpaddd zmm8, zmm8, zmm13
- vpaddd zmm9, zmm9, zmm14
- vpxord zmm5, zmm5, zmm10
- vpxord zmm6, zmm6, zmm11
- vpxord zmm7, zmm7, zmm8
- vpxord zmm4, zmm4, zmm9
- vprord zmm5, zmm5, 7
- vprord zmm6, zmm6, 7
- vprord zmm7, zmm7, 7
- vprord zmm4, zmm4, 7
- vpaddd zmm0, zmm0, zmm25
- vpaddd zmm1, zmm1, zmm27
- vpaddd zmm2, zmm2, zmm24
- vpaddd zmm3, zmm3, zmm31
- vpaddd zmm0, zmm0, zmm4
- vpaddd zmm1, zmm1, zmm5
- vpaddd zmm2, zmm2, zmm6
- vpaddd zmm3, zmm3, zmm7
- vpxord zmm12, zmm12, zmm0
- vpxord zmm13, zmm13, zmm1
- vpxord zmm14, zmm14, zmm2
- vpxord zmm15, zmm15, zmm3
- vprord zmm12, zmm12, 16
- vprord zmm13, zmm13, 16
- vprord zmm14, zmm14, 16
- vprord zmm15, zmm15, 16
- vpaddd zmm8, zmm8, zmm12
- vpaddd zmm9, zmm9, zmm13
- vpaddd zmm10, zmm10, zmm14
- vpaddd zmm11, zmm11, zmm15
- vpxord zmm4, zmm4, zmm8
- vpxord zmm5, zmm5, zmm9
- vpxord zmm6, zmm6, zmm10
- vpxord zmm7, zmm7, zmm11
- vprord zmm4, zmm4, 12
- vprord zmm5, zmm5, 12
- vprord zmm6, zmm6, 12
- vprord zmm7, zmm7, 12
- vpaddd zmm0, zmm0, zmm30
- vpaddd zmm1, zmm1, zmm21
- vpaddd zmm2, zmm2, zmm28
- vpaddd zmm3, zmm3, zmm17
- vpaddd zmm0, zmm0, zmm4
- vpaddd zmm1, zmm1, zmm5
- vpaddd zmm2, zmm2, zmm6
- vpaddd zmm3, zmm3, zmm7
- vpxord zmm12, zmm12, zmm0
- vpxord zmm13, zmm13, zmm1
- vpxord zmm14, zmm14, zmm2
- vpxord zmm15, zmm15, zmm3
- vprord zmm12, zmm12, 8
- vprord zmm13, zmm13, 8
- vprord zmm14, zmm14, 8
- vprord zmm15, zmm15, 8
- vpaddd zmm8, zmm8, zmm12
- vpaddd zmm9, zmm9, zmm13
- vpaddd zmm10, zmm10, zmm14
- vpaddd zmm11, zmm11, zmm15
- vpxord zmm4, zmm4, zmm8
- vpxord zmm5, zmm5, zmm9
- vpxord zmm6, zmm6, zmm10
- vpxord zmm7, zmm7, zmm11
- vprord zmm4, zmm4, 7
- vprord zmm5, zmm5, 7
- vprord zmm6, zmm6, 7
- vprord zmm7, zmm7, 7
- vpaddd zmm0, zmm0, zmm29
- vpaddd zmm1, zmm1, zmm16
- vpaddd zmm2, zmm2, zmm18
- vpaddd zmm3, zmm3, zmm20
- vpaddd zmm0, zmm0, zmm5
- vpaddd zmm1, zmm1, zmm6
- vpaddd zmm2, zmm2, zmm7
- vpaddd zmm3, zmm3, zmm4
- vpxord zmm15, zmm15, zmm0
- vpxord zmm12, zmm12, zmm1
- vpxord zmm13, zmm13, zmm2
- vpxord zmm14, zmm14, zmm3
- vprord zmm15, zmm15, 16
- vprord zmm12, zmm12, 16
- vprord zmm13, zmm13, 16
- vprord zmm14, zmm14, 16
- vpaddd zmm10, zmm10, zmm15
- vpaddd zmm11, zmm11, zmm12
- vpaddd zmm8, zmm8, zmm13
- vpaddd zmm9, zmm9, zmm14
- vpxord zmm5, zmm5, zmm10
- vpxord zmm6, zmm6, zmm11
- vpxord zmm7, zmm7, zmm8
- vpxord zmm4, zmm4, zmm9
- vprord zmm5, zmm5, 12
- vprord zmm6, zmm6, 12
- vprord zmm7, zmm7, 12
- vprord zmm4, zmm4, 12
- vpaddd zmm0, zmm0, zmm19
- vpaddd zmm1, zmm1, zmm26
- vpaddd zmm2, zmm2, zmm22
- vpaddd zmm3, zmm3, zmm23
- vpaddd zmm0, zmm0, zmm5
- vpaddd zmm1, zmm1, zmm6
- vpaddd zmm2, zmm2, zmm7
- vpaddd zmm3, zmm3, zmm4
- vpxord zmm15, zmm15, zmm0
- vpxord zmm12, zmm12, zmm1
- vpxord zmm13, zmm13, zmm2
- vpxord zmm14, zmm14, zmm3
- vprord zmm15, zmm15, 8
- vprord zmm12, zmm12, 8
- vprord zmm13, zmm13, 8
- vprord zmm14, zmm14, 8
- vpaddd zmm10, zmm10, zmm15
- vpaddd zmm11, zmm11, zmm12
- vpaddd zmm8, zmm8, zmm13
- vpaddd zmm9, zmm9, zmm14
- vpxord zmm5, zmm5, zmm10
- vpxord zmm6, zmm6, zmm11
- vpxord zmm7, zmm7, zmm8
- vpxord zmm4, zmm4, zmm9
- vprord zmm5, zmm5, 7
- vprord zmm6, zmm6, 7
- vprord zmm7, zmm7, 7
- vprord zmm4, zmm4, 7
- vpaddd zmm0, zmm0, zmm27
- vpaddd zmm1, zmm1, zmm21
- vpaddd zmm2, zmm2, zmm17
- vpaddd zmm3, zmm3, zmm24
- vpaddd zmm0, zmm0, zmm4
- vpaddd zmm1, zmm1, zmm5
- vpaddd zmm2, zmm2, zmm6
- vpaddd zmm3, zmm3, zmm7
- vpxord zmm12, zmm12, zmm0
- vpxord zmm13, zmm13, zmm1
- vpxord zmm14, zmm14, zmm2
- vpxord zmm15, zmm15, zmm3
- vprord zmm12, zmm12, 16
- vprord zmm13, zmm13, 16
- vprord zmm14, zmm14, 16
- vprord zmm15, zmm15, 16
- vpaddd zmm8, zmm8, zmm12
- vpaddd zmm9, zmm9, zmm13
- vpaddd zmm10, zmm10, zmm14
- vpaddd zmm11, zmm11, zmm15
- vpxord zmm4, zmm4, zmm8
- vpxord zmm5, zmm5, zmm9
- vpxord zmm6, zmm6, zmm10
- vpxord zmm7, zmm7, zmm11
- vprord zmm4, zmm4, 12
- vprord zmm5, zmm5, 12
- vprord zmm6, zmm6, 12
- vprord zmm7, zmm7, 12
- vpaddd zmm0, zmm0, zmm31
- vpaddd zmm1, zmm1, zmm16
- vpaddd zmm2, zmm2, zmm25
- vpaddd zmm3, zmm3, zmm22
- vpaddd zmm0, zmm0, zmm4
- vpaddd zmm1, zmm1, zmm5
- vpaddd zmm2, zmm2, zmm6
- vpaddd zmm3, zmm3, zmm7
- vpxord zmm12, zmm12, zmm0
- vpxord zmm13, zmm13, zmm1
- vpxord zmm14, zmm14, zmm2
- vpxord zmm15, zmm15, zmm3
- vprord zmm12, zmm12, 8
- vprord zmm13, zmm13, 8
- vprord zmm14, zmm14, 8
- vprord zmm15, zmm15, 8
- vpaddd zmm8, zmm8, zmm12
- vpaddd zmm9, zmm9, zmm13
- vpaddd zmm10, zmm10, zmm14
- vpaddd zmm11, zmm11, zmm15
- vpxord zmm4, zmm4, zmm8
- vpxord zmm5, zmm5, zmm9
- vpxord zmm6, zmm6, zmm10
- vpxord zmm7, zmm7, zmm11
- vprord zmm4, zmm4, 7
- vprord zmm5, zmm5, 7
- vprord zmm6, zmm6, 7
- vprord zmm7, zmm7, 7
- vpaddd zmm0, zmm0, zmm30
- vpaddd zmm1, zmm1, zmm18
- vpaddd zmm2, zmm2, zmm19
- vpaddd zmm3, zmm3, zmm23
- vpaddd zmm0, zmm0, zmm5
- vpaddd zmm1, zmm1, zmm6
- vpaddd zmm2, zmm2, zmm7
- vpaddd zmm3, zmm3, zmm4
- vpxord zmm15, zmm15, zmm0
- vpxord zmm12, zmm12, zmm1
- vpxord zmm13, zmm13, zmm2
- vpxord zmm14, zmm14, zmm3
- vprord zmm15, zmm15, 16
- vprord zmm12, zmm12, 16
- vprord zmm13, zmm13, 16
- vprord zmm14, zmm14, 16
- vpaddd zmm10, zmm10, zmm15
- vpaddd zmm11, zmm11, zmm12
- vpaddd zmm8, zmm8, zmm13
- vpaddd zmm9, zmm9, zmm14
- vpxord zmm5, zmm5, zmm10
- vpxord zmm6, zmm6, zmm11
- vpxord zmm7, zmm7, zmm8
- vpxord zmm4, zmm4, zmm9
- vprord zmm5, zmm5, 12
- vprord zmm6, zmm6, 12
- vprord zmm7, zmm7, 12
- vprord zmm4, zmm4, 12
- vpaddd zmm0, zmm0, zmm26
- vpaddd zmm1, zmm1, zmm28
- vpaddd zmm2, zmm2, zmm20
- vpaddd zmm3, zmm3, zmm29
- vpaddd zmm0, zmm0, zmm5
- vpaddd zmm1, zmm1, zmm6
- vpaddd zmm2, zmm2, zmm7
- vpaddd zmm3, zmm3, zmm4
- vpxord zmm15, zmm15, zmm0
- vpxord zmm12, zmm12, zmm1
- vpxord zmm13, zmm13, zmm2
- vpxord zmm14, zmm14, zmm3
- vprord zmm15, zmm15, 8
- vprord zmm12, zmm12, 8
- vprord zmm13, zmm13, 8
- vprord zmm14, zmm14, 8
- vpaddd zmm10, zmm10, zmm15
- vpaddd zmm11, zmm11, zmm12
- vpaddd zmm8, zmm8, zmm13
- vpaddd zmm9, zmm9, zmm14
- vpxord zmm5, zmm5, zmm10
- vpxord zmm6, zmm6, zmm11
- vpxord zmm7, zmm7, zmm8
- vpxord zmm4, zmm4, zmm9
- vprord zmm5, zmm5, 7
- vprord zmm6, zmm6, 7
- vprord zmm7, zmm7, 7
- vprord zmm4, zmm4, 7
- vpxord zmm0, zmm0, zmm8
- vpxord zmm1, zmm1, zmm9
- vpxord zmm2, zmm2, zmm10
- vpxord zmm3, zmm3, zmm11
- vpxord zmm4, zmm4, zmm12
- vpxord zmm5, zmm5, zmm13
- vpxord zmm6, zmm6, zmm14
- vpxord zmm7, zmm7, zmm15
- movzx eax, byte ptr [rbp+0x38]
- jne 9b
- mov rbx, qword ptr [rbp+0x50]
- vpunpckldq zmm16, zmm0, zmm1
- vpunpckhdq zmm17, zmm0, zmm1
- vpunpckldq zmm18, zmm2, zmm3
- vpunpckhdq zmm19, zmm2, zmm3
- vpunpckldq zmm20, zmm4, zmm5
- vpunpckhdq zmm21, zmm4, zmm5
- vpunpckldq zmm22, zmm6, zmm7
- vpunpckhdq zmm23, zmm6, zmm7
- vpunpcklqdq zmm0, zmm16, zmm18
- vpunpckhqdq zmm1, zmm16, zmm18
- vpunpcklqdq zmm2, zmm17, zmm19
- vpunpckhqdq zmm3, zmm17, zmm19
- vpunpcklqdq zmm4, zmm20, zmm22
- vpunpckhqdq zmm5, zmm20, zmm22
- vpunpcklqdq zmm6, zmm21, zmm23
- vpunpckhqdq zmm7, zmm21, zmm23
- vshufi32x4 zmm16, zmm0, zmm4, 0x88
- vshufi32x4 zmm17, zmm1, zmm5, 0x88
- vshufi32x4 zmm18, zmm2, zmm6, 0x88
- vshufi32x4 zmm19, zmm3, zmm7, 0x88
- vshufi32x4 zmm20, zmm0, zmm4, 0xDD
- vshufi32x4 zmm21, zmm1, zmm5, 0xDD
- vshufi32x4 zmm22, zmm2, zmm6, 0xDD
- vshufi32x4 zmm23, zmm3, zmm7, 0xDD
- vshufi32x4 zmm0, zmm16, zmm17, 0x88
- vshufi32x4 zmm1, zmm18, zmm19, 0x88
- vshufi32x4 zmm2, zmm20, zmm21, 0x88
- vshufi32x4 zmm3, zmm22, zmm23, 0x88
- vshufi32x4 zmm4, zmm16, zmm17, 0xDD
- vshufi32x4 zmm5, zmm18, zmm19, 0xDD
- vshufi32x4 zmm6, zmm20, zmm21, 0xDD
- vshufi32x4 zmm7, zmm22, zmm23, 0xDD
- vmovdqu32 zmmword ptr [rbx], zmm0
- vmovdqu32 zmmword ptr [rbx+0x1*0x40], zmm1
- vmovdqu32 zmmword ptr [rbx+0x2*0x40], zmm2
- vmovdqu32 zmmword ptr [rbx+0x3*0x40], zmm3
- vmovdqu32 zmmword ptr [rbx+0x4*0x40], zmm4
- vmovdqu32 zmmword ptr [rbx+0x5*0x40], zmm5
- vmovdqu32 zmmword ptr [rbx+0x6*0x40], zmm6
- vmovdqu32 zmmword ptr [rbx+0x7*0x40], zmm7
- vmovdqa32 zmm0, zmmword ptr [rsp]
- vmovdqa32 zmm1, zmmword ptr [rsp+0x1*0x40]
- vmovdqa32 zmm2, zmm0
- vpaddd zmm2{k1}, zmm0, dword ptr [ADD16+rip] {1to16}
- vpcmpltud k2, zmm2, zmm0
- vpaddd zmm1 {k2}, zmm1, dword ptr [ADD1+rip] {1to16}
- vmovdqa32 zmmword ptr [rsp], zmm2
- vmovdqa32 zmmword ptr [rsp+0x1*0x40], zmm1
- add rdi, 128
- add rbx, 512
- mov qword ptr [rbp+0x50], rbx
- sub rsi, 16
- cmp rsi, 16
- jnc 2b
- test rsi, rsi
- jnz 3f
-4:
- vzeroupper
- mov rsp, rbp
- pop rbp
- pop rbx
- pop r12
- pop r13
- pop r14
- pop r15
- ret
-.p2align 6
-3:
- test esi, 0x8
- je 3f
- vpbroadcastd ymm0, dword ptr [rcx]
- vpbroadcastd ymm1, dword ptr [rcx+0x4]
- vpbroadcastd ymm2, dword ptr [rcx+0x8]
- vpbroadcastd ymm3, dword ptr [rcx+0xC]
- vpbroadcastd ymm4, dword ptr [rcx+0x10]
- vpbroadcastd ymm5, dword ptr [rcx+0x14]
- vpbroadcastd ymm6, dword ptr [rcx+0x18]
- vpbroadcastd ymm7, dword ptr [rcx+0x1C]
- mov r8, qword ptr [rdi]
- mov r9, qword ptr [rdi+0x8]
- mov r10, qword ptr [rdi+0x10]
- mov r11, qword ptr [rdi+0x18]
- mov r12, qword ptr [rdi+0x20]
- mov r13, qword ptr [rdi+0x28]
- mov r14, qword ptr [rdi+0x30]
- mov r15, qword ptr [rdi+0x38]
- movzx eax, byte ptr [rbp+0x38]
- movzx ebx, byte ptr [rbp+0x40]
- or eax, ebx
- xor edx, edx
-2:
- movzx ebx, byte ptr [rbp+0x48]
- or ebx, eax
- add rdx, 64
- cmp rdx, qword ptr [rsp+0x80]
- cmove eax, ebx
- mov dword ptr [rsp+0x88], eax
- vmovups xmm8, xmmword ptr [r8+rdx-0x40]
- vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01
- vmovups xmm9, xmmword ptr [r9+rdx-0x40]
- vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01
- vunpcklpd ymm12, ymm8, ymm9
- vunpckhpd ymm13, ymm8, ymm9
- vmovups xmm10, xmmword ptr [r10+rdx-0x40]
- vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01
- vmovups xmm11, xmmword ptr [r11+rdx-0x40]
- vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01
- vunpcklpd ymm14, ymm10, ymm11
- vunpckhpd ymm15, ymm10, ymm11
- vshufps ymm16, ymm12, ymm14, 136
- vshufps ymm17, ymm12, ymm14, 221
- vshufps ymm18, ymm13, ymm15, 136
- vshufps ymm19, ymm13, ymm15, 221
- vmovups xmm8, xmmword ptr [r8+rdx-0x30]
- vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01
- vmovups xmm9, xmmword ptr [r9+rdx-0x30]
- vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01
- vunpcklpd ymm12, ymm8, ymm9
- vunpckhpd ymm13, ymm8, ymm9
- vmovups xmm10, xmmword ptr [r10+rdx-0x30]
- vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01
- vmovups xmm11, xmmword ptr [r11+rdx-0x30]
- vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01
- vunpcklpd ymm14, ymm10, ymm11
- vunpckhpd ymm15, ymm10, ymm11
- vshufps ymm20, ymm12, ymm14, 136
- vshufps ymm21, ymm12, ymm14, 221
- vshufps ymm22, ymm13, ymm15, 136
- vshufps ymm23, ymm13, ymm15, 221
- vmovups xmm8, xmmword ptr [r8+rdx-0x20]
- vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01
- vmovups xmm9, xmmword ptr [r9+rdx-0x20]
- vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01
- vunpcklpd ymm12, ymm8, ymm9
- vunpckhpd ymm13, ymm8, ymm9
- vmovups xmm10, xmmword ptr [r10+rdx-0x20]
- vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01
- vmovups xmm11, xmmword ptr [r11+rdx-0x20]
- vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01
- vunpcklpd ymm14, ymm10, ymm11
- vunpckhpd ymm15, ymm10, ymm11
- vshufps ymm24, ymm12, ymm14, 136
- vshufps ymm25, ymm12, ymm14, 221
- vshufps ymm26, ymm13, ymm15, 136
- vshufps ymm27, ymm13, ymm15, 221
- vmovups xmm8, xmmword ptr [r8+rdx-0x10]
- vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01
- vmovups xmm9, xmmword ptr [r9+rdx-0x10]
- vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01
- vunpcklpd ymm12, ymm8, ymm9
- vunpckhpd ymm13, ymm8, ymm9
- vmovups xmm10, xmmword ptr [r10+rdx-0x10]
- vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01
- vmovups xmm11, xmmword ptr [r11+rdx-0x10]
- vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01
- vunpcklpd ymm14, ymm10, ymm11
- vunpckhpd ymm15, ymm10, ymm11
- vshufps ymm28, ymm12, ymm14, 136
- vshufps ymm29, ymm12, ymm14, 221
- vshufps ymm30, ymm13, ymm15, 136
- vshufps ymm31, ymm13, ymm15, 221
- vpbroadcastd ymm8, dword ptr [BLAKE3_IV_0+rip]
- vpbroadcastd ymm9, dword ptr [BLAKE3_IV_1+rip]
- vpbroadcastd ymm10, dword ptr [BLAKE3_IV_2+rip]
- vpbroadcastd ymm11, dword ptr [BLAKE3_IV_3+rip]
- vmovdqa ymm12, ymmword ptr [rsp]
- vmovdqa ymm13, ymmword ptr [rsp+0x40]
- vpbroadcastd ymm14, dword ptr [BLAKE3_BLOCK_LEN+rip]
- vpbroadcastd ymm15, dword ptr [rsp+0x88]
- vpaddd ymm0, ymm0, ymm16
- vpaddd ymm1, ymm1, ymm18
- vpaddd ymm2, ymm2, ymm20
- vpaddd ymm3, ymm3, ymm22
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxord ymm12, ymm12, ymm0
- vpxord ymm13, ymm13, ymm1
- vpxord ymm14, ymm14, ymm2
- vpxord ymm15, ymm15, ymm3
- vprord ymm12, ymm12, 16
- vprord ymm13, ymm13, 16
- vprord ymm14, ymm14, 16
- vprord ymm15, ymm15, 16
- vpaddd ymm8, ymm8, ymm12
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxord ymm4, ymm4, ymm8
- vpxord ymm5, ymm5, ymm9
- vpxord ymm6, ymm6, ymm10
- vpxord ymm7, ymm7, ymm11
- vprord ymm4, ymm4, 12
- vprord ymm5, ymm5, 12
- vprord ymm6, ymm6, 12
- vprord ymm7, ymm7, 12
- vpaddd ymm0, ymm0, ymm17
- vpaddd ymm1, ymm1, ymm19
- vpaddd ymm2, ymm2, ymm21
- vpaddd ymm3, ymm3, ymm23
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxord ymm12, ymm12, ymm0
- vpxord ymm13, ymm13, ymm1
- vpxord ymm14, ymm14, ymm2
- vpxord ymm15, ymm15, ymm3
- vprord ymm12, ymm12, 8
- vprord ymm13, ymm13, 8
- vprord ymm14, ymm14, 8
- vprord ymm15, ymm15, 8
- vpaddd ymm8, ymm8, ymm12
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxord ymm4, ymm4, ymm8
- vpxord ymm5, ymm5, ymm9
- vpxord ymm6, ymm6, ymm10
- vpxord ymm7, ymm7, ymm11
- vprord ymm4, ymm4, 7
- vprord ymm5, ymm5, 7
- vprord ymm6, ymm6, 7
- vprord ymm7, ymm7, 7
- vpaddd ymm0, ymm0, ymm24
- vpaddd ymm1, ymm1, ymm26
- vpaddd ymm2, ymm2, ymm28
- vpaddd ymm3, ymm3, ymm30
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxord ymm15, ymm15, ymm0
- vpxord ymm12, ymm12, ymm1
- vpxord ymm13, ymm13, ymm2
- vpxord ymm14, ymm14, ymm3
- vprord ymm15, ymm15, 16
- vprord ymm12, ymm12, 16
- vprord ymm13, ymm13, 16
- vprord ymm14, ymm14, 16
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm8, ymm13
- vpaddd ymm9, ymm9, ymm14
- vpxord ymm5, ymm5, ymm10
- vpxord ymm6, ymm6, ymm11
- vpxord ymm7, ymm7, ymm8
- vpxord ymm4, ymm4, ymm9
- vprord ymm5, ymm5, 12
- vprord ymm6, ymm6, 12
- vprord ymm7, ymm7, 12
- vprord ymm4, ymm4, 12
- vpaddd ymm0, ymm0, ymm25
- vpaddd ymm1, ymm1, ymm27
- vpaddd ymm2, ymm2, ymm29
- vpaddd ymm3, ymm3, ymm31
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxord ymm15, ymm15, ymm0
- vpxord ymm12, ymm12, ymm1
- vpxord ymm13, ymm13, ymm2
- vpxord ymm14, ymm14, ymm3
- vprord ymm15, ymm15, 8
- vprord ymm12, ymm12, 8
- vprord ymm13, ymm13, 8
- vprord ymm14, ymm14, 8
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm8, ymm13
- vpaddd ymm9, ymm9, ymm14
- vpxord ymm5, ymm5, ymm10
- vpxord ymm6, ymm6, ymm11
- vpxord ymm7, ymm7, ymm8
- vpxord ymm4, ymm4, ymm9
- vprord ymm5, ymm5, 7
- vprord ymm6, ymm6, 7
- vprord ymm7, ymm7, 7
- vprord ymm4, ymm4, 7
- vpaddd ymm0, ymm0, ymm18
- vpaddd ymm1, ymm1, ymm19
- vpaddd ymm2, ymm2, ymm23
- vpaddd ymm3, ymm3, ymm20
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxord ymm12, ymm12, ymm0
- vpxord ymm13, ymm13, ymm1
- vpxord ymm14, ymm14, ymm2
- vpxord ymm15, ymm15, ymm3
- vprord ymm12, ymm12, 16
- vprord ymm13, ymm13, 16
- vprord ymm14, ymm14, 16
- vprord ymm15, ymm15, 16
- vpaddd ymm8, ymm8, ymm12
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxord ymm4, ymm4, ymm8
- vpxord ymm5, ymm5, ymm9
- vpxord ymm6, ymm6, ymm10
- vpxord ymm7, ymm7, ymm11
- vprord ymm4, ymm4, 12
- vprord ymm5, ymm5, 12
- vprord ymm6, ymm6, 12
- vprord ymm7, ymm7, 12
- vpaddd ymm0, ymm0, ymm22
- vpaddd ymm1, ymm1, ymm26
- vpaddd ymm2, ymm2, ymm16
- vpaddd ymm3, ymm3, ymm29
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxord ymm12, ymm12, ymm0
- vpxord ymm13, ymm13, ymm1
- vpxord ymm14, ymm14, ymm2
- vpxord ymm15, ymm15, ymm3
- vprord ymm12, ymm12, 8
- vprord ymm13, ymm13, 8
- vprord ymm14, ymm14, 8
- vprord ymm15, ymm15, 8
- vpaddd ymm8, ymm8, ymm12
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxord ymm4, ymm4, ymm8
- vpxord ymm5, ymm5, ymm9
- vpxord ymm6, ymm6, ymm10
- vpxord ymm7, ymm7, ymm11
- vprord ymm4, ymm4, 7
- vprord ymm5, ymm5, 7
- vprord ymm6, ymm6, 7
- vprord ymm7, ymm7, 7
- vpaddd ymm0, ymm0, ymm17
- vpaddd ymm1, ymm1, ymm28
- vpaddd ymm2, ymm2, ymm25
- vpaddd ymm3, ymm3, ymm31
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxord ymm15, ymm15, ymm0
- vpxord ymm12, ymm12, ymm1
- vpxord ymm13, ymm13, ymm2
- vpxord ymm14, ymm14, ymm3
- vprord ymm15, ymm15, 16
- vprord ymm12, ymm12, 16
- vprord ymm13, ymm13, 16
- vprord ymm14, ymm14, 16
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm8, ymm13
- vpaddd ymm9, ymm9, ymm14
- vpxord ymm5, ymm5, ymm10
- vpxord ymm6, ymm6, ymm11
- vpxord ymm7, ymm7, ymm8
- vpxord ymm4, ymm4, ymm9
- vprord ymm5, ymm5, 12
- vprord ymm6, ymm6, 12
- vprord ymm7, ymm7, 12
- vprord ymm4, ymm4, 12
- vpaddd ymm0, ymm0, ymm27
- vpaddd ymm1, ymm1, ymm21
- vpaddd ymm2, ymm2, ymm30
- vpaddd ymm3, ymm3, ymm24
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxord ymm15, ymm15, ymm0
- vpxord ymm12, ymm12, ymm1
- vpxord ymm13, ymm13, ymm2
- vpxord ymm14, ymm14, ymm3
- vprord ymm15, ymm15, 8
- vprord ymm12, ymm12, 8
- vprord ymm13, ymm13, 8
- vprord ymm14, ymm14, 8
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm8, ymm13
- vpaddd ymm9, ymm9, ymm14
- vpxord ymm5, ymm5, ymm10
- vpxord ymm6, ymm6, ymm11
- vpxord ymm7, ymm7, ymm8
- vpxord ymm4, ymm4, ymm9
- vprord ymm5, ymm5, 7
- vprord ymm6, ymm6, 7
- vprord ymm7, ymm7, 7
- vprord ymm4, ymm4, 7
- vpaddd ymm0, ymm0, ymm19
- vpaddd ymm1, ymm1, ymm26
- vpaddd ymm2, ymm2, ymm29
- vpaddd ymm3, ymm3, ymm23
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxord ymm12, ymm12, ymm0
- vpxord ymm13, ymm13, ymm1
- vpxord ymm14, ymm14, ymm2
- vpxord ymm15, ymm15, ymm3
- vprord ymm12, ymm12, 16
- vprord ymm13, ymm13, 16
- vprord ymm14, ymm14, 16
- vprord ymm15, ymm15, 16
- vpaddd ymm8, ymm8, ymm12
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxord ymm4, ymm4, ymm8
- vpxord ymm5, ymm5, ymm9
- vpxord ymm6, ymm6, ymm10
- vpxord ymm7, ymm7, ymm11
- vprord ymm4, ymm4, 12
- vprord ymm5, ymm5, 12
- vprord ymm6, ymm6, 12
- vprord ymm7, ymm7, 12
- vpaddd ymm0, ymm0, ymm20
- vpaddd ymm1, ymm1, ymm28
- vpaddd ymm2, ymm2, ymm18
- vpaddd ymm3, ymm3, ymm30
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxord ymm12, ymm12, ymm0
- vpxord ymm13, ymm13, ymm1
- vpxord ymm14, ymm14, ymm2
- vpxord ymm15, ymm15, ymm3
- vprord ymm12, ymm12, 8
- vprord ymm13, ymm13, 8
- vprord ymm14, ymm14, 8
- vprord ymm15, ymm15, 8
- vpaddd ymm8, ymm8, ymm12
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxord ymm4, ymm4, ymm8
- vpxord ymm5, ymm5, ymm9
- vpxord ymm6, ymm6, ymm10
- vpxord ymm7, ymm7, ymm11
- vprord ymm4, ymm4, 7
- vprord ymm5, ymm5, 7
- vprord ymm6, ymm6, 7
- vprord ymm7, ymm7, 7
- vpaddd ymm0, ymm0, ymm22
- vpaddd ymm1, ymm1, ymm25
- vpaddd ymm2, ymm2, ymm27
- vpaddd ymm3, ymm3, ymm24
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxord ymm15, ymm15, ymm0
- vpxord ymm12, ymm12, ymm1
- vpxord ymm13, ymm13, ymm2
- vpxord ymm14, ymm14, ymm3
- vprord ymm15, ymm15, 16
- vprord ymm12, ymm12, 16
- vprord ymm13, ymm13, 16
- vprord ymm14, ymm14, 16
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm8, ymm13
- vpaddd ymm9, ymm9, ymm14
- vpxord ymm5, ymm5, ymm10
- vpxord ymm6, ymm6, ymm11
- vpxord ymm7, ymm7, ymm8
- vpxord ymm4, ymm4, ymm9
- vprord ymm5, ymm5, 12
- vprord ymm6, ymm6, 12
- vprord ymm7, ymm7, 12
- vprord ymm4, ymm4, 12
- vpaddd ymm0, ymm0, ymm21
- vpaddd ymm1, ymm1, ymm16
- vpaddd ymm2, ymm2, ymm31
- vpaddd ymm3, ymm3, ymm17
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxord ymm15, ymm15, ymm0
- vpxord ymm12, ymm12, ymm1
- vpxord ymm13, ymm13, ymm2
- vpxord ymm14, ymm14, ymm3
- vprord ymm15, ymm15, 8
- vprord ymm12, ymm12, 8
- vprord ymm13, ymm13, 8
- vprord ymm14, ymm14, 8
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm8, ymm13
- vpaddd ymm9, ymm9, ymm14
- vpxord ymm5, ymm5, ymm10
- vpxord ymm6, ymm6, ymm11
- vpxord ymm7, ymm7, ymm8
- vpxord ymm4, ymm4, ymm9
- vprord ymm5, ymm5, 7
- vprord ymm6, ymm6, 7
- vprord ymm7, ymm7, 7
- vprord ymm4, ymm4, 7
- vpaddd ymm0, ymm0, ymm26
- vpaddd ymm1, ymm1, ymm28
- vpaddd ymm2, ymm2, ymm30
- vpaddd ymm3, ymm3, ymm29
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxord ymm12, ymm12, ymm0
- vpxord ymm13, ymm13, ymm1
- vpxord ymm14, ymm14, ymm2
- vpxord ymm15, ymm15, ymm3
- vprord ymm12, ymm12, 16
- vprord ymm13, ymm13, 16
- vprord ymm14, ymm14, 16
- vprord ymm15, ymm15, 16
- vpaddd ymm8, ymm8, ymm12
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxord ymm4, ymm4, ymm8
- vpxord ymm5, ymm5, ymm9
- vpxord ymm6, ymm6, ymm10
- vpxord ymm7, ymm7, ymm11
- vprord ymm4, ymm4, 12
- vprord ymm5, ymm5, 12
- vprord ymm6, ymm6, 12
- vprord ymm7, ymm7, 12
- vpaddd ymm0, ymm0, ymm23
- vpaddd ymm1, ymm1, ymm25
- vpaddd ymm2, ymm2, ymm19
- vpaddd ymm3, ymm3, ymm31
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxord ymm12, ymm12, ymm0
- vpxord ymm13, ymm13, ymm1
- vpxord ymm14, ymm14, ymm2
- vpxord ymm15, ymm15, ymm3
- vprord ymm12, ymm12, 8
- vprord ymm13, ymm13, 8
- vprord ymm14, ymm14, 8
- vprord ymm15, ymm15, 8
- vpaddd ymm8, ymm8, ymm12
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxord ymm4, ymm4, ymm8
- vpxord ymm5, ymm5, ymm9
- vpxord ymm6, ymm6, ymm10
- vpxord ymm7, ymm7, ymm11
- vprord ymm4, ymm4, 7
- vprord ymm5, ymm5, 7
- vprord ymm6, ymm6, 7
- vprord ymm7, ymm7, 7
- vpaddd ymm0, ymm0, ymm20
- vpaddd ymm1, ymm1, ymm27
- vpaddd ymm2, ymm2, ymm21
- vpaddd ymm3, ymm3, ymm17
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxord ymm15, ymm15, ymm0
- vpxord ymm12, ymm12, ymm1
- vpxord ymm13, ymm13, ymm2
- vpxord ymm14, ymm14, ymm3
- vprord ymm15, ymm15, 16
- vprord ymm12, ymm12, 16
- vprord ymm13, ymm13, 16
- vprord ymm14, ymm14, 16
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm8, ymm13
- vpaddd ymm9, ymm9, ymm14
- vpxord ymm5, ymm5, ymm10
- vpxord ymm6, ymm6, ymm11
- vpxord ymm7, ymm7, ymm8
- vpxord ymm4, ymm4, ymm9
- vprord ymm5, ymm5, 12
- vprord ymm6, ymm6, 12
- vprord ymm7, ymm7, 12
- vprord ymm4, ymm4, 12
- vpaddd ymm0, ymm0, ymm16
- vpaddd ymm1, ymm1, ymm18
- vpaddd ymm2, ymm2, ymm24
- vpaddd ymm3, ymm3, ymm22
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxord ymm15, ymm15, ymm0
- vpxord ymm12, ymm12, ymm1
- vpxord ymm13, ymm13, ymm2
- vpxord ymm14, ymm14, ymm3
- vprord ymm15, ymm15, 8
- vprord ymm12, ymm12, 8
- vprord ymm13, ymm13, 8
- vprord ymm14, ymm14, 8
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm8, ymm13
- vpaddd ymm9, ymm9, ymm14
- vpxord ymm5, ymm5, ymm10
- vpxord ymm6, ymm6, ymm11
- vpxord ymm7, ymm7, ymm8
- vpxord ymm4, ymm4, ymm9
- vprord ymm5, ymm5, 7
- vprord ymm6, ymm6, 7
- vprord ymm7, ymm7, 7
- vprord ymm4, ymm4, 7
- vpaddd ymm0, ymm0, ymm28
- vpaddd ymm1, ymm1, ymm25
- vpaddd ymm2, ymm2, ymm31
- vpaddd ymm3, ymm3, ymm30
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxord ymm12, ymm12, ymm0
- vpxord ymm13, ymm13, ymm1
- vpxord ymm14, ymm14, ymm2
- vpxord ymm15, ymm15, ymm3
- vprord ymm12, ymm12, 16
- vprord ymm13, ymm13, 16
- vprord ymm14, ymm14, 16
- vprord ymm15, ymm15, 16
- vpaddd ymm8, ymm8, ymm12
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxord ymm4, ymm4, ymm8
- vpxord ymm5, ymm5, ymm9
- vpxord ymm6, ymm6, ymm10
- vpxord ymm7, ymm7, ymm11
- vprord ymm4, ymm4, 12
- vprord ymm5, ymm5, 12
- vprord ymm6, ymm6, 12
- vprord ymm7, ymm7, 12
- vpaddd ymm0, ymm0, ymm29
- vpaddd ymm1, ymm1, ymm27
- vpaddd ymm2, ymm2, ymm26
- vpaddd ymm3, ymm3, ymm24
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxord ymm12, ymm12, ymm0
- vpxord ymm13, ymm13, ymm1
- vpxord ymm14, ymm14, ymm2
- vpxord ymm15, ymm15, ymm3
- vprord ymm12, ymm12, 8
- vprord ymm13, ymm13, 8
- vprord ymm14, ymm14, 8
- vprord ymm15, ymm15, 8
- vpaddd ymm8, ymm8, ymm12
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxord ymm4, ymm4, ymm8
- vpxord ymm5, ymm5, ymm9
- vpxord ymm6, ymm6, ymm10
- vpxord ymm7, ymm7, ymm11
- vprord ymm4, ymm4, 7
- vprord ymm5, ymm5, 7
- vprord ymm6, ymm6, 7
- vprord ymm7, ymm7, 7
- vpaddd ymm0, ymm0, ymm23
- vpaddd ymm1, ymm1, ymm21
- vpaddd ymm2, ymm2, ymm16
- vpaddd ymm3, ymm3, ymm22
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxord ymm15, ymm15, ymm0
- vpxord ymm12, ymm12, ymm1
- vpxord ymm13, ymm13, ymm2
- vpxord ymm14, ymm14, ymm3
- vprord ymm15, ymm15, 16
- vprord ymm12, ymm12, 16
- vprord ymm13, ymm13, 16
- vprord ymm14, ymm14, 16
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm8, ymm13
- vpaddd ymm9, ymm9, ymm14
- vpxord ymm5, ymm5, ymm10
- vpxord ymm6, ymm6, ymm11
- vpxord ymm7, ymm7, ymm8
- vpxord ymm4, ymm4, ymm9
- vprord ymm5, ymm5, 12
- vprord ymm6, ymm6, 12
- vprord ymm7, ymm7, 12
- vprord ymm4, ymm4, 12
- vpaddd ymm0, ymm0, ymm18
- vpaddd ymm1, ymm1, ymm19
- vpaddd ymm2, ymm2, ymm17
- vpaddd ymm3, ymm3, ymm20
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxord ymm15, ymm15, ymm0
- vpxord ymm12, ymm12, ymm1
- vpxord ymm13, ymm13, ymm2
- vpxord ymm14, ymm14, ymm3
- vprord ymm15, ymm15, 8
- vprord ymm12, ymm12, 8
- vprord ymm13, ymm13, 8
- vprord ymm14, ymm14, 8
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm8, ymm13
- vpaddd ymm9, ymm9, ymm14
- vpxord ymm5, ymm5, ymm10
- vpxord ymm6, ymm6, ymm11
- vpxord ymm7, ymm7, ymm8
- vpxord ymm4, ymm4, ymm9
- vprord ymm5, ymm5, 7
- vprord ymm6, ymm6, 7
- vprord ymm7, ymm7, 7
- vprord ymm4, ymm4, 7
- vpaddd ymm0, ymm0, ymm25
- vpaddd ymm1, ymm1, ymm27
- vpaddd ymm2, ymm2, ymm24
- vpaddd ymm3, ymm3, ymm31
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxord ymm12, ymm12, ymm0
- vpxord ymm13, ymm13, ymm1
- vpxord ymm14, ymm14, ymm2
- vpxord ymm15, ymm15, ymm3
- vprord ymm12, ymm12, 16
- vprord ymm13, ymm13, 16
- vprord ymm14, ymm14, 16
- vprord ymm15, ymm15, 16
- vpaddd ymm8, ymm8, ymm12
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxord ymm4, ymm4, ymm8
- vpxord ymm5, ymm5, ymm9
- vpxord ymm6, ymm6, ymm10
- vpxord ymm7, ymm7, ymm11
- vprord ymm4, ymm4, 12
- vprord ymm5, ymm5, 12
- vprord ymm6, ymm6, 12
- vprord ymm7, ymm7, 12
- vpaddd ymm0, ymm0, ymm30
- vpaddd ymm1, ymm1, ymm21
- vpaddd ymm2, ymm2, ymm28
- vpaddd ymm3, ymm3, ymm17
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxord ymm12, ymm12, ymm0
- vpxord ymm13, ymm13, ymm1
- vpxord ymm14, ymm14, ymm2
- vpxord ymm15, ymm15, ymm3
- vprord ymm12, ymm12, 8
- vprord ymm13, ymm13, 8
- vprord ymm14, ymm14, 8
- vprord ymm15, ymm15, 8
- vpaddd ymm8, ymm8, ymm12
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxord ymm4, ymm4, ymm8
- vpxord ymm5, ymm5, ymm9
- vpxord ymm6, ymm6, ymm10
- vpxord ymm7, ymm7, ymm11
- vprord ymm4, ymm4, 7
- vprord ymm5, ymm5, 7
- vprord ymm6, ymm6, 7
- vprord ymm7, ymm7, 7
- vpaddd ymm0, ymm0, ymm29
- vpaddd ymm1, ymm1, ymm16
- vpaddd ymm2, ymm2, ymm18
- vpaddd ymm3, ymm3, ymm20
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxord ymm15, ymm15, ymm0
- vpxord ymm12, ymm12, ymm1
- vpxord ymm13, ymm13, ymm2
- vpxord ymm14, ymm14, ymm3
- vprord ymm15, ymm15, 16
- vprord ymm12, ymm12, 16
- vprord ymm13, ymm13, 16
- vprord ymm14, ymm14, 16
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm8, ymm13
- vpaddd ymm9, ymm9, ymm14
- vpxord ymm5, ymm5, ymm10
- vpxord ymm6, ymm6, ymm11
- vpxord ymm7, ymm7, ymm8
- vpxord ymm4, ymm4, ymm9
- vprord ymm5, ymm5, 12
- vprord ymm6, ymm6, 12
- vprord ymm7, ymm7, 12
- vprord ymm4, ymm4, 12
- vpaddd ymm0, ymm0, ymm19
- vpaddd ymm1, ymm1, ymm26
- vpaddd ymm2, ymm2, ymm22
- vpaddd ymm3, ymm3, ymm23
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxord ymm15, ymm15, ymm0
- vpxord ymm12, ymm12, ymm1
- vpxord ymm13, ymm13, ymm2
- vpxord ymm14, ymm14, ymm3
- vprord ymm15, ymm15, 8
- vprord ymm12, ymm12, 8
- vprord ymm13, ymm13, 8
- vprord ymm14, ymm14, 8
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm8, ymm13
- vpaddd ymm9, ymm9, ymm14
- vpxord ymm5, ymm5, ymm10
- vpxord ymm6, ymm6, ymm11
- vpxord ymm7, ymm7, ymm8
- vpxord ymm4, ymm4, ymm9
- vprord ymm5, ymm5, 7
- vprord ymm6, ymm6, 7
- vprord ymm7, ymm7, 7
- vprord ymm4, ymm4, 7
- vpaddd ymm0, ymm0, ymm27
- vpaddd ymm1, ymm1, ymm21
- vpaddd ymm2, ymm2, ymm17
- vpaddd ymm3, ymm3, ymm24
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxord ymm12, ymm12, ymm0
- vpxord ymm13, ymm13, ymm1
- vpxord ymm14, ymm14, ymm2
- vpxord ymm15, ymm15, ymm3
- vprord ymm12, ymm12, 16
- vprord ymm13, ymm13, 16
- vprord ymm14, ymm14, 16
- vprord ymm15, ymm15, 16
- vpaddd ymm8, ymm8, ymm12
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxord ymm4, ymm4, ymm8
- vpxord ymm5, ymm5, ymm9
- vpxord ymm6, ymm6, ymm10
- vpxord ymm7, ymm7, ymm11
- vprord ymm4, ymm4, 12
- vprord ymm5, ymm5, 12
- vprord ymm6, ymm6, 12
- vprord ymm7, ymm7, 12
- vpaddd ymm0, ymm0, ymm31
- vpaddd ymm1, ymm1, ymm16
- vpaddd ymm2, ymm2, ymm25
- vpaddd ymm3, ymm3, ymm22
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxord ymm12, ymm12, ymm0
- vpxord ymm13, ymm13, ymm1
- vpxord ymm14, ymm14, ymm2
- vpxord ymm15, ymm15, ymm3
- vprord ymm12, ymm12, 8
- vprord ymm13, ymm13, 8
- vprord ymm14, ymm14, 8
- vprord ymm15, ymm15, 8
- vpaddd ymm8, ymm8, ymm12
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxord ymm4, ymm4, ymm8
- vpxord ymm5, ymm5, ymm9
- vpxord ymm6, ymm6, ymm10
- vpxord ymm7, ymm7, ymm11
- vprord ymm4, ymm4, 7
- vprord ymm5, ymm5, 7
- vprord ymm6, ymm6, 7
- vprord ymm7, ymm7, 7
- vpaddd ymm0, ymm0, ymm30
- vpaddd ymm1, ymm1, ymm18
- vpaddd ymm2, ymm2, ymm19
- vpaddd ymm3, ymm3, ymm23
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxord ymm15, ymm15, ymm0
- vpxord ymm12, ymm12, ymm1
- vpxord ymm13, ymm13, ymm2
- vpxord ymm14, ymm14, ymm3
- vprord ymm15, ymm15, 16
- vprord ymm12, ymm12, 16
- vprord ymm13, ymm13, 16
- vprord ymm14, ymm14, 16
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm8, ymm13
- vpaddd ymm9, ymm9, ymm14
- vpxord ymm5, ymm5, ymm10
- vpxord ymm6, ymm6, ymm11
- vpxord ymm7, ymm7, ymm8
- vpxord ymm4, ymm4, ymm9
- vprord ymm5, ymm5, 12
- vprord ymm6, ymm6, 12
- vprord ymm7, ymm7, 12
- vprord ymm4, ymm4, 12
- vpaddd ymm0, ymm0, ymm26
- vpaddd ymm1, ymm1, ymm28
- vpaddd ymm2, ymm2, ymm20
- vpaddd ymm3, ymm3, ymm29
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxord ymm15, ymm15, ymm0
- vpxord ymm12, ymm12, ymm1
- vpxord ymm13, ymm13, ymm2
- vpxord ymm14, ymm14, ymm3
- vprord ymm15, ymm15, 8
- vprord ymm12, ymm12, 8
- vprord ymm13, ymm13, 8
- vprord ymm14, ymm14, 8
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm8, ymm13
- vpaddd ymm9, ymm9, ymm14
- vpxord ymm5, ymm5, ymm10
- vpxord ymm6, ymm6, ymm11
- vpxord ymm7, ymm7, ymm8
- vpxord ymm4, ymm4, ymm9
- vprord ymm5, ymm5, 7
- vprord ymm6, ymm6, 7
- vprord ymm7, ymm7, 7
- vprord ymm4, ymm4, 7
- vpxor ymm0, ymm0, ymm8
- vpxor ymm1, ymm1, ymm9
- vpxor ymm2, ymm2, ymm10
- vpxor ymm3, ymm3, ymm11
- vpxor ymm4, ymm4, ymm12
- vpxor ymm5, ymm5, ymm13
- vpxor ymm6, ymm6, ymm14
- vpxor ymm7, ymm7, ymm15
- movzx eax, byte ptr [rbp+0x38]
- jne 2b
- mov rbx, qword ptr [rbp+0x50]
- vunpcklps ymm8, ymm0, ymm1
- vunpcklps ymm9, ymm2, ymm3
- vunpckhps ymm10, ymm0, ymm1
- vunpcklps ymm11, ymm4, ymm5
- vunpcklps ymm0, ymm6, ymm7
- vshufps ymm12, ymm8, ymm9, 78
- vblendps ymm1, ymm8, ymm12, 0xCC
- vshufps ymm8, ymm11, ymm0, 78
- vunpckhps ymm13, ymm2, ymm3
- vblendps ymm2, ymm11, ymm8, 0xCC
- vblendps ymm3, ymm12, ymm9, 0xCC
- vperm2f128 ymm12, ymm1, ymm2, 0x20
- vmovups ymmword ptr [rbx], ymm12
- vunpckhps ymm14, ymm4, ymm5
- vblendps ymm4, ymm8, ymm0, 0xCC
- vunpckhps ymm15, ymm6, ymm7
- vperm2f128 ymm7, ymm3, ymm4, 0x20
- vmovups ymmword ptr [rbx+0x20], ymm7
- vshufps ymm5, ymm10, ymm13, 78
- vblendps ymm6, ymm5, ymm13, 0xCC
- vshufps ymm13, ymm14, ymm15, 78
- vblendps ymm10, ymm10, ymm5, 0xCC
- vblendps ymm14, ymm14, ymm13, 0xCC
- vperm2f128 ymm8, ymm10, ymm14, 0x20
- vmovups ymmword ptr [rbx+0x40], ymm8
- vblendps ymm15, ymm13, ymm15, 0xCC
- vperm2f128 ymm13, ymm6, ymm15, 0x20
- vmovups ymmword ptr [rbx+0x60], ymm13
- vperm2f128 ymm9, ymm1, ymm2, 0x31
- vperm2f128 ymm11, ymm3, ymm4, 0x31
- vmovups ymmword ptr [rbx+0x80], ymm9
- vperm2f128 ymm14, ymm10, ymm14, 0x31
- vperm2f128 ymm15, ymm6, ymm15, 0x31
- vmovups ymmword ptr [rbx+0xA0], ymm11
- vmovups ymmword ptr [rbx+0xC0], ymm14
- vmovups ymmword ptr [rbx+0xE0], ymm15
- vmovdqa ymm0, ymmword ptr [rsp]
- vmovdqa ymm2, ymmword ptr [rsp+0x2*0x20]
- vmovdqa32 ymm0 {k1}, ymmword ptr [rsp+0x1*0x20]
- vmovdqa32 ymm2 {k1}, ymmword ptr [rsp+0x3*0x20]
- vmovdqa ymmword ptr [rsp], ymm0
- vmovdqa ymmword ptr [rsp+0x2*0x20], ymm2
- add rbx, 256
- mov qword ptr [rbp+0x50], rbx
- add rdi, 64
- sub rsi, 8
-3:
- mov rbx, qword ptr [rbp+0x50]
- mov r15, qword ptr [rsp+0x80]
- movzx r13, byte ptr [rbp+0x38]
- movzx r12, byte ptr [rbp+0x48]
- test esi, 0x4
- je 3f
- vbroadcasti32x4 zmm0, xmmword ptr [rcx]
- vbroadcasti32x4 zmm1, xmmword ptr [rcx+0x1*0x10]
- vmovdqa xmm12, xmmword ptr [rsp]
- vmovdqa xmm13, xmmword ptr [rsp+0x4*0x10]
- vpunpckldq xmm14, xmm12, xmm13
- vpunpckhdq xmm15, xmm12, xmm13
- vpermq ymm14, ymm14, 0xDC
- vpermq ymm15, ymm15, 0xDC
- vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN+rip]
- vinserti64x4 zmm13, zmm14, ymm15, 0x01
- mov eax, 17476
- kmovw k2, eax
- vpblendmd zmm13 {k2}, zmm13, zmm12
- vbroadcasti32x4 zmm15, xmmword ptr [BLAKE3_IV+rip]
- mov r8, qword ptr [rdi]
- mov r9, qword ptr [rdi+0x8]
- mov r10, qword ptr [rdi+0x10]
- mov r11, qword ptr [rdi+0x18]
- mov eax, 43690
- kmovw k3, eax
- mov eax, 34952
- kmovw k4, eax
- movzx eax, byte ptr [rbp+0x40]
- or eax, r13d
- xor edx, edx
-.p2align 5
-2:
- mov r14d, eax
- or eax, r12d
- add rdx, 64
- cmp rdx, r15
- cmovne eax, r14d
- mov dword ptr [rsp+0x88], eax
- vmovdqa32 zmm2, zmm15
- vpbroadcastd zmm8, dword ptr [rsp+0x22*0x4]
- vpblendmd zmm3 {k4}, zmm13, zmm8
- vmovups zmm8, zmmword ptr [r8+rdx-0x1*0x40]
- vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-0x4*0x10], 0x01
- vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-0x4*0x10], 0x02
- vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-0x4*0x10], 0x03
- vmovups zmm9, zmmword ptr [r8+rdx-0x30]
- vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-0x3*0x10], 0x01
- vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-0x3*0x10], 0x02
- vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-0x3*0x10], 0x03
- vshufps zmm4, zmm8, zmm9, 136
- vshufps zmm5, zmm8, zmm9, 221
- vmovups zmm8, zmmword ptr [r8+rdx-0x20]
- vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-0x2*0x10], 0x01
- vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-0x2*0x10], 0x02
- vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-0x2*0x10], 0x03
- vmovups zmm9, zmmword ptr [r8+rdx-0x10]
- vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-0x1*0x10], 0x01
- vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-0x1*0x10], 0x02
- vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-0x1*0x10], 0x03
- vshufps zmm6, zmm8, zmm9, 136
- vshufps zmm7, zmm8, zmm9, 221
- vpshufd zmm6, zmm6, 0x93
- vpshufd zmm7, zmm7, 0x93
- mov al, 7
-9:
- vpaddd zmm0, zmm0, zmm4
- vpaddd zmm0, zmm0, zmm1
- vpxord zmm3, zmm3, zmm0
- vprord zmm3, zmm3, 16
- vpaddd zmm2, zmm2, zmm3
- vpxord zmm1, zmm1, zmm2
- vprord zmm1, zmm1, 12
- vpaddd zmm0, zmm0, zmm5
- vpaddd zmm0, zmm0, zmm1
- vpxord zmm3, zmm3, zmm0
- vprord zmm3, zmm3, 8
- vpaddd zmm2, zmm2, zmm3
- vpxord zmm1, zmm1, zmm2
- vprord zmm1, zmm1, 7
- vpshufd zmm0, zmm0, 0x93
- vpshufd zmm3, zmm3, 0x4E
- vpshufd zmm2, zmm2, 0x39
- vpaddd zmm0, zmm0, zmm6
- vpaddd zmm0, zmm0, zmm1
- vpxord zmm3, zmm3, zmm0
- vprord zmm3, zmm3, 16
- vpaddd zmm2, zmm2, zmm3
- vpxord zmm1, zmm1, zmm2
- vprord zmm1, zmm1, 12
- vpaddd zmm0, zmm0, zmm7
- vpaddd zmm0, zmm0, zmm1
- vpxord zmm3, zmm3, zmm0
- vprord zmm3, zmm3, 8
- vpaddd zmm2, zmm2, zmm3
- vpxord zmm1, zmm1, zmm2
- vprord zmm1, zmm1, 7
- vpshufd zmm0, zmm0, 0x39
- vpshufd zmm3, zmm3, 0x4E
- vpshufd zmm2, zmm2, 0x93
- dec al
- jz 9f
- vshufps zmm8, zmm4, zmm5, 214
- vpshufd zmm9, zmm4, 0x0F
- vpshufd zmm4, zmm8, 0x39
- vshufps zmm8, zmm6, zmm7, 250
- vpblendmd zmm9 {k3}, zmm9, zmm8
- vpunpcklqdq zmm8, zmm7, zmm5
- vpblendmd zmm8 {k4}, zmm8, zmm6
- vpshufd zmm8, zmm8, 0x78
- vpunpckhdq zmm5, zmm5, zmm7
- vpunpckldq zmm6, zmm6, zmm5
- vpshufd zmm7, zmm6, 0x1E
- vmovdqa32 zmm5, zmm9
- vmovdqa32 zmm6, zmm8
- jmp 9b
-9:
- vpxord zmm0, zmm0, zmm2
- vpxord zmm1, zmm1, zmm3
- mov eax, r13d
- cmp rdx, r15
- jne 2b
- vmovdqu xmmword ptr [rbx], xmm0
- vmovdqu xmmword ptr [rbx+0x10], xmm1
- vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01
- vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01
- vextracti32x4 xmmword ptr [rbx+0x4*0x10], zmm0, 0x02
- vextracti32x4 xmmword ptr [rbx+0x5*0x10], zmm1, 0x02
- vextracti32x4 xmmword ptr [rbx+0x6*0x10], zmm0, 0x03
- vextracti32x4 xmmword ptr [rbx+0x7*0x10], zmm1, 0x03
- vmovdqa xmm0, xmmword ptr [rsp]
- vmovdqa xmm2, xmmword ptr [rsp+0x40]
- vmovdqa32 xmm0 {k1}, xmmword ptr [rsp+0x1*0x10]
- vmovdqa32 xmm2 {k1}, xmmword ptr [rsp+0x5*0x10]
- vmovdqa xmmword ptr [rsp], xmm0
- vmovdqa xmmword ptr [rsp+0x40], xmm2
- add rbx, 128
- add rdi, 32
- sub rsi, 4
-3:
- test esi, 0x2
- je 3f
- vbroadcasti128 ymm0, xmmword ptr [rcx]
- vbroadcasti128 ymm1, xmmword ptr [rcx+0x10]
- vmovd xmm13, dword ptr [rsp]
- vpinsrd xmm13, xmm13, dword ptr [rsp+0x40], 1
- vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
- vmovd xmm14, dword ptr [rsp+0x4]
- vpinsrd xmm14, xmm14, dword ptr [rsp+0x44], 1
- vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
- vinserti128 ymm13, ymm13, xmm14, 0x01
- mov r8, qword ptr [rdi]
- mov r9, qword ptr [rdi+0x8]
- movzx eax, byte ptr [rbp+0x40]
- or eax, r13d
- xor edx, edx
-.p2align 5
-2:
- mov r14d, eax
- or eax, r12d
- add rdx, 64
- cmp rdx, r15
- cmovne eax, r14d
- mov dword ptr [rsp+0x88], eax
- vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip]
- vpbroadcastd ymm8, dword ptr [rsp+0x88]
- vpblendd ymm3, ymm13, ymm8, 0x88
- vmovups ymm8, ymmword ptr [r8+rdx-0x40]
- vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x40], 0x01
- vmovups ymm9, ymmword ptr [r8+rdx-0x30]
- vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x30], 0x01
- vshufps ymm4, ymm8, ymm9, 136
- vshufps ymm5, ymm8, ymm9, 221
- vmovups ymm8, ymmword ptr [r8+rdx-0x20]
- vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x20], 0x01
- vmovups ymm9, ymmword ptr [r8+rdx-0x10]
- vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x10], 0x01
- vshufps ymm6, ymm8, ymm9, 136
- vshufps ymm7, ymm8, ymm9, 221
- vpshufd ymm6, ymm6, 0x93
- vpshufd ymm7, ymm7, 0x93
- mov al, 7
-9:
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm0, ymm0, ymm1
- vpxord ymm3, ymm3, ymm0
- vprord ymm3, ymm3, 16
- vpaddd ymm2, ymm2, ymm3
- vpxord ymm1, ymm1, ymm2
- vprord ymm1, ymm1, 12
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm0, ymm0, ymm1
- vpxord ymm3, ymm3, ymm0
- vprord ymm3, ymm3, 8
- vpaddd ymm2, ymm2, ymm3
- vpxord ymm1, ymm1, ymm2
- vprord ymm1, ymm1, 7
- vpshufd ymm0, ymm0, 0x93
- vpshufd ymm3, ymm3, 0x4E
- vpshufd ymm2, ymm2, 0x39
- vpaddd ymm0, ymm0, ymm6
- vpaddd ymm0, ymm0, ymm1
- vpxord ymm3, ymm3, ymm0
- vprord ymm3, ymm3, 16
- vpaddd ymm2, ymm2, ymm3
- vpxord ymm1, ymm1, ymm2
- vprord ymm1, ymm1, 12
- vpaddd ymm0, ymm0, ymm7
- vpaddd ymm0, ymm0, ymm1
- vpxord ymm3, ymm3, ymm0
- vprord ymm3, ymm3, 8
- vpaddd ymm2, ymm2, ymm3
- vpxord ymm1, ymm1, ymm2
- vprord ymm1, ymm1, 7
- vpshufd ymm0, ymm0, 0x39
- vpshufd ymm3, ymm3, 0x4E
- vpshufd ymm2, ymm2, 0x93
- dec al
- jz 9f
- vshufps ymm8, ymm4, ymm5, 214
- vpshufd ymm9, ymm4, 0x0F
- vpshufd ymm4, ymm8, 0x39
- vshufps ymm8, ymm6, ymm7, 250
- vpblendd ymm9, ymm9, ymm8, 0xAA
- vpunpcklqdq ymm8, ymm7, ymm5
- vpblendd ymm8, ymm8, ymm6, 0x88
- vpshufd ymm8, ymm8, 0x78
- vpunpckhdq ymm5, ymm5, ymm7
- vpunpckldq ymm6, ymm6, ymm5
- vpshufd ymm7, ymm6, 0x1E
- vmovdqa ymm5, ymm9
- vmovdqa ymm6, ymm8
- jmp 9b
-9:
- vpxor ymm0, ymm0, ymm2
- vpxor ymm1, ymm1, ymm3
- mov eax, r13d
- cmp rdx, r15
- jne 2b
- vmovdqu xmmword ptr [rbx], xmm0
- vmovdqu xmmword ptr [rbx+0x10], xmm1
- vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01
- vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01
- vmovdqa xmm0, xmmword ptr [rsp]
- vmovdqa xmm2, xmmword ptr [rsp+0x4*0x10]
- vmovdqu32 xmm0 {k1}, xmmword ptr [rsp+0x8]
- vmovdqu32 xmm2 {k1}, xmmword ptr [rsp+0x48]
- vmovdqa xmmword ptr [rsp], xmm0
- vmovdqa xmmword ptr [rsp+0x4*0x10], xmm2
- add rbx, 64
- add rdi, 16
- sub rsi, 2
-3:
- test esi, 0x1
- je 4b
- vmovdqu xmm0, xmmword ptr [rcx]
- vmovdqu xmm1, xmmword ptr [rcx+0x10]
- vmovd xmm14, dword ptr [rsp]
- vpinsrd xmm14, xmm14, dword ptr [rsp+0x40], 1
- vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
- vmovdqa xmm15, xmmword ptr [BLAKE3_IV+rip]
- mov r8, qword ptr [rdi]
- movzx eax, byte ptr [rbp+0x40]
- or eax, r13d
- xor edx, edx
-.p2align 5
-2:
- mov r14d, eax
- or eax, r12d
- add rdx, 64
- cmp rdx, r15
- cmovne eax, r14d
- vpinsrd xmm3, xmm14, eax, 3
- vmovdqa xmm2, xmm15
- vmovups xmm8, xmmword ptr [r8+rdx-0x40]
- vmovups xmm9, xmmword ptr [r8+rdx-0x30]
- vshufps xmm4, xmm8, xmm9, 136
- vshufps xmm5, xmm8, xmm9, 221
- vmovups xmm8, xmmword ptr [r8+rdx-0x20]
- vmovups xmm9, xmmword ptr [r8+rdx-0x10]
- vshufps xmm6, xmm8, xmm9, 136
- vshufps xmm7, xmm8, xmm9, 221
- vpshufd xmm6, xmm6, 0x93
- vpshufd xmm7, xmm7, 0x93
- mov al, 7
-9:
- vpaddd xmm0, xmm0, xmm4
- vpaddd xmm0, xmm0, xmm1
- vpxord xmm3, xmm3, xmm0
- vprord xmm3, xmm3, 16
- vpaddd xmm2, xmm2, xmm3
- vpxord xmm1, xmm1, xmm2
- vprord xmm1, xmm1, 12
- vpaddd xmm0, xmm0, xmm5
- vpaddd xmm0, xmm0, xmm1
- vpxord xmm3, xmm3, xmm0
- vprord xmm3, xmm3, 8
- vpaddd xmm2, xmm2, xmm3
- vpxord xmm1, xmm1, xmm2
- vprord xmm1, xmm1, 7
- vpshufd xmm0, xmm0, 0x93
- vpshufd xmm3, xmm3, 0x4E
- vpshufd xmm2, xmm2, 0x39
- vpaddd xmm0, xmm0, xmm6
- vpaddd xmm0, xmm0, xmm1
- vpxord xmm3, xmm3, xmm0
- vprord xmm3, xmm3, 16
- vpaddd xmm2, xmm2, xmm3
- vpxord xmm1, xmm1, xmm2
- vprord xmm1, xmm1, 12
- vpaddd xmm0, xmm0, xmm7
- vpaddd xmm0, xmm0, xmm1
- vpxord xmm3, xmm3, xmm0
- vprord xmm3, xmm3, 8
- vpaddd xmm2, xmm2, xmm3
- vpxord xmm1, xmm1, xmm2
- vprord xmm1, xmm1, 7
- vpshufd xmm0, xmm0, 0x39
- vpshufd xmm3, xmm3, 0x4E
- vpshufd xmm2, xmm2, 0x93
- dec al
- jz 9f
- vshufps xmm8, xmm4, xmm5, 214
- vpshufd xmm9, xmm4, 0x0F
- vpshufd xmm4, xmm8, 0x39
- vshufps xmm8, xmm6, xmm7, 250
- vpblendd xmm9, xmm9, xmm8, 0xAA
- vpunpcklqdq xmm8, xmm7, xmm5
- vpblendd xmm8, xmm8, xmm6, 0x88
- vpshufd xmm8, xmm8, 0x78
- vpunpckhdq xmm5, xmm5, xmm7
- vpunpckldq xmm6, xmm6, xmm5
- vpshufd xmm7, xmm6, 0x1E
- vmovdqa xmm5, xmm9
- vmovdqa xmm6, xmm8
- jmp 9b
-9:
- vpxor xmm0, xmm0, xmm2
- vpxor xmm1, xmm1, xmm3
- mov eax, r13d
- cmp rdx, r15
- jne 2b
- vmovdqu xmmword ptr [rbx], xmm0
- vmovdqu xmmword ptr [rbx+0x10], xmm1
- jmp 4b
-.p2align 6
-_blake3_compress_in_place_avx512:
-blake3_compress_in_place_avx512:
- _CET_ENDBR
- vmovdqu xmm0, xmmword ptr [rdi]
- vmovdqu xmm1, xmmword ptr [rdi+0x10]
- movzx eax, r8b
- movzx edx, dl
- shl rax, 32
- add rdx, rax
- vmovq xmm3, rcx
- vmovq xmm4, rdx
- vpunpcklqdq xmm3, xmm3, xmm4
- vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip]
- vmovups xmm8, xmmword ptr [rsi]
- vmovups xmm9, xmmword ptr [rsi+0x10]
- vshufps xmm4, xmm8, xmm9, 136
- vshufps xmm5, xmm8, xmm9, 221
- vmovups xmm8, xmmword ptr [rsi+0x20]
- vmovups xmm9, xmmword ptr [rsi+0x30]
- vshufps xmm6, xmm8, xmm9, 136
- vshufps xmm7, xmm8, xmm9, 221
- vpshufd xmm6, xmm6, 0x93
- vpshufd xmm7, xmm7, 0x93
- mov al, 7
-9:
- vpaddd xmm0, xmm0, xmm4
- vpaddd xmm0, xmm0, xmm1
- vpxord xmm3, xmm3, xmm0
- vprord xmm3, xmm3, 16
- vpaddd xmm2, xmm2, xmm3
- vpxord xmm1, xmm1, xmm2
- vprord xmm1, xmm1, 12
- vpaddd xmm0, xmm0, xmm5
- vpaddd xmm0, xmm0, xmm1
- vpxord xmm3, xmm3, xmm0
- vprord xmm3, xmm3, 8
- vpaddd xmm2, xmm2, xmm3
- vpxord xmm1, xmm1, xmm2
- vprord xmm1, xmm1, 7
- vpshufd xmm0, xmm0, 0x93
- vpshufd xmm3, xmm3, 0x4E
- vpshufd xmm2, xmm2, 0x39
- vpaddd xmm0, xmm0, xmm6
- vpaddd xmm0, xmm0, xmm1
- vpxord xmm3, xmm3, xmm0
- vprord xmm3, xmm3, 16
- vpaddd xmm2, xmm2, xmm3
- vpxord xmm1, xmm1, xmm2
- vprord xmm1, xmm1, 12
- vpaddd xmm0, xmm0, xmm7
- vpaddd xmm0, xmm0, xmm1
- vpxord xmm3, xmm3, xmm0
- vprord xmm3, xmm3, 8
- vpaddd xmm2, xmm2, xmm3
- vpxord xmm1, xmm1, xmm2
- vprord xmm1, xmm1, 7
- vpshufd xmm0, xmm0, 0x39
- vpshufd xmm3, xmm3, 0x4E
- vpshufd xmm2, xmm2, 0x93
- dec al
- jz 9f
- vshufps xmm8, xmm4, xmm5, 214
- vpshufd xmm9, xmm4, 0x0F
- vpshufd xmm4, xmm8, 0x39
- vshufps xmm8, xmm6, xmm7, 250
- vpblendd xmm9, xmm9, xmm8, 0xAA
- vpunpcklqdq xmm8, xmm7, xmm5
- vpblendd xmm8, xmm8, xmm6, 0x88
- vpshufd xmm8, xmm8, 0x78
- vpunpckhdq xmm5, xmm5, xmm7
- vpunpckldq xmm6, xmm6, xmm5
- vpshufd xmm7, xmm6, 0x1E
- vmovdqa xmm5, xmm9
- vmovdqa xmm6, xmm8
- jmp 9b
-9:
- vpxor xmm0, xmm0, xmm2
- vpxor xmm1, xmm1, xmm3
- vmovdqu xmmword ptr [rdi], xmm0
- vmovdqu xmmword ptr [rdi+0x10], xmm1
- ret
-
-.p2align 6
-_blake3_compress_xof_avx512:
-blake3_compress_xof_avx512:
- _CET_ENDBR
- vmovdqu xmm0, xmmword ptr [rdi]
- vmovdqu xmm1, xmmword ptr [rdi+0x10]
- movzx eax, r8b
- movzx edx, dl
- shl rax, 32
- add rdx, rax
- vmovq xmm3, rcx
- vmovq xmm4, rdx
- vpunpcklqdq xmm3, xmm3, xmm4
- vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip]
- vmovups xmm8, xmmword ptr [rsi]
- vmovups xmm9, xmmword ptr [rsi+0x10]
- vshufps xmm4, xmm8, xmm9, 136
- vshufps xmm5, xmm8, xmm9, 221
- vmovups xmm8, xmmword ptr [rsi+0x20]
- vmovups xmm9, xmmword ptr [rsi+0x30]
- vshufps xmm6, xmm8, xmm9, 136
- vshufps xmm7, xmm8, xmm9, 221
- vpshufd xmm6, xmm6, 0x93
- vpshufd xmm7, xmm7, 0x93
- mov al, 7
-9:
- vpaddd xmm0, xmm0, xmm4
- vpaddd xmm0, xmm0, xmm1
- vpxord xmm3, xmm3, xmm0
- vprord xmm3, xmm3, 16
- vpaddd xmm2, xmm2, xmm3
- vpxord xmm1, xmm1, xmm2
- vprord xmm1, xmm1, 12
- vpaddd xmm0, xmm0, xmm5
- vpaddd xmm0, xmm0, xmm1
- vpxord xmm3, xmm3, xmm0
- vprord xmm3, xmm3, 8
- vpaddd xmm2, xmm2, xmm3
- vpxord xmm1, xmm1, xmm2
- vprord xmm1, xmm1, 7
- vpshufd xmm0, xmm0, 0x93
- vpshufd xmm3, xmm3, 0x4E
- vpshufd xmm2, xmm2, 0x39
- vpaddd xmm0, xmm0, xmm6
- vpaddd xmm0, xmm0, xmm1
- vpxord xmm3, xmm3, xmm0
- vprord xmm3, xmm3, 16
- vpaddd xmm2, xmm2, xmm3
- vpxord xmm1, xmm1, xmm2
- vprord xmm1, xmm1, 12
- vpaddd xmm0, xmm0, xmm7
- vpaddd xmm0, xmm0, xmm1
- vpxord xmm3, xmm3, xmm0
- vprord xmm3, xmm3, 8
- vpaddd xmm2, xmm2, xmm3
- vpxord xmm1, xmm1, xmm2
- vprord xmm1, xmm1, 7
- vpshufd xmm0, xmm0, 0x39
- vpshufd xmm3, xmm3, 0x4E
- vpshufd xmm2, xmm2, 0x93
- dec al
- jz 9f
- vshufps xmm8, xmm4, xmm5, 214
- vpshufd xmm9, xmm4, 0x0F
- vpshufd xmm4, xmm8, 0x39
- vshufps xmm8, xmm6, xmm7, 250
- vpblendd xmm9, xmm9, xmm8, 0xAA
- vpunpcklqdq xmm8, xmm7, xmm5
- vpblendd xmm8, xmm8, xmm6, 0x88
- vpshufd xmm8, xmm8, 0x78
- vpunpckhdq xmm5, xmm5, xmm7
- vpunpckldq xmm6, xmm6, xmm5
- vpshufd xmm7, xmm6, 0x1E
- vmovdqa xmm5, xmm9
- vmovdqa xmm6, xmm8
- jmp 9b
-9:
- vpxor xmm0, xmm0, xmm2
- vpxor xmm1, xmm1, xmm3
- vpxor xmm2, xmm2, [rdi]
- vpxor xmm3, xmm3, [rdi+0x10]
- vmovdqu xmmword ptr [r9], xmm0
- vmovdqu xmmword ptr [r9+0x10], xmm1
- vmovdqu xmmword ptr [r9+0x20], xmm2
- vmovdqu xmmword ptr [r9+0x30], xmm3
- ret
-
-#ifdef __APPLE__
-.static_data
-#else
-.section .rodata
-#endif
-.p2align 6
-INDEX0:
- .long 0, 1, 2, 3, 16, 17, 18, 19
- .long 8, 9, 10, 11, 24, 25, 26, 27
-INDEX1:
- .long 4, 5, 6, 7, 20, 21, 22, 23
- .long 12, 13, 14, 15, 28, 29, 30, 31
-ADD0:
- .long 0, 1, 2, 3, 4, 5, 6, 7
- .long 8, 9, 10, 11, 12, 13, 14, 15
-ADD1: .long 1
-
-ADD16: .long 16
-BLAKE3_BLOCK_LEN:
- .long 64
-.p2align 6
-BLAKE3_IV:
-BLAKE3_IV_0:
- .long 0x6A09E667
-BLAKE3_IV_1:
- .long 0xBB67AE85
-BLAKE3_IV_2:
- .long 0x3C6EF372
-BLAKE3_IV_3:
- .long 0xA54FF53A
diff --git a/thirdparty/BLAKE3/c/blake3_avx512_x86-64_windows_gnu.S b/thirdparty/BLAKE3/c/blake3_avx512_x86-64_windows_gnu.S
deleted file mode 100644
index e10b9f36c..000000000
--- a/thirdparty/BLAKE3/c/blake3_avx512_x86-64_windows_gnu.S
+++ /dev/null
@@ -1,2615 +0,0 @@
-.intel_syntax noprefix
-
-.global _blake3_hash_many_avx512
-.global blake3_hash_many_avx512
-.global blake3_compress_in_place_avx512
-.global _blake3_compress_in_place_avx512
-.global blake3_compress_xof_avx512
-.global _blake3_compress_xof_avx512
-
-.section .text
-.p2align 6
-_blake3_hash_many_avx512:
-blake3_hash_many_avx512:
- push r15
- push r14
- push r13
- push r12
- push rdi
- push rsi
- push rbx
- push rbp
- mov rbp, rsp
- sub rsp, 304
- and rsp, 0xFFFFFFFFFFFFFFC0
- vmovdqa xmmword ptr [rsp+0x90], xmm6
- vmovdqa xmmword ptr [rsp+0xA0], xmm7
- vmovdqa xmmword ptr [rsp+0xB0], xmm8
- vmovdqa xmmword ptr [rsp+0xC0], xmm9
- vmovdqa xmmword ptr [rsp+0xD0], xmm10
- vmovdqa xmmword ptr [rsp+0xE0], xmm11
- vmovdqa xmmword ptr [rsp+0xF0], xmm12
- vmovdqa xmmword ptr [rsp+0x100], xmm13
- vmovdqa xmmword ptr [rsp+0x110], xmm14
- vmovdqa xmmword ptr [rsp+0x120], xmm15
- mov rdi, rcx
- mov rsi, rdx
- mov rdx, r8
- mov rcx, r9
- mov r8, qword ptr [rbp+0x68]
- movzx r9, byte ptr [rbp+0x70]
- neg r9
- kmovw k1, r9d
- vmovd xmm0, r8d
- vpbroadcastd ymm0, xmm0
- shr r8, 32
- vmovd xmm1, r8d
- vpbroadcastd ymm1, xmm1
- vmovdqa ymm4, ymm1
- vmovdqa ymm5, ymm1
- vpaddd ymm2, ymm0, ymmword ptr [ADD0+rip]
- vpaddd ymm3, ymm0, ymmword ptr [ADD0+32+rip]
- vpcmpltud k2, ymm2, ymm0
- vpcmpltud k3, ymm3, ymm0
- vpaddd ymm4 {k2}, ymm4, dword ptr [ADD1+rip] {1to8}
- vpaddd ymm5 {k3}, ymm5, dword ptr [ADD1+rip] {1to8}
- knotw k2, k1
- vmovdqa32 ymm2 {k2}, ymm0
- vmovdqa32 ymm3 {k2}, ymm0
- vmovdqa32 ymm4 {k2}, ymm1
- vmovdqa32 ymm5 {k2}, ymm1
- vmovdqa ymmword ptr [rsp], ymm2
- vmovdqa ymmword ptr [rsp+0x20], ymm3
- vmovdqa ymmword ptr [rsp+0x40], ymm4
- vmovdqa ymmword ptr [rsp+0x60], ymm5
- shl rdx, 6
- mov qword ptr [rsp+0x80], rdx
- cmp rsi, 16
- jc 3f
-2:
- vpbroadcastd zmm0, dword ptr [rcx]
- vpbroadcastd zmm1, dword ptr [rcx+0x1*0x4]
- vpbroadcastd zmm2, dword ptr [rcx+0x2*0x4]
- vpbroadcastd zmm3, dword ptr [rcx+0x3*0x4]
- vpbroadcastd zmm4, dword ptr [rcx+0x4*0x4]
- vpbroadcastd zmm5, dword ptr [rcx+0x5*0x4]
- vpbroadcastd zmm6, dword ptr [rcx+0x6*0x4]
- vpbroadcastd zmm7, dword ptr [rcx+0x7*0x4]
- movzx eax, byte ptr [rbp+0x78]
- movzx ebx, byte ptr [rbp+0x80]
- or eax, ebx
- xor edx, edx
-.p2align 5
-9:
- movzx ebx, byte ptr [rbp+0x88]
- or ebx, eax
- add rdx, 64
- cmp rdx, qword ptr [rsp+0x80]
- cmove eax, ebx
- mov dword ptr [rsp+0x88], eax
- mov r8, qword ptr [rdi]
- mov r9, qword ptr [rdi+0x8]
- mov r10, qword ptr [rdi+0x10]
- mov r11, qword ptr [rdi+0x18]
- mov r12, qword ptr [rdi+0x40]
- mov r13, qword ptr [rdi+0x48]
- mov r14, qword ptr [rdi+0x50]
- mov r15, qword ptr [rdi+0x58]
- vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
- vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
- vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
- vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
- vpunpcklqdq zmm8, zmm16, zmm17
- vpunpckhqdq zmm9, zmm16, zmm17
- vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20]
- vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
- vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20]
- vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
- vpunpcklqdq zmm10, zmm18, zmm19
- vpunpckhqdq zmm11, zmm18, zmm19
- mov r8, qword ptr [rdi+0x20]
- mov r9, qword ptr [rdi+0x28]
- mov r10, qword ptr [rdi+0x30]
- mov r11, qword ptr [rdi+0x38]
- mov r12, qword ptr [rdi+0x60]
- mov r13, qword ptr [rdi+0x68]
- mov r14, qword ptr [rdi+0x70]
- mov r15, qword ptr [rdi+0x78]
- vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
- vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
- vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
- vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
- vpunpcklqdq zmm12, zmm16, zmm17
- vpunpckhqdq zmm13, zmm16, zmm17
- vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20]
- vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
- vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20]
- vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
- vpunpcklqdq zmm14, zmm18, zmm19
- vpunpckhqdq zmm15, zmm18, zmm19
- vmovdqa32 zmm27, zmmword ptr [INDEX0+rip]
- vmovdqa32 zmm31, zmmword ptr [INDEX1+rip]
- vshufps zmm16, zmm8, zmm10, 136
- vshufps zmm17, zmm12, zmm14, 136
- vmovdqa32 zmm20, zmm16
- vpermt2d zmm16, zmm27, zmm17
- vpermt2d zmm20, zmm31, zmm17
- vshufps zmm17, zmm8, zmm10, 221
- vshufps zmm30, zmm12, zmm14, 221
- vmovdqa32 zmm21, zmm17
- vpermt2d zmm17, zmm27, zmm30
- vpermt2d zmm21, zmm31, zmm30
- vshufps zmm18, zmm9, zmm11, 136
- vshufps zmm8, zmm13, zmm15, 136
- vmovdqa32 zmm22, zmm18
- vpermt2d zmm18, zmm27, zmm8
- vpermt2d zmm22, zmm31, zmm8
- vshufps zmm19, zmm9, zmm11, 221
- vshufps zmm8, zmm13, zmm15, 221
- vmovdqa32 zmm23, zmm19
- vpermt2d zmm19, zmm27, zmm8
- vpermt2d zmm23, zmm31, zmm8
- mov r8, qword ptr [rdi]
- mov r9, qword ptr [rdi+0x8]
- mov r10, qword ptr [rdi+0x10]
- mov r11, qword ptr [rdi+0x18]
- mov r12, qword ptr [rdi+0x40]
- mov r13, qword ptr [rdi+0x48]
- mov r14, qword ptr [rdi+0x50]
- mov r15, qword ptr [rdi+0x58]
- vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
- vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
- vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
- vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
- vpunpcklqdq zmm8, zmm24, zmm25
- vpunpckhqdq zmm9, zmm24, zmm25
- vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20]
- vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
- vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20]
- vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
- vpunpcklqdq zmm10, zmm24, zmm25
- vpunpckhqdq zmm11, zmm24, zmm25
- prefetcht0 [r8+rdx+0x80]
- prefetcht0 [r12+rdx+0x80]
- prefetcht0 [r9+rdx+0x80]
- prefetcht0 [r13+rdx+0x80]
- prefetcht0 [r10+rdx+0x80]
- prefetcht0 [r14+rdx+0x80]
- prefetcht0 [r11+rdx+0x80]
- prefetcht0 [r15+rdx+0x80]
- mov r8, qword ptr [rdi+0x20]
- mov r9, qword ptr [rdi+0x28]
- mov r10, qword ptr [rdi+0x30]
- mov r11, qword ptr [rdi+0x38]
- mov r12, qword ptr [rdi+0x60]
- mov r13, qword ptr [rdi+0x68]
- mov r14, qword ptr [rdi+0x70]
- mov r15, qword ptr [rdi+0x78]
- vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
- vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
- vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
- vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
- vpunpcklqdq zmm12, zmm24, zmm25
- vpunpckhqdq zmm13, zmm24, zmm25
- vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20]
- vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
- vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20]
- vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
- vpunpcklqdq zmm14, zmm24, zmm25
- vpunpckhqdq zmm15, zmm24, zmm25
- prefetcht0 [r8+rdx+0x80]
- prefetcht0 [r12+rdx+0x80]
- prefetcht0 [r9+rdx+0x80]
- prefetcht0 [r13+rdx+0x80]
- prefetcht0 [r10+rdx+0x80]
- prefetcht0 [r14+rdx+0x80]
- prefetcht0 [r11+rdx+0x80]
- prefetcht0 [r15+rdx+0x80]
- vshufps zmm24, zmm8, zmm10, 136
- vshufps zmm30, zmm12, zmm14, 136
- vmovdqa32 zmm28, zmm24
- vpermt2d zmm24, zmm27, zmm30
- vpermt2d zmm28, zmm31, zmm30
- vshufps zmm25, zmm8, zmm10, 221
- vshufps zmm30, zmm12, zmm14, 221
- vmovdqa32 zmm29, zmm25
- vpermt2d zmm25, zmm27, zmm30
- vpermt2d zmm29, zmm31, zmm30
- vshufps zmm26, zmm9, zmm11, 136
- vshufps zmm8, zmm13, zmm15, 136
- vmovdqa32 zmm30, zmm26
- vpermt2d zmm26, zmm27, zmm8
- vpermt2d zmm30, zmm31, zmm8
- vshufps zmm8, zmm9, zmm11, 221
- vshufps zmm10, zmm13, zmm15, 221
- vpermi2d zmm27, zmm8, zmm10
- vpermi2d zmm31, zmm8, zmm10
- vpbroadcastd zmm8, dword ptr [BLAKE3_IV_0+rip]
- vpbroadcastd zmm9, dword ptr [BLAKE3_IV_1+rip]
- vpbroadcastd zmm10, dword ptr [BLAKE3_IV_2+rip]
- vpbroadcastd zmm11, dword ptr [BLAKE3_IV_3+rip]
- vmovdqa32 zmm12, zmmword ptr [rsp]
- vmovdqa32 zmm13, zmmword ptr [rsp+0x1*0x40]
- vpbroadcastd zmm14, dword ptr [BLAKE3_BLOCK_LEN+rip]
- vpbroadcastd zmm15, dword ptr [rsp+0x22*0x4]
- vpaddd zmm0, zmm0, zmm16
- vpaddd zmm1, zmm1, zmm18
- vpaddd zmm2, zmm2, zmm20
- vpaddd zmm3, zmm3, zmm22
- vpaddd zmm0, zmm0, zmm4
- vpaddd zmm1, zmm1, zmm5
- vpaddd zmm2, zmm2, zmm6
- vpaddd zmm3, zmm3, zmm7
- vpxord zmm12, zmm12, zmm0
- vpxord zmm13, zmm13, zmm1
- vpxord zmm14, zmm14, zmm2
- vpxord zmm15, zmm15, zmm3
- vprord zmm12, zmm12, 16
- vprord zmm13, zmm13, 16
- vprord zmm14, zmm14, 16
- vprord zmm15, zmm15, 16
- vpaddd zmm8, zmm8, zmm12
- vpaddd zmm9, zmm9, zmm13
- vpaddd zmm10, zmm10, zmm14
- vpaddd zmm11, zmm11, zmm15
- vpxord zmm4, zmm4, zmm8
- vpxord zmm5, zmm5, zmm9
- vpxord zmm6, zmm6, zmm10
- vpxord zmm7, zmm7, zmm11
- vprord zmm4, zmm4, 12
- vprord zmm5, zmm5, 12
- vprord zmm6, zmm6, 12
- vprord zmm7, zmm7, 12
- vpaddd zmm0, zmm0, zmm17
- vpaddd zmm1, zmm1, zmm19
- vpaddd zmm2, zmm2, zmm21
- vpaddd zmm3, zmm3, zmm23
- vpaddd zmm0, zmm0, zmm4
- vpaddd zmm1, zmm1, zmm5
- vpaddd zmm2, zmm2, zmm6
- vpaddd zmm3, zmm3, zmm7
- vpxord zmm12, zmm12, zmm0
- vpxord zmm13, zmm13, zmm1
- vpxord zmm14, zmm14, zmm2
- vpxord zmm15, zmm15, zmm3
- vprord zmm12, zmm12, 8
- vprord zmm13, zmm13, 8
- vprord zmm14, zmm14, 8
- vprord zmm15, zmm15, 8
- vpaddd zmm8, zmm8, zmm12
- vpaddd zmm9, zmm9, zmm13
- vpaddd zmm10, zmm10, zmm14
- vpaddd zmm11, zmm11, zmm15
- vpxord zmm4, zmm4, zmm8
- vpxord zmm5, zmm5, zmm9
- vpxord zmm6, zmm6, zmm10
- vpxord zmm7, zmm7, zmm11
- vprord zmm4, zmm4, 7
- vprord zmm5, zmm5, 7
- vprord zmm6, zmm6, 7
- vprord zmm7, zmm7, 7
- vpaddd zmm0, zmm0, zmm24
- vpaddd zmm1, zmm1, zmm26
- vpaddd zmm2, zmm2, zmm28
- vpaddd zmm3, zmm3, zmm30
- vpaddd zmm0, zmm0, zmm5
- vpaddd zmm1, zmm1, zmm6
- vpaddd zmm2, zmm2, zmm7
- vpaddd zmm3, zmm3, zmm4
- vpxord zmm15, zmm15, zmm0
- vpxord zmm12, zmm12, zmm1
- vpxord zmm13, zmm13, zmm2
- vpxord zmm14, zmm14, zmm3
- vprord zmm15, zmm15, 16
- vprord zmm12, zmm12, 16
- vprord zmm13, zmm13, 16
- vprord zmm14, zmm14, 16
- vpaddd zmm10, zmm10, zmm15
- vpaddd zmm11, zmm11, zmm12
- vpaddd zmm8, zmm8, zmm13
- vpaddd zmm9, zmm9, zmm14
- vpxord zmm5, zmm5, zmm10
- vpxord zmm6, zmm6, zmm11
- vpxord zmm7, zmm7, zmm8
- vpxord zmm4, zmm4, zmm9
- vprord zmm5, zmm5, 12
- vprord zmm6, zmm6, 12
- vprord zmm7, zmm7, 12
- vprord zmm4, zmm4, 12
- vpaddd zmm0, zmm0, zmm25
- vpaddd zmm1, zmm1, zmm27
- vpaddd zmm2, zmm2, zmm29
- vpaddd zmm3, zmm3, zmm31
- vpaddd zmm0, zmm0, zmm5
- vpaddd zmm1, zmm1, zmm6
- vpaddd zmm2, zmm2, zmm7
- vpaddd zmm3, zmm3, zmm4
- vpxord zmm15, zmm15, zmm0
- vpxord zmm12, zmm12, zmm1
- vpxord zmm13, zmm13, zmm2
- vpxord zmm14, zmm14, zmm3
- vprord zmm15, zmm15, 8
- vprord zmm12, zmm12, 8
- vprord zmm13, zmm13, 8
- vprord zmm14, zmm14, 8
- vpaddd zmm10, zmm10, zmm15
- vpaddd zmm11, zmm11, zmm12
- vpaddd zmm8, zmm8, zmm13
- vpaddd zmm9, zmm9, zmm14
- vpxord zmm5, zmm5, zmm10
- vpxord zmm6, zmm6, zmm11
- vpxord zmm7, zmm7, zmm8
- vpxord zmm4, zmm4, zmm9
- vprord zmm5, zmm5, 7
- vprord zmm6, zmm6, 7
- vprord zmm7, zmm7, 7
- vprord zmm4, zmm4, 7
- vpaddd zmm0, zmm0, zmm18
- vpaddd zmm1, zmm1, zmm19
- vpaddd zmm2, zmm2, zmm23
- vpaddd zmm3, zmm3, zmm20
- vpaddd zmm0, zmm0, zmm4
- vpaddd zmm1, zmm1, zmm5
- vpaddd zmm2, zmm2, zmm6
- vpaddd zmm3, zmm3, zmm7
- vpxord zmm12, zmm12, zmm0
- vpxord zmm13, zmm13, zmm1
- vpxord zmm14, zmm14, zmm2
- vpxord zmm15, zmm15, zmm3
- vprord zmm12, zmm12, 16
- vprord zmm13, zmm13, 16
- vprord zmm14, zmm14, 16
- vprord zmm15, zmm15, 16
- vpaddd zmm8, zmm8, zmm12
- vpaddd zmm9, zmm9, zmm13
- vpaddd zmm10, zmm10, zmm14
- vpaddd zmm11, zmm11, zmm15
- vpxord zmm4, zmm4, zmm8
- vpxord zmm5, zmm5, zmm9
- vpxord zmm6, zmm6, zmm10
- vpxord zmm7, zmm7, zmm11
- vprord zmm4, zmm4, 12
- vprord zmm5, zmm5, 12
- vprord zmm6, zmm6, 12
- vprord zmm7, zmm7, 12
- vpaddd zmm0, zmm0, zmm22
- vpaddd zmm1, zmm1, zmm26
- vpaddd zmm2, zmm2, zmm16
- vpaddd zmm3, zmm3, zmm29
- vpaddd zmm0, zmm0, zmm4
- vpaddd zmm1, zmm1, zmm5
- vpaddd zmm2, zmm2, zmm6
- vpaddd zmm3, zmm3, zmm7
- vpxord zmm12, zmm12, zmm0
- vpxord zmm13, zmm13, zmm1
- vpxord zmm14, zmm14, zmm2
- vpxord zmm15, zmm15, zmm3
- vprord zmm12, zmm12, 8
- vprord zmm13, zmm13, 8
- vprord zmm14, zmm14, 8
- vprord zmm15, zmm15, 8
- vpaddd zmm8, zmm8, zmm12
- vpaddd zmm9, zmm9, zmm13
- vpaddd zmm10, zmm10, zmm14
- vpaddd zmm11, zmm11, zmm15
- vpxord zmm4, zmm4, zmm8
- vpxord zmm5, zmm5, zmm9
- vpxord zmm6, zmm6, zmm10
- vpxord zmm7, zmm7, zmm11
- vprord zmm4, zmm4, 7
- vprord zmm5, zmm5, 7
- vprord zmm6, zmm6, 7
- vprord zmm7, zmm7, 7
- vpaddd zmm0, zmm0, zmm17
- vpaddd zmm1, zmm1, zmm28
- vpaddd zmm2, zmm2, zmm25
- vpaddd zmm3, zmm3, zmm31
- vpaddd zmm0, zmm0, zmm5
- vpaddd zmm1, zmm1, zmm6
- vpaddd zmm2, zmm2, zmm7
- vpaddd zmm3, zmm3, zmm4
- vpxord zmm15, zmm15, zmm0
- vpxord zmm12, zmm12, zmm1
- vpxord zmm13, zmm13, zmm2
- vpxord zmm14, zmm14, zmm3
- vprord zmm15, zmm15, 16
- vprord zmm12, zmm12, 16
- vprord zmm13, zmm13, 16
- vprord zmm14, zmm14, 16
- vpaddd zmm10, zmm10, zmm15
- vpaddd zmm11, zmm11, zmm12
- vpaddd zmm8, zmm8, zmm13
- vpaddd zmm9, zmm9, zmm14
- vpxord zmm5, zmm5, zmm10
- vpxord zmm6, zmm6, zmm11
- vpxord zmm7, zmm7, zmm8
- vpxord zmm4, zmm4, zmm9
- vprord zmm5, zmm5, 12
- vprord zmm6, zmm6, 12
- vprord zmm7, zmm7, 12
- vprord zmm4, zmm4, 12
- vpaddd zmm0, zmm0, zmm27
- vpaddd zmm1, zmm1, zmm21
- vpaddd zmm2, zmm2, zmm30
- vpaddd zmm3, zmm3, zmm24
- vpaddd zmm0, zmm0, zmm5
- vpaddd zmm1, zmm1, zmm6
- vpaddd zmm2, zmm2, zmm7
- vpaddd zmm3, zmm3, zmm4
- vpxord zmm15, zmm15, zmm0
- vpxord zmm12, zmm12, zmm1
- vpxord zmm13, zmm13, zmm2
- vpxord zmm14, zmm14, zmm3
- vprord zmm15, zmm15, 8
- vprord zmm12, zmm12, 8
- vprord zmm13, zmm13, 8
- vprord zmm14, zmm14, 8
- vpaddd zmm10, zmm10, zmm15
- vpaddd zmm11, zmm11, zmm12
- vpaddd zmm8, zmm8, zmm13
- vpaddd zmm9, zmm9, zmm14
- vpxord zmm5, zmm5, zmm10
- vpxord zmm6, zmm6, zmm11
- vpxord zmm7, zmm7, zmm8
- vpxord zmm4, zmm4, zmm9
- vprord zmm5, zmm5, 7
- vprord zmm6, zmm6, 7
- vprord zmm7, zmm7, 7
- vprord zmm4, zmm4, 7
- vpaddd zmm0, zmm0, zmm19
- vpaddd zmm1, zmm1, zmm26
- vpaddd zmm2, zmm2, zmm29
- vpaddd zmm3, zmm3, zmm23
- vpaddd zmm0, zmm0, zmm4
- vpaddd zmm1, zmm1, zmm5
- vpaddd zmm2, zmm2, zmm6
- vpaddd zmm3, zmm3, zmm7
- vpxord zmm12, zmm12, zmm0
- vpxord zmm13, zmm13, zmm1
- vpxord zmm14, zmm14, zmm2
- vpxord zmm15, zmm15, zmm3
- vprord zmm12, zmm12, 16
- vprord zmm13, zmm13, 16
- vprord zmm14, zmm14, 16
- vprord zmm15, zmm15, 16
- vpaddd zmm8, zmm8, zmm12
- vpaddd zmm9, zmm9, zmm13
- vpaddd zmm10, zmm10, zmm14
- vpaddd zmm11, zmm11, zmm15
- vpxord zmm4, zmm4, zmm8
- vpxord zmm5, zmm5, zmm9
- vpxord zmm6, zmm6, zmm10
- vpxord zmm7, zmm7, zmm11
- vprord zmm4, zmm4, 12
- vprord zmm5, zmm5, 12
- vprord zmm6, zmm6, 12
- vprord zmm7, zmm7, 12
- vpaddd zmm0, zmm0, zmm20
- vpaddd zmm1, zmm1, zmm28
- vpaddd zmm2, zmm2, zmm18
- vpaddd zmm3, zmm3, zmm30
- vpaddd zmm0, zmm0, zmm4
- vpaddd zmm1, zmm1, zmm5
- vpaddd zmm2, zmm2, zmm6
- vpaddd zmm3, zmm3, zmm7
- vpxord zmm12, zmm12, zmm0
- vpxord zmm13, zmm13, zmm1
- vpxord zmm14, zmm14, zmm2
- vpxord zmm15, zmm15, zmm3
- vprord zmm12, zmm12, 8
- vprord zmm13, zmm13, 8
- vprord zmm14, zmm14, 8
- vprord zmm15, zmm15, 8
- vpaddd zmm8, zmm8, zmm12
- vpaddd zmm9, zmm9, zmm13
- vpaddd zmm10, zmm10, zmm14
- vpaddd zmm11, zmm11, zmm15
- vpxord zmm4, zmm4, zmm8
- vpxord zmm5, zmm5, zmm9
- vpxord zmm6, zmm6, zmm10
- vpxord zmm7, zmm7, zmm11
- vprord zmm4, zmm4, 7
- vprord zmm5, zmm5, 7
- vprord zmm6, zmm6, 7
- vprord zmm7, zmm7, 7
- vpaddd zmm0, zmm0, zmm22
- vpaddd zmm1, zmm1, zmm25
- vpaddd zmm2, zmm2, zmm27
- vpaddd zmm3, zmm3, zmm24
- vpaddd zmm0, zmm0, zmm5
- vpaddd zmm1, zmm1, zmm6
- vpaddd zmm2, zmm2, zmm7
- vpaddd zmm3, zmm3, zmm4
- vpxord zmm15, zmm15, zmm0
- vpxord zmm12, zmm12, zmm1
- vpxord zmm13, zmm13, zmm2
- vpxord zmm14, zmm14, zmm3
- vprord zmm15, zmm15, 16
- vprord zmm12, zmm12, 16
- vprord zmm13, zmm13, 16
- vprord zmm14, zmm14, 16
- vpaddd zmm10, zmm10, zmm15
- vpaddd zmm11, zmm11, zmm12
- vpaddd zmm8, zmm8, zmm13
- vpaddd zmm9, zmm9, zmm14
- vpxord zmm5, zmm5, zmm10
- vpxord zmm6, zmm6, zmm11
- vpxord zmm7, zmm7, zmm8
- vpxord zmm4, zmm4, zmm9
- vprord zmm5, zmm5, 12
- vprord zmm6, zmm6, 12
- vprord zmm7, zmm7, 12
- vprord zmm4, zmm4, 12
- vpaddd zmm0, zmm0, zmm21
- vpaddd zmm1, zmm1, zmm16
- vpaddd zmm2, zmm2, zmm31
- vpaddd zmm3, zmm3, zmm17
- vpaddd zmm0, zmm0, zmm5
- vpaddd zmm1, zmm1, zmm6
- vpaddd zmm2, zmm2, zmm7
- vpaddd zmm3, zmm3, zmm4
- vpxord zmm15, zmm15, zmm0
- vpxord zmm12, zmm12, zmm1
- vpxord zmm13, zmm13, zmm2
- vpxord zmm14, zmm14, zmm3
- vprord zmm15, zmm15, 8
- vprord zmm12, zmm12, 8
- vprord zmm13, zmm13, 8
- vprord zmm14, zmm14, 8
- vpaddd zmm10, zmm10, zmm15
- vpaddd zmm11, zmm11, zmm12
- vpaddd zmm8, zmm8, zmm13
- vpaddd zmm9, zmm9, zmm14
- vpxord zmm5, zmm5, zmm10
- vpxord zmm6, zmm6, zmm11
- vpxord zmm7, zmm7, zmm8
- vpxord zmm4, zmm4, zmm9
- vprord zmm5, zmm5, 7
- vprord zmm6, zmm6, 7
- vprord zmm7, zmm7, 7
- vprord zmm4, zmm4, 7
- vpaddd zmm0, zmm0, zmm26
- vpaddd zmm1, zmm1, zmm28
- vpaddd zmm2, zmm2, zmm30
- vpaddd zmm3, zmm3, zmm29
- vpaddd zmm0, zmm0, zmm4
- vpaddd zmm1, zmm1, zmm5
- vpaddd zmm2, zmm2, zmm6
- vpaddd zmm3, zmm3, zmm7
- vpxord zmm12, zmm12, zmm0
- vpxord zmm13, zmm13, zmm1
- vpxord zmm14, zmm14, zmm2
- vpxord zmm15, zmm15, zmm3
- vprord zmm12, zmm12, 16
- vprord zmm13, zmm13, 16
- vprord zmm14, zmm14, 16
- vprord zmm15, zmm15, 16
- vpaddd zmm8, zmm8, zmm12
- vpaddd zmm9, zmm9, zmm13
- vpaddd zmm10, zmm10, zmm14
- vpaddd zmm11, zmm11, zmm15
- vpxord zmm4, zmm4, zmm8
- vpxord zmm5, zmm5, zmm9
- vpxord zmm6, zmm6, zmm10
- vpxord zmm7, zmm7, zmm11
- vprord zmm4, zmm4, 12
- vprord zmm5, zmm5, 12
- vprord zmm6, zmm6, 12
- vprord zmm7, zmm7, 12
- vpaddd zmm0, zmm0, zmm23
- vpaddd zmm1, zmm1, zmm25
- vpaddd zmm2, zmm2, zmm19
- vpaddd zmm3, zmm3, zmm31
- vpaddd zmm0, zmm0, zmm4
- vpaddd zmm1, zmm1, zmm5
- vpaddd zmm2, zmm2, zmm6
- vpaddd zmm3, zmm3, zmm7
- vpxord zmm12, zmm12, zmm0
- vpxord zmm13, zmm13, zmm1
- vpxord zmm14, zmm14, zmm2
- vpxord zmm15, zmm15, zmm3
- vprord zmm12, zmm12, 8
- vprord zmm13, zmm13, 8
- vprord zmm14, zmm14, 8
- vprord zmm15, zmm15, 8
- vpaddd zmm8, zmm8, zmm12
- vpaddd zmm9, zmm9, zmm13
- vpaddd zmm10, zmm10, zmm14
- vpaddd zmm11, zmm11, zmm15
- vpxord zmm4, zmm4, zmm8
- vpxord zmm5, zmm5, zmm9
- vpxord zmm6, zmm6, zmm10
- vpxord zmm7, zmm7, zmm11
- vprord zmm4, zmm4, 7
- vprord zmm5, zmm5, 7
- vprord zmm6, zmm6, 7
- vprord zmm7, zmm7, 7
- vpaddd zmm0, zmm0, zmm20
- vpaddd zmm1, zmm1, zmm27
- vpaddd zmm2, zmm2, zmm21
- vpaddd zmm3, zmm3, zmm17
- vpaddd zmm0, zmm0, zmm5
- vpaddd zmm1, zmm1, zmm6
- vpaddd zmm2, zmm2, zmm7
- vpaddd zmm3, zmm3, zmm4
- vpxord zmm15, zmm15, zmm0
- vpxord zmm12, zmm12, zmm1
- vpxord zmm13, zmm13, zmm2
- vpxord zmm14, zmm14, zmm3
- vprord zmm15, zmm15, 16
- vprord zmm12, zmm12, 16
- vprord zmm13, zmm13, 16
- vprord zmm14, zmm14, 16
- vpaddd zmm10, zmm10, zmm15
- vpaddd zmm11, zmm11, zmm12
- vpaddd zmm8, zmm8, zmm13
- vpaddd zmm9, zmm9, zmm14
- vpxord zmm5, zmm5, zmm10
- vpxord zmm6, zmm6, zmm11
- vpxord zmm7, zmm7, zmm8
- vpxord zmm4, zmm4, zmm9
- vprord zmm5, zmm5, 12
- vprord zmm6, zmm6, 12
- vprord zmm7, zmm7, 12
- vprord zmm4, zmm4, 12
- vpaddd zmm0, zmm0, zmm16
- vpaddd zmm1, zmm1, zmm18
- vpaddd zmm2, zmm2, zmm24
- vpaddd zmm3, zmm3, zmm22
- vpaddd zmm0, zmm0, zmm5
- vpaddd zmm1, zmm1, zmm6
- vpaddd zmm2, zmm2, zmm7
- vpaddd zmm3, zmm3, zmm4
- vpxord zmm15, zmm15, zmm0
- vpxord zmm12, zmm12, zmm1
- vpxord zmm13, zmm13, zmm2
- vpxord zmm14, zmm14, zmm3
- vprord zmm15, zmm15, 8
- vprord zmm12, zmm12, 8
- vprord zmm13, zmm13, 8
- vprord zmm14, zmm14, 8
- vpaddd zmm10, zmm10, zmm15
- vpaddd zmm11, zmm11, zmm12
- vpaddd zmm8, zmm8, zmm13
- vpaddd zmm9, zmm9, zmm14
- vpxord zmm5, zmm5, zmm10
- vpxord zmm6, zmm6, zmm11
- vpxord zmm7, zmm7, zmm8
- vpxord zmm4, zmm4, zmm9
- vprord zmm5, zmm5, 7
- vprord zmm6, zmm6, 7
- vprord zmm7, zmm7, 7
- vprord zmm4, zmm4, 7
- vpaddd zmm0, zmm0, zmm28
- vpaddd zmm1, zmm1, zmm25
- vpaddd zmm2, zmm2, zmm31
- vpaddd zmm3, zmm3, zmm30
- vpaddd zmm0, zmm0, zmm4
- vpaddd zmm1, zmm1, zmm5
- vpaddd zmm2, zmm2, zmm6
- vpaddd zmm3, zmm3, zmm7
- vpxord zmm12, zmm12, zmm0
- vpxord zmm13, zmm13, zmm1
- vpxord zmm14, zmm14, zmm2
- vpxord zmm15, zmm15, zmm3
- vprord zmm12, zmm12, 16
- vprord zmm13, zmm13, 16
- vprord zmm14, zmm14, 16
- vprord zmm15, zmm15, 16
- vpaddd zmm8, zmm8, zmm12
- vpaddd zmm9, zmm9, zmm13
- vpaddd zmm10, zmm10, zmm14
- vpaddd zmm11, zmm11, zmm15
- vpxord zmm4, zmm4, zmm8
- vpxord zmm5, zmm5, zmm9
- vpxord zmm6, zmm6, zmm10
- vpxord zmm7, zmm7, zmm11
- vprord zmm4, zmm4, 12
- vprord zmm5, zmm5, 12
- vprord zmm6, zmm6, 12
- vprord zmm7, zmm7, 12
- vpaddd zmm0, zmm0, zmm29
- vpaddd zmm1, zmm1, zmm27
- vpaddd zmm2, zmm2, zmm26
- vpaddd zmm3, zmm3, zmm24
- vpaddd zmm0, zmm0, zmm4
- vpaddd zmm1, zmm1, zmm5
- vpaddd zmm2, zmm2, zmm6
- vpaddd zmm3, zmm3, zmm7
- vpxord zmm12, zmm12, zmm0
- vpxord zmm13, zmm13, zmm1
- vpxord zmm14, zmm14, zmm2
- vpxord zmm15, zmm15, zmm3
- vprord zmm12, zmm12, 8
- vprord zmm13, zmm13, 8
- vprord zmm14, zmm14, 8
- vprord zmm15, zmm15, 8
- vpaddd zmm8, zmm8, zmm12
- vpaddd zmm9, zmm9, zmm13
- vpaddd zmm10, zmm10, zmm14
- vpaddd zmm11, zmm11, zmm15
- vpxord zmm4, zmm4, zmm8
- vpxord zmm5, zmm5, zmm9
- vpxord zmm6, zmm6, zmm10
- vpxord zmm7, zmm7, zmm11
- vprord zmm4, zmm4, 7
- vprord zmm5, zmm5, 7
- vprord zmm6, zmm6, 7
- vprord zmm7, zmm7, 7
- vpaddd zmm0, zmm0, zmm23
- vpaddd zmm1, zmm1, zmm21
- vpaddd zmm2, zmm2, zmm16
- vpaddd zmm3, zmm3, zmm22
- vpaddd zmm0, zmm0, zmm5
- vpaddd zmm1, zmm1, zmm6
- vpaddd zmm2, zmm2, zmm7
- vpaddd zmm3, zmm3, zmm4
- vpxord zmm15, zmm15, zmm0
- vpxord zmm12, zmm12, zmm1
- vpxord zmm13, zmm13, zmm2
- vpxord zmm14, zmm14, zmm3
- vprord zmm15, zmm15, 16
- vprord zmm12, zmm12, 16
- vprord zmm13, zmm13, 16
- vprord zmm14, zmm14, 16
- vpaddd zmm10, zmm10, zmm15
- vpaddd zmm11, zmm11, zmm12
- vpaddd zmm8, zmm8, zmm13
- vpaddd zmm9, zmm9, zmm14
- vpxord zmm5, zmm5, zmm10
- vpxord zmm6, zmm6, zmm11
- vpxord zmm7, zmm7, zmm8
- vpxord zmm4, zmm4, zmm9
- vprord zmm5, zmm5, 12
- vprord zmm6, zmm6, 12
- vprord zmm7, zmm7, 12
- vprord zmm4, zmm4, 12
- vpaddd zmm0, zmm0, zmm18
- vpaddd zmm1, zmm1, zmm19
- vpaddd zmm2, zmm2, zmm17
- vpaddd zmm3, zmm3, zmm20
- vpaddd zmm0, zmm0, zmm5
- vpaddd zmm1, zmm1, zmm6
- vpaddd zmm2, zmm2, zmm7
- vpaddd zmm3, zmm3, zmm4
- vpxord zmm15, zmm15, zmm0
- vpxord zmm12, zmm12, zmm1
- vpxord zmm13, zmm13, zmm2
- vpxord zmm14, zmm14, zmm3
- vprord zmm15, zmm15, 8
- vprord zmm12, zmm12, 8
- vprord zmm13, zmm13, 8
- vprord zmm14, zmm14, 8
- vpaddd zmm10, zmm10, zmm15
- vpaddd zmm11, zmm11, zmm12
- vpaddd zmm8, zmm8, zmm13
- vpaddd zmm9, zmm9, zmm14
- vpxord zmm5, zmm5, zmm10
- vpxord zmm6, zmm6, zmm11
- vpxord zmm7, zmm7, zmm8
- vpxord zmm4, zmm4, zmm9
- vprord zmm5, zmm5, 7
- vprord zmm6, zmm6, 7
- vprord zmm7, zmm7, 7
- vprord zmm4, zmm4, 7
- vpaddd zmm0, zmm0, zmm25
- vpaddd zmm1, zmm1, zmm27
- vpaddd zmm2, zmm2, zmm24
- vpaddd zmm3, zmm3, zmm31
- vpaddd zmm0, zmm0, zmm4
- vpaddd zmm1, zmm1, zmm5
- vpaddd zmm2, zmm2, zmm6
- vpaddd zmm3, zmm3, zmm7
- vpxord zmm12, zmm12, zmm0
- vpxord zmm13, zmm13, zmm1
- vpxord zmm14, zmm14, zmm2
- vpxord zmm15, zmm15, zmm3
- vprord zmm12, zmm12, 16
- vprord zmm13, zmm13, 16
- vprord zmm14, zmm14, 16
- vprord zmm15, zmm15, 16
- vpaddd zmm8, zmm8, zmm12
- vpaddd zmm9, zmm9, zmm13
- vpaddd zmm10, zmm10, zmm14
- vpaddd zmm11, zmm11, zmm15
- vpxord zmm4, zmm4, zmm8
- vpxord zmm5, zmm5, zmm9
- vpxord zmm6, zmm6, zmm10
- vpxord zmm7, zmm7, zmm11
- vprord zmm4, zmm4, 12
- vprord zmm5, zmm5, 12
- vprord zmm6, zmm6, 12
- vprord zmm7, zmm7, 12
- vpaddd zmm0, zmm0, zmm30
- vpaddd zmm1, zmm1, zmm21
- vpaddd zmm2, zmm2, zmm28
- vpaddd zmm3, zmm3, zmm17
- vpaddd zmm0, zmm0, zmm4
- vpaddd zmm1, zmm1, zmm5
- vpaddd zmm2, zmm2, zmm6
- vpaddd zmm3, zmm3, zmm7
- vpxord zmm12, zmm12, zmm0
- vpxord zmm13, zmm13, zmm1
- vpxord zmm14, zmm14, zmm2
- vpxord zmm15, zmm15, zmm3
- vprord zmm12, zmm12, 8
- vprord zmm13, zmm13, 8
- vprord zmm14, zmm14, 8
- vprord zmm15, zmm15, 8
- vpaddd zmm8, zmm8, zmm12
- vpaddd zmm9, zmm9, zmm13
- vpaddd zmm10, zmm10, zmm14
- vpaddd zmm11, zmm11, zmm15
- vpxord zmm4, zmm4, zmm8
- vpxord zmm5, zmm5, zmm9
- vpxord zmm6, zmm6, zmm10
- vpxord zmm7, zmm7, zmm11
- vprord zmm4, zmm4, 7
- vprord zmm5, zmm5, 7
- vprord zmm6, zmm6, 7
- vprord zmm7, zmm7, 7
- vpaddd zmm0, zmm0, zmm29
- vpaddd zmm1, zmm1, zmm16
- vpaddd zmm2, zmm2, zmm18
- vpaddd zmm3, zmm3, zmm20
- vpaddd zmm0, zmm0, zmm5
- vpaddd zmm1, zmm1, zmm6
- vpaddd zmm2, zmm2, zmm7
- vpaddd zmm3, zmm3, zmm4
- vpxord zmm15, zmm15, zmm0
- vpxord zmm12, zmm12, zmm1
- vpxord zmm13, zmm13, zmm2
- vpxord zmm14, zmm14, zmm3
- vprord zmm15, zmm15, 16
- vprord zmm12, zmm12, 16
- vprord zmm13, zmm13, 16
- vprord zmm14, zmm14, 16
- vpaddd zmm10, zmm10, zmm15
- vpaddd zmm11, zmm11, zmm12
- vpaddd zmm8, zmm8, zmm13
- vpaddd zmm9, zmm9, zmm14
- vpxord zmm5, zmm5, zmm10
- vpxord zmm6, zmm6, zmm11
- vpxord zmm7, zmm7, zmm8
- vpxord zmm4, zmm4, zmm9
- vprord zmm5, zmm5, 12
- vprord zmm6, zmm6, 12
- vprord zmm7, zmm7, 12
- vprord zmm4, zmm4, 12
- vpaddd zmm0, zmm0, zmm19
- vpaddd zmm1, zmm1, zmm26
- vpaddd zmm2, zmm2, zmm22
- vpaddd zmm3, zmm3, zmm23
- vpaddd zmm0, zmm0, zmm5
- vpaddd zmm1, zmm1, zmm6
- vpaddd zmm2, zmm2, zmm7
- vpaddd zmm3, zmm3, zmm4
- vpxord zmm15, zmm15, zmm0
- vpxord zmm12, zmm12, zmm1
- vpxord zmm13, zmm13, zmm2
- vpxord zmm14, zmm14, zmm3
- vprord zmm15, zmm15, 8
- vprord zmm12, zmm12, 8
- vprord zmm13, zmm13, 8
- vprord zmm14, zmm14, 8
- vpaddd zmm10, zmm10, zmm15
- vpaddd zmm11, zmm11, zmm12
- vpaddd zmm8, zmm8, zmm13
- vpaddd zmm9, zmm9, zmm14
- vpxord zmm5, zmm5, zmm10
- vpxord zmm6, zmm6, zmm11
- vpxord zmm7, zmm7, zmm8
- vpxord zmm4, zmm4, zmm9
- vprord zmm5, zmm5, 7
- vprord zmm6, zmm6, 7
- vprord zmm7, zmm7, 7
- vprord zmm4, zmm4, 7
- vpaddd zmm0, zmm0, zmm27
- vpaddd zmm1, zmm1, zmm21
- vpaddd zmm2, zmm2, zmm17
- vpaddd zmm3, zmm3, zmm24
- vpaddd zmm0, zmm0, zmm4
- vpaddd zmm1, zmm1, zmm5
- vpaddd zmm2, zmm2, zmm6
- vpaddd zmm3, zmm3, zmm7
- vpxord zmm12, zmm12, zmm0
- vpxord zmm13, zmm13, zmm1
- vpxord zmm14, zmm14, zmm2
- vpxord zmm15, zmm15, zmm3
- vprord zmm12, zmm12, 16
- vprord zmm13, zmm13, 16
- vprord zmm14, zmm14, 16
- vprord zmm15, zmm15, 16
- vpaddd zmm8, zmm8, zmm12
- vpaddd zmm9, zmm9, zmm13
- vpaddd zmm10, zmm10, zmm14
- vpaddd zmm11, zmm11, zmm15
- vpxord zmm4, zmm4, zmm8
- vpxord zmm5, zmm5, zmm9
- vpxord zmm6, zmm6, zmm10
- vpxord zmm7, zmm7, zmm11
- vprord zmm4, zmm4, 12
- vprord zmm5, zmm5, 12
- vprord zmm6, zmm6, 12
- vprord zmm7, zmm7, 12
- vpaddd zmm0, zmm0, zmm31
- vpaddd zmm1, zmm1, zmm16
- vpaddd zmm2, zmm2, zmm25
- vpaddd zmm3, zmm3, zmm22
- vpaddd zmm0, zmm0, zmm4
- vpaddd zmm1, zmm1, zmm5
- vpaddd zmm2, zmm2, zmm6
- vpaddd zmm3, zmm3, zmm7
- vpxord zmm12, zmm12, zmm0
- vpxord zmm13, zmm13, zmm1
- vpxord zmm14, zmm14, zmm2
- vpxord zmm15, zmm15, zmm3
- vprord zmm12, zmm12, 8
- vprord zmm13, zmm13, 8
- vprord zmm14, zmm14, 8
- vprord zmm15, zmm15, 8
- vpaddd zmm8, zmm8, zmm12
- vpaddd zmm9, zmm9, zmm13
- vpaddd zmm10, zmm10, zmm14
- vpaddd zmm11, zmm11, zmm15
- vpxord zmm4, zmm4, zmm8
- vpxord zmm5, zmm5, zmm9
- vpxord zmm6, zmm6, zmm10
- vpxord zmm7, zmm7, zmm11
- vprord zmm4, zmm4, 7
- vprord zmm5, zmm5, 7
- vprord zmm6, zmm6, 7
- vprord zmm7, zmm7, 7
- vpaddd zmm0, zmm0, zmm30
- vpaddd zmm1, zmm1, zmm18
- vpaddd zmm2, zmm2, zmm19
- vpaddd zmm3, zmm3, zmm23
- vpaddd zmm0, zmm0, zmm5
- vpaddd zmm1, zmm1, zmm6
- vpaddd zmm2, zmm2, zmm7
- vpaddd zmm3, zmm3, zmm4
- vpxord zmm15, zmm15, zmm0
- vpxord zmm12, zmm12, zmm1
- vpxord zmm13, zmm13, zmm2
- vpxord zmm14, zmm14, zmm3
- vprord zmm15, zmm15, 16
- vprord zmm12, zmm12, 16
- vprord zmm13, zmm13, 16
- vprord zmm14, zmm14, 16
- vpaddd zmm10, zmm10, zmm15
- vpaddd zmm11, zmm11, zmm12
- vpaddd zmm8, zmm8, zmm13
- vpaddd zmm9, zmm9, zmm14
- vpxord zmm5, zmm5, zmm10
- vpxord zmm6, zmm6, zmm11
- vpxord zmm7, zmm7, zmm8
- vpxord zmm4, zmm4, zmm9
- vprord zmm5, zmm5, 12
- vprord zmm6, zmm6, 12
- vprord zmm7, zmm7, 12
- vprord zmm4, zmm4, 12
- vpaddd zmm0, zmm0, zmm26
- vpaddd zmm1, zmm1, zmm28
- vpaddd zmm2, zmm2, zmm20
- vpaddd zmm3, zmm3, zmm29
- vpaddd zmm0, zmm0, zmm5
- vpaddd zmm1, zmm1, zmm6
- vpaddd zmm2, zmm2, zmm7
- vpaddd zmm3, zmm3, zmm4
- vpxord zmm15, zmm15, zmm0
- vpxord zmm12, zmm12, zmm1
- vpxord zmm13, zmm13, zmm2
- vpxord zmm14, zmm14, zmm3
- vprord zmm15, zmm15, 8
- vprord zmm12, zmm12, 8
- vprord zmm13, zmm13, 8
- vprord zmm14, zmm14, 8
- vpaddd zmm10, zmm10, zmm15
- vpaddd zmm11, zmm11, zmm12
- vpaddd zmm8, zmm8, zmm13
- vpaddd zmm9, zmm9, zmm14
- vpxord zmm5, zmm5, zmm10
- vpxord zmm6, zmm6, zmm11
- vpxord zmm7, zmm7, zmm8
- vpxord zmm4, zmm4, zmm9
- vprord zmm5, zmm5, 7
- vprord zmm6, zmm6, 7
- vprord zmm7, zmm7, 7
- vprord zmm4, zmm4, 7
- vpxord zmm0, zmm0, zmm8
- vpxord zmm1, zmm1, zmm9
- vpxord zmm2, zmm2, zmm10
- vpxord zmm3, zmm3, zmm11
- vpxord zmm4, zmm4, zmm12
- vpxord zmm5, zmm5, zmm13
- vpxord zmm6, zmm6, zmm14
- vpxord zmm7, zmm7, zmm15
- movzx eax, byte ptr [rbp+0x78]
- jne 9b
- mov rbx, qword ptr [rbp+0x90]
- vpunpckldq zmm16, zmm0, zmm1
- vpunpckhdq zmm17, zmm0, zmm1
- vpunpckldq zmm18, zmm2, zmm3
- vpunpckhdq zmm19, zmm2, zmm3
- vpunpckldq zmm20, zmm4, zmm5
- vpunpckhdq zmm21, zmm4, zmm5
- vpunpckldq zmm22, zmm6, zmm7
- vpunpckhdq zmm23, zmm6, zmm7
- vpunpcklqdq zmm0, zmm16, zmm18
- vpunpckhqdq zmm1, zmm16, zmm18
- vpunpcklqdq zmm2, zmm17, zmm19
- vpunpckhqdq zmm3, zmm17, zmm19
- vpunpcklqdq zmm4, zmm20, zmm22
- vpunpckhqdq zmm5, zmm20, zmm22
- vpunpcklqdq zmm6, zmm21, zmm23
- vpunpckhqdq zmm7, zmm21, zmm23
- vshufi32x4 zmm16, zmm0, zmm4, 0x88
- vshufi32x4 zmm17, zmm1, zmm5, 0x88
- vshufi32x4 zmm18, zmm2, zmm6, 0x88
- vshufi32x4 zmm19, zmm3, zmm7, 0x88
- vshufi32x4 zmm20, zmm0, zmm4, 0xDD
- vshufi32x4 zmm21, zmm1, zmm5, 0xDD
- vshufi32x4 zmm22, zmm2, zmm6, 0xDD
- vshufi32x4 zmm23, zmm3, zmm7, 0xDD
- vshufi32x4 zmm0, zmm16, zmm17, 0x88
- vshufi32x4 zmm1, zmm18, zmm19, 0x88
- vshufi32x4 zmm2, zmm20, zmm21, 0x88
- vshufi32x4 zmm3, zmm22, zmm23, 0x88
- vshufi32x4 zmm4, zmm16, zmm17, 0xDD
- vshufi32x4 zmm5, zmm18, zmm19, 0xDD
- vshufi32x4 zmm6, zmm20, zmm21, 0xDD
- vshufi32x4 zmm7, zmm22, zmm23, 0xDD
- vmovdqu32 zmmword ptr [rbx], zmm0
- vmovdqu32 zmmword ptr [rbx+0x1*0x40], zmm1
- vmovdqu32 zmmword ptr [rbx+0x2*0x40], zmm2
- vmovdqu32 zmmword ptr [rbx+0x3*0x40], zmm3
- vmovdqu32 zmmword ptr [rbx+0x4*0x40], zmm4
- vmovdqu32 zmmword ptr [rbx+0x5*0x40], zmm5
- vmovdqu32 zmmword ptr [rbx+0x6*0x40], zmm6
- vmovdqu32 zmmword ptr [rbx+0x7*0x40], zmm7
- vmovdqa32 zmm0, zmmword ptr [rsp]
- vmovdqa32 zmm1, zmmword ptr [rsp+0x1*0x40]
- vmovdqa32 zmm2, zmm0
- vpaddd zmm2{k1}, zmm0, dword ptr [ADD16+rip] {1to16}
- vpcmpltud k2, zmm2, zmm0
- vpaddd zmm1 {k2}, zmm1, dword ptr [ADD1+rip] {1to16}
- vmovdqa32 zmmword ptr [rsp], zmm2
- vmovdqa32 zmmword ptr [rsp+0x1*0x40], zmm1
- add rdi, 128
- add rbx, 512
- mov qword ptr [rbp+0x90], rbx
- sub rsi, 16
- cmp rsi, 16
- jnc 2b
- test rsi, rsi
- jne 3f
-4:
- vzeroupper
- vmovdqa xmm6, xmmword ptr [rsp+0x90]
- vmovdqa xmm7, xmmword ptr [rsp+0xA0]
- vmovdqa xmm8, xmmword ptr [rsp+0xB0]
- vmovdqa xmm9, xmmword ptr [rsp+0xC0]
- vmovdqa xmm10, xmmword ptr [rsp+0xD0]
- vmovdqa xmm11, xmmword ptr [rsp+0xE0]
- vmovdqa xmm12, xmmword ptr [rsp+0xF0]
- vmovdqa xmm13, xmmword ptr [rsp+0x100]
- vmovdqa xmm14, xmmword ptr [rsp+0x110]
- vmovdqa xmm15, xmmword ptr [rsp+0x120]
- mov rsp, rbp
- pop rbp
- pop rbx
- pop rsi
- pop rdi
- pop r12
- pop r13
- pop r14
- pop r15
- ret
-.p2align 6
-3:
- test esi, 0x8
- je 3f
- vpbroadcastd ymm0, dword ptr [rcx]
- vpbroadcastd ymm1, dword ptr [rcx+0x4]
- vpbroadcastd ymm2, dword ptr [rcx+0x8]
- vpbroadcastd ymm3, dword ptr [rcx+0xC]
- vpbroadcastd ymm4, dword ptr [rcx+0x10]
- vpbroadcastd ymm5, dword ptr [rcx+0x14]
- vpbroadcastd ymm6, dword ptr [rcx+0x18]
- vpbroadcastd ymm7, dword ptr [rcx+0x1C]
- mov r8, qword ptr [rdi]
- mov r9, qword ptr [rdi+0x8]
- mov r10, qword ptr [rdi+0x10]
- mov r11, qword ptr [rdi+0x18]
- mov r12, qword ptr [rdi+0x20]
- mov r13, qword ptr [rdi+0x28]
- mov r14, qword ptr [rdi+0x30]
- mov r15, qword ptr [rdi+0x38]
- movzx eax, byte ptr [rbp+0x78]
- movzx ebx, byte ptr [rbp+0x80]
- or eax, ebx
- xor edx, edx
-2:
- movzx ebx, byte ptr [rbp+0x88]
- or ebx, eax
- add rdx, 64
- cmp rdx, qword ptr [rsp+0x80]
- cmove eax, ebx
- mov dword ptr [rsp+0x88], eax
- vmovups xmm8, xmmword ptr [r8+rdx-0x40]
- vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01
- vmovups xmm9, xmmword ptr [r9+rdx-0x40]
- vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01
- vunpcklpd ymm12, ymm8, ymm9
- vunpckhpd ymm13, ymm8, ymm9
- vmovups xmm10, xmmword ptr [r10+rdx-0x40]
- vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01
- vmovups xmm11, xmmword ptr [r11+rdx-0x40]
- vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01
- vunpcklpd ymm14, ymm10, ymm11
- vunpckhpd ymm15, ymm10, ymm11
- vshufps ymm16, ymm12, ymm14, 136
- vshufps ymm17, ymm12, ymm14, 221
- vshufps ymm18, ymm13, ymm15, 136
- vshufps ymm19, ymm13, ymm15, 221
- vmovups xmm8, xmmword ptr [r8+rdx-0x30]
- vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01
- vmovups xmm9, xmmword ptr [r9+rdx-0x30]
- vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01
- vunpcklpd ymm12, ymm8, ymm9
- vunpckhpd ymm13, ymm8, ymm9
- vmovups xmm10, xmmword ptr [r10+rdx-0x30]
- vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01
- vmovups xmm11, xmmword ptr [r11+rdx-0x30]
- vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01
- vunpcklpd ymm14, ymm10, ymm11
- vunpckhpd ymm15, ymm10, ymm11
- vshufps ymm20, ymm12, ymm14, 136
- vshufps ymm21, ymm12, ymm14, 221
- vshufps ymm22, ymm13, ymm15, 136
- vshufps ymm23, ymm13, ymm15, 221
- vmovups xmm8, xmmword ptr [r8+rdx-0x20]
- vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01
- vmovups xmm9, xmmword ptr [r9+rdx-0x20]
- vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01
- vunpcklpd ymm12, ymm8, ymm9
- vunpckhpd ymm13, ymm8, ymm9
- vmovups xmm10, xmmword ptr [r10+rdx-0x20]
- vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01
- vmovups xmm11, xmmword ptr [r11+rdx-0x20]
- vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01
- vunpcklpd ymm14, ymm10, ymm11
- vunpckhpd ymm15, ymm10, ymm11
- vshufps ymm24, ymm12, ymm14, 136
- vshufps ymm25, ymm12, ymm14, 221
- vshufps ymm26, ymm13, ymm15, 136
- vshufps ymm27, ymm13, ymm15, 221
- vmovups xmm8, xmmword ptr [r8+rdx-0x10]
- vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01
- vmovups xmm9, xmmword ptr [r9+rdx-0x10]
- vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01
- vunpcklpd ymm12, ymm8, ymm9
- vunpckhpd ymm13, ymm8, ymm9
- vmovups xmm10, xmmword ptr [r10+rdx-0x10]
- vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01
- vmovups xmm11, xmmword ptr [r11+rdx-0x10]
- vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01
- vunpcklpd ymm14, ymm10, ymm11
- vunpckhpd ymm15, ymm10, ymm11
- vshufps ymm28, ymm12, ymm14, 136
- vshufps ymm29, ymm12, ymm14, 221
- vshufps ymm30, ymm13, ymm15, 136
- vshufps ymm31, ymm13, ymm15, 221
- vpbroadcastd ymm8, dword ptr [BLAKE3_IV_0+rip]
- vpbroadcastd ymm9, dword ptr [BLAKE3_IV_1+rip]
- vpbroadcastd ymm10, dword ptr [BLAKE3_IV_2+rip]
- vpbroadcastd ymm11, dword ptr [BLAKE3_IV_3+rip]
- vmovdqa ymm12, ymmword ptr [rsp]
- vmovdqa ymm13, ymmword ptr [rsp+0x40]
- vpbroadcastd ymm14, dword ptr [BLAKE3_BLOCK_LEN+rip]
- vpbroadcastd ymm15, dword ptr [rsp+0x88]
- vpaddd ymm0, ymm0, ymm16
- vpaddd ymm1, ymm1, ymm18
- vpaddd ymm2, ymm2, ymm20
- vpaddd ymm3, ymm3, ymm22
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxord ymm12, ymm12, ymm0
- vpxord ymm13, ymm13, ymm1
- vpxord ymm14, ymm14, ymm2
- vpxord ymm15, ymm15, ymm3
- vprord ymm12, ymm12, 16
- vprord ymm13, ymm13, 16
- vprord ymm14, ymm14, 16
- vprord ymm15, ymm15, 16
- vpaddd ymm8, ymm8, ymm12
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxord ymm4, ymm4, ymm8
- vpxord ymm5, ymm5, ymm9
- vpxord ymm6, ymm6, ymm10
- vpxord ymm7, ymm7, ymm11
- vprord ymm4, ymm4, 12
- vprord ymm5, ymm5, 12
- vprord ymm6, ymm6, 12
- vprord ymm7, ymm7, 12
- vpaddd ymm0, ymm0, ymm17
- vpaddd ymm1, ymm1, ymm19
- vpaddd ymm2, ymm2, ymm21
- vpaddd ymm3, ymm3, ymm23
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxord ymm12, ymm12, ymm0
- vpxord ymm13, ymm13, ymm1
- vpxord ymm14, ymm14, ymm2
- vpxord ymm15, ymm15, ymm3
- vprord ymm12, ymm12, 8
- vprord ymm13, ymm13, 8
- vprord ymm14, ymm14, 8
- vprord ymm15, ymm15, 8
- vpaddd ymm8, ymm8, ymm12
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxord ymm4, ymm4, ymm8
- vpxord ymm5, ymm5, ymm9
- vpxord ymm6, ymm6, ymm10
- vpxord ymm7, ymm7, ymm11
- vprord ymm4, ymm4, 7
- vprord ymm5, ymm5, 7
- vprord ymm6, ymm6, 7
- vprord ymm7, ymm7, 7
- vpaddd ymm0, ymm0, ymm24
- vpaddd ymm1, ymm1, ymm26
- vpaddd ymm2, ymm2, ymm28
- vpaddd ymm3, ymm3, ymm30
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxord ymm15, ymm15, ymm0
- vpxord ymm12, ymm12, ymm1
- vpxord ymm13, ymm13, ymm2
- vpxord ymm14, ymm14, ymm3
- vprord ymm15, ymm15, 16
- vprord ymm12, ymm12, 16
- vprord ymm13, ymm13, 16
- vprord ymm14, ymm14, 16
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm8, ymm13
- vpaddd ymm9, ymm9, ymm14
- vpxord ymm5, ymm5, ymm10
- vpxord ymm6, ymm6, ymm11
- vpxord ymm7, ymm7, ymm8
- vpxord ymm4, ymm4, ymm9
- vprord ymm5, ymm5, 12
- vprord ymm6, ymm6, 12
- vprord ymm7, ymm7, 12
- vprord ymm4, ymm4, 12
- vpaddd ymm0, ymm0, ymm25
- vpaddd ymm1, ymm1, ymm27
- vpaddd ymm2, ymm2, ymm29
- vpaddd ymm3, ymm3, ymm31
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxord ymm15, ymm15, ymm0
- vpxord ymm12, ymm12, ymm1
- vpxord ymm13, ymm13, ymm2
- vpxord ymm14, ymm14, ymm3
- vprord ymm15, ymm15, 8
- vprord ymm12, ymm12, 8
- vprord ymm13, ymm13, 8
- vprord ymm14, ymm14, 8
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm8, ymm13
- vpaddd ymm9, ymm9, ymm14
- vpxord ymm5, ymm5, ymm10
- vpxord ymm6, ymm6, ymm11
- vpxord ymm7, ymm7, ymm8
- vpxord ymm4, ymm4, ymm9
- vprord ymm5, ymm5, 7
- vprord ymm6, ymm6, 7
- vprord ymm7, ymm7, 7
- vprord ymm4, ymm4, 7
- vpaddd ymm0, ymm0, ymm18
- vpaddd ymm1, ymm1, ymm19
- vpaddd ymm2, ymm2, ymm23
- vpaddd ymm3, ymm3, ymm20
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxord ymm12, ymm12, ymm0
- vpxord ymm13, ymm13, ymm1
- vpxord ymm14, ymm14, ymm2
- vpxord ymm15, ymm15, ymm3
- vprord ymm12, ymm12, 16
- vprord ymm13, ymm13, 16
- vprord ymm14, ymm14, 16
- vprord ymm15, ymm15, 16
- vpaddd ymm8, ymm8, ymm12
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxord ymm4, ymm4, ymm8
- vpxord ymm5, ymm5, ymm9
- vpxord ymm6, ymm6, ymm10
- vpxord ymm7, ymm7, ymm11
- vprord ymm4, ymm4, 12
- vprord ymm5, ymm5, 12
- vprord ymm6, ymm6, 12
- vprord ymm7, ymm7, 12
- vpaddd ymm0, ymm0, ymm22
- vpaddd ymm1, ymm1, ymm26
- vpaddd ymm2, ymm2, ymm16
- vpaddd ymm3, ymm3, ymm29
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxord ymm12, ymm12, ymm0
- vpxord ymm13, ymm13, ymm1
- vpxord ymm14, ymm14, ymm2
- vpxord ymm15, ymm15, ymm3
- vprord ymm12, ymm12, 8
- vprord ymm13, ymm13, 8
- vprord ymm14, ymm14, 8
- vprord ymm15, ymm15, 8
- vpaddd ymm8, ymm8, ymm12
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxord ymm4, ymm4, ymm8
- vpxord ymm5, ymm5, ymm9
- vpxord ymm6, ymm6, ymm10
- vpxord ymm7, ymm7, ymm11
- vprord ymm4, ymm4, 7
- vprord ymm5, ymm5, 7
- vprord ymm6, ymm6, 7
- vprord ymm7, ymm7, 7
- vpaddd ymm0, ymm0, ymm17
- vpaddd ymm1, ymm1, ymm28
- vpaddd ymm2, ymm2, ymm25
- vpaddd ymm3, ymm3, ymm31
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxord ymm15, ymm15, ymm0
- vpxord ymm12, ymm12, ymm1
- vpxord ymm13, ymm13, ymm2
- vpxord ymm14, ymm14, ymm3
- vprord ymm15, ymm15, 16
- vprord ymm12, ymm12, 16
- vprord ymm13, ymm13, 16
- vprord ymm14, ymm14, 16
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm8, ymm13
- vpaddd ymm9, ymm9, ymm14
- vpxord ymm5, ymm5, ymm10
- vpxord ymm6, ymm6, ymm11
- vpxord ymm7, ymm7, ymm8
- vpxord ymm4, ymm4, ymm9
- vprord ymm5, ymm5, 12
- vprord ymm6, ymm6, 12
- vprord ymm7, ymm7, 12
- vprord ymm4, ymm4, 12
- vpaddd ymm0, ymm0, ymm27
- vpaddd ymm1, ymm1, ymm21
- vpaddd ymm2, ymm2, ymm30
- vpaddd ymm3, ymm3, ymm24
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxord ymm15, ymm15, ymm0
- vpxord ymm12, ymm12, ymm1
- vpxord ymm13, ymm13, ymm2
- vpxord ymm14, ymm14, ymm3
- vprord ymm15, ymm15, 8
- vprord ymm12, ymm12, 8
- vprord ymm13, ymm13, 8
- vprord ymm14, ymm14, 8
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm8, ymm13
- vpaddd ymm9, ymm9, ymm14
- vpxord ymm5, ymm5, ymm10
- vpxord ymm6, ymm6, ymm11
- vpxord ymm7, ymm7, ymm8
- vpxord ymm4, ymm4, ymm9
- vprord ymm5, ymm5, 7
- vprord ymm6, ymm6, 7
- vprord ymm7, ymm7, 7
- vprord ymm4, ymm4, 7
- vpaddd ymm0, ymm0, ymm19
- vpaddd ymm1, ymm1, ymm26
- vpaddd ymm2, ymm2, ymm29
- vpaddd ymm3, ymm3, ymm23
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxord ymm12, ymm12, ymm0
- vpxord ymm13, ymm13, ymm1
- vpxord ymm14, ymm14, ymm2
- vpxord ymm15, ymm15, ymm3
- vprord ymm12, ymm12, 16
- vprord ymm13, ymm13, 16
- vprord ymm14, ymm14, 16
- vprord ymm15, ymm15, 16
- vpaddd ymm8, ymm8, ymm12
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxord ymm4, ymm4, ymm8
- vpxord ymm5, ymm5, ymm9
- vpxord ymm6, ymm6, ymm10
- vpxord ymm7, ymm7, ymm11
- vprord ymm4, ymm4, 12
- vprord ymm5, ymm5, 12
- vprord ymm6, ymm6, 12
- vprord ymm7, ymm7, 12
- vpaddd ymm0, ymm0, ymm20
- vpaddd ymm1, ymm1, ymm28
- vpaddd ymm2, ymm2, ymm18
- vpaddd ymm3, ymm3, ymm30
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxord ymm12, ymm12, ymm0
- vpxord ymm13, ymm13, ymm1
- vpxord ymm14, ymm14, ymm2
- vpxord ymm15, ymm15, ymm3
- vprord ymm12, ymm12, 8
- vprord ymm13, ymm13, 8
- vprord ymm14, ymm14, 8
- vprord ymm15, ymm15, 8
- vpaddd ymm8, ymm8, ymm12
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxord ymm4, ymm4, ymm8
- vpxord ymm5, ymm5, ymm9
- vpxord ymm6, ymm6, ymm10
- vpxord ymm7, ymm7, ymm11
- vprord ymm4, ymm4, 7
- vprord ymm5, ymm5, 7
- vprord ymm6, ymm6, 7
- vprord ymm7, ymm7, 7
- vpaddd ymm0, ymm0, ymm22
- vpaddd ymm1, ymm1, ymm25
- vpaddd ymm2, ymm2, ymm27
- vpaddd ymm3, ymm3, ymm24
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxord ymm15, ymm15, ymm0
- vpxord ymm12, ymm12, ymm1
- vpxord ymm13, ymm13, ymm2
- vpxord ymm14, ymm14, ymm3
- vprord ymm15, ymm15, 16
- vprord ymm12, ymm12, 16
- vprord ymm13, ymm13, 16
- vprord ymm14, ymm14, 16
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm8, ymm13
- vpaddd ymm9, ymm9, ymm14
- vpxord ymm5, ymm5, ymm10
- vpxord ymm6, ymm6, ymm11
- vpxord ymm7, ymm7, ymm8
- vpxord ymm4, ymm4, ymm9
- vprord ymm5, ymm5, 12
- vprord ymm6, ymm6, 12
- vprord ymm7, ymm7, 12
- vprord ymm4, ymm4, 12
- vpaddd ymm0, ymm0, ymm21
- vpaddd ymm1, ymm1, ymm16
- vpaddd ymm2, ymm2, ymm31
- vpaddd ymm3, ymm3, ymm17
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxord ymm15, ymm15, ymm0
- vpxord ymm12, ymm12, ymm1
- vpxord ymm13, ymm13, ymm2
- vpxord ymm14, ymm14, ymm3
- vprord ymm15, ymm15, 8
- vprord ymm12, ymm12, 8
- vprord ymm13, ymm13, 8
- vprord ymm14, ymm14, 8
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm8, ymm13
- vpaddd ymm9, ymm9, ymm14
- vpxord ymm5, ymm5, ymm10
- vpxord ymm6, ymm6, ymm11
- vpxord ymm7, ymm7, ymm8
- vpxord ymm4, ymm4, ymm9
- vprord ymm5, ymm5, 7
- vprord ymm6, ymm6, 7
- vprord ymm7, ymm7, 7
- vprord ymm4, ymm4, 7
- vpaddd ymm0, ymm0, ymm26
- vpaddd ymm1, ymm1, ymm28
- vpaddd ymm2, ymm2, ymm30
- vpaddd ymm3, ymm3, ymm29
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxord ymm12, ymm12, ymm0
- vpxord ymm13, ymm13, ymm1
- vpxord ymm14, ymm14, ymm2
- vpxord ymm15, ymm15, ymm3
- vprord ymm12, ymm12, 16
- vprord ymm13, ymm13, 16
- vprord ymm14, ymm14, 16
- vprord ymm15, ymm15, 16
- vpaddd ymm8, ymm8, ymm12
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxord ymm4, ymm4, ymm8
- vpxord ymm5, ymm5, ymm9
- vpxord ymm6, ymm6, ymm10
- vpxord ymm7, ymm7, ymm11
- vprord ymm4, ymm4, 12
- vprord ymm5, ymm5, 12
- vprord ymm6, ymm6, 12
- vprord ymm7, ymm7, 12
- vpaddd ymm0, ymm0, ymm23
- vpaddd ymm1, ymm1, ymm25
- vpaddd ymm2, ymm2, ymm19
- vpaddd ymm3, ymm3, ymm31
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxord ymm12, ymm12, ymm0
- vpxord ymm13, ymm13, ymm1
- vpxord ymm14, ymm14, ymm2
- vpxord ymm15, ymm15, ymm3
- vprord ymm12, ymm12, 8
- vprord ymm13, ymm13, 8
- vprord ymm14, ymm14, 8
- vprord ymm15, ymm15, 8
- vpaddd ymm8, ymm8, ymm12
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxord ymm4, ymm4, ymm8
- vpxord ymm5, ymm5, ymm9
- vpxord ymm6, ymm6, ymm10
- vpxord ymm7, ymm7, ymm11
- vprord ymm4, ymm4, 7
- vprord ymm5, ymm5, 7
- vprord ymm6, ymm6, 7
- vprord ymm7, ymm7, 7
- vpaddd ymm0, ymm0, ymm20
- vpaddd ymm1, ymm1, ymm27
- vpaddd ymm2, ymm2, ymm21
- vpaddd ymm3, ymm3, ymm17
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxord ymm15, ymm15, ymm0
- vpxord ymm12, ymm12, ymm1
- vpxord ymm13, ymm13, ymm2
- vpxord ymm14, ymm14, ymm3
- vprord ymm15, ymm15, 16
- vprord ymm12, ymm12, 16
- vprord ymm13, ymm13, 16
- vprord ymm14, ymm14, 16
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm8, ymm13
- vpaddd ymm9, ymm9, ymm14
- vpxord ymm5, ymm5, ymm10
- vpxord ymm6, ymm6, ymm11
- vpxord ymm7, ymm7, ymm8
- vpxord ymm4, ymm4, ymm9
- vprord ymm5, ymm5, 12
- vprord ymm6, ymm6, 12
- vprord ymm7, ymm7, 12
- vprord ymm4, ymm4, 12
- vpaddd ymm0, ymm0, ymm16
- vpaddd ymm1, ymm1, ymm18
- vpaddd ymm2, ymm2, ymm24
- vpaddd ymm3, ymm3, ymm22
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxord ymm15, ymm15, ymm0
- vpxord ymm12, ymm12, ymm1
- vpxord ymm13, ymm13, ymm2
- vpxord ymm14, ymm14, ymm3
- vprord ymm15, ymm15, 8
- vprord ymm12, ymm12, 8
- vprord ymm13, ymm13, 8
- vprord ymm14, ymm14, 8
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm8, ymm13
- vpaddd ymm9, ymm9, ymm14
- vpxord ymm5, ymm5, ymm10
- vpxord ymm6, ymm6, ymm11
- vpxord ymm7, ymm7, ymm8
- vpxord ymm4, ymm4, ymm9
- vprord ymm5, ymm5, 7
- vprord ymm6, ymm6, 7
- vprord ymm7, ymm7, 7
- vprord ymm4, ymm4, 7
- vpaddd ymm0, ymm0, ymm28
- vpaddd ymm1, ymm1, ymm25
- vpaddd ymm2, ymm2, ymm31
- vpaddd ymm3, ymm3, ymm30
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxord ymm12, ymm12, ymm0
- vpxord ymm13, ymm13, ymm1
- vpxord ymm14, ymm14, ymm2
- vpxord ymm15, ymm15, ymm3
- vprord ymm12, ymm12, 16
- vprord ymm13, ymm13, 16
- vprord ymm14, ymm14, 16
- vprord ymm15, ymm15, 16
- vpaddd ymm8, ymm8, ymm12
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxord ymm4, ymm4, ymm8
- vpxord ymm5, ymm5, ymm9
- vpxord ymm6, ymm6, ymm10
- vpxord ymm7, ymm7, ymm11
- vprord ymm4, ymm4, 12
- vprord ymm5, ymm5, 12
- vprord ymm6, ymm6, 12
- vprord ymm7, ymm7, 12
- vpaddd ymm0, ymm0, ymm29
- vpaddd ymm1, ymm1, ymm27
- vpaddd ymm2, ymm2, ymm26
- vpaddd ymm3, ymm3, ymm24
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxord ymm12, ymm12, ymm0
- vpxord ymm13, ymm13, ymm1
- vpxord ymm14, ymm14, ymm2
- vpxord ymm15, ymm15, ymm3
- vprord ymm12, ymm12, 8
- vprord ymm13, ymm13, 8
- vprord ymm14, ymm14, 8
- vprord ymm15, ymm15, 8
- vpaddd ymm8, ymm8, ymm12
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxord ymm4, ymm4, ymm8
- vpxord ymm5, ymm5, ymm9
- vpxord ymm6, ymm6, ymm10
- vpxord ymm7, ymm7, ymm11
- vprord ymm4, ymm4, 7
- vprord ymm5, ymm5, 7
- vprord ymm6, ymm6, 7
- vprord ymm7, ymm7, 7
- vpaddd ymm0, ymm0, ymm23
- vpaddd ymm1, ymm1, ymm21
- vpaddd ymm2, ymm2, ymm16
- vpaddd ymm3, ymm3, ymm22
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxord ymm15, ymm15, ymm0
- vpxord ymm12, ymm12, ymm1
- vpxord ymm13, ymm13, ymm2
- vpxord ymm14, ymm14, ymm3
- vprord ymm15, ymm15, 16
- vprord ymm12, ymm12, 16
- vprord ymm13, ymm13, 16
- vprord ymm14, ymm14, 16
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm8, ymm13
- vpaddd ymm9, ymm9, ymm14
- vpxord ymm5, ymm5, ymm10
- vpxord ymm6, ymm6, ymm11
- vpxord ymm7, ymm7, ymm8
- vpxord ymm4, ymm4, ymm9
- vprord ymm5, ymm5, 12
- vprord ymm6, ymm6, 12
- vprord ymm7, ymm7, 12
- vprord ymm4, ymm4, 12
- vpaddd ymm0, ymm0, ymm18
- vpaddd ymm1, ymm1, ymm19
- vpaddd ymm2, ymm2, ymm17
- vpaddd ymm3, ymm3, ymm20
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxord ymm15, ymm15, ymm0
- vpxord ymm12, ymm12, ymm1
- vpxord ymm13, ymm13, ymm2
- vpxord ymm14, ymm14, ymm3
- vprord ymm15, ymm15, 8
- vprord ymm12, ymm12, 8
- vprord ymm13, ymm13, 8
- vprord ymm14, ymm14, 8
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm8, ymm13
- vpaddd ymm9, ymm9, ymm14
- vpxord ymm5, ymm5, ymm10
- vpxord ymm6, ymm6, ymm11
- vpxord ymm7, ymm7, ymm8
- vpxord ymm4, ymm4, ymm9
- vprord ymm5, ymm5, 7
- vprord ymm6, ymm6, 7
- vprord ymm7, ymm7, 7
- vprord ymm4, ymm4, 7
- vpaddd ymm0, ymm0, ymm25
- vpaddd ymm1, ymm1, ymm27
- vpaddd ymm2, ymm2, ymm24
- vpaddd ymm3, ymm3, ymm31
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxord ymm12, ymm12, ymm0
- vpxord ymm13, ymm13, ymm1
- vpxord ymm14, ymm14, ymm2
- vpxord ymm15, ymm15, ymm3
- vprord ymm12, ymm12, 16
- vprord ymm13, ymm13, 16
- vprord ymm14, ymm14, 16
- vprord ymm15, ymm15, 16
- vpaddd ymm8, ymm8, ymm12
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxord ymm4, ymm4, ymm8
- vpxord ymm5, ymm5, ymm9
- vpxord ymm6, ymm6, ymm10
- vpxord ymm7, ymm7, ymm11
- vprord ymm4, ymm4, 12
- vprord ymm5, ymm5, 12
- vprord ymm6, ymm6, 12
- vprord ymm7, ymm7, 12
- vpaddd ymm0, ymm0, ymm30
- vpaddd ymm1, ymm1, ymm21
- vpaddd ymm2, ymm2, ymm28
- vpaddd ymm3, ymm3, ymm17
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxord ymm12, ymm12, ymm0
- vpxord ymm13, ymm13, ymm1
- vpxord ymm14, ymm14, ymm2
- vpxord ymm15, ymm15, ymm3
- vprord ymm12, ymm12, 8
- vprord ymm13, ymm13, 8
- vprord ymm14, ymm14, 8
- vprord ymm15, ymm15, 8
- vpaddd ymm8, ymm8, ymm12
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxord ymm4, ymm4, ymm8
- vpxord ymm5, ymm5, ymm9
- vpxord ymm6, ymm6, ymm10
- vpxord ymm7, ymm7, ymm11
- vprord ymm4, ymm4, 7
- vprord ymm5, ymm5, 7
- vprord ymm6, ymm6, 7
- vprord ymm7, ymm7, 7
- vpaddd ymm0, ymm0, ymm29
- vpaddd ymm1, ymm1, ymm16
- vpaddd ymm2, ymm2, ymm18
- vpaddd ymm3, ymm3, ymm20
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxord ymm15, ymm15, ymm0
- vpxord ymm12, ymm12, ymm1
- vpxord ymm13, ymm13, ymm2
- vpxord ymm14, ymm14, ymm3
- vprord ymm15, ymm15, 16
- vprord ymm12, ymm12, 16
- vprord ymm13, ymm13, 16
- vprord ymm14, ymm14, 16
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm8, ymm13
- vpaddd ymm9, ymm9, ymm14
- vpxord ymm5, ymm5, ymm10
- vpxord ymm6, ymm6, ymm11
- vpxord ymm7, ymm7, ymm8
- vpxord ymm4, ymm4, ymm9
- vprord ymm5, ymm5, 12
- vprord ymm6, ymm6, 12
- vprord ymm7, ymm7, 12
- vprord ymm4, ymm4, 12
- vpaddd ymm0, ymm0, ymm19
- vpaddd ymm1, ymm1, ymm26
- vpaddd ymm2, ymm2, ymm22
- vpaddd ymm3, ymm3, ymm23
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxord ymm15, ymm15, ymm0
- vpxord ymm12, ymm12, ymm1
- vpxord ymm13, ymm13, ymm2
- vpxord ymm14, ymm14, ymm3
- vprord ymm15, ymm15, 8
- vprord ymm12, ymm12, 8
- vprord ymm13, ymm13, 8
- vprord ymm14, ymm14, 8
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm8, ymm13
- vpaddd ymm9, ymm9, ymm14
- vpxord ymm5, ymm5, ymm10
- vpxord ymm6, ymm6, ymm11
- vpxord ymm7, ymm7, ymm8
- vpxord ymm4, ymm4, ymm9
- vprord ymm5, ymm5, 7
- vprord ymm6, ymm6, 7
- vprord ymm7, ymm7, 7
- vprord ymm4, ymm4, 7
- vpaddd ymm0, ymm0, ymm27
- vpaddd ymm1, ymm1, ymm21
- vpaddd ymm2, ymm2, ymm17
- vpaddd ymm3, ymm3, ymm24
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxord ymm12, ymm12, ymm0
- vpxord ymm13, ymm13, ymm1
- vpxord ymm14, ymm14, ymm2
- vpxord ymm15, ymm15, ymm3
- vprord ymm12, ymm12, 16
- vprord ymm13, ymm13, 16
- vprord ymm14, ymm14, 16
- vprord ymm15, ymm15, 16
- vpaddd ymm8, ymm8, ymm12
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxord ymm4, ymm4, ymm8
- vpxord ymm5, ymm5, ymm9
- vpxord ymm6, ymm6, ymm10
- vpxord ymm7, ymm7, ymm11
- vprord ymm4, ymm4, 12
- vprord ymm5, ymm5, 12
- vprord ymm6, ymm6, 12
- vprord ymm7, ymm7, 12
- vpaddd ymm0, ymm0, ymm31
- vpaddd ymm1, ymm1, ymm16
- vpaddd ymm2, ymm2, ymm25
- vpaddd ymm3, ymm3, ymm22
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxord ymm12, ymm12, ymm0
- vpxord ymm13, ymm13, ymm1
- vpxord ymm14, ymm14, ymm2
- vpxord ymm15, ymm15, ymm3
- vprord ymm12, ymm12, 8
- vprord ymm13, ymm13, 8
- vprord ymm14, ymm14, 8
- vprord ymm15, ymm15, 8
- vpaddd ymm8, ymm8, ymm12
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxord ymm4, ymm4, ymm8
- vpxord ymm5, ymm5, ymm9
- vpxord ymm6, ymm6, ymm10
- vpxord ymm7, ymm7, ymm11
- vprord ymm4, ymm4, 7
- vprord ymm5, ymm5, 7
- vprord ymm6, ymm6, 7
- vprord ymm7, ymm7, 7
- vpaddd ymm0, ymm0, ymm30
- vpaddd ymm1, ymm1, ymm18
- vpaddd ymm2, ymm2, ymm19
- vpaddd ymm3, ymm3, ymm23
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxord ymm15, ymm15, ymm0
- vpxord ymm12, ymm12, ymm1
- vpxord ymm13, ymm13, ymm2
- vpxord ymm14, ymm14, ymm3
- vprord ymm15, ymm15, 16
- vprord ymm12, ymm12, 16
- vprord ymm13, ymm13, 16
- vprord ymm14, ymm14, 16
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm8, ymm13
- vpaddd ymm9, ymm9, ymm14
- vpxord ymm5, ymm5, ymm10
- vpxord ymm6, ymm6, ymm11
- vpxord ymm7, ymm7, ymm8
- vpxord ymm4, ymm4, ymm9
- vprord ymm5, ymm5, 12
- vprord ymm6, ymm6, 12
- vprord ymm7, ymm7, 12
- vprord ymm4, ymm4, 12
- vpaddd ymm0, ymm0, ymm26
- vpaddd ymm1, ymm1, ymm28
- vpaddd ymm2, ymm2, ymm20
- vpaddd ymm3, ymm3, ymm29
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxord ymm15, ymm15, ymm0
- vpxord ymm12, ymm12, ymm1
- vpxord ymm13, ymm13, ymm2
- vpxord ymm14, ymm14, ymm3
- vprord ymm15, ymm15, 8
- vprord ymm12, ymm12, 8
- vprord ymm13, ymm13, 8
- vprord ymm14, ymm14, 8
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm8, ymm13
- vpaddd ymm9, ymm9, ymm14
- vpxord ymm5, ymm5, ymm10
- vpxord ymm6, ymm6, ymm11
- vpxord ymm7, ymm7, ymm8
- vpxord ymm4, ymm4, ymm9
- vprord ymm5, ymm5, 7
- vprord ymm6, ymm6, 7
- vprord ymm7, ymm7, 7
- vprord ymm4, ymm4, 7
- vpxor ymm0, ymm0, ymm8
- vpxor ymm1, ymm1, ymm9
- vpxor ymm2, ymm2, ymm10
- vpxor ymm3, ymm3, ymm11
- vpxor ymm4, ymm4, ymm12
- vpxor ymm5, ymm5, ymm13
- vpxor ymm6, ymm6, ymm14
- vpxor ymm7, ymm7, ymm15
- movzx eax, byte ptr [rbp+0x78]
- jne 2b
- mov rbx, qword ptr [rbp+0x90]
- vunpcklps ymm8, ymm0, ymm1
- vunpcklps ymm9, ymm2, ymm3
- vunpckhps ymm10, ymm0, ymm1
- vunpcklps ymm11, ymm4, ymm5
- vunpcklps ymm0, ymm6, ymm7
- vshufps ymm12, ymm8, ymm9, 78
- vblendps ymm1, ymm8, ymm12, 0xCC
- vshufps ymm8, ymm11, ymm0, 78
- vunpckhps ymm13, ymm2, ymm3
- vblendps ymm2, ymm11, ymm8, 0xCC
- vblendps ymm3, ymm12, ymm9, 0xCC
- vperm2f128 ymm12, ymm1, ymm2, 0x20
- vmovups ymmword ptr [rbx], ymm12
- vunpckhps ymm14, ymm4, ymm5
- vblendps ymm4, ymm8, ymm0, 0xCC
- vunpckhps ymm15, ymm6, ymm7
- vperm2f128 ymm7, ymm3, ymm4, 0x20
- vmovups ymmword ptr [rbx+0x20], ymm7
- vshufps ymm5, ymm10, ymm13, 78
- vblendps ymm6, ymm5, ymm13, 0xCC
- vshufps ymm13, ymm14, ymm15, 78
- vblendps ymm10, ymm10, ymm5, 0xCC
- vblendps ymm14, ymm14, ymm13, 0xCC
- vperm2f128 ymm8, ymm10, ymm14, 0x20
- vmovups ymmword ptr [rbx+0x40], ymm8
- vblendps ymm15, ymm13, ymm15, 0xCC
- vperm2f128 ymm13, ymm6, ymm15, 0x20
- vmovups ymmword ptr [rbx+0x60], ymm13
- vperm2f128 ymm9, ymm1, ymm2, 0x31
- vperm2f128 ymm11, ymm3, ymm4, 0x31
- vmovups ymmword ptr [rbx+0x80], ymm9
- vperm2f128 ymm14, ymm10, ymm14, 0x31
- vperm2f128 ymm15, ymm6, ymm15, 0x31
- vmovups ymmword ptr [rbx+0xA0], ymm11
- vmovups ymmword ptr [rbx+0xC0], ymm14
- vmovups ymmword ptr [rbx+0xE0], ymm15
- vmovdqa ymm0, ymmword ptr [rsp]
- vmovdqa ymm2, ymmword ptr [rsp+0x40]
- vmovdqa32 ymm0 {k1}, ymmword ptr [rsp+0x1*0x20]
- vmovdqa32 ymm2 {k1}, ymmword ptr [rsp+0x3*0x20]
- vmovdqa ymmword ptr [rsp], ymm0
- vmovdqa ymmword ptr [rsp+0x40], ymm2
- add rbx, 256
- mov qword ptr [rbp+0x90], rbx
- add rdi, 64
- sub rsi, 8
-3:
- mov rbx, qword ptr [rbp+0x90]
- mov r15, qword ptr [rsp+0x80]
- movzx r13, byte ptr [rbp+0x78]
- movzx r12, byte ptr [rbp+0x88]
- test esi, 0x4
- je 3f
- vbroadcasti32x4 zmm0, xmmword ptr [rcx]
- vbroadcasti32x4 zmm1, xmmword ptr [rcx+0x1*0x10]
- vmovdqa xmm12, xmmword ptr [rsp]
- vmovdqa xmm13, xmmword ptr [rsp+0x40]
- vpunpckldq xmm14, xmm12, xmm13
- vpunpckhdq xmm15, xmm12, xmm13
- vpermq ymm14, ymm14, 0xDC
- vpermq ymm15, ymm15, 0xDC
- vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN+rip]
- vinserti64x4 zmm13, zmm14, ymm15, 0x01
- mov eax, 17476
- kmovw k2, eax
- vpblendmd zmm13 {k2}, zmm13, zmm12
- vbroadcasti32x4 zmm15, xmmword ptr [BLAKE3_IV+rip]
- mov r8, qword ptr [rdi]
- mov r9, qword ptr [rdi+0x8]
- mov r10, qword ptr [rdi+0x10]
- mov r11, qword ptr [rdi+0x18]
- mov eax, 43690
- kmovw k3, eax
- mov eax, 34952
- kmovw k4, eax
- movzx eax, byte ptr [rbp+0x80]
- or eax, r13d
- xor edx, edx
-.p2align 5
-2:
- mov r14d, eax
- or eax, r12d
- add rdx, 64
- cmp rdx, r15
- cmovne eax, r14d
- mov dword ptr [rsp+0x88], eax
- vmovdqa32 zmm2, zmm15
- vpbroadcastd zmm8, dword ptr [rsp+0x22*0x4]
- vpblendmd zmm3 {k4}, zmm13, zmm8
- vmovups zmm8, zmmword ptr [r8+rdx-0x1*0x40]
- vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-0x4*0x10], 0x01
- vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-0x4*0x10], 0x02
- vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-0x4*0x10], 0x03
- vmovups zmm9, zmmword ptr [r8+rdx-0x30]
- vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-0x3*0x10], 0x01
- vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-0x3*0x10], 0x02
- vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-0x3*0x10], 0x03
- vshufps zmm4, zmm8, zmm9, 136
- vshufps zmm5, zmm8, zmm9, 221
- vmovups zmm8, zmmword ptr [r8+rdx-0x20]
- vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-0x2*0x10], 0x01
- vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-0x2*0x10], 0x02
- vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-0x2*0x10], 0x03
- vmovups zmm9, zmmword ptr [r8+rdx-0x10]
- vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-0x1*0x10], 0x01
- vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-0x1*0x10], 0x02
- vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-0x1*0x10], 0x03
- vshufps zmm6, zmm8, zmm9, 136
- vshufps zmm7, zmm8, zmm9, 221
- vpshufd zmm6, zmm6, 0x93
- vpshufd zmm7, zmm7, 0x93
- mov al, 7
-9:
- vpaddd zmm0, zmm0, zmm4
- vpaddd zmm0, zmm0, zmm1
- vpxord zmm3, zmm3, zmm0
- vprord zmm3, zmm3, 16
- vpaddd zmm2, zmm2, zmm3
- vpxord zmm1, zmm1, zmm2
- vprord zmm1, zmm1, 12
- vpaddd zmm0, zmm0, zmm5
- vpaddd zmm0, zmm0, zmm1
- vpxord zmm3, zmm3, zmm0
- vprord zmm3, zmm3, 8
- vpaddd zmm2, zmm2, zmm3
- vpxord zmm1, zmm1, zmm2
- vprord zmm1, zmm1, 7
- vpshufd zmm0, zmm0, 0x93
- vpshufd zmm3, zmm3, 0x4E
- vpshufd zmm2, zmm2, 0x39
- vpaddd zmm0, zmm0, zmm6
- vpaddd zmm0, zmm0, zmm1
- vpxord zmm3, zmm3, zmm0
- vprord zmm3, zmm3, 16
- vpaddd zmm2, zmm2, zmm3
- vpxord zmm1, zmm1, zmm2
- vprord zmm1, zmm1, 12
- vpaddd zmm0, zmm0, zmm7
- vpaddd zmm0, zmm0, zmm1
- vpxord zmm3, zmm3, zmm0
- vprord zmm3, zmm3, 8
- vpaddd zmm2, zmm2, zmm3
- vpxord zmm1, zmm1, zmm2
- vprord zmm1, zmm1, 7
- vpshufd zmm0, zmm0, 0x39
- vpshufd zmm3, zmm3, 0x4E
- vpshufd zmm2, zmm2, 0x93
- dec al
- jz 9f
- vshufps zmm8, zmm4, zmm5, 214
- vpshufd zmm9, zmm4, 0x0F
- vpshufd zmm4, zmm8, 0x39
- vshufps zmm8, zmm6, zmm7, 250
- vpblendmd zmm9 {k3}, zmm9, zmm8
- vpunpcklqdq zmm8, zmm7, zmm5
- vpblendmd zmm8 {k4}, zmm8, zmm6
- vpshufd zmm8, zmm8, 0x78
- vpunpckhdq zmm5, zmm5, zmm7
- vpunpckldq zmm6, zmm6, zmm5
- vpshufd zmm7, zmm6, 0x1E
- vmovdqa32 zmm5, zmm9
- vmovdqa32 zmm6, zmm8
- jmp 9b
-9:
- vpxord zmm0, zmm0, zmm2
- vpxord zmm1, zmm1, zmm3
- mov eax, r13d
- cmp rdx, r15
- jne 2b
- vmovdqu xmmword ptr [rbx], xmm0
- vmovdqu xmmword ptr [rbx+0x10], xmm1
- vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01
- vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01
- vextracti32x4 xmmword ptr [rbx+0x4*0x10], zmm0, 0x02
- vextracti32x4 xmmword ptr [rbx+0x5*0x10], zmm1, 0x02
- vextracti32x4 xmmword ptr [rbx+0x6*0x10], zmm0, 0x03
- vextracti32x4 xmmword ptr [rbx+0x7*0x10], zmm1, 0x03
- vmovdqa xmm0, xmmword ptr [rsp]
- vmovdqa xmm2, xmmword ptr [rsp+0x40]
- vmovdqa32 xmm0 {k1}, xmmword ptr [rsp+0x1*0x10]
- vmovdqa32 xmm2 {k1}, xmmword ptr [rsp+0x5*0x10]
- vmovdqa xmmword ptr [rsp], xmm0
- vmovdqa xmmword ptr [rsp+0x40], xmm2
- add rbx, 128
- add rdi, 32
- sub rsi, 4
-3:
- test esi, 0x2
- je 3f
- vbroadcasti128 ymm0, xmmword ptr [rcx]
- vbroadcasti128 ymm1, xmmword ptr [rcx+0x10]
- vmovd xmm13, dword ptr [rsp]
- vpinsrd xmm13, xmm13, dword ptr [rsp+0x40], 1
- vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
- vmovd xmm14, dword ptr [rsp+0x4]
- vpinsrd xmm14, xmm14, dword ptr [rsp+0x44], 1
- vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
- vinserti128 ymm13, ymm13, xmm14, 0x01
- mov r8, qword ptr [rdi]
- mov r9, qword ptr [rdi+0x8]
- movzx eax, byte ptr [rbp+0x80]
- or eax, r13d
- xor edx, edx
-.p2align 5
-2:
- mov r14d, eax
- or eax, r12d
- add rdx, 64
- cmp rdx, r15
- cmovne eax, r14d
- mov dword ptr [rsp+0x88], eax
- vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip]
- vpbroadcastd ymm8, dword ptr [rsp+0x88]
- vpblendd ymm3, ymm13, ymm8, 0x88
- vmovups ymm8, ymmword ptr [r8+rdx-0x40]
- vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x40], 0x01
- vmovups ymm9, ymmword ptr [r8+rdx-0x30]
- vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x30], 0x01
- vshufps ymm4, ymm8, ymm9, 136
- vshufps ymm5, ymm8, ymm9, 221
- vmovups ymm8, ymmword ptr [r8+rdx-0x20]
- vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x20], 0x01
- vmovups ymm9, ymmword ptr [r8+rdx-0x10]
- vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x10], 0x01
- vshufps ymm6, ymm8, ymm9, 136
- vshufps ymm7, ymm8, ymm9, 221
- vpshufd ymm6, ymm6, 0x93
- vpshufd ymm7, ymm7, 0x93
- mov al, 7
-9:
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm0, ymm0, ymm1
- vpxord ymm3, ymm3, ymm0
- vprord ymm3, ymm3, 16
- vpaddd ymm2, ymm2, ymm3
- vpxord ymm1, ymm1, ymm2
- vprord ymm1, ymm1, 12
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm0, ymm0, ymm1
- vpxord ymm3, ymm3, ymm0
- vprord ymm3, ymm3, 8
- vpaddd ymm2, ymm2, ymm3
- vpxord ymm1, ymm1, ymm2
- vprord ymm1, ymm1, 7
- vpshufd ymm0, ymm0, 0x93
- vpshufd ymm3, ymm3, 0x4E
- vpshufd ymm2, ymm2, 0x39
- vpaddd ymm0, ymm0, ymm6
- vpaddd ymm0, ymm0, ymm1
- vpxord ymm3, ymm3, ymm0
- vprord ymm3, ymm3, 16
- vpaddd ymm2, ymm2, ymm3
- vpxord ymm1, ymm1, ymm2
- vprord ymm1, ymm1, 12
- vpaddd ymm0, ymm0, ymm7
- vpaddd ymm0, ymm0, ymm1
- vpxord ymm3, ymm3, ymm0
- vprord ymm3, ymm3, 8
- vpaddd ymm2, ymm2, ymm3
- vpxord ymm1, ymm1, ymm2
- vprord ymm1, ymm1, 7
- vpshufd ymm0, ymm0, 0x39
- vpshufd ymm3, ymm3, 0x4E
- vpshufd ymm2, ymm2, 0x93
- dec al
- jz 9f
- vshufps ymm8, ymm4, ymm5, 214
- vpshufd ymm9, ymm4, 0x0F
- vpshufd ymm4, ymm8, 0x39
- vshufps ymm8, ymm6, ymm7, 250
- vpblendd ymm9, ymm9, ymm8, 0xAA
- vpunpcklqdq ymm8, ymm7, ymm5
- vpblendd ymm8, ymm8, ymm6, 0x88
- vpshufd ymm8, ymm8, 0x78
- vpunpckhdq ymm5, ymm5, ymm7
- vpunpckldq ymm6, ymm6, ymm5
- vpshufd ymm7, ymm6, 0x1E
- vmovdqa ymm5, ymm9
- vmovdqa ymm6, ymm8
- jmp 9b
-9:
- vpxor ymm0, ymm0, ymm2
- vpxor ymm1, ymm1, ymm3
- mov eax, r13d
- cmp rdx, r15
- jne 2b
- vmovdqu xmmword ptr [rbx], xmm0
- vmovdqu xmmword ptr [rbx+0x10], xmm1
- vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01
- vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01
- vmovdqa xmm0, xmmword ptr [rsp]
- vmovdqa xmm2, xmmword ptr [rsp+0x40]
- vmovdqu32 xmm0 {k1}, xmmword ptr [rsp+0x8]
- vmovdqu32 xmm2 {k1}, xmmword ptr [rsp+0x48]
- vmovdqa xmmword ptr [rsp], xmm0
- vmovdqa xmmword ptr [rsp+0x40], xmm2
- add rbx, 64
- add rdi, 16
- sub rsi, 2
-3:
- test esi, 0x1
- je 4b
- vmovdqu xmm0, xmmword ptr [rcx]
- vmovdqu xmm1, xmmword ptr [rcx+0x10]
- vmovd xmm14, dword ptr [rsp]
- vpinsrd xmm14, xmm14, dword ptr [rsp+0x40], 1
- vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
- vmovdqa xmm15, xmmword ptr [BLAKE3_IV+rip]
- mov r8, qword ptr [rdi]
- movzx eax, byte ptr [rbp+0x80]
- or eax, r13d
- xor edx, edx
-.p2align 5
-2:
- mov r14d, eax
- or eax, r12d
- add rdx, 64
- cmp rdx, r15
- cmovne eax, r14d
- vpinsrd xmm3, xmm14, eax, 3
- vmovdqa xmm2, xmm15
- vmovups xmm8, xmmword ptr [r8+rdx-0x40]
- vmovups xmm9, xmmword ptr [r8+rdx-0x30]
- vshufps xmm4, xmm8, xmm9, 136
- vshufps xmm5, xmm8, xmm9, 221
- vmovups xmm8, xmmword ptr [r8+rdx-0x20]
- vmovups xmm9, xmmword ptr [r8+rdx-0x10]
- vshufps xmm6, xmm8, xmm9, 136
- vshufps xmm7, xmm8, xmm9, 221
- vpshufd xmm6, xmm6, 0x93
- vpshufd xmm7, xmm7, 0x93
- mov al, 7
-9:
- vpaddd xmm0, xmm0, xmm4
- vpaddd xmm0, xmm0, xmm1
- vpxord xmm3, xmm3, xmm0
- vprord xmm3, xmm3, 16
- vpaddd xmm2, xmm2, xmm3
- vpxord xmm1, xmm1, xmm2
- vprord xmm1, xmm1, 12
- vpaddd xmm0, xmm0, xmm5
- vpaddd xmm0, xmm0, xmm1
- vpxord xmm3, xmm3, xmm0
- vprord xmm3, xmm3, 8
- vpaddd xmm2, xmm2, xmm3
- vpxord xmm1, xmm1, xmm2
- vprord xmm1, xmm1, 7
- vpshufd xmm0, xmm0, 0x93
- vpshufd xmm3, xmm3, 0x4E
- vpshufd xmm2, xmm2, 0x39
- vpaddd xmm0, xmm0, xmm6
- vpaddd xmm0, xmm0, xmm1
- vpxord xmm3, xmm3, xmm0
- vprord xmm3, xmm3, 16
- vpaddd xmm2, xmm2, xmm3
- vpxord xmm1, xmm1, xmm2
- vprord xmm1, xmm1, 12
- vpaddd xmm0, xmm0, xmm7
- vpaddd xmm0, xmm0, xmm1
- vpxord xmm3, xmm3, xmm0
- vprord xmm3, xmm3, 8
- vpaddd xmm2, xmm2, xmm3
- vpxord xmm1, xmm1, xmm2
- vprord xmm1, xmm1, 7
- vpshufd xmm0, xmm0, 0x39
- vpshufd xmm3, xmm3, 0x4E
- vpshufd xmm2, xmm2, 0x93
- dec al
- jz 9f
- vshufps xmm8, xmm4, xmm5, 214
- vpshufd xmm9, xmm4, 0x0F
- vpshufd xmm4, xmm8, 0x39
- vshufps xmm8, xmm6, xmm7, 250
- vpblendd xmm9, xmm9, xmm8, 0xAA
- vpunpcklqdq xmm8, xmm7, xmm5
- vpblendd xmm8, xmm8, xmm6, 0x88
- vpshufd xmm8, xmm8, 0x78
- vpunpckhdq xmm5, xmm5, xmm7
- vpunpckldq xmm6, xmm6, xmm5
- vpshufd xmm7, xmm6, 0x1E
- vmovdqa xmm5, xmm9
- vmovdqa xmm6, xmm8
- jmp 9b
-9:
- vpxor xmm0, xmm0, xmm2
- vpxor xmm1, xmm1, xmm3
- mov eax, r13d
- cmp rdx, r15
- jne 2b
- vmovdqu xmmword ptr [rbx], xmm0
- vmovdqu xmmword ptr [rbx+0x10], xmm1
- jmp 4b
-
-
-.p2align 6
-_blake3_compress_in_place_avx512:
-blake3_compress_in_place_avx512:
- sub rsp, 72
- vmovdqa xmmword ptr [rsp], xmm6
- vmovdqa xmmword ptr [rsp+0x10], xmm7
- vmovdqa xmmword ptr [rsp+0x20], xmm8
- vmovdqa xmmword ptr [rsp+0x30], xmm9
- vmovdqu xmm0, xmmword ptr [rcx]
- vmovdqu xmm1, xmmword ptr [rcx+0x10]
- movzx eax, byte ptr [rsp+0x70]
- movzx r8d, r8b
- shl rax, 32
- add r8, rax
- vmovq xmm3, r9
- vmovq xmm4, r8
- vpunpcklqdq xmm3, xmm3, xmm4
- vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip]
- vmovups xmm8, xmmword ptr [rdx]
- vmovups xmm9, xmmword ptr [rdx+0x10]
- vshufps xmm4, xmm8, xmm9, 136
- vshufps xmm5, xmm8, xmm9, 221
- vmovups xmm8, xmmword ptr [rdx+0x20]
- vmovups xmm9, xmmword ptr [rdx+0x30]
- vshufps xmm6, xmm8, xmm9, 136
- vshufps xmm7, xmm8, xmm9, 221
- vpshufd xmm6, xmm6, 0x93
- vpshufd xmm7, xmm7, 0x93
- mov al, 7
-9:
- vpaddd xmm0, xmm0, xmm4
- vpaddd xmm0, xmm0, xmm1
- vpxord xmm3, xmm3, xmm0
- vprord xmm3, xmm3, 16
- vpaddd xmm2, xmm2, xmm3
- vpxord xmm1, xmm1, xmm2
- vprord xmm1, xmm1, 12
- vpaddd xmm0, xmm0, xmm5
- vpaddd xmm0, xmm0, xmm1
- vpxord xmm3, xmm3, xmm0
- vprord xmm3, xmm3, 8
- vpaddd xmm2, xmm2, xmm3
- vpxord xmm1, xmm1, xmm2
- vprord xmm1, xmm1, 7
- vpshufd xmm0, xmm0, 0x93
- vpshufd xmm3, xmm3, 0x4E
- vpshufd xmm2, xmm2, 0x39
- vpaddd xmm0, xmm0, xmm6
- vpaddd xmm0, xmm0, xmm1
- vpxord xmm3, xmm3, xmm0
- vprord xmm3, xmm3, 16
- vpaddd xmm2, xmm2, xmm3
- vpxord xmm1, xmm1, xmm2
- vprord xmm1, xmm1, 12
- vpaddd xmm0, xmm0, xmm7
- vpaddd xmm0, xmm0, xmm1
- vpxord xmm3, xmm3, xmm0
- vprord xmm3, xmm3, 8
- vpaddd xmm2, xmm2, xmm3
- vpxord xmm1, xmm1, xmm2
- vprord xmm1, xmm1, 7
- vpshufd xmm0, xmm0, 0x39
- vpshufd xmm3, xmm3, 0x4E
- vpshufd xmm2, xmm2, 0x93
- dec al
- jz 9f
- vshufps xmm8, xmm4, xmm5, 214
- vpshufd xmm9, xmm4, 0x0F
- vpshufd xmm4, xmm8, 0x39
- vshufps xmm8, xmm6, xmm7, 250
- vpblendd xmm9, xmm9, xmm8, 0xAA
- vpunpcklqdq xmm8, xmm7, xmm5
- vpblendd xmm8, xmm8, xmm6, 0x88
- vpshufd xmm8, xmm8, 0x78
- vpunpckhdq xmm5, xmm5, xmm7
- vpunpckldq xmm6, xmm6, xmm5
- vpshufd xmm7, xmm6, 0x1E
- vmovdqa xmm5, xmm9
- vmovdqa xmm6, xmm8
- jmp 9b
-9:
- vpxor xmm0, xmm0, xmm2
- vpxor xmm1, xmm1, xmm3
- vmovdqu xmmword ptr [rcx], xmm0
- vmovdqu xmmword ptr [rcx+0x10], xmm1
- vmovdqa xmm6, xmmword ptr [rsp]
- vmovdqa xmm7, xmmword ptr [rsp+0x10]
- vmovdqa xmm8, xmmword ptr [rsp+0x20]
- vmovdqa xmm9, xmmword ptr [rsp+0x30]
- add rsp, 72
- ret
-
-
-.p2align 6
-_blake3_compress_xof_avx512:
-blake3_compress_xof_avx512:
- sub rsp, 72
- vmovdqa xmmword ptr [rsp], xmm6
- vmovdqa xmmword ptr [rsp+0x10], xmm7
- vmovdqa xmmword ptr [rsp+0x20], xmm8
- vmovdqa xmmword ptr [rsp+0x30], xmm9
- vmovdqu xmm0, xmmword ptr [rcx]
- vmovdqu xmm1, xmmword ptr [rcx+0x10]
- movzx eax, byte ptr [rsp+0x70]
- movzx r8d, r8b
- mov r10, qword ptr [rsp+0x78]
- shl rax, 32
- add r8, rax
- vmovq xmm3, r9
- vmovq xmm4, r8
- vpunpcklqdq xmm3, xmm3, xmm4
- vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip]
- vmovups xmm8, xmmword ptr [rdx]
- vmovups xmm9, xmmword ptr [rdx+0x10]
- vshufps xmm4, xmm8, xmm9, 136
- vshufps xmm5, xmm8, xmm9, 221
- vmovups xmm8, xmmword ptr [rdx+0x20]
- vmovups xmm9, xmmword ptr [rdx+0x30]
- vshufps xmm6, xmm8, xmm9, 136
- vshufps xmm7, xmm8, xmm9, 221
- vpshufd xmm6, xmm6, 0x93
- vpshufd xmm7, xmm7, 0x93
- mov al, 7
-9:
- vpaddd xmm0, xmm0, xmm4
- vpaddd xmm0, xmm0, xmm1
- vpxord xmm3, xmm3, xmm0
- vprord xmm3, xmm3, 16
- vpaddd xmm2, xmm2, xmm3
- vpxord xmm1, xmm1, xmm2
- vprord xmm1, xmm1, 12
- vpaddd xmm0, xmm0, xmm5
- vpaddd xmm0, xmm0, xmm1
- vpxord xmm3, xmm3, xmm0
- vprord xmm3, xmm3, 8
- vpaddd xmm2, xmm2, xmm3
- vpxord xmm1, xmm1, xmm2
- vprord xmm1, xmm1, 7
- vpshufd xmm0, xmm0, 0x93
- vpshufd xmm3, xmm3, 0x4E
- vpshufd xmm2, xmm2, 0x39
- vpaddd xmm0, xmm0, xmm6
- vpaddd xmm0, xmm0, xmm1
- vpxord xmm3, xmm3, xmm0
- vprord xmm3, xmm3, 16
- vpaddd xmm2, xmm2, xmm3
- vpxord xmm1, xmm1, xmm2
- vprord xmm1, xmm1, 12
- vpaddd xmm0, xmm0, xmm7
- vpaddd xmm0, xmm0, xmm1
- vpxord xmm3, xmm3, xmm0
- vprord xmm3, xmm3, 8
- vpaddd xmm2, xmm2, xmm3
- vpxord xmm1, xmm1, xmm2
- vprord xmm1, xmm1, 7
- vpshufd xmm0, xmm0, 0x39
- vpshufd xmm3, xmm3, 0x4E
- vpshufd xmm2, xmm2, 0x93
- dec al
- jz 9f
- vshufps xmm8, xmm4, xmm5, 214
- vpshufd xmm9, xmm4, 0x0F
- vpshufd xmm4, xmm8, 0x39
- vshufps xmm8, xmm6, xmm7, 250
- vpblendd xmm9, xmm9, xmm8, 0xAA
- vpunpcklqdq xmm8, xmm7, xmm5
- vpblendd xmm8, xmm8, xmm6, 0x88
- vpshufd xmm8, xmm8, 0x78
- vpunpckhdq xmm5, xmm5, xmm7
- vpunpckldq xmm6, xmm6, xmm5
- vpshufd xmm7, xmm6, 0x1E
- vmovdqa xmm5, xmm9
- vmovdqa xmm6, xmm8
- jmp 9b
-9:
- vpxor xmm0, xmm0, xmm2
- vpxor xmm1, xmm1, xmm3
- vpxor xmm2, xmm2, xmmword ptr [rcx]
- vpxor xmm3, xmm3, xmmword ptr [rcx+0x10]
- vmovdqu xmmword ptr [r10], xmm0
- vmovdqu xmmword ptr [r10+0x10], xmm1
- vmovdqu xmmword ptr [r10+0x20], xmm2
- vmovdqu xmmword ptr [r10+0x30], xmm3
- vmovdqa xmm6, xmmword ptr [rsp]
- vmovdqa xmm7, xmmword ptr [rsp+0x10]
- vmovdqa xmm8, xmmword ptr [rsp+0x20]
- vmovdqa xmm9, xmmword ptr [rsp+0x30]
- add rsp, 72
- ret
-
-.section .rodata
-.p2align 6
-INDEX0:
- .long 0, 1, 2, 3, 16, 17, 18, 19
- .long 8, 9, 10, 11, 24, 25, 26, 27
-INDEX1:
- .long 4, 5, 6, 7, 20, 21, 22, 23
- .long 12, 13, 14, 15, 28, 29, 30, 31
-ADD0:
- .long 0, 1, 2, 3, 4, 5, 6, 7
- .long 8, 9, 10, 11, 12, 13, 14, 15
-ADD1: .long 1
-
-ADD16: .long 16
-BLAKE3_BLOCK_LEN:
- .long 64
-.p2align 6
-BLAKE3_IV:
-BLAKE3_IV_0:
- .long 0x6A09E667
-BLAKE3_IV_1:
- .long 0xBB67AE85
-BLAKE3_IV_2:
- .long 0x3C6EF372
-BLAKE3_IV_3:
- .long 0xA54FF53A
diff --git a/thirdparty/BLAKE3/c/blake3_avx512_x86-64_windows_msvc.asm b/thirdparty/BLAKE3/c/blake3_avx512_x86-64_windows_msvc.asm
deleted file mode 100644
index b19efbaae..000000000
--- a/thirdparty/BLAKE3/c/blake3_avx512_x86-64_windows_msvc.asm
+++ /dev/null
@@ -1,2634 +0,0 @@
-public _blake3_hash_many_avx512
-public blake3_hash_many_avx512
-public blake3_compress_in_place_avx512
-public _blake3_compress_in_place_avx512
-public blake3_compress_xof_avx512
-public _blake3_compress_xof_avx512
-
-_TEXT SEGMENT ALIGN(16) 'CODE'
-
-ALIGN 16
-blake3_hash_many_avx512 PROC
-_blake3_hash_many_avx512 PROC
- push r15
- push r14
- push r13
- push r12
- push rdi
- push rsi
- push rbx
- push rbp
- mov rbp, rsp
- sub rsp, 304
- and rsp, 0FFFFFFFFFFFFFFC0H
- vmovdqa xmmword ptr [rsp+90H], xmm6
- vmovdqa xmmword ptr [rsp+0A0H], xmm7
- vmovdqa xmmword ptr [rsp+0B0H], xmm8
- vmovdqa xmmword ptr [rsp+0C0H], xmm9
- vmovdqa xmmword ptr [rsp+0D0H], xmm10
- vmovdqa xmmword ptr [rsp+0E0H], xmm11
- vmovdqa xmmword ptr [rsp+0F0H], xmm12
- vmovdqa xmmword ptr [rsp+100H], xmm13
- vmovdqa xmmword ptr [rsp+110H], xmm14
- vmovdqa xmmword ptr [rsp+120H], xmm15
- mov rdi, rcx
- mov rsi, rdx
- mov rdx, r8
- mov rcx, r9
- mov r8, qword ptr [rbp+68H]
- movzx r9, byte ptr [rbp+70H]
- neg r9
- kmovw k1, r9d
- vmovd xmm0, r8d
- vpbroadcastd ymm0, xmm0
- shr r8, 32
- vmovd xmm1, r8d
- vpbroadcastd ymm1, xmm1
- vmovdqa ymm4, ymm1
- vmovdqa ymm5, ymm1
- vpaddd ymm2, ymm0, ymmword ptr [ADD0]
- vpaddd ymm3, ymm0, ymmword ptr [ADD0+32]
- vpcmpud k2, ymm2, ymm0, 1
- vpcmpud k3, ymm3, ymm0, 1
- ; XXX: ml64.exe does not currently understand the syntax. We use a workaround.
- vpbroadcastd ymm6, dword ptr [ADD1]
- vpaddd ymm4 {k2}, ymm4, ymm6
- vpaddd ymm5 {k3}, ymm5, ymm6
- ; vpaddd ymm4 {k2}, ymm4, dword ptr [ADD1] {1to8}
- ; vpaddd ymm5 {k3}, ymm5, dword ptr [ADD1] {1to8}
- knotw k2, k1
- vmovdqa32 ymm2 {k2}, ymm0
- vmovdqa32 ymm3 {k2}, ymm0
- vmovdqa32 ymm4 {k2}, ymm1
- vmovdqa32 ymm5 {k2}, ymm1
- vmovdqa ymmword ptr [rsp], ymm2
- vmovdqa ymmword ptr [rsp+20H], ymm3
- vmovdqa ymmword ptr [rsp+40H], ymm4
- vmovdqa ymmword ptr [rsp+60H], ymm5
- shl rdx, 6
- mov qword ptr [rsp+80H], rdx
- cmp rsi, 16
- jc final15blocks
-outerloop16:
- vpbroadcastd zmm0, dword ptr [rcx]
- vpbroadcastd zmm1, dword ptr [rcx+1H*4H]
- vpbroadcastd zmm2, dword ptr [rcx+2H*4H]
- vpbroadcastd zmm3, dword ptr [rcx+3H*4H]
- vpbroadcastd zmm4, dword ptr [rcx+4H*4H]
- vpbroadcastd zmm5, dword ptr [rcx+5H*4H]
- vpbroadcastd zmm6, dword ptr [rcx+6H*4H]
- vpbroadcastd zmm7, dword ptr [rcx+7H*4H]
- movzx eax, byte ptr [rbp+78H]
- movzx ebx, byte ptr [rbp+80H]
- or eax, ebx
- xor edx, edx
-ALIGN 16
-innerloop16:
- movzx ebx, byte ptr [rbp+88H]
- or ebx, eax
- add rdx, 64
- cmp rdx, qword ptr [rsp+80H]
- cmove eax, ebx
- mov dword ptr [rsp+88H], eax
- mov r8, qword ptr [rdi]
- mov r9, qword ptr [rdi+8H]
- mov r10, qword ptr [rdi+10H]
- mov r11, qword ptr [rdi+18H]
- mov r12, qword ptr [rdi+40H]
- mov r13, qword ptr [rdi+48H]
- mov r14, qword ptr [rdi+50H]
- mov r15, qword ptr [rdi+58H]
- vmovdqu32 ymm16, ymmword ptr [rdx+r8-2H*20H]
- vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-2H*20H], 01H
- vmovdqu32 ymm17, ymmword ptr [rdx+r9-2H*20H]
- vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-2H*20H], 01H
- vpunpcklqdq zmm8, zmm16, zmm17
- vpunpckhqdq zmm9, zmm16, zmm17
- vmovdqu32 ymm18, ymmword ptr [rdx+r10-2H*20H]
- vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-2H*20H], 01H
- vmovdqu32 ymm19, ymmword ptr [rdx+r11-2H*20H]
- vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-2H*20H], 01H
- vpunpcklqdq zmm10, zmm18, zmm19
- vpunpckhqdq zmm11, zmm18, zmm19
- mov r8, qword ptr [rdi+20H]
- mov r9, qword ptr [rdi+28H]
- mov r10, qword ptr [rdi+30H]
- mov r11, qword ptr [rdi+38H]
- mov r12, qword ptr [rdi+60H]
- mov r13, qword ptr [rdi+68H]
- mov r14, qword ptr [rdi+70H]
- mov r15, qword ptr [rdi+78H]
- vmovdqu32 ymm16, ymmword ptr [rdx+r8-2H*20H]
- vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-2H*20H], 01H
- vmovdqu32 ymm17, ymmword ptr [rdx+r9-2H*20H]
- vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-2H*20H], 01H
- vpunpcklqdq zmm12, zmm16, zmm17
- vpunpckhqdq zmm13, zmm16, zmm17
- vmovdqu32 ymm18, ymmword ptr [rdx+r10-2H*20H]
- vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-2H*20H], 01H
- vmovdqu32 ymm19, ymmword ptr [rdx+r11-2H*20H]
- vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-2H*20H], 01H
- vpunpcklqdq zmm14, zmm18, zmm19
- vpunpckhqdq zmm15, zmm18, zmm19
- vmovdqa32 zmm27, zmmword ptr [INDEX0]
- vmovdqa32 zmm31, zmmword ptr [INDEX1]
- vshufps zmm16, zmm8, zmm10, 136
- vshufps zmm17, zmm12, zmm14, 136
- vmovdqa32 zmm20, zmm16
- vpermt2d zmm16, zmm27, zmm17
- vpermt2d zmm20, zmm31, zmm17
- vshufps zmm17, zmm8, zmm10, 221
- vshufps zmm30, zmm12, zmm14, 221
- vmovdqa32 zmm21, zmm17
- vpermt2d zmm17, zmm27, zmm30
- vpermt2d zmm21, zmm31, zmm30
- vshufps zmm18, zmm9, zmm11, 136
- vshufps zmm8, zmm13, zmm15, 136
- vmovdqa32 zmm22, zmm18
- vpermt2d zmm18, zmm27, zmm8
- vpermt2d zmm22, zmm31, zmm8
- vshufps zmm19, zmm9, zmm11, 221
- vshufps zmm8, zmm13, zmm15, 221
- vmovdqa32 zmm23, zmm19
- vpermt2d zmm19, zmm27, zmm8
- vpermt2d zmm23, zmm31, zmm8
- mov r8, qword ptr [rdi]
- mov r9, qword ptr [rdi+8H]
- mov r10, qword ptr [rdi+10H]
- mov r11, qword ptr [rdi+18H]
- mov r12, qword ptr [rdi+40H]
- mov r13, qword ptr [rdi+48H]
- mov r14, qword ptr [rdi+50H]
- mov r15, qword ptr [rdi+58H]
- vmovdqu32 ymm24, ymmword ptr [r8+rdx-1H*20H]
- vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-1H*20H], 01H
- vmovdqu32 ymm25, ymmword ptr [r9+rdx-1H*20H]
- vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-1H*20H], 01H
- vpunpcklqdq zmm8, zmm24, zmm25
- vpunpckhqdq zmm9, zmm24, zmm25
- vmovdqu32 ymm24, ymmword ptr [r10+rdx-1H*20H]
- vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-1H*20H], 01H
- vmovdqu32 ymm25, ymmword ptr [r11+rdx-1H*20H]
- vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-1H*20H], 01H
- vpunpcklqdq zmm10, zmm24, zmm25
- vpunpckhqdq zmm11, zmm24, zmm25
- prefetcht0 byte ptr [r8+rdx+80H]
- prefetcht0 byte ptr [r12+rdx+80H]
- prefetcht0 byte ptr [r9+rdx+80H]
- prefetcht0 byte ptr [r13+rdx+80H]
- prefetcht0 byte ptr [r10+rdx+80H]
- prefetcht0 byte ptr [r14+rdx+80H]
- prefetcht0 byte ptr [r11+rdx+80H]
- prefetcht0 byte ptr [r15+rdx+80H]
- mov r8, qword ptr [rdi+20H]
- mov r9, qword ptr [rdi+28H]
- mov r10, qword ptr [rdi+30H]
- mov r11, qword ptr [rdi+38H]
- mov r12, qword ptr [rdi+60H]
- mov r13, qword ptr [rdi+68H]
- mov r14, qword ptr [rdi+70H]
- mov r15, qword ptr [rdi+78H]
- vmovdqu32 ymm24, ymmword ptr [r8+rdx-1H*20H]
- vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-1H*20H], 01H
- vmovdqu32 ymm25, ymmword ptr [r9+rdx-1H*20H]
- vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-1H*20H], 01H
- vpunpcklqdq zmm12, zmm24, zmm25
- vpunpckhqdq zmm13, zmm24, zmm25
- vmovdqu32 ymm24, ymmword ptr [r10+rdx-1H*20H]
- vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-1H*20H], 01H
- vmovdqu32 ymm25, ymmword ptr [r11+rdx-1H*20H]
- vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-1H*20H], 01H
- vpunpcklqdq zmm14, zmm24, zmm25
- vpunpckhqdq zmm15, zmm24, zmm25
- prefetcht0 byte ptr [r8+rdx+80H]
- prefetcht0 byte ptr [r12+rdx+80H]
- prefetcht0 byte ptr [r9+rdx+80H]
- prefetcht0 byte ptr [r13+rdx+80H]
- prefetcht0 byte ptr [r10+rdx+80H]
- prefetcht0 byte ptr [r14+rdx+80H]
- prefetcht0 byte ptr [r11+rdx+80H]
- prefetcht0 byte ptr [r15+rdx+80H]
- vshufps zmm24, zmm8, zmm10, 136
- vshufps zmm30, zmm12, zmm14, 136
- vmovdqa32 zmm28, zmm24
- vpermt2d zmm24, zmm27, zmm30
- vpermt2d zmm28, zmm31, zmm30
- vshufps zmm25, zmm8, zmm10, 221
- vshufps zmm30, zmm12, zmm14, 221
- vmovdqa32 zmm29, zmm25
- vpermt2d zmm25, zmm27, zmm30
- vpermt2d zmm29, zmm31, zmm30
- vshufps zmm26, zmm9, zmm11, 136
- vshufps zmm8, zmm13, zmm15, 136
- vmovdqa32 zmm30, zmm26
- vpermt2d zmm26, zmm27, zmm8
- vpermt2d zmm30, zmm31, zmm8
- vshufps zmm8, zmm9, zmm11, 221
- vshufps zmm10, zmm13, zmm15, 221
- vpermi2d zmm27, zmm8, zmm10
- vpermi2d zmm31, zmm8, zmm10
- vpbroadcastd zmm8, dword ptr [BLAKE3_IV_0]
- vpbroadcastd zmm9, dword ptr [BLAKE3_IV_1]
- vpbroadcastd zmm10, dword ptr [BLAKE3_IV_2]
- vpbroadcastd zmm11, dword ptr [BLAKE3_IV_3]
- vmovdqa32 zmm12, zmmword ptr [rsp]
- vmovdqa32 zmm13, zmmword ptr [rsp+1H*40H]
- vpbroadcastd zmm14, dword ptr [BLAKE3_BLOCK_LEN]
- vpbroadcastd zmm15, dword ptr [rsp+22H*4H]
- vpaddd zmm0, zmm0, zmm16
- vpaddd zmm1, zmm1, zmm18
- vpaddd zmm2, zmm2, zmm20
- vpaddd zmm3, zmm3, zmm22
- vpaddd zmm0, zmm0, zmm4
- vpaddd zmm1, zmm1, zmm5
- vpaddd zmm2, zmm2, zmm6
- vpaddd zmm3, zmm3, zmm7
- vpxord zmm12, zmm12, zmm0
- vpxord zmm13, zmm13, zmm1
- vpxord zmm14, zmm14, zmm2
- vpxord zmm15, zmm15, zmm3
- vprord zmm12, zmm12, 16
- vprord zmm13, zmm13, 16
- vprord zmm14, zmm14, 16
- vprord zmm15, zmm15, 16
- vpaddd zmm8, zmm8, zmm12
- vpaddd zmm9, zmm9, zmm13
- vpaddd zmm10, zmm10, zmm14
- vpaddd zmm11, zmm11, zmm15
- vpxord zmm4, zmm4, zmm8
- vpxord zmm5, zmm5, zmm9
- vpxord zmm6, zmm6, zmm10
- vpxord zmm7, zmm7, zmm11
- vprord zmm4, zmm4, 12
- vprord zmm5, zmm5, 12
- vprord zmm6, zmm6, 12
- vprord zmm7, zmm7, 12
- vpaddd zmm0, zmm0, zmm17
- vpaddd zmm1, zmm1, zmm19
- vpaddd zmm2, zmm2, zmm21
- vpaddd zmm3, zmm3, zmm23
- vpaddd zmm0, zmm0, zmm4
- vpaddd zmm1, zmm1, zmm5
- vpaddd zmm2, zmm2, zmm6
- vpaddd zmm3, zmm3, zmm7
- vpxord zmm12, zmm12, zmm0
- vpxord zmm13, zmm13, zmm1
- vpxord zmm14, zmm14, zmm2
- vpxord zmm15, zmm15, zmm3
- vprord zmm12, zmm12, 8
- vprord zmm13, zmm13, 8
- vprord zmm14, zmm14, 8
- vprord zmm15, zmm15, 8
- vpaddd zmm8, zmm8, zmm12
- vpaddd zmm9, zmm9, zmm13
- vpaddd zmm10, zmm10, zmm14
- vpaddd zmm11, zmm11, zmm15
- vpxord zmm4, zmm4, zmm8
- vpxord zmm5, zmm5, zmm9
- vpxord zmm6, zmm6, zmm10
- vpxord zmm7, zmm7, zmm11
- vprord zmm4, zmm4, 7
- vprord zmm5, zmm5, 7
- vprord zmm6, zmm6, 7
- vprord zmm7, zmm7, 7
- vpaddd zmm0, zmm0, zmm24
- vpaddd zmm1, zmm1, zmm26
- vpaddd zmm2, zmm2, zmm28
- vpaddd zmm3, zmm3, zmm30
- vpaddd zmm0, zmm0, zmm5
- vpaddd zmm1, zmm1, zmm6
- vpaddd zmm2, zmm2, zmm7
- vpaddd zmm3, zmm3, zmm4
- vpxord zmm15, zmm15, zmm0
- vpxord zmm12, zmm12, zmm1
- vpxord zmm13, zmm13, zmm2
- vpxord zmm14, zmm14, zmm3
- vprord zmm15, zmm15, 16
- vprord zmm12, zmm12, 16
- vprord zmm13, zmm13, 16
- vprord zmm14, zmm14, 16
- vpaddd zmm10, zmm10, zmm15
- vpaddd zmm11, zmm11, zmm12
- vpaddd zmm8, zmm8, zmm13
- vpaddd zmm9, zmm9, zmm14
- vpxord zmm5, zmm5, zmm10
- vpxord zmm6, zmm6, zmm11
- vpxord zmm7, zmm7, zmm8
- vpxord zmm4, zmm4, zmm9
- vprord zmm5, zmm5, 12
- vprord zmm6, zmm6, 12
- vprord zmm7, zmm7, 12
- vprord zmm4, zmm4, 12
- vpaddd zmm0, zmm0, zmm25
- vpaddd zmm1, zmm1, zmm27
- vpaddd zmm2, zmm2, zmm29
- vpaddd zmm3, zmm3, zmm31
- vpaddd zmm0, zmm0, zmm5
- vpaddd zmm1, zmm1, zmm6
- vpaddd zmm2, zmm2, zmm7
- vpaddd zmm3, zmm3, zmm4
- vpxord zmm15, zmm15, zmm0
- vpxord zmm12, zmm12, zmm1
- vpxord zmm13, zmm13, zmm2
- vpxord zmm14, zmm14, zmm3
- vprord zmm15, zmm15, 8
- vprord zmm12, zmm12, 8
- vprord zmm13, zmm13, 8
- vprord zmm14, zmm14, 8
- vpaddd zmm10, zmm10, zmm15
- vpaddd zmm11, zmm11, zmm12
- vpaddd zmm8, zmm8, zmm13
- vpaddd zmm9, zmm9, zmm14
- vpxord zmm5, zmm5, zmm10
- vpxord zmm6, zmm6, zmm11
- vpxord zmm7, zmm7, zmm8
- vpxord zmm4, zmm4, zmm9
- vprord zmm5, zmm5, 7
- vprord zmm6, zmm6, 7
- vprord zmm7, zmm7, 7
- vprord zmm4, zmm4, 7
- vpaddd zmm0, zmm0, zmm18
- vpaddd zmm1, zmm1, zmm19
- vpaddd zmm2, zmm2, zmm23
- vpaddd zmm3, zmm3, zmm20
- vpaddd zmm0, zmm0, zmm4
- vpaddd zmm1, zmm1, zmm5
- vpaddd zmm2, zmm2, zmm6
- vpaddd zmm3, zmm3, zmm7
- vpxord zmm12, zmm12, zmm0
- vpxord zmm13, zmm13, zmm1
- vpxord zmm14, zmm14, zmm2
- vpxord zmm15, zmm15, zmm3
- vprord zmm12, zmm12, 16
- vprord zmm13, zmm13, 16
- vprord zmm14, zmm14, 16
- vprord zmm15, zmm15, 16
- vpaddd zmm8, zmm8, zmm12
- vpaddd zmm9, zmm9, zmm13
- vpaddd zmm10, zmm10, zmm14
- vpaddd zmm11, zmm11, zmm15
- vpxord zmm4, zmm4, zmm8
- vpxord zmm5, zmm5, zmm9
- vpxord zmm6, zmm6, zmm10
- vpxord zmm7, zmm7, zmm11
- vprord zmm4, zmm4, 12
- vprord zmm5, zmm5, 12
- vprord zmm6, zmm6, 12
- vprord zmm7, zmm7, 12
- vpaddd zmm0, zmm0, zmm22
- vpaddd zmm1, zmm1, zmm26
- vpaddd zmm2, zmm2, zmm16
- vpaddd zmm3, zmm3, zmm29
- vpaddd zmm0, zmm0, zmm4
- vpaddd zmm1, zmm1, zmm5
- vpaddd zmm2, zmm2, zmm6
- vpaddd zmm3, zmm3, zmm7
- vpxord zmm12, zmm12, zmm0
- vpxord zmm13, zmm13, zmm1
- vpxord zmm14, zmm14, zmm2
- vpxord zmm15, zmm15, zmm3
- vprord zmm12, zmm12, 8
- vprord zmm13, zmm13, 8
- vprord zmm14, zmm14, 8
- vprord zmm15, zmm15, 8
- vpaddd zmm8, zmm8, zmm12
- vpaddd zmm9, zmm9, zmm13
- vpaddd zmm10, zmm10, zmm14
- vpaddd zmm11, zmm11, zmm15
- vpxord zmm4, zmm4, zmm8
- vpxord zmm5, zmm5, zmm9
- vpxord zmm6, zmm6, zmm10
- vpxord zmm7, zmm7, zmm11
- vprord zmm4, zmm4, 7
- vprord zmm5, zmm5, 7
- vprord zmm6, zmm6, 7
- vprord zmm7, zmm7, 7
- vpaddd zmm0, zmm0, zmm17
- vpaddd zmm1, zmm1, zmm28
- vpaddd zmm2, zmm2, zmm25
- vpaddd zmm3, zmm3, zmm31
- vpaddd zmm0, zmm0, zmm5
- vpaddd zmm1, zmm1, zmm6
- vpaddd zmm2, zmm2, zmm7
- vpaddd zmm3, zmm3, zmm4
- vpxord zmm15, zmm15, zmm0
- vpxord zmm12, zmm12, zmm1
- vpxord zmm13, zmm13, zmm2
- vpxord zmm14, zmm14, zmm3
- vprord zmm15, zmm15, 16
- vprord zmm12, zmm12, 16
- vprord zmm13, zmm13, 16
- vprord zmm14, zmm14, 16
- vpaddd zmm10, zmm10, zmm15
- vpaddd zmm11, zmm11, zmm12
- vpaddd zmm8, zmm8, zmm13
- vpaddd zmm9, zmm9, zmm14
- vpxord zmm5, zmm5, zmm10
- vpxord zmm6, zmm6, zmm11
- vpxord zmm7, zmm7, zmm8
- vpxord zmm4, zmm4, zmm9
- vprord zmm5, zmm5, 12
- vprord zmm6, zmm6, 12
- vprord zmm7, zmm7, 12
- vprord zmm4, zmm4, 12
- vpaddd zmm0, zmm0, zmm27
- vpaddd zmm1, zmm1, zmm21
- vpaddd zmm2, zmm2, zmm30
- vpaddd zmm3, zmm3, zmm24
- vpaddd zmm0, zmm0, zmm5
- vpaddd zmm1, zmm1, zmm6
- vpaddd zmm2, zmm2, zmm7
- vpaddd zmm3, zmm3, zmm4
- vpxord zmm15, zmm15, zmm0
- vpxord zmm12, zmm12, zmm1
- vpxord zmm13, zmm13, zmm2
- vpxord zmm14, zmm14, zmm3
- vprord zmm15, zmm15, 8
- vprord zmm12, zmm12, 8
- vprord zmm13, zmm13, 8
- vprord zmm14, zmm14, 8
- vpaddd zmm10, zmm10, zmm15
- vpaddd zmm11, zmm11, zmm12
- vpaddd zmm8, zmm8, zmm13
- vpaddd zmm9, zmm9, zmm14
- vpxord zmm5, zmm5, zmm10
- vpxord zmm6, zmm6, zmm11
- vpxord zmm7, zmm7, zmm8
- vpxord zmm4, zmm4, zmm9
- vprord zmm5, zmm5, 7
- vprord zmm6, zmm6, 7
- vprord zmm7, zmm7, 7
- vprord zmm4, zmm4, 7
- vpaddd zmm0, zmm0, zmm19
- vpaddd zmm1, zmm1, zmm26
- vpaddd zmm2, zmm2, zmm29
- vpaddd zmm3, zmm3, zmm23
- vpaddd zmm0, zmm0, zmm4
- vpaddd zmm1, zmm1, zmm5
- vpaddd zmm2, zmm2, zmm6
- vpaddd zmm3, zmm3, zmm7
- vpxord zmm12, zmm12, zmm0
- vpxord zmm13, zmm13, zmm1
- vpxord zmm14, zmm14, zmm2
- vpxord zmm15, zmm15, zmm3
- vprord zmm12, zmm12, 16
- vprord zmm13, zmm13, 16
- vprord zmm14, zmm14, 16
- vprord zmm15, zmm15, 16
- vpaddd zmm8, zmm8, zmm12
- vpaddd zmm9, zmm9, zmm13
- vpaddd zmm10, zmm10, zmm14
- vpaddd zmm11, zmm11, zmm15
- vpxord zmm4, zmm4, zmm8
- vpxord zmm5, zmm5, zmm9
- vpxord zmm6, zmm6, zmm10
- vpxord zmm7, zmm7, zmm11
- vprord zmm4, zmm4, 12
- vprord zmm5, zmm5, 12
- vprord zmm6, zmm6, 12
- vprord zmm7, zmm7, 12
- vpaddd zmm0, zmm0, zmm20
- vpaddd zmm1, zmm1, zmm28
- vpaddd zmm2, zmm2, zmm18
- vpaddd zmm3, zmm3, zmm30
- vpaddd zmm0, zmm0, zmm4
- vpaddd zmm1, zmm1, zmm5
- vpaddd zmm2, zmm2, zmm6
- vpaddd zmm3, zmm3, zmm7
- vpxord zmm12, zmm12, zmm0
- vpxord zmm13, zmm13, zmm1
- vpxord zmm14, zmm14, zmm2
- vpxord zmm15, zmm15, zmm3
- vprord zmm12, zmm12, 8
- vprord zmm13, zmm13, 8
- vprord zmm14, zmm14, 8
- vprord zmm15, zmm15, 8
- vpaddd zmm8, zmm8, zmm12
- vpaddd zmm9, zmm9, zmm13
- vpaddd zmm10, zmm10, zmm14
- vpaddd zmm11, zmm11, zmm15
- vpxord zmm4, zmm4, zmm8
- vpxord zmm5, zmm5, zmm9
- vpxord zmm6, zmm6, zmm10
- vpxord zmm7, zmm7, zmm11
- vprord zmm4, zmm4, 7
- vprord zmm5, zmm5, 7
- vprord zmm6, zmm6, 7
- vprord zmm7, zmm7, 7
- vpaddd zmm0, zmm0, zmm22
- vpaddd zmm1, zmm1, zmm25
- vpaddd zmm2, zmm2, zmm27
- vpaddd zmm3, zmm3, zmm24
- vpaddd zmm0, zmm0, zmm5
- vpaddd zmm1, zmm1, zmm6
- vpaddd zmm2, zmm2, zmm7
- vpaddd zmm3, zmm3, zmm4
- vpxord zmm15, zmm15, zmm0
- vpxord zmm12, zmm12, zmm1
- vpxord zmm13, zmm13, zmm2
- vpxord zmm14, zmm14, zmm3
- vprord zmm15, zmm15, 16
- vprord zmm12, zmm12, 16
- vprord zmm13, zmm13, 16
- vprord zmm14, zmm14, 16
- vpaddd zmm10, zmm10, zmm15
- vpaddd zmm11, zmm11, zmm12
- vpaddd zmm8, zmm8, zmm13
- vpaddd zmm9, zmm9, zmm14
- vpxord zmm5, zmm5, zmm10
- vpxord zmm6, zmm6, zmm11
- vpxord zmm7, zmm7, zmm8
- vpxord zmm4, zmm4, zmm9
- vprord zmm5, zmm5, 12
- vprord zmm6, zmm6, 12
- vprord zmm7, zmm7, 12
- vprord zmm4, zmm4, 12
- vpaddd zmm0, zmm0, zmm21
- vpaddd zmm1, zmm1, zmm16
- vpaddd zmm2, zmm2, zmm31
- vpaddd zmm3, zmm3, zmm17
- vpaddd zmm0, zmm0, zmm5
- vpaddd zmm1, zmm1, zmm6
- vpaddd zmm2, zmm2, zmm7
- vpaddd zmm3, zmm3, zmm4
- vpxord zmm15, zmm15, zmm0
- vpxord zmm12, zmm12, zmm1
- vpxord zmm13, zmm13, zmm2
- vpxord zmm14, zmm14, zmm3
- vprord zmm15, zmm15, 8
- vprord zmm12, zmm12, 8
- vprord zmm13, zmm13, 8
- vprord zmm14, zmm14, 8
- vpaddd zmm10, zmm10, zmm15
- vpaddd zmm11, zmm11, zmm12
- vpaddd zmm8, zmm8, zmm13
- vpaddd zmm9, zmm9, zmm14
- vpxord zmm5, zmm5, zmm10
- vpxord zmm6, zmm6, zmm11
- vpxord zmm7, zmm7, zmm8
- vpxord zmm4, zmm4, zmm9
- vprord zmm5, zmm5, 7
- vprord zmm6, zmm6, 7
- vprord zmm7, zmm7, 7
- vprord zmm4, zmm4, 7
- vpaddd zmm0, zmm0, zmm26
- vpaddd zmm1, zmm1, zmm28
- vpaddd zmm2, zmm2, zmm30
- vpaddd zmm3, zmm3, zmm29
- vpaddd zmm0, zmm0, zmm4
- vpaddd zmm1, zmm1, zmm5
- vpaddd zmm2, zmm2, zmm6
- vpaddd zmm3, zmm3, zmm7
- vpxord zmm12, zmm12, zmm0
- vpxord zmm13, zmm13, zmm1
- vpxord zmm14, zmm14, zmm2
- vpxord zmm15, zmm15, zmm3
- vprord zmm12, zmm12, 16
- vprord zmm13, zmm13, 16
- vprord zmm14, zmm14, 16
- vprord zmm15, zmm15, 16
- vpaddd zmm8, zmm8, zmm12
- vpaddd zmm9, zmm9, zmm13
- vpaddd zmm10, zmm10, zmm14
- vpaddd zmm11, zmm11, zmm15
- vpxord zmm4, zmm4, zmm8
- vpxord zmm5, zmm5, zmm9
- vpxord zmm6, zmm6, zmm10
- vpxord zmm7, zmm7, zmm11
- vprord zmm4, zmm4, 12
- vprord zmm5, zmm5, 12
- vprord zmm6, zmm6, 12
- vprord zmm7, zmm7, 12
- vpaddd zmm0, zmm0, zmm23
- vpaddd zmm1, zmm1, zmm25
- vpaddd zmm2, zmm2, zmm19
- vpaddd zmm3, zmm3, zmm31
- vpaddd zmm0, zmm0, zmm4
- vpaddd zmm1, zmm1, zmm5
- vpaddd zmm2, zmm2, zmm6
- vpaddd zmm3, zmm3, zmm7
- vpxord zmm12, zmm12, zmm0
- vpxord zmm13, zmm13, zmm1
- vpxord zmm14, zmm14, zmm2
- vpxord zmm15, zmm15, zmm3
- vprord zmm12, zmm12, 8
- vprord zmm13, zmm13, 8
- vprord zmm14, zmm14, 8
- vprord zmm15, zmm15, 8
- vpaddd zmm8, zmm8, zmm12
- vpaddd zmm9, zmm9, zmm13
- vpaddd zmm10, zmm10, zmm14
- vpaddd zmm11, zmm11, zmm15
- vpxord zmm4, zmm4, zmm8
- vpxord zmm5, zmm5, zmm9
- vpxord zmm6, zmm6, zmm10
- vpxord zmm7, zmm7, zmm11
- vprord zmm4, zmm4, 7
- vprord zmm5, zmm5, 7
- vprord zmm6, zmm6, 7
- vprord zmm7, zmm7, 7
- vpaddd zmm0, zmm0, zmm20
- vpaddd zmm1, zmm1, zmm27
- vpaddd zmm2, zmm2, zmm21
- vpaddd zmm3, zmm3, zmm17
- vpaddd zmm0, zmm0, zmm5
- vpaddd zmm1, zmm1, zmm6
- vpaddd zmm2, zmm2, zmm7
- vpaddd zmm3, zmm3, zmm4
- vpxord zmm15, zmm15, zmm0
- vpxord zmm12, zmm12, zmm1
- vpxord zmm13, zmm13, zmm2
- vpxord zmm14, zmm14, zmm3
- vprord zmm15, zmm15, 16
- vprord zmm12, zmm12, 16
- vprord zmm13, zmm13, 16
- vprord zmm14, zmm14, 16
- vpaddd zmm10, zmm10, zmm15
- vpaddd zmm11, zmm11, zmm12
- vpaddd zmm8, zmm8, zmm13
- vpaddd zmm9, zmm9, zmm14
- vpxord zmm5, zmm5, zmm10
- vpxord zmm6, zmm6, zmm11
- vpxord zmm7, zmm7, zmm8
- vpxord zmm4, zmm4, zmm9
- vprord zmm5, zmm5, 12
- vprord zmm6, zmm6, 12
- vprord zmm7, zmm7, 12
- vprord zmm4, zmm4, 12
- vpaddd zmm0, zmm0, zmm16
- vpaddd zmm1, zmm1, zmm18
- vpaddd zmm2, zmm2, zmm24
- vpaddd zmm3, zmm3, zmm22
- vpaddd zmm0, zmm0, zmm5
- vpaddd zmm1, zmm1, zmm6
- vpaddd zmm2, zmm2, zmm7
- vpaddd zmm3, zmm3, zmm4
- vpxord zmm15, zmm15, zmm0
- vpxord zmm12, zmm12, zmm1
- vpxord zmm13, zmm13, zmm2
- vpxord zmm14, zmm14, zmm3
- vprord zmm15, zmm15, 8
- vprord zmm12, zmm12, 8
- vprord zmm13, zmm13, 8
- vprord zmm14, zmm14, 8
- vpaddd zmm10, zmm10, zmm15
- vpaddd zmm11, zmm11, zmm12
- vpaddd zmm8, zmm8, zmm13
- vpaddd zmm9, zmm9, zmm14
- vpxord zmm5, zmm5, zmm10
- vpxord zmm6, zmm6, zmm11
- vpxord zmm7, zmm7, zmm8
- vpxord zmm4, zmm4, zmm9
- vprord zmm5, zmm5, 7
- vprord zmm6, zmm6, 7
- vprord zmm7, zmm7, 7
- vprord zmm4, zmm4, 7
- vpaddd zmm0, zmm0, zmm28
- vpaddd zmm1, zmm1, zmm25
- vpaddd zmm2, zmm2, zmm31
- vpaddd zmm3, zmm3, zmm30
- vpaddd zmm0, zmm0, zmm4
- vpaddd zmm1, zmm1, zmm5
- vpaddd zmm2, zmm2, zmm6
- vpaddd zmm3, zmm3, zmm7
- vpxord zmm12, zmm12, zmm0
- vpxord zmm13, zmm13, zmm1
- vpxord zmm14, zmm14, zmm2
- vpxord zmm15, zmm15, zmm3
- vprord zmm12, zmm12, 16
- vprord zmm13, zmm13, 16
- vprord zmm14, zmm14, 16
- vprord zmm15, zmm15, 16
- vpaddd zmm8, zmm8, zmm12
- vpaddd zmm9, zmm9, zmm13
- vpaddd zmm10, zmm10, zmm14
- vpaddd zmm11, zmm11, zmm15
- vpxord zmm4, zmm4, zmm8
- vpxord zmm5, zmm5, zmm9
- vpxord zmm6, zmm6, zmm10
- vpxord zmm7, zmm7, zmm11
- vprord zmm4, zmm4, 12
- vprord zmm5, zmm5, 12
- vprord zmm6, zmm6, 12
- vprord zmm7, zmm7, 12
- vpaddd zmm0, zmm0, zmm29
- vpaddd zmm1, zmm1, zmm27
- vpaddd zmm2, zmm2, zmm26
- vpaddd zmm3, zmm3, zmm24
- vpaddd zmm0, zmm0, zmm4
- vpaddd zmm1, zmm1, zmm5
- vpaddd zmm2, zmm2, zmm6
- vpaddd zmm3, zmm3, zmm7
- vpxord zmm12, zmm12, zmm0
- vpxord zmm13, zmm13, zmm1
- vpxord zmm14, zmm14, zmm2
- vpxord zmm15, zmm15, zmm3
- vprord zmm12, zmm12, 8
- vprord zmm13, zmm13, 8
- vprord zmm14, zmm14, 8
- vprord zmm15, zmm15, 8
- vpaddd zmm8, zmm8, zmm12
- vpaddd zmm9, zmm9, zmm13
- vpaddd zmm10, zmm10, zmm14
- vpaddd zmm11, zmm11, zmm15
- vpxord zmm4, zmm4, zmm8
- vpxord zmm5, zmm5, zmm9
- vpxord zmm6, zmm6, zmm10
- vpxord zmm7, zmm7, zmm11
- vprord zmm4, zmm4, 7
- vprord zmm5, zmm5, 7
- vprord zmm6, zmm6, 7
- vprord zmm7, zmm7, 7
- vpaddd zmm0, zmm0, zmm23
- vpaddd zmm1, zmm1, zmm21
- vpaddd zmm2, zmm2, zmm16
- vpaddd zmm3, zmm3, zmm22
- vpaddd zmm0, zmm0, zmm5
- vpaddd zmm1, zmm1, zmm6
- vpaddd zmm2, zmm2, zmm7
- vpaddd zmm3, zmm3, zmm4
- vpxord zmm15, zmm15, zmm0
- vpxord zmm12, zmm12, zmm1
- vpxord zmm13, zmm13, zmm2
- vpxord zmm14, zmm14, zmm3
- vprord zmm15, zmm15, 16
- vprord zmm12, zmm12, 16
- vprord zmm13, zmm13, 16
- vprord zmm14, zmm14, 16
- vpaddd zmm10, zmm10, zmm15
- vpaddd zmm11, zmm11, zmm12
- vpaddd zmm8, zmm8, zmm13
- vpaddd zmm9, zmm9, zmm14
- vpxord zmm5, zmm5, zmm10
- vpxord zmm6, zmm6, zmm11
- vpxord zmm7, zmm7, zmm8
- vpxord zmm4, zmm4, zmm9
- vprord zmm5, zmm5, 12
- vprord zmm6, zmm6, 12
- vprord zmm7, zmm7, 12
- vprord zmm4, zmm4, 12
- vpaddd zmm0, zmm0, zmm18
- vpaddd zmm1, zmm1, zmm19
- vpaddd zmm2, zmm2, zmm17
- vpaddd zmm3, zmm3, zmm20
- vpaddd zmm0, zmm0, zmm5
- vpaddd zmm1, zmm1, zmm6
- vpaddd zmm2, zmm2, zmm7
- vpaddd zmm3, zmm3, zmm4
- vpxord zmm15, zmm15, zmm0
- vpxord zmm12, zmm12, zmm1
- vpxord zmm13, zmm13, zmm2
- vpxord zmm14, zmm14, zmm3
- vprord zmm15, zmm15, 8
- vprord zmm12, zmm12, 8
- vprord zmm13, zmm13, 8
- vprord zmm14, zmm14, 8
- vpaddd zmm10, zmm10, zmm15
- vpaddd zmm11, zmm11, zmm12
- vpaddd zmm8, zmm8, zmm13
- vpaddd zmm9, zmm9, zmm14
- vpxord zmm5, zmm5, zmm10
- vpxord zmm6, zmm6, zmm11
- vpxord zmm7, zmm7, zmm8
- vpxord zmm4, zmm4, zmm9
- vprord zmm5, zmm5, 7
- vprord zmm6, zmm6, 7
- vprord zmm7, zmm7, 7
- vprord zmm4, zmm4, 7
- vpaddd zmm0, zmm0, zmm25
- vpaddd zmm1, zmm1, zmm27
- vpaddd zmm2, zmm2, zmm24
- vpaddd zmm3, zmm3, zmm31
- vpaddd zmm0, zmm0, zmm4
- vpaddd zmm1, zmm1, zmm5
- vpaddd zmm2, zmm2, zmm6
- vpaddd zmm3, zmm3, zmm7
- vpxord zmm12, zmm12, zmm0
- vpxord zmm13, zmm13, zmm1
- vpxord zmm14, zmm14, zmm2
- vpxord zmm15, zmm15, zmm3
- vprord zmm12, zmm12, 16
- vprord zmm13, zmm13, 16
- vprord zmm14, zmm14, 16
- vprord zmm15, zmm15, 16
- vpaddd zmm8, zmm8, zmm12
- vpaddd zmm9, zmm9, zmm13
- vpaddd zmm10, zmm10, zmm14
- vpaddd zmm11, zmm11, zmm15
- vpxord zmm4, zmm4, zmm8
- vpxord zmm5, zmm5, zmm9
- vpxord zmm6, zmm6, zmm10
- vpxord zmm7, zmm7, zmm11
- vprord zmm4, zmm4, 12
- vprord zmm5, zmm5, 12
- vprord zmm6, zmm6, 12
- vprord zmm7, zmm7, 12
- vpaddd zmm0, zmm0, zmm30
- vpaddd zmm1, zmm1, zmm21
- vpaddd zmm2, zmm2, zmm28
- vpaddd zmm3, zmm3, zmm17
- vpaddd zmm0, zmm0, zmm4
- vpaddd zmm1, zmm1, zmm5
- vpaddd zmm2, zmm2, zmm6
- vpaddd zmm3, zmm3, zmm7
- vpxord zmm12, zmm12, zmm0
- vpxord zmm13, zmm13, zmm1
- vpxord zmm14, zmm14, zmm2
- vpxord zmm15, zmm15, zmm3
- vprord zmm12, zmm12, 8
- vprord zmm13, zmm13, 8
- vprord zmm14, zmm14, 8
- vprord zmm15, zmm15, 8
- vpaddd zmm8, zmm8, zmm12
- vpaddd zmm9, zmm9, zmm13
- vpaddd zmm10, zmm10, zmm14
- vpaddd zmm11, zmm11, zmm15
- vpxord zmm4, zmm4, zmm8
- vpxord zmm5, zmm5, zmm9
- vpxord zmm6, zmm6, zmm10
- vpxord zmm7, zmm7, zmm11
- vprord zmm4, zmm4, 7
- vprord zmm5, zmm5, 7
- vprord zmm6, zmm6, 7
- vprord zmm7, zmm7, 7
- vpaddd zmm0, zmm0, zmm29
- vpaddd zmm1, zmm1, zmm16
- vpaddd zmm2, zmm2, zmm18
- vpaddd zmm3, zmm3, zmm20
- vpaddd zmm0, zmm0, zmm5
- vpaddd zmm1, zmm1, zmm6
- vpaddd zmm2, zmm2, zmm7
- vpaddd zmm3, zmm3, zmm4
- vpxord zmm15, zmm15, zmm0
- vpxord zmm12, zmm12, zmm1
- vpxord zmm13, zmm13, zmm2
- vpxord zmm14, zmm14, zmm3
- vprord zmm15, zmm15, 16
- vprord zmm12, zmm12, 16
- vprord zmm13, zmm13, 16
- vprord zmm14, zmm14, 16
- vpaddd zmm10, zmm10, zmm15
- vpaddd zmm11, zmm11, zmm12
- vpaddd zmm8, zmm8, zmm13
- vpaddd zmm9, zmm9, zmm14
- vpxord zmm5, zmm5, zmm10
- vpxord zmm6, zmm6, zmm11
- vpxord zmm7, zmm7, zmm8
- vpxord zmm4, zmm4, zmm9
- vprord zmm5, zmm5, 12
- vprord zmm6, zmm6, 12
- vprord zmm7, zmm7, 12
- vprord zmm4, zmm4, 12
- vpaddd zmm0, zmm0, zmm19
- vpaddd zmm1, zmm1, zmm26
- vpaddd zmm2, zmm2, zmm22
- vpaddd zmm3, zmm3, zmm23
- vpaddd zmm0, zmm0, zmm5
- vpaddd zmm1, zmm1, zmm6
- vpaddd zmm2, zmm2, zmm7
- vpaddd zmm3, zmm3, zmm4
- vpxord zmm15, zmm15, zmm0
- vpxord zmm12, zmm12, zmm1
- vpxord zmm13, zmm13, zmm2
- vpxord zmm14, zmm14, zmm3
- vprord zmm15, zmm15, 8
- vprord zmm12, zmm12, 8
- vprord zmm13, zmm13, 8
- vprord zmm14, zmm14, 8
- vpaddd zmm10, zmm10, zmm15
- vpaddd zmm11, zmm11, zmm12
- vpaddd zmm8, zmm8, zmm13
- vpaddd zmm9, zmm9, zmm14
- vpxord zmm5, zmm5, zmm10
- vpxord zmm6, zmm6, zmm11
- vpxord zmm7, zmm7, zmm8
- vpxord zmm4, zmm4, zmm9
- vprord zmm5, zmm5, 7
- vprord zmm6, zmm6, 7
- vprord zmm7, zmm7, 7
- vprord zmm4, zmm4, 7
- vpaddd zmm0, zmm0, zmm27
- vpaddd zmm1, zmm1, zmm21
- vpaddd zmm2, zmm2, zmm17
- vpaddd zmm3, zmm3, zmm24
- vpaddd zmm0, zmm0, zmm4
- vpaddd zmm1, zmm1, zmm5
- vpaddd zmm2, zmm2, zmm6
- vpaddd zmm3, zmm3, zmm7
- vpxord zmm12, zmm12, zmm0
- vpxord zmm13, zmm13, zmm1
- vpxord zmm14, zmm14, zmm2
- vpxord zmm15, zmm15, zmm3
- vprord zmm12, zmm12, 16
- vprord zmm13, zmm13, 16
- vprord zmm14, zmm14, 16
- vprord zmm15, zmm15, 16
- vpaddd zmm8, zmm8, zmm12
- vpaddd zmm9, zmm9, zmm13
- vpaddd zmm10, zmm10, zmm14
- vpaddd zmm11, zmm11, zmm15
- vpxord zmm4, zmm4, zmm8
- vpxord zmm5, zmm5, zmm9
- vpxord zmm6, zmm6, zmm10
- vpxord zmm7, zmm7, zmm11
- vprord zmm4, zmm4, 12
- vprord zmm5, zmm5, 12
- vprord zmm6, zmm6, 12
- vprord zmm7, zmm7, 12
- vpaddd zmm0, zmm0, zmm31
- vpaddd zmm1, zmm1, zmm16
- vpaddd zmm2, zmm2, zmm25
- vpaddd zmm3, zmm3, zmm22
- vpaddd zmm0, zmm0, zmm4
- vpaddd zmm1, zmm1, zmm5
- vpaddd zmm2, zmm2, zmm6
- vpaddd zmm3, zmm3, zmm7
- vpxord zmm12, zmm12, zmm0
- vpxord zmm13, zmm13, zmm1
- vpxord zmm14, zmm14, zmm2
- vpxord zmm15, zmm15, zmm3
- vprord zmm12, zmm12, 8
- vprord zmm13, zmm13, 8
- vprord zmm14, zmm14, 8
- vprord zmm15, zmm15, 8
- vpaddd zmm8, zmm8, zmm12
- vpaddd zmm9, zmm9, zmm13
- vpaddd zmm10, zmm10, zmm14
- vpaddd zmm11, zmm11, zmm15
- vpxord zmm4, zmm4, zmm8
- vpxord zmm5, zmm5, zmm9
- vpxord zmm6, zmm6, zmm10
- vpxord zmm7, zmm7, zmm11
- vprord zmm4, zmm4, 7
- vprord zmm5, zmm5, 7
- vprord zmm6, zmm6, 7
- vprord zmm7, zmm7, 7
- vpaddd zmm0, zmm0, zmm30
- vpaddd zmm1, zmm1, zmm18
- vpaddd zmm2, zmm2, zmm19
- vpaddd zmm3, zmm3, zmm23
- vpaddd zmm0, zmm0, zmm5
- vpaddd zmm1, zmm1, zmm6
- vpaddd zmm2, zmm2, zmm7
- vpaddd zmm3, zmm3, zmm4
- vpxord zmm15, zmm15, zmm0
- vpxord zmm12, zmm12, zmm1
- vpxord zmm13, zmm13, zmm2
- vpxord zmm14, zmm14, zmm3
- vprord zmm15, zmm15, 16
- vprord zmm12, zmm12, 16
- vprord zmm13, zmm13, 16
- vprord zmm14, zmm14, 16
- vpaddd zmm10, zmm10, zmm15
- vpaddd zmm11, zmm11, zmm12
- vpaddd zmm8, zmm8, zmm13
- vpaddd zmm9, zmm9, zmm14
- vpxord zmm5, zmm5, zmm10
- vpxord zmm6, zmm6, zmm11
- vpxord zmm7, zmm7, zmm8
- vpxord zmm4, zmm4, zmm9
- vprord zmm5, zmm5, 12
- vprord zmm6, zmm6, 12
- vprord zmm7, zmm7, 12
- vprord zmm4, zmm4, 12
- vpaddd zmm0, zmm0, zmm26
- vpaddd zmm1, zmm1, zmm28
- vpaddd zmm2, zmm2, zmm20
- vpaddd zmm3, zmm3, zmm29
- vpaddd zmm0, zmm0, zmm5
- vpaddd zmm1, zmm1, zmm6
- vpaddd zmm2, zmm2, zmm7
- vpaddd zmm3, zmm3, zmm4
- vpxord zmm15, zmm15, zmm0
- vpxord zmm12, zmm12, zmm1
- vpxord zmm13, zmm13, zmm2
- vpxord zmm14, zmm14, zmm3
- vprord zmm15, zmm15, 8
- vprord zmm12, zmm12, 8
- vprord zmm13, zmm13, 8
- vprord zmm14, zmm14, 8
- vpaddd zmm10, zmm10, zmm15
- vpaddd zmm11, zmm11, zmm12
- vpaddd zmm8, zmm8, zmm13
- vpaddd zmm9, zmm9, zmm14
- vpxord zmm5, zmm5, zmm10
- vpxord zmm6, zmm6, zmm11
- vpxord zmm7, zmm7, zmm8
- vpxord zmm4, zmm4, zmm9
- vprord zmm5, zmm5, 7
- vprord zmm6, zmm6, 7
- vprord zmm7, zmm7, 7
- vprord zmm4, zmm4, 7
- vpxord zmm0, zmm0, zmm8
- vpxord zmm1, zmm1, zmm9
- vpxord zmm2, zmm2, zmm10
- vpxord zmm3, zmm3, zmm11
- vpxord zmm4, zmm4, zmm12
- vpxord zmm5, zmm5, zmm13
- vpxord zmm6, zmm6, zmm14
- vpxord zmm7, zmm7, zmm15
- movzx eax, byte ptr [rbp+78H]
- jne innerloop16
- mov rbx, qword ptr [rbp+90H]
- vpunpckldq zmm16, zmm0, zmm1
- vpunpckhdq zmm17, zmm0, zmm1
- vpunpckldq zmm18, zmm2, zmm3
- vpunpckhdq zmm19, zmm2, zmm3
- vpunpckldq zmm20, zmm4, zmm5
- vpunpckhdq zmm21, zmm4, zmm5
- vpunpckldq zmm22, zmm6, zmm7
- vpunpckhdq zmm23, zmm6, zmm7
- vpunpcklqdq zmm0, zmm16, zmm18
- vpunpckhqdq zmm1, zmm16, zmm18
- vpunpcklqdq zmm2, zmm17, zmm19
- vpunpckhqdq zmm3, zmm17, zmm19
- vpunpcklqdq zmm4, zmm20, zmm22
- vpunpckhqdq zmm5, zmm20, zmm22
- vpunpcklqdq zmm6, zmm21, zmm23
- vpunpckhqdq zmm7, zmm21, zmm23
- vshufi32x4 zmm16, zmm0, zmm4, 88H
- vshufi32x4 zmm17, zmm1, zmm5, 88H
- vshufi32x4 zmm18, zmm2, zmm6, 88H
- vshufi32x4 zmm19, zmm3, zmm7, 88H
- vshufi32x4 zmm20, zmm0, zmm4, 0DDH
- vshufi32x4 zmm21, zmm1, zmm5, 0DDH
- vshufi32x4 zmm22, zmm2, zmm6, 0DDH
- vshufi32x4 zmm23, zmm3, zmm7, 0DDH
- vshufi32x4 zmm0, zmm16, zmm17, 88H
- vshufi32x4 zmm1, zmm18, zmm19, 88H
- vshufi32x4 zmm2, zmm20, zmm21, 88H
- vshufi32x4 zmm3, zmm22, zmm23, 88H
- vshufi32x4 zmm4, zmm16, zmm17, 0DDH
- vshufi32x4 zmm5, zmm18, zmm19, 0DDH
- vshufi32x4 zmm6, zmm20, zmm21, 0DDH
- vshufi32x4 zmm7, zmm22, zmm23, 0DDH
- vmovdqu32 zmmword ptr [rbx], zmm0
- vmovdqu32 zmmword ptr [rbx+1H*40H], zmm1
- vmovdqu32 zmmword ptr [rbx+2H*40H], zmm2
- vmovdqu32 zmmword ptr [rbx+3H*40H], zmm3
- vmovdqu32 zmmword ptr [rbx+4H*40H], zmm4
- vmovdqu32 zmmword ptr [rbx+5H*40H], zmm5
- vmovdqu32 zmmword ptr [rbx+6H*40H], zmm6
- vmovdqu32 zmmword ptr [rbx+7H*40H], zmm7
- vmovdqa32 zmm0, zmmword ptr [rsp]
- vmovdqa32 zmm1, zmmword ptr [rsp+1H*40H]
- vmovdqa32 zmm2, zmm0
- ; XXX: ml64.exe does not currently understand the syntax. We use a workaround.
- vpbroadcastd zmm4, dword ptr [ADD16]
- vpbroadcastd zmm5, dword ptr [ADD1]
- vpaddd zmm2{k1}, zmm0, zmm4
- ; vpaddd zmm2{k1}, zmm0, dword ptr [ADD16] ; {1to16}
- vpcmpud k2, zmm2, zmm0, 1
- vpaddd zmm1 {k2}, zmm1, zmm5
- ; vpaddd zmm1 {k2}, zmm1, dword ptr [ADD1] ; {1to16}
- vmovdqa32 zmmword ptr [rsp], zmm2
- vmovdqa32 zmmword ptr [rsp+1H*40H], zmm1
- add rdi, 128
- add rbx, 512
- mov qword ptr [rbp+90H], rbx
- sub rsi, 16
- cmp rsi, 16
- jnc outerloop16
- test rsi, rsi
- jne final15blocks
-unwind:
- vzeroupper
- vmovdqa xmm6, xmmword ptr [rsp+90H]
- vmovdqa xmm7, xmmword ptr [rsp+0A0H]
- vmovdqa xmm8, xmmword ptr [rsp+0B0H]
- vmovdqa xmm9, xmmword ptr [rsp+0C0H]
- vmovdqa xmm10, xmmword ptr [rsp+0D0H]
- vmovdqa xmm11, xmmword ptr [rsp+0E0H]
- vmovdqa xmm12, xmmword ptr [rsp+0F0H]
- vmovdqa xmm13, xmmword ptr [rsp+100H]
- vmovdqa xmm14, xmmword ptr [rsp+110H]
- vmovdqa xmm15, xmmword ptr [rsp+120H]
- mov rsp, rbp
- pop rbp
- pop rbx
- pop rsi
- pop rdi
- pop r12
- pop r13
- pop r14
- pop r15
- ret
-ALIGN 16
-final15blocks:
- test esi, 8H
- je final7blocks
- vpbroadcastd ymm0, dword ptr [rcx]
- vpbroadcastd ymm1, dword ptr [rcx+4H]
- vpbroadcastd ymm2, dword ptr [rcx+8H]
- vpbroadcastd ymm3, dword ptr [rcx+0CH]
- vpbroadcastd ymm4, dword ptr [rcx+10H]
- vpbroadcastd ymm5, dword ptr [rcx+14H]
- vpbroadcastd ymm6, dword ptr [rcx+18H]
- vpbroadcastd ymm7, dword ptr [rcx+1CH]
- mov r8, qword ptr [rdi]
- mov r9, qword ptr [rdi+8H]
- mov r10, qword ptr [rdi+10H]
- mov r11, qword ptr [rdi+18H]
- mov r12, qword ptr [rdi+20H]
- mov r13, qword ptr [rdi+28H]
- mov r14, qword ptr [rdi+30H]
- mov r15, qword ptr [rdi+38H]
- movzx eax, byte ptr [rbp+78H]
- movzx ebx, byte ptr [rbp+80H]
- or eax, ebx
- xor edx, edx
-innerloop8:
- movzx ebx, byte ptr [rbp+88H]
- or ebx, eax
- add rdx, 64
- cmp rdx, qword ptr [rsp+80H]
- cmove eax, ebx
- mov dword ptr [rsp+88H], eax
- vmovups xmm8, xmmword ptr [r8+rdx-40H]
- vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-40H], 01H
- vmovups xmm9, xmmword ptr [r9+rdx-40H]
- vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-40H], 01H
- vunpcklpd ymm12, ymm8, ymm9
- vunpckhpd ymm13, ymm8, ymm9
- vmovups xmm10, xmmword ptr [r10+rdx-40H]
- vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-40H], 01H
- vmovups xmm11, xmmword ptr [r11+rdx-40H]
- vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-40H], 01H
- vunpcklpd ymm14, ymm10, ymm11
- vunpckhpd ymm15, ymm10, ymm11
- vshufps ymm16, ymm12, ymm14, 136
- vshufps ymm17, ymm12, ymm14, 221
- vshufps ymm18, ymm13, ymm15, 136
- vshufps ymm19, ymm13, ymm15, 221
- vmovups xmm8, xmmword ptr [r8+rdx-30H]
- vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-30H], 01H
- vmovups xmm9, xmmword ptr [r9+rdx-30H]
- vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-30H], 01H
- vunpcklpd ymm12, ymm8, ymm9
- vunpckhpd ymm13, ymm8, ymm9
- vmovups xmm10, xmmword ptr [r10+rdx-30H]
- vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-30H], 01H
- vmovups xmm11, xmmword ptr [r11+rdx-30H]
- vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-30H], 01H
- vunpcklpd ymm14, ymm10, ymm11
- vunpckhpd ymm15, ymm10, ymm11
- vshufps ymm20, ymm12, ymm14, 136
- vshufps ymm21, ymm12, ymm14, 221
- vshufps ymm22, ymm13, ymm15, 136
- vshufps ymm23, ymm13, ymm15, 221
- vmovups xmm8, xmmword ptr [r8+rdx-20H]
- vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-20H], 01H
- vmovups xmm9, xmmword ptr [r9+rdx-20H]
- vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-20H], 01H
- vunpcklpd ymm12, ymm8, ymm9
- vunpckhpd ymm13, ymm8, ymm9
- vmovups xmm10, xmmword ptr [r10+rdx-20H]
- vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-20H], 01H
- vmovups xmm11, xmmword ptr [r11+rdx-20H]
- vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-20H], 01H
- vunpcklpd ymm14, ymm10, ymm11
- vunpckhpd ymm15, ymm10, ymm11
- vshufps ymm24, ymm12, ymm14, 136
- vshufps ymm25, ymm12, ymm14, 221
- vshufps ymm26, ymm13, ymm15, 136
- vshufps ymm27, ymm13, ymm15, 221
- vmovups xmm8, xmmword ptr [r8+rdx-10H]
- vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-10H], 01H
- vmovups xmm9, xmmword ptr [r9+rdx-10H]
- vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-10H], 01H
- vunpcklpd ymm12, ymm8, ymm9
- vunpckhpd ymm13, ymm8, ymm9
- vmovups xmm10, xmmword ptr [r10+rdx-10H]
- vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-10H], 01H
- vmovups xmm11, xmmword ptr [r11+rdx-10H]
- vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-10H], 01H
- vunpcklpd ymm14, ymm10, ymm11
- vunpckhpd ymm15, ymm10, ymm11
- vshufps ymm28, ymm12, ymm14, 136
- vshufps ymm29, ymm12, ymm14, 221
- vshufps ymm30, ymm13, ymm15, 136
- vshufps ymm31, ymm13, ymm15, 221
- vpbroadcastd ymm8, dword ptr [BLAKE3_IV_0]
- vpbroadcastd ymm9, dword ptr [BLAKE3_IV_1]
- vpbroadcastd ymm10, dword ptr [BLAKE3_IV_2]
- vpbroadcastd ymm11, dword ptr [BLAKE3_IV_3]
- vmovdqa ymm12, ymmword ptr [rsp]
- vmovdqa ymm13, ymmword ptr [rsp+40H]
- vpbroadcastd ymm14, dword ptr [BLAKE3_BLOCK_LEN]
- vpbroadcastd ymm15, dword ptr [rsp+88H]
- vpaddd ymm0, ymm0, ymm16
- vpaddd ymm1, ymm1, ymm18
- vpaddd ymm2, ymm2, ymm20
- vpaddd ymm3, ymm3, ymm22
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxord ymm12, ymm12, ymm0
- vpxord ymm13, ymm13, ymm1
- vpxord ymm14, ymm14, ymm2
- vpxord ymm15, ymm15, ymm3
- vprord ymm12, ymm12, 16
- vprord ymm13, ymm13, 16
- vprord ymm14, ymm14, 16
- vprord ymm15, ymm15, 16
- vpaddd ymm8, ymm8, ymm12
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxord ymm4, ymm4, ymm8
- vpxord ymm5, ymm5, ymm9
- vpxord ymm6, ymm6, ymm10
- vpxord ymm7, ymm7, ymm11
- vprord ymm4, ymm4, 12
- vprord ymm5, ymm5, 12
- vprord ymm6, ymm6, 12
- vprord ymm7, ymm7, 12
- vpaddd ymm0, ymm0, ymm17
- vpaddd ymm1, ymm1, ymm19
- vpaddd ymm2, ymm2, ymm21
- vpaddd ymm3, ymm3, ymm23
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxord ymm12, ymm12, ymm0
- vpxord ymm13, ymm13, ymm1
- vpxord ymm14, ymm14, ymm2
- vpxord ymm15, ymm15, ymm3
- vprord ymm12, ymm12, 8
- vprord ymm13, ymm13, 8
- vprord ymm14, ymm14, 8
- vprord ymm15, ymm15, 8
- vpaddd ymm8, ymm8, ymm12
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxord ymm4, ymm4, ymm8
- vpxord ymm5, ymm5, ymm9
- vpxord ymm6, ymm6, ymm10
- vpxord ymm7, ymm7, ymm11
- vprord ymm4, ymm4, 7
- vprord ymm5, ymm5, 7
- vprord ymm6, ymm6, 7
- vprord ymm7, ymm7, 7
- vpaddd ymm0, ymm0, ymm24
- vpaddd ymm1, ymm1, ymm26
- vpaddd ymm2, ymm2, ymm28
- vpaddd ymm3, ymm3, ymm30
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxord ymm15, ymm15, ymm0
- vpxord ymm12, ymm12, ymm1
- vpxord ymm13, ymm13, ymm2
- vpxord ymm14, ymm14, ymm3
- vprord ymm15, ymm15, 16
- vprord ymm12, ymm12, 16
- vprord ymm13, ymm13, 16
- vprord ymm14, ymm14, 16
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm8, ymm13
- vpaddd ymm9, ymm9, ymm14
- vpxord ymm5, ymm5, ymm10
- vpxord ymm6, ymm6, ymm11
- vpxord ymm7, ymm7, ymm8
- vpxord ymm4, ymm4, ymm9
- vprord ymm5, ymm5, 12
- vprord ymm6, ymm6, 12
- vprord ymm7, ymm7, 12
- vprord ymm4, ymm4, 12
- vpaddd ymm0, ymm0, ymm25
- vpaddd ymm1, ymm1, ymm27
- vpaddd ymm2, ymm2, ymm29
- vpaddd ymm3, ymm3, ymm31
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxord ymm15, ymm15, ymm0
- vpxord ymm12, ymm12, ymm1
- vpxord ymm13, ymm13, ymm2
- vpxord ymm14, ymm14, ymm3
- vprord ymm15, ymm15, 8
- vprord ymm12, ymm12, 8
- vprord ymm13, ymm13, 8
- vprord ymm14, ymm14, 8
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm8, ymm13
- vpaddd ymm9, ymm9, ymm14
- vpxord ymm5, ymm5, ymm10
- vpxord ymm6, ymm6, ymm11
- vpxord ymm7, ymm7, ymm8
- vpxord ymm4, ymm4, ymm9
- vprord ymm5, ymm5, 7
- vprord ymm6, ymm6, 7
- vprord ymm7, ymm7, 7
- vprord ymm4, ymm4, 7
- vpaddd ymm0, ymm0, ymm18
- vpaddd ymm1, ymm1, ymm19
- vpaddd ymm2, ymm2, ymm23
- vpaddd ymm3, ymm3, ymm20
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxord ymm12, ymm12, ymm0
- vpxord ymm13, ymm13, ymm1
- vpxord ymm14, ymm14, ymm2
- vpxord ymm15, ymm15, ymm3
- vprord ymm12, ymm12, 16
- vprord ymm13, ymm13, 16
- vprord ymm14, ymm14, 16
- vprord ymm15, ymm15, 16
- vpaddd ymm8, ymm8, ymm12
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxord ymm4, ymm4, ymm8
- vpxord ymm5, ymm5, ymm9
- vpxord ymm6, ymm6, ymm10
- vpxord ymm7, ymm7, ymm11
- vprord ymm4, ymm4, 12
- vprord ymm5, ymm5, 12
- vprord ymm6, ymm6, 12
- vprord ymm7, ymm7, 12
- vpaddd ymm0, ymm0, ymm22
- vpaddd ymm1, ymm1, ymm26
- vpaddd ymm2, ymm2, ymm16
- vpaddd ymm3, ymm3, ymm29
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxord ymm12, ymm12, ymm0
- vpxord ymm13, ymm13, ymm1
- vpxord ymm14, ymm14, ymm2
- vpxord ymm15, ymm15, ymm3
- vprord ymm12, ymm12, 8
- vprord ymm13, ymm13, 8
- vprord ymm14, ymm14, 8
- vprord ymm15, ymm15, 8
- vpaddd ymm8, ymm8, ymm12
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxord ymm4, ymm4, ymm8
- vpxord ymm5, ymm5, ymm9
- vpxord ymm6, ymm6, ymm10
- vpxord ymm7, ymm7, ymm11
- vprord ymm4, ymm4, 7
- vprord ymm5, ymm5, 7
- vprord ymm6, ymm6, 7
- vprord ymm7, ymm7, 7
- vpaddd ymm0, ymm0, ymm17
- vpaddd ymm1, ymm1, ymm28
- vpaddd ymm2, ymm2, ymm25
- vpaddd ymm3, ymm3, ymm31
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxord ymm15, ymm15, ymm0
- vpxord ymm12, ymm12, ymm1
- vpxord ymm13, ymm13, ymm2
- vpxord ymm14, ymm14, ymm3
- vprord ymm15, ymm15, 16
- vprord ymm12, ymm12, 16
- vprord ymm13, ymm13, 16
- vprord ymm14, ymm14, 16
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm8, ymm13
- vpaddd ymm9, ymm9, ymm14
- vpxord ymm5, ymm5, ymm10
- vpxord ymm6, ymm6, ymm11
- vpxord ymm7, ymm7, ymm8
- vpxord ymm4, ymm4, ymm9
- vprord ymm5, ymm5, 12
- vprord ymm6, ymm6, 12
- vprord ymm7, ymm7, 12
- vprord ymm4, ymm4, 12
- vpaddd ymm0, ymm0, ymm27
- vpaddd ymm1, ymm1, ymm21
- vpaddd ymm2, ymm2, ymm30
- vpaddd ymm3, ymm3, ymm24
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxord ymm15, ymm15, ymm0
- vpxord ymm12, ymm12, ymm1
- vpxord ymm13, ymm13, ymm2
- vpxord ymm14, ymm14, ymm3
- vprord ymm15, ymm15, 8
- vprord ymm12, ymm12, 8
- vprord ymm13, ymm13, 8
- vprord ymm14, ymm14, 8
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm8, ymm13
- vpaddd ymm9, ymm9, ymm14
- vpxord ymm5, ymm5, ymm10
- vpxord ymm6, ymm6, ymm11
- vpxord ymm7, ymm7, ymm8
- vpxord ymm4, ymm4, ymm9
- vprord ymm5, ymm5, 7
- vprord ymm6, ymm6, 7
- vprord ymm7, ymm7, 7
- vprord ymm4, ymm4, 7
- vpaddd ymm0, ymm0, ymm19
- vpaddd ymm1, ymm1, ymm26
- vpaddd ymm2, ymm2, ymm29
- vpaddd ymm3, ymm3, ymm23
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxord ymm12, ymm12, ymm0
- vpxord ymm13, ymm13, ymm1
- vpxord ymm14, ymm14, ymm2
- vpxord ymm15, ymm15, ymm3
- vprord ymm12, ymm12, 16
- vprord ymm13, ymm13, 16
- vprord ymm14, ymm14, 16
- vprord ymm15, ymm15, 16
- vpaddd ymm8, ymm8, ymm12
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxord ymm4, ymm4, ymm8
- vpxord ymm5, ymm5, ymm9
- vpxord ymm6, ymm6, ymm10
- vpxord ymm7, ymm7, ymm11
- vprord ymm4, ymm4, 12
- vprord ymm5, ymm5, 12
- vprord ymm6, ymm6, 12
- vprord ymm7, ymm7, 12
- vpaddd ymm0, ymm0, ymm20
- vpaddd ymm1, ymm1, ymm28
- vpaddd ymm2, ymm2, ymm18
- vpaddd ymm3, ymm3, ymm30
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxord ymm12, ymm12, ymm0
- vpxord ymm13, ymm13, ymm1
- vpxord ymm14, ymm14, ymm2
- vpxord ymm15, ymm15, ymm3
- vprord ymm12, ymm12, 8
- vprord ymm13, ymm13, 8
- vprord ymm14, ymm14, 8
- vprord ymm15, ymm15, 8
- vpaddd ymm8, ymm8, ymm12
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxord ymm4, ymm4, ymm8
- vpxord ymm5, ymm5, ymm9
- vpxord ymm6, ymm6, ymm10
- vpxord ymm7, ymm7, ymm11
- vprord ymm4, ymm4, 7
- vprord ymm5, ymm5, 7
- vprord ymm6, ymm6, 7
- vprord ymm7, ymm7, 7
- vpaddd ymm0, ymm0, ymm22
- vpaddd ymm1, ymm1, ymm25
- vpaddd ymm2, ymm2, ymm27
- vpaddd ymm3, ymm3, ymm24
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxord ymm15, ymm15, ymm0
- vpxord ymm12, ymm12, ymm1
- vpxord ymm13, ymm13, ymm2
- vpxord ymm14, ymm14, ymm3
- vprord ymm15, ymm15, 16
- vprord ymm12, ymm12, 16
- vprord ymm13, ymm13, 16
- vprord ymm14, ymm14, 16
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm8, ymm13
- vpaddd ymm9, ymm9, ymm14
- vpxord ymm5, ymm5, ymm10
- vpxord ymm6, ymm6, ymm11
- vpxord ymm7, ymm7, ymm8
- vpxord ymm4, ymm4, ymm9
- vprord ymm5, ymm5, 12
- vprord ymm6, ymm6, 12
- vprord ymm7, ymm7, 12
- vprord ymm4, ymm4, 12
- vpaddd ymm0, ymm0, ymm21
- vpaddd ymm1, ymm1, ymm16
- vpaddd ymm2, ymm2, ymm31
- vpaddd ymm3, ymm3, ymm17
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxord ymm15, ymm15, ymm0
- vpxord ymm12, ymm12, ymm1
- vpxord ymm13, ymm13, ymm2
- vpxord ymm14, ymm14, ymm3
- vprord ymm15, ymm15, 8
- vprord ymm12, ymm12, 8
- vprord ymm13, ymm13, 8
- vprord ymm14, ymm14, 8
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm8, ymm13
- vpaddd ymm9, ymm9, ymm14
- vpxord ymm5, ymm5, ymm10
- vpxord ymm6, ymm6, ymm11
- vpxord ymm7, ymm7, ymm8
- vpxord ymm4, ymm4, ymm9
- vprord ymm5, ymm5, 7
- vprord ymm6, ymm6, 7
- vprord ymm7, ymm7, 7
- vprord ymm4, ymm4, 7
- vpaddd ymm0, ymm0, ymm26
- vpaddd ymm1, ymm1, ymm28
- vpaddd ymm2, ymm2, ymm30
- vpaddd ymm3, ymm3, ymm29
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxord ymm12, ymm12, ymm0
- vpxord ymm13, ymm13, ymm1
- vpxord ymm14, ymm14, ymm2
- vpxord ymm15, ymm15, ymm3
- vprord ymm12, ymm12, 16
- vprord ymm13, ymm13, 16
- vprord ymm14, ymm14, 16
- vprord ymm15, ymm15, 16
- vpaddd ymm8, ymm8, ymm12
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxord ymm4, ymm4, ymm8
- vpxord ymm5, ymm5, ymm9
- vpxord ymm6, ymm6, ymm10
- vpxord ymm7, ymm7, ymm11
- vprord ymm4, ymm4, 12
- vprord ymm5, ymm5, 12
- vprord ymm6, ymm6, 12
- vprord ymm7, ymm7, 12
- vpaddd ymm0, ymm0, ymm23
- vpaddd ymm1, ymm1, ymm25
- vpaddd ymm2, ymm2, ymm19
- vpaddd ymm3, ymm3, ymm31
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxord ymm12, ymm12, ymm0
- vpxord ymm13, ymm13, ymm1
- vpxord ymm14, ymm14, ymm2
- vpxord ymm15, ymm15, ymm3
- vprord ymm12, ymm12, 8
- vprord ymm13, ymm13, 8
- vprord ymm14, ymm14, 8
- vprord ymm15, ymm15, 8
- vpaddd ymm8, ymm8, ymm12
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxord ymm4, ymm4, ymm8
- vpxord ymm5, ymm5, ymm9
- vpxord ymm6, ymm6, ymm10
- vpxord ymm7, ymm7, ymm11
- vprord ymm4, ymm4, 7
- vprord ymm5, ymm5, 7
- vprord ymm6, ymm6, 7
- vprord ymm7, ymm7, 7
- vpaddd ymm0, ymm0, ymm20
- vpaddd ymm1, ymm1, ymm27
- vpaddd ymm2, ymm2, ymm21
- vpaddd ymm3, ymm3, ymm17
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxord ymm15, ymm15, ymm0
- vpxord ymm12, ymm12, ymm1
- vpxord ymm13, ymm13, ymm2
- vpxord ymm14, ymm14, ymm3
- vprord ymm15, ymm15, 16
- vprord ymm12, ymm12, 16
- vprord ymm13, ymm13, 16
- vprord ymm14, ymm14, 16
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm8, ymm13
- vpaddd ymm9, ymm9, ymm14
- vpxord ymm5, ymm5, ymm10
- vpxord ymm6, ymm6, ymm11
- vpxord ymm7, ymm7, ymm8
- vpxord ymm4, ymm4, ymm9
- vprord ymm5, ymm5, 12
- vprord ymm6, ymm6, 12
- vprord ymm7, ymm7, 12
- vprord ymm4, ymm4, 12
- vpaddd ymm0, ymm0, ymm16
- vpaddd ymm1, ymm1, ymm18
- vpaddd ymm2, ymm2, ymm24
- vpaddd ymm3, ymm3, ymm22
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxord ymm15, ymm15, ymm0
- vpxord ymm12, ymm12, ymm1
- vpxord ymm13, ymm13, ymm2
- vpxord ymm14, ymm14, ymm3
- vprord ymm15, ymm15, 8
- vprord ymm12, ymm12, 8
- vprord ymm13, ymm13, 8
- vprord ymm14, ymm14, 8
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm8, ymm13
- vpaddd ymm9, ymm9, ymm14
- vpxord ymm5, ymm5, ymm10
- vpxord ymm6, ymm6, ymm11
- vpxord ymm7, ymm7, ymm8
- vpxord ymm4, ymm4, ymm9
- vprord ymm5, ymm5, 7
- vprord ymm6, ymm6, 7
- vprord ymm7, ymm7, 7
- vprord ymm4, ymm4, 7
- vpaddd ymm0, ymm0, ymm28
- vpaddd ymm1, ymm1, ymm25
- vpaddd ymm2, ymm2, ymm31
- vpaddd ymm3, ymm3, ymm30
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxord ymm12, ymm12, ymm0
- vpxord ymm13, ymm13, ymm1
- vpxord ymm14, ymm14, ymm2
- vpxord ymm15, ymm15, ymm3
- vprord ymm12, ymm12, 16
- vprord ymm13, ymm13, 16
- vprord ymm14, ymm14, 16
- vprord ymm15, ymm15, 16
- vpaddd ymm8, ymm8, ymm12
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxord ymm4, ymm4, ymm8
- vpxord ymm5, ymm5, ymm9
- vpxord ymm6, ymm6, ymm10
- vpxord ymm7, ymm7, ymm11
- vprord ymm4, ymm4, 12
- vprord ymm5, ymm5, 12
- vprord ymm6, ymm6, 12
- vprord ymm7, ymm7, 12
- vpaddd ymm0, ymm0, ymm29
- vpaddd ymm1, ymm1, ymm27
- vpaddd ymm2, ymm2, ymm26
- vpaddd ymm3, ymm3, ymm24
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxord ymm12, ymm12, ymm0
- vpxord ymm13, ymm13, ymm1
- vpxord ymm14, ymm14, ymm2
- vpxord ymm15, ymm15, ymm3
- vprord ymm12, ymm12, 8
- vprord ymm13, ymm13, 8
- vprord ymm14, ymm14, 8
- vprord ymm15, ymm15, 8
- vpaddd ymm8, ymm8, ymm12
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxord ymm4, ymm4, ymm8
- vpxord ymm5, ymm5, ymm9
- vpxord ymm6, ymm6, ymm10
- vpxord ymm7, ymm7, ymm11
- vprord ymm4, ymm4, 7
- vprord ymm5, ymm5, 7
- vprord ymm6, ymm6, 7
- vprord ymm7, ymm7, 7
- vpaddd ymm0, ymm0, ymm23
- vpaddd ymm1, ymm1, ymm21
- vpaddd ymm2, ymm2, ymm16
- vpaddd ymm3, ymm3, ymm22
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxord ymm15, ymm15, ymm0
- vpxord ymm12, ymm12, ymm1
- vpxord ymm13, ymm13, ymm2
- vpxord ymm14, ymm14, ymm3
- vprord ymm15, ymm15, 16
- vprord ymm12, ymm12, 16
- vprord ymm13, ymm13, 16
- vprord ymm14, ymm14, 16
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm8, ymm13
- vpaddd ymm9, ymm9, ymm14
- vpxord ymm5, ymm5, ymm10
- vpxord ymm6, ymm6, ymm11
- vpxord ymm7, ymm7, ymm8
- vpxord ymm4, ymm4, ymm9
- vprord ymm5, ymm5, 12
- vprord ymm6, ymm6, 12
- vprord ymm7, ymm7, 12
- vprord ymm4, ymm4, 12
- vpaddd ymm0, ymm0, ymm18
- vpaddd ymm1, ymm1, ymm19
- vpaddd ymm2, ymm2, ymm17
- vpaddd ymm3, ymm3, ymm20
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxord ymm15, ymm15, ymm0
- vpxord ymm12, ymm12, ymm1
- vpxord ymm13, ymm13, ymm2
- vpxord ymm14, ymm14, ymm3
- vprord ymm15, ymm15, 8
- vprord ymm12, ymm12, 8
- vprord ymm13, ymm13, 8
- vprord ymm14, ymm14, 8
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm8, ymm13
- vpaddd ymm9, ymm9, ymm14
- vpxord ymm5, ymm5, ymm10
- vpxord ymm6, ymm6, ymm11
- vpxord ymm7, ymm7, ymm8
- vpxord ymm4, ymm4, ymm9
- vprord ymm5, ymm5, 7
- vprord ymm6, ymm6, 7
- vprord ymm7, ymm7, 7
- vprord ymm4, ymm4, 7
- vpaddd ymm0, ymm0, ymm25
- vpaddd ymm1, ymm1, ymm27
- vpaddd ymm2, ymm2, ymm24
- vpaddd ymm3, ymm3, ymm31
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxord ymm12, ymm12, ymm0
- vpxord ymm13, ymm13, ymm1
- vpxord ymm14, ymm14, ymm2
- vpxord ymm15, ymm15, ymm3
- vprord ymm12, ymm12, 16
- vprord ymm13, ymm13, 16
- vprord ymm14, ymm14, 16
- vprord ymm15, ymm15, 16
- vpaddd ymm8, ymm8, ymm12
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxord ymm4, ymm4, ymm8
- vpxord ymm5, ymm5, ymm9
- vpxord ymm6, ymm6, ymm10
- vpxord ymm7, ymm7, ymm11
- vprord ymm4, ymm4, 12
- vprord ymm5, ymm5, 12
- vprord ymm6, ymm6, 12
- vprord ymm7, ymm7, 12
- vpaddd ymm0, ymm0, ymm30
- vpaddd ymm1, ymm1, ymm21
- vpaddd ymm2, ymm2, ymm28
- vpaddd ymm3, ymm3, ymm17
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxord ymm12, ymm12, ymm0
- vpxord ymm13, ymm13, ymm1
- vpxord ymm14, ymm14, ymm2
- vpxord ymm15, ymm15, ymm3
- vprord ymm12, ymm12, 8
- vprord ymm13, ymm13, 8
- vprord ymm14, ymm14, 8
- vprord ymm15, ymm15, 8
- vpaddd ymm8, ymm8, ymm12
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxord ymm4, ymm4, ymm8
- vpxord ymm5, ymm5, ymm9
- vpxord ymm6, ymm6, ymm10
- vpxord ymm7, ymm7, ymm11
- vprord ymm4, ymm4, 7
- vprord ymm5, ymm5, 7
- vprord ymm6, ymm6, 7
- vprord ymm7, ymm7, 7
- vpaddd ymm0, ymm0, ymm29
- vpaddd ymm1, ymm1, ymm16
- vpaddd ymm2, ymm2, ymm18
- vpaddd ymm3, ymm3, ymm20
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxord ymm15, ymm15, ymm0
- vpxord ymm12, ymm12, ymm1
- vpxord ymm13, ymm13, ymm2
- vpxord ymm14, ymm14, ymm3
- vprord ymm15, ymm15, 16
- vprord ymm12, ymm12, 16
- vprord ymm13, ymm13, 16
- vprord ymm14, ymm14, 16
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm8, ymm13
- vpaddd ymm9, ymm9, ymm14
- vpxord ymm5, ymm5, ymm10
- vpxord ymm6, ymm6, ymm11
- vpxord ymm7, ymm7, ymm8
- vpxord ymm4, ymm4, ymm9
- vprord ymm5, ymm5, 12
- vprord ymm6, ymm6, 12
- vprord ymm7, ymm7, 12
- vprord ymm4, ymm4, 12
- vpaddd ymm0, ymm0, ymm19
- vpaddd ymm1, ymm1, ymm26
- vpaddd ymm2, ymm2, ymm22
- vpaddd ymm3, ymm3, ymm23
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxord ymm15, ymm15, ymm0
- vpxord ymm12, ymm12, ymm1
- vpxord ymm13, ymm13, ymm2
- vpxord ymm14, ymm14, ymm3
- vprord ymm15, ymm15, 8
- vprord ymm12, ymm12, 8
- vprord ymm13, ymm13, 8
- vprord ymm14, ymm14, 8
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm8, ymm13
- vpaddd ymm9, ymm9, ymm14
- vpxord ymm5, ymm5, ymm10
- vpxord ymm6, ymm6, ymm11
- vpxord ymm7, ymm7, ymm8
- vpxord ymm4, ymm4, ymm9
- vprord ymm5, ymm5, 7
- vprord ymm6, ymm6, 7
- vprord ymm7, ymm7, 7
- vprord ymm4, ymm4, 7
- vpaddd ymm0, ymm0, ymm27
- vpaddd ymm1, ymm1, ymm21
- vpaddd ymm2, ymm2, ymm17
- vpaddd ymm3, ymm3, ymm24
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxord ymm12, ymm12, ymm0
- vpxord ymm13, ymm13, ymm1
- vpxord ymm14, ymm14, ymm2
- vpxord ymm15, ymm15, ymm3
- vprord ymm12, ymm12, 16
- vprord ymm13, ymm13, 16
- vprord ymm14, ymm14, 16
- vprord ymm15, ymm15, 16
- vpaddd ymm8, ymm8, ymm12
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxord ymm4, ymm4, ymm8
- vpxord ymm5, ymm5, ymm9
- vpxord ymm6, ymm6, ymm10
- vpxord ymm7, ymm7, ymm11
- vprord ymm4, ymm4, 12
- vprord ymm5, ymm5, 12
- vprord ymm6, ymm6, 12
- vprord ymm7, ymm7, 12
- vpaddd ymm0, ymm0, ymm31
- vpaddd ymm1, ymm1, ymm16
- vpaddd ymm2, ymm2, ymm25
- vpaddd ymm3, ymm3, ymm22
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm1, ymm1, ymm5
- vpaddd ymm2, ymm2, ymm6
- vpaddd ymm3, ymm3, ymm7
- vpxord ymm12, ymm12, ymm0
- vpxord ymm13, ymm13, ymm1
- vpxord ymm14, ymm14, ymm2
- vpxord ymm15, ymm15, ymm3
- vprord ymm12, ymm12, 8
- vprord ymm13, ymm13, 8
- vprord ymm14, ymm14, 8
- vprord ymm15, ymm15, 8
- vpaddd ymm8, ymm8, ymm12
- vpaddd ymm9, ymm9, ymm13
- vpaddd ymm10, ymm10, ymm14
- vpaddd ymm11, ymm11, ymm15
- vpxord ymm4, ymm4, ymm8
- vpxord ymm5, ymm5, ymm9
- vpxord ymm6, ymm6, ymm10
- vpxord ymm7, ymm7, ymm11
- vprord ymm4, ymm4, 7
- vprord ymm5, ymm5, 7
- vprord ymm6, ymm6, 7
- vprord ymm7, ymm7, 7
- vpaddd ymm0, ymm0, ymm30
- vpaddd ymm1, ymm1, ymm18
- vpaddd ymm2, ymm2, ymm19
- vpaddd ymm3, ymm3, ymm23
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxord ymm15, ymm15, ymm0
- vpxord ymm12, ymm12, ymm1
- vpxord ymm13, ymm13, ymm2
- vpxord ymm14, ymm14, ymm3
- vprord ymm15, ymm15, 16
- vprord ymm12, ymm12, 16
- vprord ymm13, ymm13, 16
- vprord ymm14, ymm14, 16
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm8, ymm13
- vpaddd ymm9, ymm9, ymm14
- vpxord ymm5, ymm5, ymm10
- vpxord ymm6, ymm6, ymm11
- vpxord ymm7, ymm7, ymm8
- vpxord ymm4, ymm4, ymm9
- vprord ymm5, ymm5, 12
- vprord ymm6, ymm6, 12
- vprord ymm7, ymm7, 12
- vprord ymm4, ymm4, 12
- vpaddd ymm0, ymm0, ymm26
- vpaddd ymm1, ymm1, ymm28
- vpaddd ymm2, ymm2, ymm20
- vpaddd ymm3, ymm3, ymm29
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm1, ymm1, ymm6
- vpaddd ymm2, ymm2, ymm7
- vpaddd ymm3, ymm3, ymm4
- vpxord ymm15, ymm15, ymm0
- vpxord ymm12, ymm12, ymm1
- vpxord ymm13, ymm13, ymm2
- vpxord ymm14, ymm14, ymm3
- vprord ymm15, ymm15, 8
- vprord ymm12, ymm12, 8
- vprord ymm13, ymm13, 8
- vprord ymm14, ymm14, 8
- vpaddd ymm10, ymm10, ymm15
- vpaddd ymm11, ymm11, ymm12
- vpaddd ymm8, ymm8, ymm13
- vpaddd ymm9, ymm9, ymm14
- vpxord ymm5, ymm5, ymm10
- vpxord ymm6, ymm6, ymm11
- vpxord ymm7, ymm7, ymm8
- vpxord ymm4, ymm4, ymm9
- vprord ymm5, ymm5, 7
- vprord ymm6, ymm6, 7
- vprord ymm7, ymm7, 7
- vprord ymm4, ymm4, 7
- vpxor ymm0, ymm0, ymm8
- vpxor ymm1, ymm1, ymm9
- vpxor ymm2, ymm2, ymm10
- vpxor ymm3, ymm3, ymm11
- vpxor ymm4, ymm4, ymm12
- vpxor ymm5, ymm5, ymm13
- vpxor ymm6, ymm6, ymm14
- vpxor ymm7, ymm7, ymm15
- movzx eax, byte ptr [rbp+78H]
- jne innerloop8
- mov rbx, qword ptr [rbp+90H]
- vunpcklps ymm8, ymm0, ymm1
- vunpcklps ymm9, ymm2, ymm3
- vunpckhps ymm10, ymm0, ymm1
- vunpcklps ymm11, ymm4, ymm5
- vunpcklps ymm0, ymm6, ymm7
- vshufps ymm12, ymm8, ymm9, 78
- vblendps ymm1, ymm8, ymm12, 0CCH
- vshufps ymm8, ymm11, ymm0, 78
- vunpckhps ymm13, ymm2, ymm3
- vblendps ymm2, ymm11, ymm8, 0CCH
- vblendps ymm3, ymm12, ymm9, 0CCH
- vperm2f128 ymm12, ymm1, ymm2, 20H
- vmovups ymmword ptr [rbx], ymm12
- vunpckhps ymm14, ymm4, ymm5
- vblendps ymm4, ymm8, ymm0, 0CCH
- vunpckhps ymm15, ymm6, ymm7
- vperm2f128 ymm7, ymm3, ymm4, 20H
- vmovups ymmword ptr [rbx+20H], ymm7
- vshufps ymm5, ymm10, ymm13, 78
- vblendps ymm6, ymm5, ymm13, 0CCH
- vshufps ymm13, ymm14, ymm15, 78
- vblendps ymm10, ymm10, ymm5, 0CCH
- vblendps ymm14, ymm14, ymm13, 0CCH
- vperm2f128 ymm8, ymm10, ymm14, 20H
- vmovups ymmword ptr [rbx+40H], ymm8
- vblendps ymm15, ymm13, ymm15, 0CCH
- vperm2f128 ymm13, ymm6, ymm15, 20H
- vmovups ymmword ptr [rbx+60H], ymm13
- vperm2f128 ymm9, ymm1, ymm2, 31H
- vperm2f128 ymm11, ymm3, ymm4, 31H
- vmovups ymmword ptr [rbx+80H], ymm9
- vperm2f128 ymm14, ymm10, ymm14, 31H
- vperm2f128 ymm15, ymm6, ymm15, 31H
- vmovups ymmword ptr [rbx+0A0H], ymm11
- vmovups ymmword ptr [rbx+0C0H], ymm14
- vmovups ymmword ptr [rbx+0E0H], ymm15
- vmovdqa ymm0, ymmword ptr [rsp]
- vmovdqa ymm2, ymmword ptr [rsp+40H]
- vmovdqa32 ymm0 {k1}, ymmword ptr [rsp+1H*20H]
- vmovdqa32 ymm2 {k1}, ymmword ptr [rsp+3H*20H]
- vmovdqa ymmword ptr [rsp], ymm0
- vmovdqa ymmword ptr [rsp+40H], ymm2
- add rbx, 256
- mov qword ptr [rbp+90H], rbx
- add rdi, 64
- sub rsi, 8
-final7blocks:
- mov rbx, qword ptr [rbp+90H]
- mov r15, qword ptr [rsp+80H]
- movzx r13, byte ptr [rbp+78H]
- movzx r12, byte ptr [rbp+88H]
- test esi, 4H
- je final3blocks
- vbroadcasti32x4 zmm0, xmmword ptr [rcx]
- vbroadcasti32x4 zmm1, xmmword ptr [rcx+1H*10H]
- vmovdqa xmm12, xmmword ptr [rsp]
- vmovdqa xmm13, xmmword ptr [rsp+40H]
- vpunpckldq xmm14, xmm12, xmm13
- vpunpckhdq xmm15, xmm12, xmm13
- vpermq ymm14, ymm14, 0DCH
- vpermq ymm15, ymm15, 0DCH
- vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN]
- vinserti64x4 zmm13, zmm14, ymm15, 01H
- mov eax, 17476
- kmovw k2, eax
- vpblendmd zmm13 {k2}, zmm13, zmm12
- vbroadcasti32x4 zmm15, xmmword ptr [BLAKE3_IV]
- mov r8, qword ptr [rdi]
- mov r9, qword ptr [rdi+8H]
- mov r10, qword ptr [rdi+10H]
- mov r11, qword ptr [rdi+18H]
- mov eax, 43690
- kmovw k3, eax
- mov eax, 34952
- kmovw k4, eax
- movzx eax, byte ptr [rbp+80H]
- or eax, r13d
- xor edx, edx
-ALIGN 16
-innerloop4:
- mov r14d, eax
- or eax, r12d
- add rdx, 64
- cmp rdx, r15
- cmovne eax, r14d
- mov dword ptr [rsp+88H], eax
- vmovdqa32 zmm2, zmm15
- vpbroadcastd zmm8, dword ptr [rsp+22H*4H]
- vpblendmd zmm3 {k4}, zmm13, zmm8
- vmovups zmm8, zmmword ptr [r8+rdx-1H*40H]
- vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-4H*10H], 01H
- vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-4H*10H], 02H
- vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-4H*10H], 03H
- vmovups zmm9, zmmword ptr [r8+rdx-30H]
- vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-3H*10H], 01H
- vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-3H*10H], 02H
- vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-3H*10H], 03H
- vshufps zmm4, zmm8, zmm9, 136
- vshufps zmm5, zmm8, zmm9, 221
- vmovups zmm8, zmmword ptr [r8+rdx-20H]
- vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-2H*10H], 01H
- vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-2H*10H], 02H
- vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-2H*10H], 03H
- vmovups zmm9, zmmword ptr [r8+rdx-10H]
- vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-1H*10H], 01H
- vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-1H*10H], 02H
- vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-1H*10H], 03H
- vshufps zmm6, zmm8, zmm9, 136
- vshufps zmm7, zmm8, zmm9, 221
- vpshufd zmm6, zmm6, 93H
- vpshufd zmm7, zmm7, 93H
- mov al, 7
-roundloop4:
- vpaddd zmm0, zmm0, zmm4
- vpaddd zmm0, zmm0, zmm1
- vpxord zmm3, zmm3, zmm0
- vprord zmm3, zmm3, 16
- vpaddd zmm2, zmm2, zmm3
- vpxord zmm1, zmm1, zmm2
- vprord zmm1, zmm1, 12
- vpaddd zmm0, zmm0, zmm5
- vpaddd zmm0, zmm0, zmm1
- vpxord zmm3, zmm3, zmm0
- vprord zmm3, zmm3, 8
- vpaddd zmm2, zmm2, zmm3
- vpxord zmm1, zmm1, zmm2
- vprord zmm1, zmm1, 7
- vpshufd zmm0, zmm0, 93H
- vpshufd zmm3, zmm3, 4EH
- vpshufd zmm2, zmm2, 39H
- vpaddd zmm0, zmm0, zmm6
- vpaddd zmm0, zmm0, zmm1
- vpxord zmm3, zmm3, zmm0
- vprord zmm3, zmm3, 16
- vpaddd zmm2, zmm2, zmm3
- vpxord zmm1, zmm1, zmm2
- vprord zmm1, zmm1, 12
- vpaddd zmm0, zmm0, zmm7
- vpaddd zmm0, zmm0, zmm1
- vpxord zmm3, zmm3, zmm0
- vprord zmm3, zmm3, 8
- vpaddd zmm2, zmm2, zmm3
- vpxord zmm1, zmm1, zmm2
- vprord zmm1, zmm1, 7
- vpshufd zmm0, zmm0, 39H
- vpshufd zmm3, zmm3, 4EH
- vpshufd zmm2, zmm2, 93H
- dec al
- jz endroundloop4
- vshufps zmm8, zmm4, zmm5, 214
- vpshufd zmm9, zmm4, 0FH
- vpshufd zmm4, zmm8, 39H
- vshufps zmm8, zmm6, zmm7, 250
- vpblendmd zmm9 {k3}, zmm9, zmm8
- vpunpcklqdq zmm8, zmm7, zmm5
- vpblendmd zmm8 {k4}, zmm8, zmm6
- vpshufd zmm8, zmm8, 78H
- vpunpckhdq zmm5, zmm5, zmm7
- vpunpckldq zmm6, zmm6, zmm5
- vpshufd zmm7, zmm6, 1EH
- vmovdqa32 zmm5, zmm9
- vmovdqa32 zmm6, zmm8
- jmp roundloop4
-endroundloop4:
- vpxord zmm0, zmm0, zmm2
- vpxord zmm1, zmm1, zmm3
- mov eax, r13d
- cmp rdx, r15
- jne innerloop4
- vmovdqu xmmword ptr [rbx], xmm0
- vmovdqu xmmword ptr [rbx+10H], xmm1
- vextracti128 xmmword ptr [rbx+20H], ymm0, 01H
- vextracti128 xmmword ptr [rbx+30H], ymm1, 01H
- vextracti32x4 xmmword ptr [rbx+4H*10H], zmm0, 02H
- vextracti32x4 xmmword ptr [rbx+5H*10H], zmm1, 02H
- vextracti32x4 xmmword ptr [rbx+6H*10H], zmm0, 03H
- vextracti32x4 xmmword ptr [rbx+7H*10H], zmm1, 03H
- vmovdqa xmm0, xmmword ptr [rsp]
- vmovdqa xmm2, xmmword ptr [rsp+40H]
- vmovdqa32 xmm0 {k1}, xmmword ptr [rsp+1H*10H]
- vmovdqa32 xmm2 {k1}, xmmword ptr [rsp+5H*10H]
- vmovdqa xmmword ptr [rsp], xmm0
- vmovdqa xmmword ptr [rsp+40H], xmm2
- add rbx, 128
- add rdi, 32
- sub rsi, 4
-final3blocks:
- test esi, 2H
- je final1block
- vbroadcasti128 ymm0, xmmword ptr [rcx]
- vbroadcasti128 ymm1, xmmword ptr [rcx+10H]
- vmovd xmm13, dword ptr [rsp]
- vpinsrd xmm13, xmm13, dword ptr [rsp+40H], 1
- vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN], 2
- vmovd xmm14, dword ptr [rsp+4H]
- vpinsrd xmm14, xmm14, dword ptr [rsp+44H], 1
- vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN], 2
- vinserti128 ymm13, ymm13, xmm14, 01H
- mov r8, qword ptr [rdi]
- mov r9, qword ptr [rdi+8H]
- movzx eax, byte ptr [rbp+80H]
- or eax, r13d
- xor edx, edx
-ALIGN 16
-innerloop2:
- mov r14d, eax
- or eax, r12d
- add rdx, 64
- cmp rdx, r15
- cmovne eax, r14d
- mov dword ptr [rsp+88H], eax
- vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV]
- vpbroadcastd ymm8, dword ptr [rsp+88H]
- vpblendd ymm3, ymm13, ymm8, 88H
- vmovups ymm8, ymmword ptr [r8+rdx-40H]
- vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-40H], 01H
- vmovups ymm9, ymmword ptr [r8+rdx-30H]
- vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-30H], 01H
- vshufps ymm4, ymm8, ymm9, 136
- vshufps ymm5, ymm8, ymm9, 221
- vmovups ymm8, ymmword ptr [r8+rdx-20H]
- vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-20H], 01H
- vmovups ymm9, ymmword ptr [r8+rdx-10H]
- vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-10H], 01H
- vshufps ymm6, ymm8, ymm9, 136
- vshufps ymm7, ymm8, ymm9, 221
- vpshufd ymm6, ymm6, 93H
- vpshufd ymm7, ymm7, 93H
- mov al, 7
-roundloop2:
- vpaddd ymm0, ymm0, ymm4
- vpaddd ymm0, ymm0, ymm1
- vpxord ymm3, ymm3, ymm0
- vprord ymm3, ymm3, 16
- vpaddd ymm2, ymm2, ymm3
- vpxord ymm1, ymm1, ymm2
- vprord ymm1, ymm1, 12
- vpaddd ymm0, ymm0, ymm5
- vpaddd ymm0, ymm0, ymm1
- vpxord ymm3, ymm3, ymm0
- vprord ymm3, ymm3, 8
- vpaddd ymm2, ymm2, ymm3
- vpxord ymm1, ymm1, ymm2
- vprord ymm1, ymm1, 7
- vpshufd ymm0, ymm0, 93H
- vpshufd ymm3, ymm3, 4EH
- vpshufd ymm2, ymm2, 39H
- vpaddd ymm0, ymm0, ymm6
- vpaddd ymm0, ymm0, ymm1
- vpxord ymm3, ymm3, ymm0
- vprord ymm3, ymm3, 16
- vpaddd ymm2, ymm2, ymm3
- vpxord ymm1, ymm1, ymm2
- vprord ymm1, ymm1, 12
- vpaddd ymm0, ymm0, ymm7
- vpaddd ymm0, ymm0, ymm1
- vpxord ymm3, ymm3, ymm0
- vprord ymm3, ymm3, 8
- vpaddd ymm2, ymm2, ymm3
- vpxord ymm1, ymm1, ymm2
- vprord ymm1, ymm1, 7
- vpshufd ymm0, ymm0, 39H
- vpshufd ymm3, ymm3, 4EH
- vpshufd ymm2, ymm2, 93H
- dec al
- jz endroundloop2
- vshufps ymm8, ymm4, ymm5, 214
- vpshufd ymm9, ymm4, 0FH
- vpshufd ymm4, ymm8, 39H
- vshufps ymm8, ymm6, ymm7, 250
- vpblendd ymm9, ymm9, ymm8, 0AAH
- vpunpcklqdq ymm8, ymm7, ymm5
- vpblendd ymm8, ymm8, ymm6, 88H
- vpshufd ymm8, ymm8, 78H
- vpunpckhdq ymm5, ymm5, ymm7
- vpunpckldq ymm6, ymm6, ymm5
- vpshufd ymm7, ymm6, 1EH
- vmovdqa ymm5, ymm9
- vmovdqa ymm6, ymm8
- jmp roundloop2
-endroundloop2:
- vpxor ymm0, ymm0, ymm2
- vpxor ymm1, ymm1, ymm3
- mov eax, r13d
- cmp rdx, r15
- jne innerloop2
- vmovdqu xmmword ptr [rbx], xmm0
- vmovdqu xmmword ptr [rbx+10H], xmm1
- vextracti128 xmmword ptr [rbx+20H], ymm0, 01H
- vextracti128 xmmword ptr [rbx+30H], ymm1, 01H
- vmovdqa xmm0, xmmword ptr [rsp]
- vmovdqa xmm2, xmmword ptr [rsp+40H]
- vmovdqu32 xmm0 {k1}, xmmword ptr [rsp+8H]
- vmovdqu32 xmm2 {k1}, xmmword ptr [rsp+48H]
- vmovdqa xmmword ptr [rsp], xmm0
- vmovdqa xmmword ptr [rsp+40H], xmm2
- add rbx, 64
- add rdi, 16
- sub rsi, 2
-final1block:
- test esi, 1H
- je unwind
- vmovdqu xmm0, xmmword ptr [rcx]
- vmovdqu xmm1, xmmword ptr [rcx+10H]
- vmovd xmm14, dword ptr [rsp]
- vpinsrd xmm14, xmm14, dword ptr [rsp+40H], 1
- vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN], 2
- vmovdqa xmm15, xmmword ptr [BLAKE3_IV]
- mov r8, qword ptr [rdi]
- movzx eax, byte ptr [rbp+80H]
- or eax, r13d
- xor edx, edx
-ALIGN 16
-innerloop1:
- mov r14d, eax
- or eax, r12d
- add rdx, 64
- cmp rdx, r15
- cmovne eax, r14d
- vpinsrd xmm3, xmm14, eax, 3
- vmovdqa xmm2, xmm15
- vmovups xmm8, xmmword ptr [r8+rdx-40H]
- vmovups xmm9, xmmword ptr [r8+rdx-30H]
- vshufps xmm4, xmm8, xmm9, 136
- vshufps xmm5, xmm8, xmm9, 221
- vmovups xmm8, xmmword ptr [r8+rdx-20H]
- vmovups xmm9, xmmword ptr [r8+rdx-10H]
- vshufps xmm6, xmm8, xmm9, 136
- vshufps xmm7, xmm8, xmm9, 221
- vpshufd xmm6, xmm6, 93H
- vpshufd xmm7, xmm7, 93H
- mov al, 7
-roundloop1:
- vpaddd xmm0, xmm0, xmm4
- vpaddd xmm0, xmm0, xmm1
- vpxord xmm3, xmm3, xmm0
- vprord xmm3, xmm3, 16
- vpaddd xmm2, xmm2, xmm3
- vpxord xmm1, xmm1, xmm2
- vprord xmm1, xmm1, 12
- vpaddd xmm0, xmm0, xmm5
- vpaddd xmm0, xmm0, xmm1
- vpxord xmm3, xmm3, xmm0
- vprord xmm3, xmm3, 8
- vpaddd xmm2, xmm2, xmm3
- vpxord xmm1, xmm1, xmm2
- vprord xmm1, xmm1, 7
- vpshufd xmm0, xmm0, 93H
- vpshufd xmm3, xmm3, 4EH
- vpshufd xmm2, xmm2, 39H
- vpaddd xmm0, xmm0, xmm6
- vpaddd xmm0, xmm0, xmm1
- vpxord xmm3, xmm3, xmm0
- vprord xmm3, xmm3, 16
- vpaddd xmm2, xmm2, xmm3
- vpxord xmm1, xmm1, xmm2
- vprord xmm1, xmm1, 12
- vpaddd xmm0, xmm0, xmm7
- vpaddd xmm0, xmm0, xmm1
- vpxord xmm3, xmm3, xmm0
- vprord xmm3, xmm3, 8
- vpaddd xmm2, xmm2, xmm3
- vpxord xmm1, xmm1, xmm2
- vprord xmm1, xmm1, 7
- vpshufd xmm0, xmm0, 39H
- vpshufd xmm3, xmm3, 4EH
- vpshufd xmm2, xmm2, 93H
- dec al
- jz endroundloop1
- vshufps xmm8, xmm4, xmm5, 214
- vpshufd xmm9, xmm4, 0FH
- vpshufd xmm4, xmm8, 39H
- vshufps xmm8, xmm6, xmm7, 250
- vpblendd xmm9, xmm9, xmm8, 0AAH
- vpunpcklqdq xmm8, xmm7, xmm5
- vpblendd xmm8, xmm8, xmm6, 88H
- vpshufd xmm8, xmm8, 78H
- vpunpckhdq xmm5, xmm5, xmm7
- vpunpckldq xmm6, xmm6, xmm5
- vpshufd xmm7, xmm6, 1EH
- vmovdqa xmm5, xmm9
- vmovdqa xmm6, xmm8
- jmp roundloop1
-endroundloop1:
- vpxor xmm0, xmm0, xmm2
- vpxor xmm1, xmm1, xmm3
- mov eax, r13d
- cmp rdx, r15
- jne innerloop1
- vmovdqu xmmword ptr [rbx], xmm0
- vmovdqu xmmword ptr [rbx+10H], xmm1
- jmp unwind
-
-_blake3_hash_many_avx512 ENDP
-blake3_hash_many_avx512 ENDP
-
-ALIGN 16
-blake3_compress_in_place_avx512 PROC
-_blake3_compress_in_place_avx512 PROC
- sub rsp, 72
- vmovdqa xmmword ptr [rsp], xmm6
- vmovdqa xmmword ptr [rsp+10H], xmm7
- vmovdqa xmmword ptr [rsp+20H], xmm8
- vmovdqa xmmword ptr [rsp+30H], xmm9
- vmovdqu xmm0, xmmword ptr [rcx]
- vmovdqu xmm1, xmmword ptr [rcx+10H]
- movzx eax, byte ptr [rsp+70H]
- movzx r8d, r8b
- shl rax, 32
- add r8, rax
- vmovq xmm3, r9
- vmovq xmm4, r8
- vpunpcklqdq xmm3, xmm3, xmm4
- vmovaps xmm2, xmmword ptr [BLAKE3_IV]
- vmovups xmm8, xmmword ptr [rdx]
- vmovups xmm9, xmmword ptr [rdx+10H]
- vshufps xmm4, xmm8, xmm9, 136
- vshufps xmm5, xmm8, xmm9, 221
- vmovups xmm8, xmmword ptr [rdx+20H]
- vmovups xmm9, xmmword ptr [rdx+30H]
- vshufps xmm6, xmm8, xmm9, 136
- vshufps xmm7, xmm8, xmm9, 221
- vpshufd xmm6, xmm6, 93H
- vpshufd xmm7, xmm7, 93H
- mov al, 7
-@@:
- vpaddd xmm0, xmm0, xmm4
- vpaddd xmm0, xmm0, xmm1
- vpxord xmm3, xmm3, xmm0
- vprord xmm3, xmm3, 16
- vpaddd xmm2, xmm2, xmm3
- vpxord xmm1, xmm1, xmm2
- vprord xmm1, xmm1, 12
- vpaddd xmm0, xmm0, xmm5
- vpaddd xmm0, xmm0, xmm1
- vpxord xmm3, xmm3, xmm0
- vprord xmm3, xmm3, 8
- vpaddd xmm2, xmm2, xmm3
- vpxord xmm1, xmm1, xmm2
- vprord xmm1, xmm1, 7
- vpshufd xmm0, xmm0, 93H
- vpshufd xmm3, xmm3, 4EH
- vpshufd xmm2, xmm2, 39H
- vpaddd xmm0, xmm0, xmm6
- vpaddd xmm0, xmm0, xmm1
- vpxord xmm3, xmm3, xmm0
- vprord xmm3, xmm3, 16
- vpaddd xmm2, xmm2, xmm3
- vpxord xmm1, xmm1, xmm2
- vprord xmm1, xmm1, 12
- vpaddd xmm0, xmm0, xmm7
- vpaddd xmm0, xmm0, xmm1
- vpxord xmm3, xmm3, xmm0
- vprord xmm3, xmm3, 8
- vpaddd xmm2, xmm2, xmm3
- vpxord xmm1, xmm1, xmm2
- vprord xmm1, xmm1, 7
- vpshufd xmm0, xmm0, 39H
- vpshufd xmm3, xmm3, 4EH
- vpshufd xmm2, xmm2, 93H
- dec al
- jz @F
- vshufps xmm8, xmm4, xmm5, 214
- vpshufd xmm9, xmm4, 0FH
- vpshufd xmm4, xmm8, 39H
- vshufps xmm8, xmm6, xmm7, 250
- vpblendd xmm9, xmm9, xmm8, 0AAH
- vpunpcklqdq xmm8, xmm7, xmm5
- vpblendd xmm8, xmm8, xmm6, 88H
- vpshufd xmm8, xmm8, 78H
- vpunpckhdq xmm5, xmm5, xmm7
- vpunpckldq xmm6, xmm6, xmm5
- vpshufd xmm7, xmm6, 1EH
- vmovdqa xmm5, xmm9
- vmovdqa xmm6, xmm8
- jmp @B
-@@:
- vpxor xmm0, xmm0, xmm2
- vpxor xmm1, xmm1, xmm3
- vmovdqu xmmword ptr [rcx], xmm0
- vmovdqu xmmword ptr [rcx+10H], xmm1
- vmovdqa xmm6, xmmword ptr [rsp]
- vmovdqa xmm7, xmmword ptr [rsp+10H]
- vmovdqa xmm8, xmmword ptr [rsp+20H]
- vmovdqa xmm9, xmmword ptr [rsp+30H]
- add rsp, 72
- ret
-_blake3_compress_in_place_avx512 ENDP
-blake3_compress_in_place_avx512 ENDP
-
-ALIGN 16
-blake3_compress_xof_avx512 PROC
-_blake3_compress_xof_avx512 PROC
- sub rsp, 72
- vmovdqa xmmword ptr [rsp], xmm6
- vmovdqa xmmword ptr [rsp+10H], xmm7
- vmovdqa xmmword ptr [rsp+20H], xmm8
- vmovdqa xmmword ptr [rsp+30H], xmm9
- vmovdqu xmm0, xmmword ptr [rcx]
- vmovdqu xmm1, xmmword ptr [rcx+10H]
- movzx eax, byte ptr [rsp+70H]
- movzx r8d, r8b
- mov r10, qword ptr [rsp+78H]
- shl rax, 32
- add r8, rax
- vmovq xmm3, r9
- vmovq xmm4, r8
- vpunpcklqdq xmm3, xmm3, xmm4
- vmovaps xmm2, xmmword ptr [BLAKE3_IV]
- vmovups xmm8, xmmword ptr [rdx]
- vmovups xmm9, xmmword ptr [rdx+10H]
- vshufps xmm4, xmm8, xmm9, 136
- vshufps xmm5, xmm8, xmm9, 221
- vmovups xmm8, xmmword ptr [rdx+20H]
- vmovups xmm9, xmmword ptr [rdx+30H]
- vshufps xmm6, xmm8, xmm9, 136
- vshufps xmm7, xmm8, xmm9, 221
- vpshufd xmm6, xmm6, 93H
- vpshufd xmm7, xmm7, 93H
- mov al, 7
-@@:
- vpaddd xmm0, xmm0, xmm4
- vpaddd xmm0, xmm0, xmm1
- vpxord xmm3, xmm3, xmm0
- vprord xmm3, xmm3, 16
- vpaddd xmm2, xmm2, xmm3
- vpxord xmm1, xmm1, xmm2
- vprord xmm1, xmm1, 12
- vpaddd xmm0, xmm0, xmm5
- vpaddd xmm0, xmm0, xmm1
- vpxord xmm3, xmm3, xmm0
- vprord xmm3, xmm3, 8
- vpaddd xmm2, xmm2, xmm3
- vpxord xmm1, xmm1, xmm2
- vprord xmm1, xmm1, 7
- vpshufd xmm0, xmm0, 93H
- vpshufd xmm3, xmm3, 4EH
- vpshufd xmm2, xmm2, 39H
- vpaddd xmm0, xmm0, xmm6
- vpaddd xmm0, xmm0, xmm1
- vpxord xmm3, xmm3, xmm0
- vprord xmm3, xmm3, 16
- vpaddd xmm2, xmm2, xmm3
- vpxord xmm1, xmm1, xmm2
- vprord xmm1, xmm1, 12
- vpaddd xmm0, xmm0, xmm7
- vpaddd xmm0, xmm0, xmm1
- vpxord xmm3, xmm3, xmm0
- vprord xmm3, xmm3, 8
- vpaddd xmm2, xmm2, xmm3
- vpxord xmm1, xmm1, xmm2
- vprord xmm1, xmm1, 7
- vpshufd xmm0, xmm0, 39H
- vpshufd xmm3, xmm3, 4EH
- vpshufd xmm2, xmm2, 93H
- dec al
- jz @F
- vshufps xmm8, xmm4, xmm5, 214
- vpshufd xmm9, xmm4, 0FH
- vpshufd xmm4, xmm8, 39H
- vshufps xmm8, xmm6, xmm7, 250
- vpblendd xmm9, xmm9, xmm8, 0AAH
- vpunpcklqdq xmm8, xmm7, xmm5
- vpblendd xmm8, xmm8, xmm6, 88H
- vpshufd xmm8, xmm8, 78H
- vpunpckhdq xmm5, xmm5, xmm7
- vpunpckldq xmm6, xmm6, xmm5
- vpshufd xmm7, xmm6, 1EH
- vmovdqa xmm5, xmm9
- vmovdqa xmm6, xmm8
- jmp @B
-@@:
- vpxor xmm0, xmm0, xmm2
- vpxor xmm1, xmm1, xmm3
- vpxor xmm2, xmm2, xmmword ptr [rcx]
- vpxor xmm3, xmm3, xmmword ptr [rcx+10H]
- vmovdqu xmmword ptr [r10], xmm0
- vmovdqu xmmword ptr [r10+10H], xmm1
- vmovdqu xmmword ptr [r10+20H], xmm2
- vmovdqu xmmword ptr [r10+30H], xmm3
- vmovdqa xmm6, xmmword ptr [rsp]
- vmovdqa xmm7, xmmword ptr [rsp+10H]
- vmovdqa xmm8, xmmword ptr [rsp+20H]
- vmovdqa xmm9, xmmword ptr [rsp+30H]
- add rsp, 72
- ret
-_blake3_compress_xof_avx512 ENDP
-blake3_compress_xof_avx512 ENDP
-
-_TEXT ENDS
-
-_RDATA SEGMENT READONLY PAGE ALIAS(".rdata") 'CONST'
-ALIGN 64
-INDEX0:
- dd 0, 1, 2, 3, 16, 17, 18, 19
- dd 8, 9, 10, 11, 24, 25, 26, 27
-INDEX1:
- dd 4, 5, 6, 7, 20, 21, 22, 23
- dd 12, 13, 14, 15, 28, 29, 30, 31
-ADD0:
- dd 0, 1, 2, 3, 4, 5, 6, 7
- dd 8, 9, 10, 11, 12, 13, 14, 15
-ADD1:
- dd 1
-ADD16:
- dd 16
-BLAKE3_BLOCK_LEN:
- dd 64
-ALIGN 64
-BLAKE3_IV:
-BLAKE3_IV_0:
- dd 06A09E667H
-BLAKE3_IV_1:
- dd 0BB67AE85H
-BLAKE3_IV_2:
- dd 03C6EF372H
-BLAKE3_IV_3:
- dd 0A54FF53AH
-
-_RDATA ENDS
-END
diff --git a/thirdparty/BLAKE3/c/blake3_c_rust_bindings/Cargo.toml b/thirdparty/BLAKE3/c/blake3_c_rust_bindings/Cargo.toml
deleted file mode 100644
index 2052c7458..000000000
--- a/thirdparty/BLAKE3/c/blake3_c_rust_bindings/Cargo.toml
+++ /dev/null
@@ -1,29 +0,0 @@
-# These are Rust bindings for the C implementation of BLAKE3. As there is a
-# native (and faster) Rust implementation of BLAKE3 provided in this same repo,
-# these bindings are not expected to be used in production. They're intended
-# for testing and benchmarking.
-
-[package]
-name = "blake3_c_rust_bindings"
-version = "0.0.0"
-description = "TESTING ONLY Rust bindings for the BLAKE3 C implementation"
-edition = "2018"
-
-[features]
-# By default the x86-64 build uses assembly implementations. This feature makes
-# the build use the C intrinsics implementations instead.
-prefer_intrinsics = []
-# Activate NEON bindings. We don't currently do any CPU feature detection for
-# this. If this Cargo feature is on, the NEON gets used.
-neon = []
-
-[dev-dependencies]
-arrayref = "0.3.5"
-arrayvec = { version = "0.5.1", default-features = false, features = ["array-sizes-33-128"] }
-page_size = "0.4.1"
-rand = "0.7.2"
-rand_chacha = "0.2.1"
-reference_impl = { path = "../../reference_impl" }
-
-[build-dependencies]
-cc = "1.0.48"
diff --git a/thirdparty/BLAKE3/c/blake3_c_rust_bindings/README.md b/thirdparty/BLAKE3/c/blake3_c_rust_bindings/README.md
deleted file mode 100644
index c44726b90..000000000
--- a/thirdparty/BLAKE3/c/blake3_c_rust_bindings/README.md
+++ /dev/null
@@ -1,4 +0,0 @@
-These are Rust bindings for the C implementation of BLAKE3. As there is
-a native Rust implementation of BLAKE3 provided in this same repo, these
-bindings are not expected to be used in production. They're intended for
-testing and benchmarking.
diff --git a/thirdparty/BLAKE3/c/blake3_c_rust_bindings/benches/bench.rs b/thirdparty/BLAKE3/c/blake3_c_rust_bindings/benches/bench.rs
deleted file mode 100644
index 119bd2064..000000000
--- a/thirdparty/BLAKE3/c/blake3_c_rust_bindings/benches/bench.rs
+++ /dev/null
@@ -1,393 +0,0 @@
-#![feature(test)]
-
-extern crate test;
-
-use arrayref::array_ref;
-use arrayvec::ArrayVec;
-use rand::prelude::*;
-use test::Bencher;
-
-const KIB: usize = 1024;
-const MAX_SIMD_DEGREE: usize = 16;
-
-const BLOCK_LEN: usize = 64;
-const CHUNK_LEN: usize = 1024;
-const OUT_LEN: usize = 32;
-
-// This struct randomizes two things:
-// 1. The actual bytes of input.
-// 2. The page offset the input starts at.
-pub struct RandomInput {
- buf: Vec<u8>,
- len: usize,
- offsets: Vec<usize>,
- offset_index: usize,
-}
-
-impl RandomInput {
- pub fn new(b: &mut Bencher, len: usize) -> Self {
- b.bytes += len as u64;
- let page_size: usize = page_size::get();
- let mut buf = vec![0u8; len + page_size];
- let mut rng = rand::thread_rng();
- rng.fill_bytes(&mut buf);
- let mut offsets: Vec<usize> = (0..page_size).collect();
- offsets.shuffle(&mut rng);
- Self {
- buf,
- len,
- offsets,
- offset_index: 0,
- }
- }
-
- pub fn get(&mut self) -> &[u8] {
- let offset = self.offsets[self.offset_index];
- self.offset_index += 1;
- if self.offset_index >= self.offsets.len() {
- self.offset_index = 0;
- }
- &self.buf[offset..][..self.len]
- }
-}
-
-type CompressInPlaceFn =
- unsafe extern "C" fn(cv: *mut u32, block: *const u8, block_len: u8, counter: u64, flags: u8);
-
-fn bench_single_compression_fn(b: &mut Bencher, f: CompressInPlaceFn) {
- let mut state = [1u32; 8];
- let mut r = RandomInput::new(b, 64);
- let input = array_ref!(r.get(), 0, 64);
- b.iter(|| unsafe { f(state.as_mut_ptr(), input.as_ptr(), 64, 0, 0) });
-}
-
-#[bench]
-fn bench_single_compression_portable(b: &mut Bencher) {
- bench_single_compression_fn(
- b,
- blake3_c_rust_bindings::ffi::blake3_compress_in_place_portable,
- );
-}
-
-#[bench]
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-fn bench_single_compression_sse2(b: &mut Bencher) {
- if !blake3_c_rust_bindings::sse2_detected() {
- return;
- }
- bench_single_compression_fn(
- b,
- blake3_c_rust_bindings::ffi::x86::blake3_compress_in_place_sse2,
- );
-}
-
-#[bench]
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-fn bench_single_compression_sse41(b: &mut Bencher) {
- if !blake3_c_rust_bindings::sse41_detected() {
- return;
- }
- bench_single_compression_fn(
- b,
- blake3_c_rust_bindings::ffi::x86::blake3_compress_in_place_sse41,
- );
-}
-
-#[bench]
-fn bench_single_compression_avx512(b: &mut Bencher) {
- if !blake3_c_rust_bindings::avx512_detected() {
- return;
- }
- bench_single_compression_fn(
- b,
- blake3_c_rust_bindings::ffi::x86::blake3_compress_in_place_avx512,
- );
-}
-
-type HashManyFn = unsafe extern "C" fn(
- inputs: *const *const u8,
- num_inputs: usize,
- blocks: usize,
- key: *const u32,
- counter: u64,
- increment_counter: bool,
- flags: u8,
- flags_start: u8,
- flags_end: u8,
- out: *mut u8,
-);
-
-fn bench_many_chunks_fn(b: &mut Bencher, f: HashManyFn, degree: usize) {
- let mut inputs = Vec::new();
- for _ in 0..degree {
- inputs.push(RandomInput::new(b, CHUNK_LEN));
- }
- b.iter(|| {
- let input_arrays: ArrayVec<[&[u8; CHUNK_LEN]; MAX_SIMD_DEGREE]> = inputs
- .iter_mut()
- .take(degree)
- .map(|i| array_ref!(i.get(), 0, CHUNK_LEN))
- .collect();
- let mut out = [0; MAX_SIMD_DEGREE * OUT_LEN];
- unsafe {
- f(
- input_arrays.as_ptr() as _,
- input_arrays.len(),
- CHUNK_LEN / BLOCK_LEN,
- [0u32; 8].as_ptr(),
- 0,
- true,
- 0,
- 0,
- 0,
- out.as_mut_ptr(),
- )
- }
- });
-}
-
-#[bench]
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-fn bench_many_chunks_sse2(b: &mut Bencher) {
- if !blake3_c_rust_bindings::sse2_detected() {
- return;
- }
- bench_many_chunks_fn(
- b,
- blake3_c_rust_bindings::ffi::x86::blake3_hash_many_sse2,
- 4,
- );
-}
-
-#[bench]
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-fn bench_many_chunks_sse41(b: &mut Bencher) {
- if !blake3_c_rust_bindings::sse41_detected() {
- return;
- }
- bench_many_chunks_fn(
- b,
- blake3_c_rust_bindings::ffi::x86::blake3_hash_many_sse41,
- 4,
- );
-}
-
-#[bench]
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-fn bench_many_chunks_avx2(b: &mut Bencher) {
- if !blake3_c_rust_bindings::avx2_detected() {
- return;
- }
- bench_many_chunks_fn(
- b,
- blake3_c_rust_bindings::ffi::x86::blake3_hash_many_avx2,
- 8,
- );
-}
-
-#[bench]
-fn bench_many_chunks_avx512(b: &mut Bencher) {
- if !blake3_c_rust_bindings::avx512_detected() {
- return;
- }
- bench_many_chunks_fn(
- b,
- blake3_c_rust_bindings::ffi::x86::blake3_hash_many_avx512,
- 16,
- );
-}
-
-#[bench]
-#[cfg(feature = "neon")]
-fn bench_many_chunks_neon(b: &mut Bencher) {
- // When "neon" is on, NEON support is assumed.
- bench_many_chunks_fn(
- b,
- blake3_c_rust_bindings::ffi::neon::blake3_hash_many_neon,
- 4,
- );
-}
-
-// TODO: When we get const generics we can unify this with the chunks code.
-fn bench_many_parents_fn(b: &mut Bencher, f: HashManyFn, degree: usize) {
- let mut inputs = Vec::new();
- for _ in 0..degree {
- inputs.push(RandomInput::new(b, BLOCK_LEN));
- }
- b.iter(|| {
- let input_arrays: ArrayVec<[&[u8; BLOCK_LEN]; MAX_SIMD_DEGREE]> = inputs
- .iter_mut()
- .take(degree)
- .map(|i| array_ref!(i.get(), 0, BLOCK_LEN))
- .collect();
- let mut out = [0; MAX_SIMD_DEGREE * OUT_LEN];
- unsafe {
- f(
- input_arrays.as_ptr() as _,
- input_arrays.len(),
- 1,
- [0u32; 8].as_ptr(),
- 0,
- false,
- 0,
- 0,
- 0,
- out.as_mut_ptr(),
- )
- }
- });
-}
-
-#[bench]
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-fn bench_many_parents_sse2(b: &mut Bencher) {
- if !blake3_c_rust_bindings::sse2_detected() {
- return;
- }
- bench_many_parents_fn(
- b,
- blake3_c_rust_bindings::ffi::x86::blake3_hash_many_sse2,
- 4,
- );
-}
-
-#[bench]
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-fn bench_many_parents_sse41(b: &mut Bencher) {
- if !blake3_c_rust_bindings::sse41_detected() {
- return;
- }
- bench_many_parents_fn(
- b,
- blake3_c_rust_bindings::ffi::x86::blake3_hash_many_sse41,
- 4,
- );
-}
-
-#[bench]
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-fn bench_many_parents_avx2(b: &mut Bencher) {
- if !blake3_c_rust_bindings::avx2_detected() {
- return;
- }
- bench_many_parents_fn(
- b,
- blake3_c_rust_bindings::ffi::x86::blake3_hash_many_avx2,
- 8,
- );
-}
-
-#[bench]
-fn bench_many_parents_avx512(b: &mut Bencher) {
- if !blake3_c_rust_bindings::avx512_detected() {
- return;
- }
- bench_many_parents_fn(
- b,
- blake3_c_rust_bindings::ffi::x86::blake3_hash_many_avx512,
- 16,
- );
-}
-
-#[bench]
-#[cfg(feature = "neon")]
-fn bench_many_parents_neon(b: &mut Bencher) {
- // When "neon" is on, NEON support is assumed.
- bench_many_parents_fn(
- b,
- blake3_c_rust_bindings::ffi::neon::blake3_hash_many_neon,
- 4,
- );
-}
-
-fn bench_incremental(b: &mut Bencher, len: usize) {
- let mut input = RandomInput::new(b, len);
- b.iter(|| {
- let mut hasher = blake3_c_rust_bindings::Hasher::new();
- hasher.update(input.get());
- let mut out = [0; 32];
- hasher.finalize(&mut out);
- out
- });
-}
-
-#[bench]
-fn bench_incremental_0001_block(b: &mut Bencher) {
- bench_incremental(b, BLOCK_LEN);
-}
-
-#[bench]
-fn bench_incremental_0001_kib(b: &mut Bencher) {
- bench_incremental(b, 1 * KIB);
-}
-
-#[bench]
-fn bench_incremental_0002_kib(b: &mut Bencher) {
- bench_incremental(b, 2 * KIB);
-}
-
-#[bench]
-fn bench_incremental_0004_kib(b: &mut Bencher) {
- bench_incremental(b, 4 * KIB);
-}
-
-#[bench]
-fn bench_incremental_0008_kib(b: &mut Bencher) {
- bench_incremental(b, 8 * KIB);
-}
-
-#[bench]
-fn bench_incremental_0016_kib(b: &mut Bencher) {
- bench_incremental(b, 16 * KIB);
-}
-
-#[bench]
-fn bench_incremental_0032_kib(b: &mut Bencher) {
- bench_incremental(b, 32 * KIB);
-}
-
-#[bench]
-fn bench_incremental_0064_kib(b: &mut Bencher) {
- bench_incremental(b, 64 * KIB);
-}
-
-#[bench]
-fn bench_incremental_0128_kib(b: &mut Bencher) {
- bench_incremental(b, 128 * KIB);
-}
-
-#[bench]
-fn bench_incremental_0256_kib(b: &mut Bencher) {
- bench_incremental(b, 256 * KIB);
-}
-
-#[bench]
-fn bench_incremental_0512_kib(b: &mut Bencher) {
- bench_incremental(b, 512 * KIB);
-}
-
-#[bench]
-fn bench_incremental_1024_kib(b: &mut Bencher) {
- bench_incremental(b, 1024 * KIB);
-}
-
-// This checks that update() splits up its input in increasing powers of 2, so
-// that it can recover a high degree of parallelism when the number of bytes
-// hashed so far is uneven. The performance of this benchmark should be
-// reasonably close to bench_incremental_0064_kib, within 80% or so. When we
-// had a bug in this logic (https://github.com/BLAKE3-team/BLAKE3/issues/69),
-// performance was less than half.
-#[bench]
-fn bench_two_updates(b: &mut Bencher) {
- let len = 65536;
- let mut input = RandomInput::new(b, len);
- b.iter(|| {
- let mut hasher = blake3_c_rust_bindings::Hasher::new();
- let input = input.get();
- hasher.update(&input[..1]);
- hasher.update(&input[1..]);
- let mut out = [0; 32];
- hasher.finalize(&mut out);
- out
- });
-}
diff --git a/thirdparty/BLAKE3/c/blake3_c_rust_bindings/build.rs b/thirdparty/BLAKE3/c/blake3_c_rust_bindings/build.rs
deleted file mode 100644
index d5dc47a81..000000000
--- a/thirdparty/BLAKE3/c/blake3_c_rust_bindings/build.rs
+++ /dev/null
@@ -1,182 +0,0 @@
-use std::env;
-
-fn defined(var: &str) -> bool {
- env::var_os(var).is_some()
-}
-
-fn target_components() -> Vec<String> {
- let target = env::var("TARGET").unwrap();
- target.split("-").map(|s| s.to_string()).collect()
-}
-
-fn is_x86_64() -> bool {
- target_components()[0] == "x86_64"
-}
-
-fn is_x86_32() -> bool {
- let arch = &target_components()[0];
- arch == "i386" || arch == "i586" || arch == "i686"
-}
-
-fn is_armv7() -> bool {
- target_components()[0] == "armv7"
-}
-
-// Windows targets may be using the MSVC toolchain or the GNU toolchain. The
-// right compiler flags to use depend on the toolchain. (And we don't want to
-// use flag_if_supported, because we don't want features to be silently
-// disabled by old compilers.)
-fn is_windows_msvc() -> bool {
- // Some targets are only two components long, so check in steps.
- target_components()[1] == "pc"
- && target_components()[2] == "windows"
- && target_components()[3] == "msvc"
-}
-
-fn is_windows_gnu() -> bool {
- // Some targets are only two components long, so check in steps.
- target_components()[1] == "pc"
- && target_components()[2] == "windows"
- && target_components()[3] == "gnu"
-}
-
-fn new_build() -> cc::Build {
- let mut build = cc::Build::new();
- if !is_windows_msvc() {
- build.flag("-std=c11");
- }
- build
-}
-
-fn c_dir_path(filename: &str) -> String {
- // The `cross` tool doesn't support reading files in parent directories. As a hacky workaround
- // in `cross_test.sh`, we move the c/ directory around and set BLAKE3_C_DIR_OVERRIDE. Regular
- // building and testing doesn't require this.
- if let Ok(c_dir_override) = env::var("BLAKE3_C_DIR_OVERRIDE") {
- c_dir_override + "/" + filename
- } else {
- "../".to_string() + filename
- }
-}
-
-fn main() -> Result<(), Box<dyn std::error::Error>> {
- let mut base_build = new_build();
- base_build.file(c_dir_path("blake3.c"));
- base_build.file(c_dir_path("blake3_dispatch.c"));
- base_build.file(c_dir_path("blake3_portable.c"));
- base_build.compile("blake3_base");
-
- if is_x86_64() && !defined("CARGO_FEATURE_PREFER_INTRINSICS") {
- // On 64-bit, use the assembly implementations, unless the
- // "prefer_intrinsics" feature is enabled.
- if is_windows_msvc() {
- let mut build = new_build();
- build.file(c_dir_path("blake3_sse2_x86-64_windows_msvc.asm"));
- build.file(c_dir_path("blake3_sse41_x86-64_windows_msvc.asm"));
- build.file(c_dir_path("blake3_avx2_x86-64_windows_msvc.asm"));
- build.file(c_dir_path("blake3_avx512_x86-64_windows_msvc.asm"));
- build.compile("blake3_asm");
- } else if is_windows_gnu() {
- let mut build = new_build();
- build.file(c_dir_path("blake3_sse2_x86-64_windows_gnu.S"));
- build.file(c_dir_path("blake3_sse41_x86-64_windows_gnu.S"));
- build.file(c_dir_path("blake3_avx2_x86-64_windows_gnu.S"));
- build.file(c_dir_path("blake3_avx512_x86-64_windows_gnu.S"));
- build.compile("blake3_asm");
- } else {
- // All non-Windows implementations are assumed to support
- // Linux-style assembly. These files do contain a small
- // explicit workaround for macOS also.
- let mut build = new_build();
- build.file(c_dir_path("blake3_sse2_x86-64_unix.S"));
- build.file(c_dir_path("blake3_sse41_x86-64_unix.S"));
- build.file(c_dir_path("blake3_avx2_x86-64_unix.S"));
- build.file(c_dir_path("blake3_avx512_x86-64_unix.S"));
- build.compile("blake3_asm");
- }
- } else if is_x86_64() || is_x86_32() {
- // Assembly implementations are only for 64-bit. On 32-bit, or if
- // the "prefer_intrinsics" feature is enabled, use the
- // intrinsics-based C implementations. These each need to be
- // compiled separately, with the corresponding instruction set
- // extension explicitly enabled in the compiler.
-
- let mut sse2_build = new_build();
- sse2_build.file(c_dir_path("blake3_sse2.c"));
- if is_windows_msvc() {
- // /arch:SSE2 is the default on x86 and undefined on x86_64:
- // https://docs.microsoft.com/en-us/cpp/build/reference/arch-x86
- // It also includes SSE4.1 intrisincs:
- // https://stackoverflow.com/a/32183222/823869
- } else {
- sse2_build.flag("-msse2");
- }
- sse2_build.compile("blake3_sse2");
-
- let mut sse41_build = new_build();
- sse41_build.file(c_dir_path("blake3_sse41.c"));
- if is_windows_msvc() {
- // /arch:SSE2 is the default on x86 and undefined on x86_64:
- // https://docs.microsoft.com/en-us/cpp/build/reference/arch-x86
- // It also includes SSE4.1 intrisincs:
- // https://stackoverflow.com/a/32183222/823869
- } else {
- sse41_build.flag("-msse4.1");
- }
- sse41_build.compile("blake3_sse41");
-
- let mut avx2_build = new_build();
- avx2_build.file(c_dir_path("blake3_avx2.c"));
- if is_windows_msvc() {
- avx2_build.flag("/arch:AVX2");
- } else {
- avx2_build.flag("-mavx2");
- }
- avx2_build.compile("blake3_avx2");
-
- let mut avx512_build = new_build();
- avx512_build.file(c_dir_path("blake3_avx512.c"));
- if is_windows_msvc() {
- // Note that a lot of versions of MSVC don't support /arch:AVX512,
- // and they'll discard it with a warning, hopefully leading to a
- // build error.
- avx512_build.flag("/arch:AVX512");
- } else {
- avx512_build.flag("-mavx512f");
- avx512_build.flag("-mavx512vl");
- }
- avx512_build.compile("blake3_avx512");
- }
-
- // We only build NEON code here if 1) it's requested and 2) the root crate
- // is not already building it. The only time this will really happen is if
- // you build this crate by hand with the "neon" feature for some reason.
- if defined("CARGO_FEATURE_NEON") {
- let mut neon_build = new_build();
- neon_build.file(c_dir_path("blake3_neon.c"));
- // ARMv7 platforms that support NEON generally need the following
- // flags. AArch64 supports NEON by default and does not support -mpfu.
- if is_armv7() {
- neon_build.flag("-mfpu=neon-vfpv4");
- neon_build.flag("-mfloat-abi=hard");
- }
- neon_build.compile("blake3_neon");
- }
-
- // The `cc` crate does not automatically emit rerun-if directives for the
- // environment variables it supports, in particular for $CC. We expect to
- // do a lot of benchmarking across different compilers, so we explicitly
- // add the variables that we're likely to need.
- println!("cargo:rerun-if-env-changed=CC");
- println!("cargo:rerun-if-env-changed=CFLAGS");
-
- // Ditto for source files, though these shouldn't change as often.
- for file in std::fs::read_dir("..")? {
- println!(
- "cargo:rerun-if-changed={}",
- file?.path().to_str().expect("utf-8")
- );
- }
-
- Ok(())
-}
diff --git a/thirdparty/BLAKE3/c/blake3_c_rust_bindings/cross_test.sh b/thirdparty/BLAKE3/c/blake3_c_rust_bindings/cross_test.sh
deleted file mode 100644
index 94d50affb..000000000
--- a/thirdparty/BLAKE3/c/blake3_c_rust_bindings/cross_test.sh
+++ /dev/null
@@ -1,31 +0,0 @@
-#! /usr/bin/env bash
-
-# This hacky script works around the fact that `cross test` does not support
-# path dependencies. (It uses a docker shared folder to let the guest access
-# project files, so parent directories aren't available.) Solve this problem by
-# copying the entire project to a temp dir and rearranging paths to put "c" and
-# "reference_impl" underneath "blake3_c_rust_bindings", so that everything is
-# accessible. Hopefully this will just run on CI forever and no one will ever
-# read this and discover my deep shame.
-
-set -e -u -o pipefail
-
-project_root="$(realpath "$(dirname "$BASH_SOURCE")/../..")"
-tmpdir="$(mktemp -d)"
-echo "Running cross tests in $tmpdir"
-cd "$tmpdir"
-git clone "$project_root" blake3
-mv blake3/c/blake3_c_rust_bindings .
-mv blake3/reference_impl blake3_c_rust_bindings
-mv blake3/c blake3_c_rust_bindings
-cd blake3_c_rust_bindings
-sed -i 's|reference_impl = { path = "../../reference_impl" }|reference_impl = { path = "reference_impl" }|' Cargo.toml
-
-export BLAKE3_C_DIR_OVERRIDE="./c"
-cat > Cross.toml << EOF
-[build.env]
-passthrough = [
- "BLAKE3_C_DIR_OVERRIDE",
-]
-EOF
-cross test "$@"
diff --git a/thirdparty/BLAKE3/c/blake3_c_rust_bindings/src/lib.rs b/thirdparty/BLAKE3/c/blake3_c_rust_bindings/src/lib.rs
deleted file mode 100644
index f18fe123f..000000000
--- a/thirdparty/BLAKE3/c/blake3_c_rust_bindings/src/lib.rs
+++ /dev/null
@@ -1,299 +0,0 @@
-//! These are Rust bindings for the C implementation of BLAKE3. As there is a
-//! native (and faster) Rust implementation of BLAKE3 provided in this same
-//! repo, these bindings are not expected to be used in production. They're
-//! intended for testing and benchmarking.
-
-use std::ffi::{c_void, CString};
-use std::mem::MaybeUninit;
-
-#[cfg(test)]
-mod test;
-
-pub const BLOCK_LEN: usize = 64;
-pub const CHUNK_LEN: usize = 1024;
-pub const OUT_LEN: usize = 32;
-
-// Feature detection functions for tests and benchmarks. Note that the C code
-// does its own feature detection in blake3_dispatch.c.
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-pub fn sse2_detected() -> bool {
- is_x86_feature_detected!("sse2")
-}
-
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-pub fn sse41_detected() -> bool {
- is_x86_feature_detected!("sse4.1")
-}
-
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-pub fn avx2_detected() -> bool {
- is_x86_feature_detected!("avx2")
-}
-
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-pub fn avx512_detected() -> bool {
- is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512vl")
-}
-
-#[derive(Clone)]
-pub struct Hasher(ffi::blake3_hasher);
-
-impl Hasher {
- pub fn new() -> Self {
- let mut c_state = MaybeUninit::uninit();
- unsafe {
- ffi::blake3_hasher_init(c_state.as_mut_ptr());
- Self(c_state.assume_init())
- }
- }
-
- pub fn new_keyed(key: &[u8; 32]) -> Self {
- let mut c_state = MaybeUninit::uninit();
- unsafe {
- ffi::blake3_hasher_init_keyed(c_state.as_mut_ptr(), key.as_ptr());
- Self(c_state.assume_init())
- }
- }
-
- pub fn new_derive_key(context: &str) -> Self {
- let mut c_state = MaybeUninit::uninit();
- let context_c_string = CString::new(context).expect("valid C string, no null bytes");
- unsafe {
- ffi::blake3_hasher_init_derive_key(c_state.as_mut_ptr(), context_c_string.as_ptr());
- Self(c_state.assume_init())
- }
- }
-
- pub fn new_derive_key_raw(context: &[u8]) -> Self {
- let mut c_state = MaybeUninit::uninit();
- unsafe {
- ffi::blake3_hasher_init_derive_key_raw(
- c_state.as_mut_ptr(),
- context.as_ptr() as *const _,
- context.len(),
- );
- Self(c_state.assume_init())
- }
- }
-
- pub fn update(&mut self, input: &[u8]) {
- unsafe {
- ffi::blake3_hasher_update(&mut self.0, input.as_ptr() as *const c_void, input.len());
- }
- }
-
- pub fn finalize(&self, output: &mut [u8]) {
- unsafe {
- ffi::blake3_hasher_finalize(&self.0, output.as_mut_ptr(), output.len());
- }
- }
-
- pub fn finalize_seek(&self, seek: u64, output: &mut [u8]) {
- unsafe {
- ffi::blake3_hasher_finalize_seek(&self.0, seek, output.as_mut_ptr(), output.len());
- }
- }
-}
-
-pub mod ffi {
- #[repr(C)]
- #[derive(Copy, Clone)]
- pub struct blake3_chunk_state {
- pub cv: [u32; 8usize],
- pub chunk_counter: u64,
- pub buf: [u8; 64usize],
- pub buf_len: u8,
- pub blocks_compressed: u8,
- pub flags: u8,
- }
-
- #[repr(C)]
- #[derive(Copy, Clone)]
- pub struct blake3_hasher {
- pub key: [u32; 8usize],
- pub chunk: blake3_chunk_state,
- pub cv_stack_len: u8,
- pub cv_stack: [u8; 1728usize],
- }
-
- extern "C" {
- // public interface
- pub fn blake3_hasher_init(self_: *mut blake3_hasher);
- pub fn blake3_hasher_init_keyed(self_: *mut blake3_hasher, key: *const u8);
- pub fn blake3_hasher_init_derive_key(
- self_: *mut blake3_hasher,
- context: *const ::std::os::raw::c_char,
- );
- pub fn blake3_hasher_init_derive_key_raw(
- self_: *mut blake3_hasher,
- context: *const ::std::os::raw::c_void,
- context_len: usize,
- );
- pub fn blake3_hasher_update(
- self_: *mut blake3_hasher,
- input: *const ::std::os::raw::c_void,
- input_len: usize,
- );
- pub fn blake3_hasher_finalize(self_: *const blake3_hasher, out: *mut u8, out_len: usize);
- pub fn blake3_hasher_finalize_seek(
- self_: *const blake3_hasher,
- seek: u64,
- out: *mut u8,
- out_len: usize,
- );
-
- // portable low-level functions
- pub fn blake3_compress_in_place_portable(
- cv: *mut u32,
- block: *const u8,
- block_len: u8,
- counter: u64,
- flags: u8,
- );
- pub fn blake3_compress_xof_portable(
- cv: *const u32,
- block: *const u8,
- block_len: u8,
- counter: u64,
- flags: u8,
- out: *mut u8,
- );
- pub fn blake3_hash_many_portable(
- inputs: *const *const u8,
- num_inputs: usize,
- blocks: usize,
- key: *const u32,
- counter: u64,
- increment_counter: bool,
- flags: u8,
- flags_start: u8,
- flags_end: u8,
- out: *mut u8,
- );
- }
-
- #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
- pub mod x86 {
- extern "C" {
- // SSE2 low level functions
- pub fn blake3_compress_in_place_sse2(
- cv: *mut u32,
- block: *const u8,
- block_len: u8,
- counter: u64,
- flags: u8,
- );
- pub fn blake3_compress_xof_sse2(
- cv: *const u32,
- block: *const u8,
- block_len: u8,
- counter: u64,
- flags: u8,
- out: *mut u8,
- );
- pub fn blake3_hash_many_sse2(
- inputs: *const *const u8,
- num_inputs: usize,
- blocks: usize,
- key: *const u32,
- counter: u64,
- increment_counter: bool,
- flags: u8,
- flags_start: u8,
- flags_end: u8,
- out: *mut u8,
- );
-
- // SSE4.1 low level functions
- pub fn blake3_compress_in_place_sse41(
- cv: *mut u32,
- block: *const u8,
- block_len: u8,
- counter: u64,
- flags: u8,
- );
- pub fn blake3_compress_xof_sse41(
- cv: *const u32,
- block: *const u8,
- block_len: u8,
- counter: u64,
- flags: u8,
- out: *mut u8,
- );
- pub fn blake3_hash_many_sse41(
- inputs: *const *const u8,
- num_inputs: usize,
- blocks: usize,
- key: *const u32,
- counter: u64,
- increment_counter: bool,
- flags: u8,
- flags_start: u8,
- flags_end: u8,
- out: *mut u8,
- );
-
- // AVX2 low level functions
- pub fn blake3_hash_many_avx2(
- inputs: *const *const u8,
- num_inputs: usize,
- blocks: usize,
- key: *const u32,
- counter: u64,
- increment_counter: bool,
- flags: u8,
- flags_start: u8,
- flags_end: u8,
- out: *mut u8,
- );
-
- // AVX-512 low level functions
- pub fn blake3_compress_xof_avx512(
- cv: *const u32,
- block: *const u8,
- block_len: u8,
- counter: u64,
- flags: u8,
- out: *mut u8,
- );
- pub fn blake3_compress_in_place_avx512(
- cv: *mut u32,
- block: *const u8,
- block_len: u8,
- counter: u64,
- flags: u8,
- );
- pub fn blake3_hash_many_avx512(
- inputs: *const *const u8,
- num_inputs: usize,
- blocks: usize,
- key: *const u32,
- counter: u64,
- increment_counter: bool,
- flags: u8,
- flags_start: u8,
- flags_end: u8,
- out: *mut u8,
- );
- }
- }
-
- #[cfg(feature = "neon")]
- pub mod neon {
- extern "C" {
- // NEON low level functions
- pub fn blake3_hash_many_neon(
- inputs: *const *const u8,
- num_inputs: usize,
- blocks: usize,
- key: *const u32,
- counter: u64,
- increment_counter: bool,
- flags: u8,
- flags_start: u8,
- flags_end: u8,
- out: *mut u8,
- );
- }
- }
-}
diff --git a/thirdparty/BLAKE3/c/blake3_c_rust_bindings/src/test.rs b/thirdparty/BLAKE3/c/blake3_c_rust_bindings/src/test.rs
deleted file mode 100644
index b989ae9c4..000000000
--- a/thirdparty/BLAKE3/c/blake3_c_rust_bindings/src/test.rs
+++ /dev/null
@@ -1,511 +0,0 @@
-// Most of this code is duplicated from the root `blake3` crate. Perhaps we
-// could share more of it in the future.
-
-use crate::{BLOCK_LEN, CHUNK_LEN, OUT_LEN};
-use arrayref::{array_mut_ref, array_ref};
-use arrayvec::ArrayVec;
-use core::usize;
-use rand::prelude::*;
-
-const CHUNK_START: u8 = 1 << 0;
-const CHUNK_END: u8 = 1 << 1;
-const PARENT: u8 = 1 << 2;
-const ROOT: u8 = 1 << 3;
-const KEYED_HASH: u8 = 1 << 4;
-// const DERIVE_KEY_CONTEXT: u8 = 1 << 5;
-// const DERIVE_KEY_MATERIAL: u8 = 1 << 6;
-
-// Interesting input lengths to run tests on.
-pub const TEST_CASES: &[usize] = &[
- 0,
- 1,
- 2,
- 3,
- 4,
- 5,
- 6,
- 7,
- 8,
- BLOCK_LEN - 1,
- BLOCK_LEN,
- BLOCK_LEN + 1,
- 2 * BLOCK_LEN - 1,
- 2 * BLOCK_LEN,
- 2 * BLOCK_LEN + 1,
- CHUNK_LEN - 1,
- CHUNK_LEN,
- CHUNK_LEN + 1,
- 2 * CHUNK_LEN,
- 2 * CHUNK_LEN + 1,
- 3 * CHUNK_LEN,
- 3 * CHUNK_LEN + 1,
- 4 * CHUNK_LEN,
- 4 * CHUNK_LEN + 1,
- 5 * CHUNK_LEN,
- 5 * CHUNK_LEN + 1,
- 6 * CHUNK_LEN,
- 6 * CHUNK_LEN + 1,
- 7 * CHUNK_LEN,
- 7 * CHUNK_LEN + 1,
- 8 * CHUNK_LEN,
- 8 * CHUNK_LEN + 1,
- 16 * CHUNK_LEN, // AVX512's bandwidth
- 31 * CHUNK_LEN, // 16 + 8 + 4 + 2 + 1
- 100 * CHUNK_LEN, // subtrees larger than MAX_SIMD_DEGREE chunks
-];
-
-pub const TEST_CASES_MAX: usize = 100 * CHUNK_LEN;
-
-// There's a test to make sure these two are equal below.
-pub const TEST_KEY: [u8; 32] = *b"whats the Elvish word for friend";
-pub const TEST_KEY_WORDS: [u32; 8] = [
- 1952540791, 1752440947, 1816469605, 1752394102, 1919907616, 1868963940, 1919295602, 1684956521,
-];
-
-// Paint the input with a repeating byte pattern. We use a cycle length of 251,
-// because that's the largets prime number less than 256. This makes it
-// unlikely to swapping any two adjacent input blocks or chunks will give the
-// same answer.
-fn paint_test_input(buf: &mut [u8]) {
- for (i, b) in buf.iter_mut().enumerate() {
- *b = (i % 251) as u8;
- }
-}
-
-#[inline(always)]
-fn le_bytes_from_words_32(words: &[u32; 8]) -> [u8; 32] {
- let mut out = [0; 32];
- *array_mut_ref!(out, 0 * 4, 4) = words[0].to_le_bytes();
- *array_mut_ref!(out, 1 * 4, 4) = words[1].to_le_bytes();
- *array_mut_ref!(out, 2 * 4, 4) = words[2].to_le_bytes();
- *array_mut_ref!(out, 3 * 4, 4) = words[3].to_le_bytes();
- *array_mut_ref!(out, 4 * 4, 4) = words[4].to_le_bytes();
- *array_mut_ref!(out, 5 * 4, 4) = words[5].to_le_bytes();
- *array_mut_ref!(out, 6 * 4, 4) = words[6].to_le_bytes();
- *array_mut_ref!(out, 7 * 4, 4) = words[7].to_le_bytes();
- out
-}
-
-type CompressInPlaceFn =
- unsafe extern "C" fn(cv: *mut u32, block: *const u8, block_len: u8, counter: u64, flags: u8);
-
-type CompressXofFn = unsafe extern "C" fn(
- cv: *const u32,
- block: *const u8,
- block_len: u8,
- counter: u64,
- flags: u8,
- out: *mut u8,
-);
-
-// A shared helper function for platform-specific tests.
-pub fn test_compress_fn(compress_in_place_fn: CompressInPlaceFn, compress_xof_fn: CompressXofFn) {
- let initial_state = TEST_KEY_WORDS;
- let block_len: u8 = 61;
- let mut block = [0; BLOCK_LEN];
- paint_test_input(&mut block[..block_len as usize]);
- // Use a counter with set bits in both 32-bit words.
- let counter = (5u64 << 32) + 6;
- let flags = CHUNK_END | ROOT | KEYED_HASH;
-
- let mut portable_out = [0; 64];
- unsafe {
- crate::ffi::blake3_compress_xof_portable(
- initial_state.as_ptr(),
- block.as_ptr(),
- block_len,
- counter,
- flags,
- portable_out.as_mut_ptr(),
- );
- }
-
- let mut test_state = initial_state;
- unsafe {
- compress_in_place_fn(
- test_state.as_mut_ptr(),
- block.as_ptr(),
- block_len,
- counter,
- flags,
- )
- };
- let test_state_bytes = le_bytes_from_words_32(&test_state);
- let mut test_xof = [0; 64];
- unsafe {
- compress_xof_fn(
- initial_state.as_ptr(),
- block.as_ptr(),
- block_len,
- counter,
- flags,
- test_xof.as_mut_ptr(),
- )
- };
-
- assert_eq!(&portable_out[..32], &test_state_bytes[..]);
- assert_eq!(&portable_out[..], &test_xof[..]);
-}
-
-// Testing the portable implementation against itself is circular, but why not.
-#[test]
-fn test_compress_portable() {
- test_compress_fn(
- crate::ffi::blake3_compress_in_place_portable,
- crate::ffi::blake3_compress_xof_portable,
- );
-}
-
-#[test]
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-fn test_compress_sse2() {
- if !crate::sse2_detected() {
- return;
- }
- test_compress_fn(
- crate::ffi::x86::blake3_compress_in_place_sse2,
- crate::ffi::x86::blake3_compress_xof_sse2,
- );
-}
-
-#[test]
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-fn test_compress_sse41() {
- if !crate::sse41_detected() {
- return;
- }
- test_compress_fn(
- crate::ffi::x86::blake3_compress_in_place_sse41,
- crate::ffi::x86::blake3_compress_xof_sse41,
- );
-}
-
-#[test]
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-fn test_compress_avx512() {
- if !crate::avx512_detected() {
- return;
- }
- test_compress_fn(
- crate::ffi::x86::blake3_compress_in_place_avx512,
- crate::ffi::x86::blake3_compress_xof_avx512,
- );
-}
-
-type HashManyFn = unsafe extern "C" fn(
- inputs: *const *const u8,
- num_inputs: usize,
- blocks: usize,
- key: *const u32,
- counter: u64,
- increment_counter: bool,
- flags: u8,
- flags_start: u8,
- flags_end: u8,
- out: *mut u8,
-);
-
-// A shared helper function for platform-specific tests.
-pub fn test_hash_many_fn(hash_many_fn: HashManyFn) {
- // 31 (16 + 8 + 4 + 2 + 1) inputs
- const NUM_INPUTS: usize = 31;
- let mut input_buf = [0; CHUNK_LEN * NUM_INPUTS];
- crate::test::paint_test_input(&mut input_buf);
- // A counter just prior to u32::MAX.
- let counter = (1u64 << 32) - 1;
-
- // First hash chunks.
- let mut chunks = ArrayVec::<[&[u8; CHUNK_LEN]; NUM_INPUTS]>::new();
- for i in 0..NUM_INPUTS {
- chunks.push(array_ref!(input_buf, i * CHUNK_LEN, CHUNK_LEN));
- }
- let mut portable_chunks_out = [0; NUM_INPUTS * OUT_LEN];
- unsafe {
- crate::ffi::blake3_hash_many_portable(
- chunks.as_ptr() as _,
- chunks.len(),
- CHUNK_LEN / BLOCK_LEN,
- TEST_KEY_WORDS.as_ptr(),
- counter,
- true,
- KEYED_HASH,
- CHUNK_START,
- CHUNK_END,
- portable_chunks_out.as_mut_ptr(),
- );
- }
-
- let mut test_chunks_out = [0; NUM_INPUTS * OUT_LEN];
- unsafe {
- hash_many_fn(
- chunks.as_ptr() as _,
- chunks.len(),
- CHUNK_LEN / BLOCK_LEN,
- TEST_KEY_WORDS.as_ptr(),
- counter,
- true,
- KEYED_HASH,
- CHUNK_START,
- CHUNK_END,
- test_chunks_out.as_mut_ptr(),
- );
- }
- for n in 0..NUM_INPUTS {
- dbg!(n);
- assert_eq!(
- &portable_chunks_out[n * OUT_LEN..][..OUT_LEN],
- &test_chunks_out[n * OUT_LEN..][..OUT_LEN]
- );
- }
-
- // Then hash parents.
- let mut parents = ArrayVec::<[&[u8; 2 * OUT_LEN]; NUM_INPUTS]>::new();
- for i in 0..NUM_INPUTS {
- parents.push(array_ref!(input_buf, i * 2 * OUT_LEN, 2 * OUT_LEN));
- }
- let mut portable_parents_out = [0; NUM_INPUTS * OUT_LEN];
- unsafe {
- crate::ffi::blake3_hash_many_portable(
- parents.as_ptr() as _,
- parents.len(),
- 1,
- TEST_KEY_WORDS.as_ptr(),
- counter,
- false,
- KEYED_HASH | PARENT,
- 0,
- 0,
- portable_parents_out.as_mut_ptr(),
- );
- }
-
- let mut test_parents_out = [0; NUM_INPUTS * OUT_LEN];
- unsafe {
- hash_many_fn(
- parents.as_ptr() as _,
- parents.len(),
- 1,
- TEST_KEY_WORDS.as_ptr(),
- counter,
- false,
- KEYED_HASH | PARENT,
- 0,
- 0,
- test_parents_out.as_mut_ptr(),
- );
- }
- for n in 0..NUM_INPUTS {
- dbg!(n);
- assert_eq!(
- &portable_parents_out[n * OUT_LEN..][..OUT_LEN],
- &test_parents_out[n * OUT_LEN..][..OUT_LEN]
- );
- }
-}
-
-// Testing the portable implementation against itself is circular, but why not.
-#[test]
-fn test_hash_many_portable() {
- test_hash_many_fn(crate::ffi::blake3_hash_many_portable);
-}
-
-#[test]
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-fn test_hash_many_sse2() {
- if !crate::sse2_detected() {
- return;
- }
- test_hash_many_fn(crate::ffi::x86::blake3_hash_many_sse2);
-}
-
-#[test]
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-fn test_hash_many_sse41() {
- if !crate::sse41_detected() {
- return;
- }
- test_hash_many_fn(crate::ffi::x86::blake3_hash_many_sse41);
-}
-
-#[test]
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-fn test_hash_many_avx2() {
- if !crate::avx2_detected() {
- return;
- }
- test_hash_many_fn(crate::ffi::x86::blake3_hash_many_avx2);
-}
-
-#[test]
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-fn test_hash_many_avx512() {
- if !crate::avx512_detected() {
- return;
- }
- test_hash_many_fn(crate::ffi::x86::blake3_hash_many_avx512);
-}
-
-#[test]
-#[cfg(feature = "neon")]
-fn test_hash_many_neon() {
- test_hash_many_fn(crate::ffi::neon::blake3_hash_many_neon);
-}
-
-#[test]
-fn test_compare_reference_impl() {
- const OUT: usize = 303; // more than 64, not a multiple of 4
- let mut input_buf = [0; TEST_CASES_MAX];
- paint_test_input(&mut input_buf);
- for &case in TEST_CASES {
- let input = &input_buf[..case];
- dbg!(case);
-
- // regular
- {
- let mut reference_hasher = reference_impl::Hasher::new();
- reference_hasher.update(input);
- let mut expected_out = [0; OUT];
- reference_hasher.finalize(&mut expected_out);
-
- let mut test_hasher = crate::Hasher::new();
- test_hasher.update(input);
- let mut test_out = [0; OUT];
- test_hasher.finalize(&mut test_out);
- assert_eq!(test_out[..], expected_out[..]);
- }
-
- // keyed
- {
- let mut reference_hasher = reference_impl::Hasher::new_keyed(&TEST_KEY);
- reference_hasher.update(input);
- let mut expected_out = [0; OUT];
- reference_hasher.finalize(&mut expected_out);
-
- let mut test_hasher = crate::Hasher::new_keyed(&TEST_KEY);
- test_hasher.update(input);
- let mut test_out = [0; OUT];
- test_hasher.finalize(&mut test_out);
- assert_eq!(test_out[..], expected_out[..]);
- }
-
- // derive_key
- {
- let context = "BLAKE3 2019-12-27 16:13:59 example context (not the test vector one)";
- let mut reference_hasher = reference_impl::Hasher::new_derive_key(context);
- reference_hasher.update(input);
- let mut expected_out = [0; OUT];
- reference_hasher.finalize(&mut expected_out);
-
- // the regular C string API
- let mut test_hasher = crate::Hasher::new_derive_key(context);
- test_hasher.update(input);
- let mut test_out = [0; OUT];
- test_hasher.finalize(&mut test_out);
- assert_eq!(test_out[..], expected_out[..]);
-
- // the raw bytes API
- let mut test_hasher_raw = crate::Hasher::new_derive_key_raw(context.as_bytes());
- test_hasher_raw.update(input);
- let mut test_out_raw = [0; OUT];
- test_hasher_raw.finalize(&mut test_out_raw);
- assert_eq!(test_out_raw[..], expected_out[..]);
- }
- }
-}
-
-fn reference_hash(input: &[u8]) -> [u8; OUT_LEN] {
- let mut hasher = reference_impl::Hasher::new();
- hasher.update(input);
- let mut bytes = [0; OUT_LEN];
- hasher.finalize(&mut bytes);
- bytes.into()
-}
-
-#[test]
-fn test_compare_update_multiple() {
- // Don't use all the long test cases here, since that's unnecessarily slow
- // in debug mode.
- let mut short_test_cases = TEST_CASES;
- while *short_test_cases.last().unwrap() > 4 * CHUNK_LEN {
- short_test_cases = &short_test_cases[..short_test_cases.len() - 1];
- }
- assert_eq!(*short_test_cases.last().unwrap(), 4 * CHUNK_LEN);
-
- let mut input_buf = [0; 2 * TEST_CASES_MAX];
- paint_test_input(&mut input_buf);
-
- for &first_update in short_test_cases {
- dbg!(first_update);
- let first_input = &input_buf[..first_update];
- let mut test_hasher = crate::Hasher::new();
- test_hasher.update(first_input);
-
- for &second_update in short_test_cases {
- dbg!(second_update);
- let second_input = &input_buf[first_update..][..second_update];
- let total_input = &input_buf[..first_update + second_update];
-
- // Clone the hasher with first_update bytes already written, so
- // that the next iteration can reuse it.
- let mut test_hasher = test_hasher.clone();
- test_hasher.update(second_input);
- let mut test_out = [0; OUT_LEN];
- test_hasher.finalize(&mut test_out);
-
- let expected = reference_hash(total_input);
- assert_eq!(expected, test_out);
- }
- }
-}
-
-#[test]
-fn test_fuzz_hasher() {
- const INPUT_MAX: usize = 4 * CHUNK_LEN;
- let mut input_buf = [0; 3 * INPUT_MAX];
- paint_test_input(&mut input_buf);
-
- // Don't do too many iterations in debug mode, to keep the tests under a
- // second or so. CI should run tests in release mode also. Provide an
- // environment variable for specifying a larger number of fuzz iterations.
- let num_tests = if cfg!(debug_assertions) { 100 } else { 10_000 };
-
- // Use a fixed RNG seed for reproducibility.
- let mut rng = rand_chacha::ChaCha8Rng::from_seed([1; 32]);
- for _num_test in 0..num_tests {
- dbg!(_num_test);
- let mut hasher = crate::Hasher::new();
- let mut total_input = 0;
- // For each test, write 3 inputs of random length.
- for _ in 0..3 {
- let input_len = rng.gen_range(0, INPUT_MAX + 1);
- dbg!(input_len);
- let input = &input_buf[total_input..][..input_len];
- hasher.update(input);
- total_input += input_len;
- }
- let expected = reference_hash(&input_buf[..total_input]);
- let mut test_out = [0; 32];
- hasher.finalize(&mut test_out);
- assert_eq!(expected, test_out);
- }
-}
-
-#[test]
-fn test_finalize_seek() {
- let mut expected = [0; 1000];
- {
- let mut reference_hasher = reference_impl::Hasher::new();
- reference_hasher.update(b"foobarbaz");
- reference_hasher.finalize(&mut expected);
- }
-
- let mut test_hasher = crate::Hasher::new();
- test_hasher.update(b"foobarbaz");
-
- let mut out = [0; 103];
- for &seek in &[0, 1, 7, 59, 63, 64, 65, 501, expected.len() - out.len()] {
- dbg!(seek);
- test_hasher.finalize_seek(seek as u64, &mut out);
- assert_eq!(&expected[seek..][..out.len()], &out[..]);
- }
-}
diff --git a/thirdparty/BLAKE3/c/blake3_dispatch.c b/thirdparty/BLAKE3/c/blake3_dispatch.c
deleted file mode 100644
index 6518478e5..000000000
--- a/thirdparty/BLAKE3/c/blake3_dispatch.c
+++ /dev/null
@@ -1,276 +0,0 @@
-#include <stdbool.h>
-#include <stddef.h>
-#include <stdint.h>
-
-#include "blake3_impl.h"
-
-#if defined(IS_X86)
-#if defined(_MSC_VER)
-#include <intrin.h>
-#elif defined(__GNUC__)
-#include <immintrin.h>
-#else
-#error "Unimplemented!"
-#endif
-#endif
-
-#define MAYBE_UNUSED(x) (void)((x))
-
-#if defined(IS_X86)
-static uint64_t xgetbv() {
-#if defined(_MSC_VER)
- return _xgetbv(0);
-#else
- uint32_t eax = 0, edx = 0;
- __asm__ __volatile__("xgetbv\n" : "=a"(eax), "=d"(edx) : "c"(0));
- return ((uint64_t)edx << 32) | eax;
-#endif
-}
-
-static void cpuid(uint32_t out[4], uint32_t id) {
-#if defined(_MSC_VER)
- __cpuid((int *)out, id);
-#elif defined(__i386__) || defined(_M_IX86)
- __asm__ __volatile__("movl %%ebx, %1\n"
- "cpuid\n"
- "xchgl %1, %%ebx\n"
- : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
- : "a"(id));
-#else
- __asm__ __volatile__("cpuid\n"
- : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
- : "a"(id));
-#endif
-}
-
-static void cpuidex(uint32_t out[4], uint32_t id, uint32_t sid) {
-#if defined(_MSC_VER)
- __cpuidex((int *)out, id, sid);
-#elif defined(__i386__) || defined(_M_IX86)
- __asm__ __volatile__("movl %%ebx, %1\n"
- "cpuid\n"
- "xchgl %1, %%ebx\n"
- : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
- : "a"(id), "c"(sid));
-#else
- __asm__ __volatile__("cpuid\n"
- : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
- : "a"(id), "c"(sid));
-#endif
-}
-
-#endif
-
-enum cpu_feature {
- SSE2 = 1 << 0,
- SSSE3 = 1 << 1,
- SSE41 = 1 << 2,
- AVX = 1 << 3,
- AVX2 = 1 << 4,
- AVX512F = 1 << 5,
- AVX512VL = 1 << 6,
- /* ... */
- UNDEFINED = 1 << 30
-};
-
-#if !defined(BLAKE3_TESTING)
-static /* Allow the variable to be controlled manually for testing */
-#endif
- enum cpu_feature g_cpu_features = UNDEFINED;
-
-#if !defined(BLAKE3_TESTING)
-static
-#endif
- enum cpu_feature
- get_cpu_features() {
-
- if (g_cpu_features != UNDEFINED) {
- return g_cpu_features;
- } else {
-#if defined(IS_X86)
- uint32_t regs[4] = {0};
- uint32_t *eax = &regs[0], *ebx = &regs[1], *ecx = &regs[2], *edx = &regs[3];
- (void)edx;
- enum cpu_feature features = 0;
- cpuid(regs, 0);
- const int max_id = *eax;
- cpuid(regs, 1);
-#if defined(__amd64__) || defined(_M_X64)
- features |= SSE2;
-#else
- if (*edx & (1UL << 26))
- features |= SSE2;
-#endif
- if (*ecx & (1UL << 0))
- features |= SSSE3;
- if (*ecx & (1UL << 19))
- features |= SSE41;
-
- if (*ecx & (1UL << 27)) { // OSXSAVE
- const uint64_t mask = xgetbv();
- if ((mask & 6) == 6) { // SSE and AVX states
- if (*ecx & (1UL << 28))
- features |= AVX;
- if (max_id >= 7) {
- cpuidex(regs, 7, 0);
- if (*ebx & (1UL << 5))
- features |= AVX2;
- if ((mask & 224) == 224) { // Opmask, ZMM_Hi256, Hi16_Zmm
- if (*ebx & (1UL << 31))
- features |= AVX512VL;
- if (*ebx & (1UL << 16))
- features |= AVX512F;
- }
- }
- }
- }
- g_cpu_features = features;
- return features;
-#else
- /* How to detect NEON? */
- return 0;
-#endif
- }
-}
-
-void blake3_compress_in_place(uint32_t cv[8],
- const uint8_t block[BLAKE3_BLOCK_LEN],
- uint8_t block_len, uint64_t counter,
- uint8_t flags) {
-#if defined(IS_X86)
- const enum cpu_feature features = get_cpu_features();
- MAYBE_UNUSED(features);
-#if !defined(BLAKE3_NO_AVX512)
- if (features & AVX512VL) {
- blake3_compress_in_place_avx512(cv, block, block_len, counter, flags);
- return;
- }
-#endif
-#if !defined(BLAKE3_NO_SSE41)
- if (features & SSE41) {
- blake3_compress_in_place_sse41(cv, block, block_len, counter, flags);
- return;
- }
-#endif
-#if !defined(BLAKE3_NO_SSE2)
- if (features & SSE2) {
- blake3_compress_in_place_sse2(cv, block, block_len, counter, flags);
- return;
- }
-#endif
-#endif
- blake3_compress_in_place_portable(cv, block, block_len, counter, flags);
-}
-
-void blake3_compress_xof(const uint32_t cv[8],
- const uint8_t block[BLAKE3_BLOCK_LEN],
- uint8_t block_len, uint64_t counter, uint8_t flags,
- uint8_t out[64]) {
-#if defined(IS_X86)
- const enum cpu_feature features = get_cpu_features();
- MAYBE_UNUSED(features);
-#if !defined(BLAKE3_NO_AVX512)
- if (features & AVX512VL) {
- blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out);
- return;
- }
-#endif
-#if !defined(BLAKE3_NO_SSE41)
- if (features & SSE41) {
- blake3_compress_xof_sse41(cv, block, block_len, counter, flags, out);
- return;
- }
-#endif
-#if !defined(BLAKE3_NO_SSE2)
- if (features & SSE2) {
- blake3_compress_xof_sse2(cv, block, block_len, counter, flags, out);
- return;
- }
-#endif
-#endif
- blake3_compress_xof_portable(cv, block, block_len, counter, flags, out);
-}
-
-void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
- size_t blocks, const uint32_t key[8], uint64_t counter,
- bool increment_counter, uint8_t flags,
- uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
-#if defined(IS_X86)
- const enum cpu_feature features = get_cpu_features();
- MAYBE_UNUSED(features);
-#if !defined(BLAKE3_NO_AVX512)
- if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
- blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter,
- increment_counter, flags, flags_start, flags_end,
- out);
- return;
- }
-#endif
-#if !defined(BLAKE3_NO_AVX2)
- if (features & AVX2) {
- blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter,
- increment_counter, flags, flags_start, flags_end,
- out);
- return;
- }
-#endif
-#if !defined(BLAKE3_NO_SSE41)
- if (features & SSE41) {
- blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter,
- increment_counter, flags, flags_start, flags_end,
- out);
- return;
- }
-#endif
-#if !defined(BLAKE3_NO_SSE2)
- if (features & SSE2) {
- blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter,
- increment_counter, flags, flags_start, flags_end,
- out);
- return;
- }
-#endif
-#endif
-
-#if defined(BLAKE3_USE_NEON)
- blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter,
- increment_counter, flags, flags_start, flags_end, out);
- return;
-#endif
-
- blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter,
- increment_counter, flags, flags_start, flags_end,
- out);
-}
-
-// The dynamically detected SIMD degree of the current platform.
-size_t blake3_simd_degree(void) {
-#if defined(IS_X86)
- const enum cpu_feature features = get_cpu_features();
- MAYBE_UNUSED(features);
-#if !defined(BLAKE3_NO_AVX512)
- if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
- return 16;
- }
-#endif
-#if !defined(BLAKE3_NO_AVX2)
- if (features & AVX2) {
- return 8;
- }
-#endif
-#if !defined(BLAKE3_NO_SSE41)
- if (features & SSE41) {
- return 4;
- }
-#endif
-#if !defined(BLAKE3_NO_SSE2)
- if (features & SSE2) {
- return 4;
- }
-#endif
-#endif
-#if defined(BLAKE3_USE_NEON)
- return 4;
-#endif
- return 1;
-}
diff --git a/thirdparty/BLAKE3/c/blake3_impl.h b/thirdparty/BLAKE3/c/blake3_impl.h
deleted file mode 100644
index 86ab6aa25..000000000
--- a/thirdparty/BLAKE3/c/blake3_impl.h
+++ /dev/null
@@ -1,269 +0,0 @@
-#ifndef BLAKE3_IMPL_H
-#define BLAKE3_IMPL_H
-
-#include <assert.h>
-#include <stdbool.h>
-#include <stddef.h>
-#include <stdint.h>
-#include <string.h>
-
-#include "blake3.h"
-
-// internal flags
-enum blake3_flags {
- CHUNK_START = 1 << 0,
- CHUNK_END = 1 << 1,
- PARENT = 1 << 2,
- ROOT = 1 << 3,
- KEYED_HASH = 1 << 4,
- DERIVE_KEY_CONTEXT = 1 << 5,
- DERIVE_KEY_MATERIAL = 1 << 6,
-};
-
-// This C implementation tries to support recent versions of GCC, Clang, and
-// MSVC.
-#if defined(_MSC_VER)
-#define INLINE static __forceinline
-#else
-#define INLINE static inline __attribute__((always_inline))
-#endif
-
-#if defined(__x86_64__) || defined(_M_X64)
-#define IS_X86
-#define IS_X86_64
-#endif
-
-#if defined(__i386__) || defined(_M_IX86)
-#define IS_X86
-#define IS_X86_32
-#endif
-
-#if defined(IS_X86)
-#if defined(_MSC_VER)
-#include <intrin.h>
-#endif
-#include <immintrin.h>
-#endif
-
-#if defined(IS_X86)
-#define MAX_SIMD_DEGREE 16
-#elif defined(BLAKE3_USE_NEON)
-#define MAX_SIMD_DEGREE 4
-#else
-#define MAX_SIMD_DEGREE 1
-#endif
-
-// There are some places where we want a static size that's equal to the
-// MAX_SIMD_DEGREE, but also at least 2.
-#define MAX_SIMD_DEGREE_OR_2 (MAX_SIMD_DEGREE > 2 ? MAX_SIMD_DEGREE : 2)
-
-static const uint32_t IV[8] = {0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL,
- 0xA54FF53AUL, 0x510E527FUL, 0x9B05688CUL,
- 0x1F83D9ABUL, 0x5BE0CD19UL};
-
-static const uint8_t MSG_SCHEDULE[7][16] = {
- {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
- {2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8},
- {3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1},
- {10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6},
- {12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4},
- {9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7},
- {11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13},
-};
-
-/* Find index of the highest set bit */
-/* x is assumed to be nonzero. */
-static unsigned int highest_one(uint64_t x) {
-#if defined(__GNUC__) || defined(__clang__)
- return 63 ^ __builtin_clzll(x);
-#elif defined(_MSC_VER) && defined(IS_X86_64)
- unsigned long index;
- _BitScanReverse64(&index, x);
- return index;
-#elif defined(_MSC_VER) && defined(IS_X86_32)
- if(x >> 32) {
- unsigned long index;
- _BitScanReverse(&index, x >> 32);
- return 32 + index;
- } else {
- unsigned long index;
- _BitScanReverse(&index, x);
- return index;
- }
-#else
- unsigned int c = 0;
- if(x & 0xffffffff00000000ULL) { x >>= 32; c += 32; }
- if(x & 0x00000000ffff0000ULL) { x >>= 16; c += 16; }
- if(x & 0x000000000000ff00ULL) { x >>= 8; c += 8; }
- if(x & 0x00000000000000f0ULL) { x >>= 4; c += 4; }
- if(x & 0x000000000000000cULL) { x >>= 2; c += 2; }
- if(x & 0x0000000000000002ULL) { c += 1; }
- return c;
-#endif
-}
-
-// Count the number of 1 bits.
-INLINE unsigned int popcnt(uint64_t x) {
-#if defined(__GNUC__) || defined(__clang__)
- return __builtin_popcountll(x);
-#else
- unsigned int count = 0;
- while (x != 0) {
- count += 1;
- x &= x - 1;
- }
- return count;
-#endif
-}
-
-// Largest power of two less than or equal to x. As a special case, returns 1
-// when x is 0.
-INLINE uint64_t round_down_to_power_of_2(uint64_t x) {
- return 1ULL << highest_one(x | 1);
-}
-
-INLINE uint32_t counter_low(uint64_t counter) { return (uint32_t)counter; }
-
-INLINE uint32_t counter_high(uint64_t counter) {
- return (uint32_t)(counter >> 32);
-}
-
-INLINE uint32_t load32(const void *src) {
- const uint8_t *p = (const uint8_t *)src;
- return ((uint32_t)(p[0]) << 0) | ((uint32_t)(p[1]) << 8) |
- ((uint32_t)(p[2]) << 16) | ((uint32_t)(p[3]) << 24);
-}
-
-INLINE void load_key_words(const uint8_t key[BLAKE3_KEY_LEN],
- uint32_t key_words[8]) {
- key_words[0] = load32(&key[0 * 4]);
- key_words[1] = load32(&key[1 * 4]);
- key_words[2] = load32(&key[2 * 4]);
- key_words[3] = load32(&key[3 * 4]);
- key_words[4] = load32(&key[4 * 4]);
- key_words[5] = load32(&key[5 * 4]);
- key_words[6] = load32(&key[6 * 4]);
- key_words[7] = load32(&key[7 * 4]);
-}
-
-INLINE void store32(void *dst, uint32_t w) {
- uint8_t *p = (uint8_t *)dst;
- p[0] = (uint8_t)(w >> 0);
- p[1] = (uint8_t)(w >> 8);
- p[2] = (uint8_t)(w >> 16);
- p[3] = (uint8_t)(w >> 24);
-}
-
-INLINE void store_cv_words(uint8_t bytes_out[32], uint32_t cv_words[8]) {
- store32(&bytes_out[0 * 4], cv_words[0]);
- store32(&bytes_out[1 * 4], cv_words[1]);
- store32(&bytes_out[2 * 4], cv_words[2]);
- store32(&bytes_out[3 * 4], cv_words[3]);
- store32(&bytes_out[4 * 4], cv_words[4]);
- store32(&bytes_out[5 * 4], cv_words[5]);
- store32(&bytes_out[6 * 4], cv_words[6]);
- store32(&bytes_out[7 * 4], cv_words[7]);
-}
-
-void blake3_compress_in_place(uint32_t cv[8],
- const uint8_t block[BLAKE3_BLOCK_LEN],
- uint8_t block_len, uint64_t counter,
- uint8_t flags);
-
-void blake3_compress_xof(const uint32_t cv[8],
- const uint8_t block[BLAKE3_BLOCK_LEN],
- uint8_t block_len, uint64_t counter, uint8_t flags,
- uint8_t out[64]);
-
-void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
- size_t blocks, const uint32_t key[8], uint64_t counter,
- bool increment_counter, uint8_t flags,
- uint8_t flags_start, uint8_t flags_end, uint8_t *out);
-
-size_t blake3_simd_degree(void);
-
-
-// Declarations for implementation-specific functions.
-void blake3_compress_in_place_portable(uint32_t cv[8],
- const uint8_t block[BLAKE3_BLOCK_LEN],
- uint8_t block_len, uint64_t counter,
- uint8_t flags);
-
-void blake3_compress_xof_portable(const uint32_t cv[8],
- const uint8_t block[BLAKE3_BLOCK_LEN],
- uint8_t block_len, uint64_t counter,
- uint8_t flags, uint8_t out[64]);
-
-void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,
- size_t blocks, const uint32_t key[8],
- uint64_t counter, bool increment_counter,
- uint8_t flags, uint8_t flags_start,
- uint8_t flags_end, uint8_t *out);
-
-#if defined(IS_X86)
-#if !defined(BLAKE3_NO_SSE2)
-void blake3_compress_in_place_sse2(uint32_t cv[8],
- const uint8_t block[BLAKE3_BLOCK_LEN],
- uint8_t block_len, uint64_t counter,
- uint8_t flags);
-void blake3_compress_xof_sse2(const uint32_t cv[8],
- const uint8_t block[BLAKE3_BLOCK_LEN],
- uint8_t block_len, uint64_t counter,
- uint8_t flags, uint8_t out[64]);
-void blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs,
- size_t blocks, const uint32_t key[8],
- uint64_t counter, bool increment_counter,
- uint8_t flags, uint8_t flags_start,
- uint8_t flags_end, uint8_t *out);
-#endif
-#if !defined(BLAKE3_NO_SSE41)
-void blake3_compress_in_place_sse41(uint32_t cv[8],
- const uint8_t block[BLAKE3_BLOCK_LEN],
- uint8_t block_len, uint64_t counter,
- uint8_t flags);
-void blake3_compress_xof_sse41(const uint32_t cv[8],
- const uint8_t block[BLAKE3_BLOCK_LEN],
- uint8_t block_len, uint64_t counter,
- uint8_t flags, uint8_t out[64]);
-void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs,
- size_t blocks, const uint32_t key[8],
- uint64_t counter, bool increment_counter,
- uint8_t flags, uint8_t flags_start,
- uint8_t flags_end, uint8_t *out);
-#endif
-#if !defined(BLAKE3_NO_AVX2)
-void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs,
- size_t blocks, const uint32_t key[8],
- uint64_t counter, bool increment_counter,
- uint8_t flags, uint8_t flags_start,
- uint8_t flags_end, uint8_t *out);
-#endif
-#if !defined(BLAKE3_NO_AVX512)
-void blake3_compress_in_place_avx512(uint32_t cv[8],
- const uint8_t block[BLAKE3_BLOCK_LEN],
- uint8_t block_len, uint64_t counter,
- uint8_t flags);
-
-void blake3_compress_xof_avx512(const uint32_t cv[8],
- const uint8_t block[BLAKE3_BLOCK_LEN],
- uint8_t block_len, uint64_t counter,
- uint8_t flags, uint8_t out[64]);
-
-void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
- size_t blocks, const uint32_t key[8],
- uint64_t counter, bool increment_counter,
- uint8_t flags, uint8_t flags_start,
- uint8_t flags_end, uint8_t *out);
-#endif
-#endif
-
-#if defined(BLAKE3_USE_NEON)
-void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs,
- size_t blocks, const uint32_t key[8],
- uint64_t counter, bool increment_counter,
- uint8_t flags, uint8_t flags_start,
- uint8_t flags_end, uint8_t *out);
-#endif
-
-
-#endif /* BLAKE3_IMPL_H */
diff --git a/thirdparty/BLAKE3/c/blake3_neon.c b/thirdparty/BLAKE3/c/blake3_neon.c
deleted file mode 100644
index 46691f526..000000000
--- a/thirdparty/BLAKE3/c/blake3_neon.c
+++ /dev/null
@@ -1,346 +0,0 @@
-#include "blake3_impl.h"
-
-#include <arm_neon.h>
-
-// TODO: This is probably incorrect for big-endian ARM. How should that work?
-INLINE uint32x4_t loadu_128(const uint8_t src[16]) {
- // vld1q_u32 has alignment requirements. Don't use it.
- uint32x4_t x;
- memcpy(&x, src, 16);
- return x;
-}
-
-INLINE void storeu_128(uint32x4_t src, uint8_t dest[16]) {
- // vst1q_u32 has alignment requirements. Don't use it.
- memcpy(dest, &src, 16);
-}
-
-INLINE uint32x4_t add_128(uint32x4_t a, uint32x4_t b) {
- return vaddq_u32(a, b);
-}
-
-INLINE uint32x4_t xor_128(uint32x4_t a, uint32x4_t b) {
- return veorq_u32(a, b);
-}
-
-INLINE uint32x4_t set1_128(uint32_t x) { return vld1q_dup_u32(&x); }
-
-INLINE uint32x4_t set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
- uint32_t array[4] = {a, b, c, d};
- return vld1q_u32(array);
-}
-
-INLINE uint32x4_t rot16_128(uint32x4_t x) {
- return vorrq_u32(vshrq_n_u32(x, 16), vshlq_n_u32(x, 32 - 16));
-}
-
-INLINE uint32x4_t rot12_128(uint32x4_t x) {
- return vorrq_u32(vshrq_n_u32(x, 12), vshlq_n_u32(x, 32 - 12));
-}
-
-INLINE uint32x4_t rot8_128(uint32x4_t x) {
- return vorrq_u32(vshrq_n_u32(x, 8), vshlq_n_u32(x, 32 - 8));
-}
-
-INLINE uint32x4_t rot7_128(uint32x4_t x) {
- return vorrq_u32(vshrq_n_u32(x, 7), vshlq_n_u32(x, 32 - 7));
-}
-
-// TODO: compress_neon
-
-// TODO: hash2_neon
-
-/*
- * ----------------------------------------------------------------------------
- * hash4_neon
- * ----------------------------------------------------------------------------
- */
-
-INLINE void round_fn4(uint32x4_t v[16], uint32x4_t m[16], size_t r) {
- v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
- v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
- v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
- v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
- v[0] = add_128(v[0], v[4]);
- v[1] = add_128(v[1], v[5]);
- v[2] = add_128(v[2], v[6]);
- v[3] = add_128(v[3], v[7]);
- v[12] = xor_128(v[12], v[0]);
- v[13] = xor_128(v[13], v[1]);
- v[14] = xor_128(v[14], v[2]);
- v[15] = xor_128(v[15], v[3]);
- v[12] = rot16_128(v[12]);
- v[13] = rot16_128(v[13]);
- v[14] = rot16_128(v[14]);
- v[15] = rot16_128(v[15]);
- v[8] = add_128(v[8], v[12]);
- v[9] = add_128(v[9], v[13]);
- v[10] = add_128(v[10], v[14]);
- v[11] = add_128(v[11], v[15]);
- v[4] = xor_128(v[4], v[8]);
- v[5] = xor_128(v[5], v[9]);
- v[6] = xor_128(v[6], v[10]);
- v[7] = xor_128(v[7], v[11]);
- v[4] = rot12_128(v[4]);
- v[5] = rot12_128(v[5]);
- v[6] = rot12_128(v[6]);
- v[7] = rot12_128(v[7]);
- v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
- v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
- v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
- v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
- v[0] = add_128(v[0], v[4]);
- v[1] = add_128(v[1], v[5]);
- v[2] = add_128(v[2], v[6]);
- v[3] = add_128(v[3], v[7]);
- v[12] = xor_128(v[12], v[0]);
- v[13] = xor_128(v[13], v[1]);
- v[14] = xor_128(v[14], v[2]);
- v[15] = xor_128(v[15], v[3]);
- v[12] = rot8_128(v[12]);
- v[13] = rot8_128(v[13]);
- v[14] = rot8_128(v[14]);
- v[15] = rot8_128(v[15]);
- v[8] = add_128(v[8], v[12]);
- v[9] = add_128(v[9], v[13]);
- v[10] = add_128(v[10], v[14]);
- v[11] = add_128(v[11], v[15]);
- v[4] = xor_128(v[4], v[8]);
- v[5] = xor_128(v[5], v[9]);
- v[6] = xor_128(v[6], v[10]);
- v[7] = xor_128(v[7], v[11]);
- v[4] = rot7_128(v[4]);
- v[5] = rot7_128(v[5]);
- v[6] = rot7_128(v[6]);
- v[7] = rot7_128(v[7]);
-
- v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
- v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
- v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
- v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
- v[0] = add_128(v[0], v[5]);
- v[1] = add_128(v[1], v[6]);
- v[2] = add_128(v[2], v[7]);
- v[3] = add_128(v[3], v[4]);
- v[15] = xor_128(v[15], v[0]);
- v[12] = xor_128(v[12], v[1]);
- v[13] = xor_128(v[13], v[2]);
- v[14] = xor_128(v[14], v[3]);
- v[15] = rot16_128(v[15]);
- v[12] = rot16_128(v[12]);
- v[13] = rot16_128(v[13]);
- v[14] = rot16_128(v[14]);
- v[10] = add_128(v[10], v[15]);
- v[11] = add_128(v[11], v[12]);
- v[8] = add_128(v[8], v[13]);
- v[9] = add_128(v[9], v[14]);
- v[5] = xor_128(v[5], v[10]);
- v[6] = xor_128(v[6], v[11]);
- v[7] = xor_128(v[7], v[8]);
- v[4] = xor_128(v[4], v[9]);
- v[5] = rot12_128(v[5]);
- v[6] = rot12_128(v[6]);
- v[7] = rot12_128(v[7]);
- v[4] = rot12_128(v[4]);
- v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
- v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
- v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
- v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
- v[0] = add_128(v[0], v[5]);
- v[1] = add_128(v[1], v[6]);
- v[2] = add_128(v[2], v[7]);
- v[3] = add_128(v[3], v[4]);
- v[15] = xor_128(v[15], v[0]);
- v[12] = xor_128(v[12], v[1]);
- v[13] = xor_128(v[13], v[2]);
- v[14] = xor_128(v[14], v[3]);
- v[15] = rot8_128(v[15]);
- v[12] = rot8_128(v[12]);
- v[13] = rot8_128(v[13]);
- v[14] = rot8_128(v[14]);
- v[10] = add_128(v[10], v[15]);
- v[11] = add_128(v[11], v[12]);
- v[8] = add_128(v[8], v[13]);
- v[9] = add_128(v[9], v[14]);
- v[5] = xor_128(v[5], v[10]);
- v[6] = xor_128(v[6], v[11]);
- v[7] = xor_128(v[7], v[8]);
- v[4] = xor_128(v[4], v[9]);
- v[5] = rot7_128(v[5]);
- v[6] = rot7_128(v[6]);
- v[7] = rot7_128(v[7]);
- v[4] = rot7_128(v[4]);
-}
-
-INLINE void transpose_vecs_128(uint32x4_t vecs[4]) {
- // Individually transpose the four 2x2 sub-matrices in each corner.
- uint32x4x2_t rows01 = vtrnq_u32(vecs[0], vecs[1]);
- uint32x4x2_t rows23 = vtrnq_u32(vecs[2], vecs[3]);
-
- // Swap the top-right and bottom-left 2x2s (which just got transposed).
- vecs[0] =
- vcombine_u32(vget_low_u32(rows01.val[0]), vget_low_u32(rows23.val[0]));
- vecs[1] =
- vcombine_u32(vget_low_u32(rows01.val[1]), vget_low_u32(rows23.val[1]));
- vecs[2] =
- vcombine_u32(vget_high_u32(rows01.val[0]), vget_high_u32(rows23.val[0]));
- vecs[3] =
- vcombine_u32(vget_high_u32(rows01.val[1]), vget_high_u32(rows23.val[1]));
-}
-
-INLINE void transpose_msg_vecs4(const uint8_t *const *inputs,
- size_t block_offset, uint32x4_t out[16]) {
- out[0] = loadu_128(&inputs[0][block_offset + 0 * sizeof(uint32x4_t)]);
- out[1] = loadu_128(&inputs[1][block_offset + 0 * sizeof(uint32x4_t)]);
- out[2] = loadu_128(&inputs[2][block_offset + 0 * sizeof(uint32x4_t)]);
- out[3] = loadu_128(&inputs[3][block_offset + 0 * sizeof(uint32x4_t)]);
- out[4] = loadu_128(&inputs[0][block_offset + 1 * sizeof(uint32x4_t)]);
- out[5] = loadu_128(&inputs[1][block_offset + 1 * sizeof(uint32x4_t)]);
- out[6] = loadu_128(&inputs[2][block_offset + 1 * sizeof(uint32x4_t)]);
- out[7] = loadu_128(&inputs[3][block_offset + 1 * sizeof(uint32x4_t)]);
- out[8] = loadu_128(&inputs[0][block_offset + 2 * sizeof(uint32x4_t)]);
- out[9] = loadu_128(&inputs[1][block_offset + 2 * sizeof(uint32x4_t)]);
- out[10] = loadu_128(&inputs[2][block_offset + 2 * sizeof(uint32x4_t)]);
- out[11] = loadu_128(&inputs[3][block_offset + 2 * sizeof(uint32x4_t)]);
- out[12] = loadu_128(&inputs[0][block_offset + 3 * sizeof(uint32x4_t)]);
- out[13] = loadu_128(&inputs[1][block_offset + 3 * sizeof(uint32x4_t)]);
- out[14] = loadu_128(&inputs[2][block_offset + 3 * sizeof(uint32x4_t)]);
- out[15] = loadu_128(&inputs[3][block_offset + 3 * sizeof(uint32x4_t)]);
- transpose_vecs_128(&out[0]);
- transpose_vecs_128(&out[4]);
- transpose_vecs_128(&out[8]);
- transpose_vecs_128(&out[12]);
-}
-
-INLINE void load_counters4(uint64_t counter, bool increment_counter,
- uint32x4_t *out_low, uint32x4_t *out_high) {
- uint64_t mask = (increment_counter ? ~0 : 0);
- *out_low = set4(
- counter_low(counter + (mask & 0)), counter_low(counter + (mask & 1)),
- counter_low(counter + (mask & 2)), counter_low(counter + (mask & 3)));
- *out_high = set4(
- counter_high(counter + (mask & 0)), counter_high(counter + (mask & 1)),
- counter_high(counter + (mask & 2)), counter_high(counter + (mask & 3)));
-}
-
-void blake3_hash4_neon(const uint8_t *const *inputs, size_t blocks,
- const uint32_t key[8], uint64_t counter,
- bool increment_counter, uint8_t flags,
- uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
- uint32x4_t h_vecs[8] = {
- set1_128(key[0]), set1_128(key[1]), set1_128(key[2]), set1_128(key[3]),
- set1_128(key[4]), set1_128(key[5]), set1_128(key[6]), set1_128(key[7]),
- };
- uint32x4_t counter_low_vec, counter_high_vec;
- load_counters4(counter, increment_counter, &counter_low_vec,
- &counter_high_vec);
- uint8_t block_flags = flags | flags_start;
-
- for (size_t block = 0; block < blocks; block++) {
- if (block + 1 == blocks) {
- block_flags |= flags_end;
- }
- uint32x4_t block_len_vec = set1_128(BLAKE3_BLOCK_LEN);
- uint32x4_t block_flags_vec = set1_128(block_flags);
- uint32x4_t msg_vecs[16];
- transpose_msg_vecs4(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
-
- uint32x4_t v[16] = {
- h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
- h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
- set1_128(IV[0]), set1_128(IV[1]), set1_128(IV[2]), set1_128(IV[3]),
- counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
- };
- round_fn4(v, msg_vecs, 0);
- round_fn4(v, msg_vecs, 1);
- round_fn4(v, msg_vecs, 2);
- round_fn4(v, msg_vecs, 3);
- round_fn4(v, msg_vecs, 4);
- round_fn4(v, msg_vecs, 5);
- round_fn4(v, msg_vecs, 6);
- h_vecs[0] = xor_128(v[0], v[8]);
- h_vecs[1] = xor_128(v[1], v[9]);
- h_vecs[2] = xor_128(v[2], v[10]);
- h_vecs[3] = xor_128(v[3], v[11]);
- h_vecs[4] = xor_128(v[4], v[12]);
- h_vecs[5] = xor_128(v[5], v[13]);
- h_vecs[6] = xor_128(v[6], v[14]);
- h_vecs[7] = xor_128(v[7], v[15]);
-
- block_flags = flags;
- }
-
- transpose_vecs_128(&h_vecs[0]);
- transpose_vecs_128(&h_vecs[4]);
- // The first four vecs now contain the first half of each output, and the
- // second four vecs contain the second half of each output.
- storeu_128(h_vecs[0], &out[0 * sizeof(uint32x4_t)]);
- storeu_128(h_vecs[4], &out[1 * sizeof(uint32x4_t)]);
- storeu_128(h_vecs[1], &out[2 * sizeof(uint32x4_t)]);
- storeu_128(h_vecs[5], &out[3 * sizeof(uint32x4_t)]);
- storeu_128(h_vecs[2], &out[4 * sizeof(uint32x4_t)]);
- storeu_128(h_vecs[6], &out[5 * sizeof(uint32x4_t)]);
- storeu_128(h_vecs[3], &out[6 * sizeof(uint32x4_t)]);
- storeu_128(h_vecs[7], &out[7 * sizeof(uint32x4_t)]);
-}
-
-/*
- * ----------------------------------------------------------------------------
- * hash_many_neon
- * ----------------------------------------------------------------------------
- */
-
-void blake3_compress_in_place_portable(uint32_t cv[8],
- const uint8_t block[BLAKE3_BLOCK_LEN],
- uint8_t block_len, uint64_t counter,
- uint8_t flags);
-
-INLINE void hash_one_neon(const uint8_t *input, size_t blocks,
- const uint32_t key[8], uint64_t counter,
- uint8_t flags, uint8_t flags_start, uint8_t flags_end,
- uint8_t out[BLAKE3_OUT_LEN]) {
- uint32_t cv[8];
- memcpy(cv, key, BLAKE3_KEY_LEN);
- uint8_t block_flags = flags | flags_start;
- while (blocks > 0) {
- if (blocks == 1) {
- block_flags |= flags_end;
- }
- // TODO: Implement compress_neon. However note that according to
- // https://github.com/BLAKE2/BLAKE2/commit/7965d3e6e1b4193438b8d3a656787587d2579227,
- // compress_neon might not be any faster than compress_portable.
- blake3_compress_in_place_portable(cv, input, BLAKE3_BLOCK_LEN, counter,
- block_flags);
- input = &input[BLAKE3_BLOCK_LEN];
- blocks -= 1;
- block_flags = flags;
- }
- memcpy(out, cv, BLAKE3_OUT_LEN);
-}
-
-void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs,
- size_t blocks, const uint32_t key[8],
- uint64_t counter, bool increment_counter,
- uint8_t flags, uint8_t flags_start,
- uint8_t flags_end, uint8_t *out) {
- while (num_inputs >= 4) {
- blake3_hash4_neon(inputs, blocks, key, counter, increment_counter, flags,
- flags_start, flags_end, out);
- if (increment_counter) {
- counter += 4;
- }
- inputs += 4;
- num_inputs -= 4;
- out = &out[4 * BLAKE3_OUT_LEN];
- }
- while (num_inputs > 0) {
- hash_one_neon(inputs[0], blocks, key, counter, flags, flags_start,
- flags_end, out);
- if (increment_counter) {
- counter += 1;
- }
- inputs += 1;
- num_inputs -= 1;
- out = &out[BLAKE3_OUT_LEN];
- }
-}
diff --git a/thirdparty/BLAKE3/c/blake3_portable.c b/thirdparty/BLAKE3/c/blake3_portable.c
deleted file mode 100644
index 062dd1b47..000000000
--- a/thirdparty/BLAKE3/c/blake3_portable.c
+++ /dev/null
@@ -1,160 +0,0 @@
-#include "blake3_impl.h"
-#include <string.h>
-
-INLINE uint32_t rotr32(uint32_t w, uint32_t c) {
- return (w >> c) | (w << (32 - c));
-}
-
-INLINE void g(uint32_t *state, size_t a, size_t b, size_t c, size_t d,
- uint32_t x, uint32_t y) {
- state[a] = state[a] + state[b] + x;
- state[d] = rotr32(state[d] ^ state[a], 16);
- state[c] = state[c] + state[d];
- state[b] = rotr32(state[b] ^ state[c], 12);
- state[a] = state[a] + state[b] + y;
- state[d] = rotr32(state[d] ^ state[a], 8);
- state[c] = state[c] + state[d];
- state[b] = rotr32(state[b] ^ state[c], 7);
-}
-
-INLINE void round_fn(uint32_t state[16], const uint32_t *msg, size_t round) {
- // Select the message schedule based on the round.
- const uint8_t *schedule = MSG_SCHEDULE[round];
-
- // Mix the columns.
- g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]);
- g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]);
- g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]);
- g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]);
-
- // Mix the rows.
- g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]);
- g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]);
- g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]);
- g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]);
-}
-
-INLINE void compress_pre(uint32_t state[16], const uint32_t cv[8],
- const uint8_t block[BLAKE3_BLOCK_LEN],
- uint8_t block_len, uint64_t counter, uint8_t flags) {
- uint32_t block_words[16];
- block_words[0] = load32(block + 4 * 0);
- block_words[1] = load32(block + 4 * 1);
- block_words[2] = load32(block + 4 * 2);
- block_words[3] = load32(block + 4 * 3);
- block_words[4] = load32(block + 4 * 4);
- block_words[5] = load32(block + 4 * 5);
- block_words[6] = load32(block + 4 * 6);
- block_words[7] = load32(block + 4 * 7);
- block_words[8] = load32(block + 4 * 8);
- block_words[9] = load32(block + 4 * 9);
- block_words[10] = load32(block + 4 * 10);
- block_words[11] = load32(block + 4 * 11);
- block_words[12] = load32(block + 4 * 12);
- block_words[13] = load32(block + 4 * 13);
- block_words[14] = load32(block + 4 * 14);
- block_words[15] = load32(block + 4 * 15);
-
- state[0] = cv[0];
- state[1] = cv[1];
- state[2] = cv[2];
- state[3] = cv[3];
- state[4] = cv[4];
- state[5] = cv[5];
- state[6] = cv[6];
- state[7] = cv[7];
- state[8] = IV[0];
- state[9] = IV[1];
- state[10] = IV[2];
- state[11] = IV[3];
- state[12] = counter_low(counter);
- state[13] = counter_high(counter);
- state[14] = (uint32_t)block_len;
- state[15] = (uint32_t)flags;
-
- round_fn(state, &block_words[0], 0);
- round_fn(state, &block_words[0], 1);
- round_fn(state, &block_words[0], 2);
- round_fn(state, &block_words[0], 3);
- round_fn(state, &block_words[0], 4);
- round_fn(state, &block_words[0], 5);
- round_fn(state, &block_words[0], 6);
-}
-
-void blake3_compress_in_place_portable(uint32_t cv[8],
- const uint8_t block[BLAKE3_BLOCK_LEN],
- uint8_t block_len, uint64_t counter,
- uint8_t flags) {
- uint32_t state[16];
- compress_pre(state, cv, block, block_len, counter, flags);
- cv[0] = state[0] ^ state[8];
- cv[1] = state[1] ^ state[9];
- cv[2] = state[2] ^ state[10];
- cv[3] = state[3] ^ state[11];
- cv[4] = state[4] ^ state[12];
- cv[5] = state[5] ^ state[13];
- cv[6] = state[6] ^ state[14];
- cv[7] = state[7] ^ state[15];
-}
-
-void blake3_compress_xof_portable(const uint32_t cv[8],
- const uint8_t block[BLAKE3_BLOCK_LEN],
- uint8_t block_len, uint64_t counter,
- uint8_t flags, uint8_t out[64]) {
- uint32_t state[16];
- compress_pre(state, cv, block, block_len, counter, flags);
-
- store32(&out[0 * 4], state[0] ^ state[8]);
- store32(&out[1 * 4], state[1] ^ state[9]);
- store32(&out[2 * 4], state[2] ^ state[10]);
- store32(&out[3 * 4], state[3] ^ state[11]);
- store32(&out[4 * 4], state[4] ^ state[12]);
- store32(&out[5 * 4], state[5] ^ state[13]);
- store32(&out[6 * 4], state[6] ^ state[14]);
- store32(&out[7 * 4], state[7] ^ state[15]);
- store32(&out[8 * 4], state[8] ^ cv[0]);
- store32(&out[9 * 4], state[9] ^ cv[1]);
- store32(&out[10 * 4], state[10] ^ cv[2]);
- store32(&out[11 * 4], state[11] ^ cv[3]);
- store32(&out[12 * 4], state[12] ^ cv[4]);
- store32(&out[13 * 4], state[13] ^ cv[5]);
- store32(&out[14 * 4], state[14] ^ cv[6]);
- store32(&out[15 * 4], state[15] ^ cv[7]);
-}
-
-INLINE void hash_one_portable(const uint8_t *input, size_t blocks,
- const uint32_t key[8], uint64_t counter,
- uint8_t flags, uint8_t flags_start,
- uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) {
- uint32_t cv[8];
- memcpy(cv, key, BLAKE3_KEY_LEN);
- uint8_t block_flags = flags | flags_start;
- while (blocks > 0) {
- if (blocks == 1) {
- block_flags |= flags_end;
- }
- blake3_compress_in_place_portable(cv, input, BLAKE3_BLOCK_LEN, counter,
- block_flags);
- input = &input[BLAKE3_BLOCK_LEN];
- blocks -= 1;
- block_flags = flags;
- }
- store_cv_words(out, cv);
-}
-
-void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,
- size_t blocks, const uint32_t key[8],
- uint64_t counter, bool increment_counter,
- uint8_t flags, uint8_t flags_start,
- uint8_t flags_end, uint8_t *out) {
- while (num_inputs > 0) {
- hash_one_portable(inputs[0], blocks, key, counter, flags, flags_start,
- flags_end, out);
- if (increment_counter) {
- counter += 1;
- }
- inputs += 1;
- num_inputs -= 1;
- out = &out[BLAKE3_OUT_LEN];
- }
-}
diff --git a/thirdparty/BLAKE3/c/blake3_sse2.c b/thirdparty/BLAKE3/c/blake3_sse2.c
deleted file mode 100644
index 159296688..000000000
--- a/thirdparty/BLAKE3/c/blake3_sse2.c
+++ /dev/null
@@ -1,565 +0,0 @@
-#include "blake3_impl.h"
-
-#include <immintrin.h>
-
-#define DEGREE 4
-
-#define _mm_shuffle_ps2(a, b, c) \
- (_mm_castps_si128( \
- _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c))))
-
-INLINE __m128i loadu(const uint8_t src[16]) {
- return _mm_loadu_si128((const __m128i *)src);
-}
-
-INLINE void storeu(__m128i src, uint8_t dest[16]) {
- _mm_storeu_si128((__m128i *)dest, src);
-}
-
-INLINE __m128i addv(__m128i a, __m128i b) { return _mm_add_epi32(a, b); }
-
-// Note that clang-format doesn't like the name "xor" for some reason.
-INLINE __m128i xorv(__m128i a, __m128i b) { return _mm_xor_si128(a, b); }
-
-INLINE __m128i set1(uint32_t x) { return _mm_set1_epi32((int32_t)x); }
-
-INLINE __m128i set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
- return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d);
-}
-
-INLINE __m128i rot16(__m128i x) {
- return _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, 0xB1), 0xB1);
-}
-
-INLINE __m128i rot12(__m128i x) {
- return xorv(_mm_srli_epi32(x, 12), _mm_slli_epi32(x, 32 - 12));
-}
-
-INLINE __m128i rot8(__m128i x) {
- return xorv(_mm_srli_epi32(x, 8), _mm_slli_epi32(x, 32 - 8));
-}
-
-INLINE __m128i rot7(__m128i x) {
- return xorv(_mm_srli_epi32(x, 7), _mm_slli_epi32(x, 32 - 7));
-}
-
-INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
- __m128i m) {
- *row0 = addv(addv(*row0, m), *row1);
- *row3 = xorv(*row3, *row0);
- *row3 = rot16(*row3);
- *row2 = addv(*row2, *row3);
- *row1 = xorv(*row1, *row2);
- *row1 = rot12(*row1);
-}
-
-INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
- __m128i m) {
- *row0 = addv(addv(*row0, m), *row1);
- *row3 = xorv(*row3, *row0);
- *row3 = rot8(*row3);
- *row2 = addv(*row2, *row3);
- *row1 = xorv(*row1, *row2);
- *row1 = rot7(*row1);
-}
-
-// Note the optimization here of leaving row1 as the unrotated row, rather than
-// row0. All the message loads below are adjusted to compensate for this. See
-// discussion at https://github.com/sneves/blake2-avx2/pull/4
-INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
- *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3));
- *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
- *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1));
-}
-
-INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
- *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1));
- *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
- *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3));
-}
-
-INLINE __m128i blend_epi16(__m128i a, __m128i b, const int imm8) {
- const __m128i bits = _mm_set_epi16(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
- __m128i mask = _mm_set1_epi16(imm8);
- mask = _mm_and_si128(mask, bits);
- mask = _mm_cmpeq_epi16(mask, bits);
- return _mm_or_si128(_mm_and_si128(mask, b), _mm_andnot_si128(mask, a));
-}
-
-INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8],
- const uint8_t block[BLAKE3_BLOCK_LEN],
- uint8_t block_len, uint64_t counter, uint8_t flags) {
- rows[0] = loadu((uint8_t *)&cv[0]);
- rows[1] = loadu((uint8_t *)&cv[4]);
- rows[2] = set4(IV[0], IV[1], IV[2], IV[3]);
- rows[3] = set4(counter_low(counter), counter_high(counter),
- (uint32_t)block_len, (uint32_t)flags);
-
- __m128i m0 = loadu(&block[sizeof(__m128i) * 0]);
- __m128i m1 = loadu(&block[sizeof(__m128i) * 1]);
- __m128i m2 = loadu(&block[sizeof(__m128i) * 2]);
- __m128i m3 = loadu(&block[sizeof(__m128i) * 3]);
-
- __m128i t0, t1, t2, t3, tt;
-
- // Round 1. The first round permutes the message words from the original
- // input order, into the groups that get mixed in parallel.
- t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); // 6 4 2 0
- g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
- t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); // 7 5 3 1
- g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
- diagonalize(&rows[0], &rows[2], &rows[3]);
- t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10 8
- t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3)); // 12 10 8 14
- g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
- t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11 9
- t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3)); // 13 11 9 15
- g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
- undiagonalize(&rows[0], &rows[2], &rows[3]);
- m0 = t0;
- m1 = t1;
- m2 = t2;
- m3 = t3;
-
- // Round 2. This round and all following rounds apply a fixed permutation
- // to the message words from the round before.
- t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
- t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
- g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
- t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
- tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
- t1 = blend_epi16(tt, t1, 0xCC);
- g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
- diagonalize(&rows[0], &rows[2], &rows[3]);
- t2 = _mm_unpacklo_epi64(m3, m1);
- tt = blend_epi16(t2, m2, 0xC0);
- t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
- g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
- t3 = _mm_unpackhi_epi32(m1, m3);
- tt = _mm_unpacklo_epi32(m2, t3);
- t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
- g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
- undiagonalize(&rows[0], &rows[2], &rows[3]);
- m0 = t0;
- m1 = t1;
- m2 = t2;
- m3 = t3;
-
- // Round 3
- t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
- t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
- g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
- t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
- tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
- t1 = blend_epi16(tt, t1, 0xCC);
- g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
- diagonalize(&rows[0], &rows[2], &rows[3]);
- t2 = _mm_unpacklo_epi64(m3, m1);
- tt = blend_epi16(t2, m2, 0xC0);
- t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
- g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
- t3 = _mm_unpackhi_epi32(m1, m3);
- tt = _mm_unpacklo_epi32(m2, t3);
- t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
- g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
- undiagonalize(&rows[0], &rows[2], &rows[3]);
- m0 = t0;
- m1 = t1;
- m2 = t2;
- m3 = t3;
-
- // Round 4
- t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
- t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
- g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
- t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
- tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
- t1 = blend_epi16(tt, t1, 0xCC);
- g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
- diagonalize(&rows[0], &rows[2], &rows[3]);
- t2 = _mm_unpacklo_epi64(m3, m1);
- tt = blend_epi16(t2, m2, 0xC0);
- t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
- g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
- t3 = _mm_unpackhi_epi32(m1, m3);
- tt = _mm_unpacklo_epi32(m2, t3);
- t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
- g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
- undiagonalize(&rows[0], &rows[2], &rows[3]);
- m0 = t0;
- m1 = t1;
- m2 = t2;
- m3 = t3;
-
- // Round 5
- t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
- t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
- g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
- t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
- tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
- t1 = blend_epi16(tt, t1, 0xCC);
- g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
- diagonalize(&rows[0], &rows[2], &rows[3]);
- t2 = _mm_unpacklo_epi64(m3, m1);
- tt = blend_epi16(t2, m2, 0xC0);
- t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
- g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
- t3 = _mm_unpackhi_epi32(m1, m3);
- tt = _mm_unpacklo_epi32(m2, t3);
- t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
- g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
- undiagonalize(&rows[0], &rows[2], &rows[3]);
- m0 = t0;
- m1 = t1;
- m2 = t2;
- m3 = t3;
-
- // Round 6
- t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
- t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
- g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
- t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
- tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
- t1 = blend_epi16(tt, t1, 0xCC);
- g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
- diagonalize(&rows[0], &rows[2], &rows[3]);
- t2 = _mm_unpacklo_epi64(m3, m1);
- tt = blend_epi16(t2, m2, 0xC0);
- t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
- g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
- t3 = _mm_unpackhi_epi32(m1, m3);
- tt = _mm_unpacklo_epi32(m2, t3);
- t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
- g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
- undiagonalize(&rows[0], &rows[2], &rows[3]);
- m0 = t0;
- m1 = t1;
- m2 = t2;
- m3 = t3;
-
- // Round 7
- t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
- t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
- g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
- t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
- tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
- t1 = blend_epi16(tt, t1, 0xCC);
- g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
- diagonalize(&rows[0], &rows[2], &rows[3]);
- t2 = _mm_unpacklo_epi64(m3, m1);
- tt = blend_epi16(t2, m2, 0xC0);
- t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
- g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
- t3 = _mm_unpackhi_epi32(m1, m3);
- tt = _mm_unpacklo_epi32(m2, t3);
- t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
- g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
- undiagonalize(&rows[0], &rows[2], &rows[3]);
-}
-
-void blake3_compress_in_place_sse2(uint32_t cv[8],
- const uint8_t block[BLAKE3_BLOCK_LEN],
- uint8_t block_len, uint64_t counter,
- uint8_t flags) {
- __m128i rows[4];
- compress_pre(rows, cv, block, block_len, counter, flags);
- storeu(xorv(rows[0], rows[2]), (uint8_t *)&cv[0]);
- storeu(xorv(rows[1], rows[3]), (uint8_t *)&cv[4]);
-}
-
-void blake3_compress_xof_sse2(const uint32_t cv[8],
- const uint8_t block[BLAKE3_BLOCK_LEN],
- uint8_t block_len, uint64_t counter,
- uint8_t flags, uint8_t out[64]) {
- __m128i rows[4];
- compress_pre(rows, cv, block, block_len, counter, flags);
- storeu(xorv(rows[0], rows[2]), &out[0]);
- storeu(xorv(rows[1], rows[3]), &out[16]);
- storeu(xorv(rows[2], loadu((uint8_t *)&cv[0])), &out[32]);
- storeu(xorv(rows[3], loadu((uint8_t *)&cv[4])), &out[48]);
-}
-
-INLINE void round_fn(__m128i v[16], __m128i m[16], size_t r) {
- v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
- v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
- v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
- v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
- v[0] = addv(v[0], v[4]);
- v[1] = addv(v[1], v[5]);
- v[2] = addv(v[2], v[6]);
- v[3] = addv(v[3], v[7]);
- v[12] = xorv(v[12], v[0]);
- v[13] = xorv(v[13], v[1]);
- v[14] = xorv(v[14], v[2]);
- v[15] = xorv(v[15], v[3]);
- v[12] = rot16(v[12]);
- v[13] = rot16(v[13]);
- v[14] = rot16(v[14]);
- v[15] = rot16(v[15]);
- v[8] = addv(v[8], v[12]);
- v[9] = addv(v[9], v[13]);
- v[10] = addv(v[10], v[14]);
- v[11] = addv(v[11], v[15]);
- v[4] = xorv(v[4], v[8]);
- v[5] = xorv(v[5], v[9]);
- v[6] = xorv(v[6], v[10]);
- v[7] = xorv(v[7], v[11]);
- v[4] = rot12(v[4]);
- v[5] = rot12(v[5]);
- v[6] = rot12(v[6]);
- v[7] = rot12(v[7]);
- v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
- v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
- v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
- v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
- v[0] = addv(v[0], v[4]);
- v[1] = addv(v[1], v[5]);
- v[2] = addv(v[2], v[6]);
- v[3] = addv(v[3], v[7]);
- v[12] = xorv(v[12], v[0]);
- v[13] = xorv(v[13], v[1]);
- v[14] = xorv(v[14], v[2]);
- v[15] = xorv(v[15], v[3]);
- v[12] = rot8(v[12]);
- v[13] = rot8(v[13]);
- v[14] = rot8(v[14]);
- v[15] = rot8(v[15]);
- v[8] = addv(v[8], v[12]);
- v[9] = addv(v[9], v[13]);
- v[10] = addv(v[10], v[14]);
- v[11] = addv(v[11], v[15]);
- v[4] = xorv(v[4], v[8]);
- v[5] = xorv(v[5], v[9]);
- v[6] = xorv(v[6], v[10]);
- v[7] = xorv(v[7], v[11]);
- v[4] = rot7(v[4]);
- v[5] = rot7(v[5]);
- v[6] = rot7(v[6]);
- v[7] = rot7(v[7]);
-
- v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
- v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
- v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
- v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
- v[0] = addv(v[0], v[5]);
- v[1] = addv(v[1], v[6]);
- v[2] = addv(v[2], v[7]);
- v[3] = addv(v[3], v[4]);
- v[15] = xorv(v[15], v[0]);
- v[12] = xorv(v[12], v[1]);
- v[13] = xorv(v[13], v[2]);
- v[14] = xorv(v[14], v[3]);
- v[15] = rot16(v[15]);
- v[12] = rot16(v[12]);
- v[13] = rot16(v[13]);
- v[14] = rot16(v[14]);
- v[10] = addv(v[10], v[15]);
- v[11] = addv(v[11], v[12]);
- v[8] = addv(v[8], v[13]);
- v[9] = addv(v[9], v[14]);
- v[5] = xorv(v[5], v[10]);
- v[6] = xorv(v[6], v[11]);
- v[7] = xorv(v[7], v[8]);
- v[4] = xorv(v[4], v[9]);
- v[5] = rot12(v[5]);
- v[6] = rot12(v[6]);
- v[7] = rot12(v[7]);
- v[4] = rot12(v[4]);
- v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
- v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
- v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
- v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
- v[0] = addv(v[0], v[5]);
- v[1] = addv(v[1], v[6]);
- v[2] = addv(v[2], v[7]);
- v[3] = addv(v[3], v[4]);
- v[15] = xorv(v[15], v[0]);
- v[12] = xorv(v[12], v[1]);
- v[13] = xorv(v[13], v[2]);
- v[14] = xorv(v[14], v[3]);
- v[15] = rot8(v[15]);
- v[12] = rot8(v[12]);
- v[13] = rot8(v[13]);
- v[14] = rot8(v[14]);
- v[10] = addv(v[10], v[15]);
- v[11] = addv(v[11], v[12]);
- v[8] = addv(v[8], v[13]);
- v[9] = addv(v[9], v[14]);
- v[5] = xorv(v[5], v[10]);
- v[6] = xorv(v[6], v[11]);
- v[7] = xorv(v[7], v[8]);
- v[4] = xorv(v[4], v[9]);
- v[5] = rot7(v[5]);
- v[6] = rot7(v[6]);
- v[7] = rot7(v[7]);
- v[4] = rot7(v[4]);
-}
-
-INLINE void transpose_vecs(__m128i vecs[DEGREE]) {
- // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is
- // 22/33. Note that this doesn't split the vector into two lanes, as the
- // AVX2 counterparts do.
- __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
- __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
- __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
- __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
-
- // Interleave 64-bit lanes.
- __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01);
- __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01);
- __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23);
- __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23);
-
- vecs[0] = abcd_0;
- vecs[1] = abcd_1;
- vecs[2] = abcd_2;
- vecs[3] = abcd_3;
-}
-
-INLINE void transpose_msg_vecs(const uint8_t *const *inputs,
- size_t block_offset, __m128i out[16]) {
- out[0] = loadu(&inputs[0][block_offset + 0 * sizeof(__m128i)]);
- out[1] = loadu(&inputs[1][block_offset + 0 * sizeof(__m128i)]);
- out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m128i)]);
- out[3] = loadu(&inputs[3][block_offset + 0 * sizeof(__m128i)]);
- out[4] = loadu(&inputs[0][block_offset + 1 * sizeof(__m128i)]);
- out[5] = loadu(&inputs[1][block_offset + 1 * sizeof(__m128i)]);
- out[6] = loadu(&inputs[2][block_offset + 1 * sizeof(__m128i)]);
- out[7] = loadu(&inputs[3][block_offset + 1 * sizeof(__m128i)]);
- out[8] = loadu(&inputs[0][block_offset + 2 * sizeof(__m128i)]);
- out[9] = loadu(&inputs[1][block_offset + 2 * sizeof(__m128i)]);
- out[10] = loadu(&inputs[2][block_offset + 2 * sizeof(__m128i)]);
- out[11] = loadu(&inputs[3][block_offset + 2 * sizeof(__m128i)]);
- out[12] = loadu(&inputs[0][block_offset + 3 * sizeof(__m128i)]);
- out[13] = loadu(&inputs[1][block_offset + 3 * sizeof(__m128i)]);
- out[14] = loadu(&inputs[2][block_offset + 3 * sizeof(__m128i)]);
- out[15] = loadu(&inputs[3][block_offset + 3 * sizeof(__m128i)]);
- for (size_t i = 0; i < 4; ++i) {
- _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0);
- }
- transpose_vecs(&out[0]);
- transpose_vecs(&out[4]);
- transpose_vecs(&out[8]);
- transpose_vecs(&out[12]);
-}
-
-INLINE void load_counters(uint64_t counter, bool increment_counter,
- __m128i *out_lo, __m128i *out_hi) {
- const __m128i mask = _mm_set1_epi32(-(int32_t)increment_counter);
- const __m128i add0 = _mm_set_epi32(3, 2, 1, 0);
- const __m128i add1 = _mm_and_si128(mask, add0);
- __m128i l = _mm_add_epi32(_mm_set1_epi32(counter), add1);
- __m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32(0x80000000)),
- _mm_xor_si128( l, _mm_set1_epi32(0x80000000)));
- __m128i h = _mm_sub_epi32(_mm_set1_epi32(counter >> 32), carry);
- *out_lo = l;
- *out_hi = h;
-}
-
-void blake3_hash4_sse2(const uint8_t *const *inputs, size_t blocks,
- const uint32_t key[8], uint64_t counter,
- bool increment_counter, uint8_t flags,
- uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
- __m128i h_vecs[8] = {
- set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]),
- set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]),
- };
- __m128i counter_low_vec, counter_high_vec;
- load_counters(counter, increment_counter, &counter_low_vec,
- &counter_high_vec);
- uint8_t block_flags = flags | flags_start;
-
- for (size_t block = 0; block < blocks; block++) {
- if (block + 1 == blocks) {
- block_flags |= flags_end;
- }
- __m128i block_len_vec = set1(BLAKE3_BLOCK_LEN);
- __m128i block_flags_vec = set1(block_flags);
- __m128i msg_vecs[16];
- transpose_msg_vecs(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
-
- __m128i v[16] = {
- h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
- h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
- set1(IV[0]), set1(IV[1]), set1(IV[2]), set1(IV[3]),
- counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
- };
- round_fn(v, msg_vecs, 0);
- round_fn(v, msg_vecs, 1);
- round_fn(v, msg_vecs, 2);
- round_fn(v, msg_vecs, 3);
- round_fn(v, msg_vecs, 4);
- round_fn(v, msg_vecs, 5);
- round_fn(v, msg_vecs, 6);
- h_vecs[0] = xorv(v[0], v[8]);
- h_vecs[1] = xorv(v[1], v[9]);
- h_vecs[2] = xorv(v[2], v[10]);
- h_vecs[3] = xorv(v[3], v[11]);
- h_vecs[4] = xorv(v[4], v[12]);
- h_vecs[5] = xorv(v[5], v[13]);
- h_vecs[6] = xorv(v[6], v[14]);
- h_vecs[7] = xorv(v[7], v[15]);
-
- block_flags = flags;
- }
-
- transpose_vecs(&h_vecs[0]);
- transpose_vecs(&h_vecs[4]);
- // The first four vecs now contain the first half of each output, and the
- // second four vecs contain the second half of each output.
- storeu(h_vecs[0], &out[0 * sizeof(__m128i)]);
- storeu(h_vecs[4], &out[1 * sizeof(__m128i)]);
- storeu(h_vecs[1], &out[2 * sizeof(__m128i)]);
- storeu(h_vecs[5], &out[3 * sizeof(__m128i)]);
- storeu(h_vecs[2], &out[4 * sizeof(__m128i)]);
- storeu(h_vecs[6], &out[5 * sizeof(__m128i)]);
- storeu(h_vecs[3], &out[6 * sizeof(__m128i)]);
- storeu(h_vecs[7], &out[7 * sizeof(__m128i)]);
-}
-
-INLINE void hash_one_sse2(const uint8_t *input, size_t blocks,
- const uint32_t key[8], uint64_t counter,
- uint8_t flags, uint8_t flags_start,
- uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) {
- uint32_t cv[8];
- memcpy(cv, key, BLAKE3_KEY_LEN);
- uint8_t block_flags = flags | flags_start;
- while (blocks > 0) {
- if (blocks == 1) {
- block_flags |= flags_end;
- }
- blake3_compress_in_place_sse2(cv, input, BLAKE3_BLOCK_LEN, counter,
- block_flags);
- input = &input[BLAKE3_BLOCK_LEN];
- blocks -= 1;
- block_flags = flags;
- }
- memcpy(out, cv, BLAKE3_OUT_LEN);
-}
-
-void blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs,
- size_t blocks, const uint32_t key[8],
- uint64_t counter, bool increment_counter,
- uint8_t flags, uint8_t flags_start,
- uint8_t flags_end, uint8_t *out) {
- while (num_inputs >= DEGREE) {
- blake3_hash4_sse2(inputs, blocks, key, counter, increment_counter, flags,
- flags_start, flags_end, out);
- if (increment_counter) {
- counter += DEGREE;
- }
- inputs += DEGREE;
- num_inputs -= DEGREE;
- out = &out[DEGREE * BLAKE3_OUT_LEN];
- }
- while (num_inputs > 0) {
- hash_one_sse2(inputs[0], blocks, key, counter, flags, flags_start,
- flags_end, out);
- if (increment_counter) {
- counter += 1;
- }
- inputs += 1;
- num_inputs -= 1;
- out = &out[BLAKE3_OUT_LEN];
- }
-}
diff --git a/thirdparty/BLAKE3/c/blake3_sse2_x86-64_unix.S b/thirdparty/BLAKE3/c/blake3_sse2_x86-64_unix.S
deleted file mode 100644
index d144046ab..000000000
--- a/thirdparty/BLAKE3/c/blake3_sse2_x86-64_unix.S
+++ /dev/null
@@ -1,2291 +0,0 @@
-#if defined(__ELF__) && defined(__linux__)
-.section .note.GNU-stack,"",%progbits
-#endif
-
-#if defined(__ELF__) && defined(__CET__) && defined(__has_include)
-#if __has_include(<cet.h>)
-#include <cet.h>
-#endif
-#endif
-
-#if !defined(_CET_ENDBR)
-#define _CET_ENDBR
-#endif
-
-.intel_syntax noprefix
-.global blake3_hash_many_sse2
-.global _blake3_hash_many_sse2
-.global blake3_compress_in_place_sse2
-.global _blake3_compress_in_place_sse2
-.global blake3_compress_xof_sse2
-.global _blake3_compress_xof_sse2
-#ifdef __APPLE__
-.text
-#else
-.section .text
-#endif
- .p2align 6
-_blake3_hash_many_sse2:
-blake3_hash_many_sse2:
- _CET_ENDBR
- push r15
- push r14
- push r13
- push r12
- push rbx
- push rbp
- mov rbp, rsp
- sub rsp, 360
- and rsp, 0xFFFFFFFFFFFFFFC0
- neg r9d
- movd xmm0, r9d
- pshufd xmm0, xmm0, 0x00
- movdqa xmmword ptr [rsp+0x130], xmm0
- movdqa xmm1, xmm0
- pand xmm1, xmmword ptr [ADD0+rip]
- pand xmm0, xmmword ptr [ADD1+rip]
- movdqa xmmword ptr [rsp+0x150], xmm0
- movd xmm0, r8d
- pshufd xmm0, xmm0, 0x00
- paddd xmm0, xmm1
- movdqa xmmword ptr [rsp+0x110], xmm0
- pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
- pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
- pcmpgtd xmm1, xmm0
- shr r8, 32
- movd xmm2, r8d
- pshufd xmm2, xmm2, 0x00
- psubd xmm2, xmm1
- movdqa xmmword ptr [rsp+0x120], xmm2
- mov rbx, qword ptr [rbp+0x50]
- mov r15, rdx
- shl r15, 6
- movzx r13d, byte ptr [rbp+0x38]
- movzx r12d, byte ptr [rbp+0x48]
- cmp rsi, 4
- jc 3f
-2:
- movdqu xmm3, xmmword ptr [rcx]
- pshufd xmm0, xmm3, 0x00
- pshufd xmm1, xmm3, 0x55
- pshufd xmm2, xmm3, 0xAA
- pshufd xmm3, xmm3, 0xFF
- movdqu xmm7, xmmword ptr [rcx+0x10]
- pshufd xmm4, xmm7, 0x00
- pshufd xmm5, xmm7, 0x55
- pshufd xmm6, xmm7, 0xAA
- pshufd xmm7, xmm7, 0xFF
- mov r8, qword ptr [rdi]
- mov r9, qword ptr [rdi+0x8]
- mov r10, qword ptr [rdi+0x10]
- mov r11, qword ptr [rdi+0x18]
- movzx eax, byte ptr [rbp+0x40]
- or eax, r13d
- xor edx, edx
-9:
- mov r14d, eax
- or eax, r12d
- add rdx, 64
- cmp rdx, r15
- cmovne eax, r14d
- movdqu xmm8, xmmword ptr [r8+rdx-0x40]
- movdqu xmm9, xmmword ptr [r9+rdx-0x40]
- movdqu xmm10, xmmword ptr [r10+rdx-0x40]
- movdqu xmm11, xmmword ptr [r11+rdx-0x40]
- movdqa xmm12, xmm8
- punpckldq xmm8, xmm9
- punpckhdq xmm12, xmm9
- movdqa xmm14, xmm10
- punpckldq xmm10, xmm11
- punpckhdq xmm14, xmm11
- movdqa xmm9, xmm8
- punpcklqdq xmm8, xmm10
- punpckhqdq xmm9, xmm10
- movdqa xmm13, xmm12
- punpcklqdq xmm12, xmm14
- punpckhqdq xmm13, xmm14
- movdqa xmmword ptr [rsp], xmm8
- movdqa xmmword ptr [rsp+0x10], xmm9
- movdqa xmmword ptr [rsp+0x20], xmm12
- movdqa xmmword ptr [rsp+0x30], xmm13
- movdqu xmm8, xmmword ptr [r8+rdx-0x30]
- movdqu xmm9, xmmword ptr [r9+rdx-0x30]
- movdqu xmm10, xmmword ptr [r10+rdx-0x30]
- movdqu xmm11, xmmword ptr [r11+rdx-0x30]
- movdqa xmm12, xmm8
- punpckldq xmm8, xmm9
- punpckhdq xmm12, xmm9
- movdqa xmm14, xmm10
- punpckldq xmm10, xmm11
- punpckhdq xmm14, xmm11
- movdqa xmm9, xmm8
- punpcklqdq xmm8, xmm10
- punpckhqdq xmm9, xmm10
- movdqa xmm13, xmm12
- punpcklqdq xmm12, xmm14
- punpckhqdq xmm13, xmm14
- movdqa xmmword ptr [rsp+0x40], xmm8
- movdqa xmmword ptr [rsp+0x50], xmm9
- movdqa xmmword ptr [rsp+0x60], xmm12
- movdqa xmmword ptr [rsp+0x70], xmm13
- movdqu xmm8, xmmword ptr [r8+rdx-0x20]
- movdqu xmm9, xmmword ptr [r9+rdx-0x20]
- movdqu xmm10, xmmword ptr [r10+rdx-0x20]
- movdqu xmm11, xmmword ptr [r11+rdx-0x20]
- movdqa xmm12, xmm8
- punpckldq xmm8, xmm9
- punpckhdq xmm12, xmm9
- movdqa xmm14, xmm10
- punpckldq xmm10, xmm11
- punpckhdq xmm14, xmm11
- movdqa xmm9, xmm8
- punpcklqdq xmm8, xmm10
- punpckhqdq xmm9, xmm10
- movdqa xmm13, xmm12
- punpcklqdq xmm12, xmm14
- punpckhqdq xmm13, xmm14
- movdqa xmmword ptr [rsp+0x80], xmm8
- movdqa xmmword ptr [rsp+0x90], xmm9
- movdqa xmmword ptr [rsp+0xA0], xmm12
- movdqa xmmword ptr [rsp+0xB0], xmm13
- movdqu xmm8, xmmword ptr [r8+rdx-0x10]
- movdqu xmm9, xmmword ptr [r9+rdx-0x10]
- movdqu xmm10, xmmword ptr [r10+rdx-0x10]
- movdqu xmm11, xmmword ptr [r11+rdx-0x10]
- movdqa xmm12, xmm8
- punpckldq xmm8, xmm9
- punpckhdq xmm12, xmm9
- movdqa xmm14, xmm10
- punpckldq xmm10, xmm11
- punpckhdq xmm14, xmm11
- movdqa xmm9, xmm8
- punpcklqdq xmm8, xmm10
- punpckhqdq xmm9, xmm10
- movdqa xmm13, xmm12
- punpcklqdq xmm12, xmm14
- punpckhqdq xmm13, xmm14
- movdqa xmmword ptr [rsp+0xC0], xmm8
- movdqa xmmword ptr [rsp+0xD0], xmm9
- movdqa xmmword ptr [rsp+0xE0], xmm12
- movdqa xmmword ptr [rsp+0xF0], xmm13
- movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip]
- movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip]
- movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip]
- movdqa xmm12, xmmword ptr [rsp+0x110]
- movdqa xmm13, xmmword ptr [rsp+0x120]
- movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
- movd xmm15, eax
- pshufd xmm15, xmm15, 0x00
- prefetcht0 [r8+rdx+0x80]
- prefetcht0 [r9+rdx+0x80]
- prefetcht0 [r10+rdx+0x80]
- prefetcht0 [r11+rdx+0x80]
- paddd xmm0, xmmword ptr [rsp]
- paddd xmm1, xmmword ptr [rsp+0x20]
- paddd xmm2, xmmword ptr [rsp+0x40]
- paddd xmm3, xmmword ptr [rsp+0x60]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- pshuflw xmm12, xmm12, 0xB1
- pshufhw xmm12, xmm12, 0xB1
- pshuflw xmm13, xmm13, 0xB1
- pshufhw xmm13, xmm13, 0xB1
- pshuflw xmm14, xmm14, 0xB1
- pshufhw xmm14, xmm14, 0xB1
- pshuflw xmm15, xmm15, 0xB1
- pshufhw xmm15, xmm15, 0xB1
- movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+0x10]
- paddd xmm1, xmmword ptr [rsp+0x30]
- paddd xmm2, xmmword ptr [rsp+0x50]
- paddd xmm3, xmmword ptr [rsp+0x70]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- movdqa xmm8, xmm12
- psrld xmm12, 8
- pslld xmm8, 24
- pxor xmm12, xmm8
- movdqa xmm8, xmm13
- psrld xmm13, 8
- pslld xmm8, 24
- pxor xmm13, xmm8
- movdqa xmm8, xmm14
- psrld xmm14, 8
- pslld xmm8, 24
- pxor xmm14, xmm8
- movdqa xmm8, xmm15
- psrld xmm15, 8
- pslld xmm8, 24
- pxor xmm15, xmm8
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+0x80]
- paddd xmm1, xmmword ptr [rsp+0xA0]
- paddd xmm2, xmmword ptr [rsp+0xC0]
- paddd xmm3, xmmword ptr [rsp+0xE0]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- pshuflw xmm15, xmm15, 0xB1
- pshufhw xmm15, xmm15, 0xB1
- pshuflw xmm12, xmm12, 0xB1
- pshufhw xmm12, xmm12, 0xB1
- pshuflw xmm13, xmm13, 0xB1
- pshufhw xmm13, xmm13, 0xB1
- pshuflw xmm14, xmm14, 0xB1
- pshufhw xmm14, xmm14, 0xB1
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+0x90]
- paddd xmm1, xmmword ptr [rsp+0xB0]
- paddd xmm2, xmmword ptr [rsp+0xD0]
- paddd xmm3, xmmword ptr [rsp+0xF0]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- movdqa xmm8, xmm15
- psrld xmm15, 8
- pslld xmm8, 24
- pxor xmm15, xmm8
- movdqa xmm8, xmm12
- psrld xmm12, 8
- pslld xmm8, 24
- pxor xmm12, xmm8
- movdqa xmm8, xmm13
- psrld xmm13, 8
- pslld xmm8, 24
- pxor xmm13, xmm8
- movdqa xmm8, xmm14
- psrld xmm14, 8
- pslld xmm8, 24
- pxor xmm14, xmm8
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+0x20]
- paddd xmm1, xmmword ptr [rsp+0x30]
- paddd xmm2, xmmword ptr [rsp+0x70]
- paddd xmm3, xmmword ptr [rsp+0x40]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- pshuflw xmm12, xmm12, 0xB1
- pshufhw xmm12, xmm12, 0xB1
- pshuflw xmm13, xmm13, 0xB1
- pshufhw xmm13, xmm13, 0xB1
- pshuflw xmm14, xmm14, 0xB1
- pshufhw xmm14, xmm14, 0xB1
- pshuflw xmm15, xmm15, 0xB1
- pshufhw xmm15, xmm15, 0xB1
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+0x60]
- paddd xmm1, xmmword ptr [rsp+0xA0]
- paddd xmm2, xmmword ptr [rsp]
- paddd xmm3, xmmword ptr [rsp+0xD0]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- movdqa xmm8, xmm12
- psrld xmm12, 8
- pslld xmm8, 24
- pxor xmm12, xmm8
- movdqa xmm8, xmm13
- psrld xmm13, 8
- pslld xmm8, 24
- pxor xmm13, xmm8
- movdqa xmm8, xmm14
- psrld xmm14, 8
- pslld xmm8, 24
- pxor xmm14, xmm8
- movdqa xmm8, xmm15
- psrld xmm15, 8
- pslld xmm8, 24
- pxor xmm15, xmm8
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+0x10]
- paddd xmm1, xmmword ptr [rsp+0xC0]
- paddd xmm2, xmmword ptr [rsp+0x90]
- paddd xmm3, xmmword ptr [rsp+0xF0]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- pshuflw xmm15, xmm15, 0xB1
- pshufhw xmm15, xmm15, 0xB1
- pshuflw xmm12, xmm12, 0xB1
- pshufhw xmm12, xmm12, 0xB1
- pshuflw xmm13, xmm13, 0xB1
- pshufhw xmm13, xmm13, 0xB1
- pshuflw xmm14, xmm14, 0xB1
- pshufhw xmm14, xmm14, 0xB1
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+0xB0]
- paddd xmm1, xmmword ptr [rsp+0x50]
- paddd xmm2, xmmword ptr [rsp+0xE0]
- paddd xmm3, xmmword ptr [rsp+0x80]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- movdqa xmm8, xmm15
- psrld xmm15, 8
- pslld xmm8, 24
- pxor xmm15, xmm8
- movdqa xmm8, xmm12
- psrld xmm12, 8
- pslld xmm8, 24
- pxor xmm12, xmm8
- movdqa xmm8, xmm13
- psrld xmm13, 8
- pslld xmm8, 24
- pxor xmm13, xmm8
- movdqa xmm8, xmm14
- psrld xmm14, 8
- pslld xmm8, 24
- pxor xmm14, xmm8
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+0x30]
- paddd xmm1, xmmword ptr [rsp+0xA0]
- paddd xmm2, xmmword ptr [rsp+0xD0]
- paddd xmm3, xmmword ptr [rsp+0x70]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- pshuflw xmm12, xmm12, 0xB1
- pshufhw xmm12, xmm12, 0xB1
- pshuflw xmm13, xmm13, 0xB1
- pshufhw xmm13, xmm13, 0xB1
- pshuflw xmm14, xmm14, 0xB1
- pshufhw xmm14, xmm14, 0xB1
- pshuflw xmm15, xmm15, 0xB1
- pshufhw xmm15, xmm15, 0xB1
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+0x40]
- paddd xmm1, xmmword ptr [rsp+0xC0]
- paddd xmm2, xmmword ptr [rsp+0x20]
- paddd xmm3, xmmword ptr [rsp+0xE0]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- movdqa xmm8, xmm12
- psrld xmm12, 8
- pslld xmm8, 24
- pxor xmm12, xmm8
- movdqa xmm8, xmm13
- psrld xmm13, 8
- pslld xmm8, 24
- pxor xmm13, xmm8
- movdqa xmm8, xmm14
- psrld xmm14, 8
- pslld xmm8, 24
- pxor xmm14, xmm8
- movdqa xmm8, xmm15
- psrld xmm15, 8
- pslld xmm8, 24
- pxor xmm15, xmm8
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+0x60]
- paddd xmm1, xmmword ptr [rsp+0x90]
- paddd xmm2, xmmword ptr [rsp+0xB0]
- paddd xmm3, xmmword ptr [rsp+0x80]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- pshuflw xmm15, xmm15, 0xB1
- pshufhw xmm15, xmm15, 0xB1
- pshuflw xmm12, xmm12, 0xB1
- pshufhw xmm12, xmm12, 0xB1
- pshuflw xmm13, xmm13, 0xB1
- pshufhw xmm13, xmm13, 0xB1
- pshuflw xmm14, xmm14, 0xB1
- pshufhw xmm14, xmm14, 0xB1
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+0x50]
- paddd xmm1, xmmword ptr [rsp]
- paddd xmm2, xmmword ptr [rsp+0xF0]
- paddd xmm3, xmmword ptr [rsp+0x10]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- movdqa xmm8, xmm15
- psrld xmm15, 8
- pslld xmm8, 24
- pxor xmm15, xmm8
- movdqa xmm8, xmm12
- psrld xmm12, 8
- pslld xmm8, 24
- pxor xmm12, xmm8
- movdqa xmm8, xmm13
- psrld xmm13, 8
- pslld xmm8, 24
- pxor xmm13, xmm8
- movdqa xmm8, xmm14
- psrld xmm14, 8
- pslld xmm8, 24
- pxor xmm14, xmm8
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+0xA0]
- paddd xmm1, xmmword ptr [rsp+0xC0]
- paddd xmm2, xmmword ptr [rsp+0xE0]
- paddd xmm3, xmmword ptr [rsp+0xD0]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- pshuflw xmm12, xmm12, 0xB1
- pshufhw xmm12, xmm12, 0xB1
- pshuflw xmm13, xmm13, 0xB1
- pshufhw xmm13, xmm13, 0xB1
- pshuflw xmm14, xmm14, 0xB1
- pshufhw xmm14, xmm14, 0xB1
- pshuflw xmm15, xmm15, 0xB1
- pshufhw xmm15, xmm15, 0xB1
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+0x70]
- paddd xmm1, xmmword ptr [rsp+0x90]
- paddd xmm2, xmmword ptr [rsp+0x30]
- paddd xmm3, xmmword ptr [rsp+0xF0]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- movdqa xmm8, xmm12
- psrld xmm12, 8
- pslld xmm8, 24
- pxor xmm12, xmm8
- movdqa xmm8, xmm13
- psrld xmm13, 8
- pslld xmm8, 24
- pxor xmm13, xmm8
- movdqa xmm8, xmm14
- psrld xmm14, 8
- pslld xmm8, 24
- pxor xmm14, xmm8
- movdqa xmm8, xmm15
- psrld xmm15, 8
- pslld xmm8, 24
- pxor xmm15, xmm8
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+0x40]
- paddd xmm1, xmmword ptr [rsp+0xB0]
- paddd xmm2, xmmword ptr [rsp+0x50]
- paddd xmm3, xmmword ptr [rsp+0x10]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- pshuflw xmm15, xmm15, 0xB1
- pshufhw xmm15, xmm15, 0xB1
- pshuflw xmm12, xmm12, 0xB1
- pshufhw xmm12, xmm12, 0xB1
- pshuflw xmm13, xmm13, 0xB1
- pshufhw xmm13, xmm13, 0xB1
- pshuflw xmm14, xmm14, 0xB1
- pshufhw xmm14, xmm14, 0xB1
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp]
- paddd xmm1, xmmword ptr [rsp+0x20]
- paddd xmm2, xmmword ptr [rsp+0x80]
- paddd xmm3, xmmword ptr [rsp+0x60]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- movdqa xmm8, xmm15
- psrld xmm15, 8
- pslld xmm8, 24
- pxor xmm15, xmm8
- movdqa xmm8, xmm12
- psrld xmm12, 8
- pslld xmm8, 24
- pxor xmm12, xmm8
- movdqa xmm8, xmm13
- psrld xmm13, 8
- pslld xmm8, 24
- pxor xmm13, xmm8
- movdqa xmm8, xmm14
- psrld xmm14, 8
- pslld xmm8, 24
- pxor xmm14, xmm8
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+0xC0]
- paddd xmm1, xmmword ptr [rsp+0x90]
- paddd xmm2, xmmword ptr [rsp+0xF0]
- paddd xmm3, xmmword ptr [rsp+0xE0]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- pshuflw xmm12, xmm12, 0xB1
- pshufhw xmm12, xmm12, 0xB1
- pshuflw xmm13, xmm13, 0xB1
- pshufhw xmm13, xmm13, 0xB1
- pshuflw xmm14, xmm14, 0xB1
- pshufhw xmm14, xmm14, 0xB1
- pshuflw xmm15, xmm15, 0xB1
- pshufhw xmm15, xmm15, 0xB1
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+0xD0]
- paddd xmm1, xmmword ptr [rsp+0xB0]
- paddd xmm2, xmmword ptr [rsp+0xA0]
- paddd xmm3, xmmword ptr [rsp+0x80]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- movdqa xmm8, xmm12
- psrld xmm12, 8
- pslld xmm8, 24
- pxor xmm12, xmm8
- movdqa xmm8, xmm13
- psrld xmm13, 8
- pslld xmm8, 24
- pxor xmm13, xmm8
- movdqa xmm8, xmm14
- psrld xmm14, 8
- pslld xmm8, 24
- pxor xmm14, xmm8
- movdqa xmm8, xmm15
- psrld xmm15, 8
- pslld xmm8, 24
- pxor xmm15, xmm8
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+0x70]
- paddd xmm1, xmmword ptr [rsp+0x50]
- paddd xmm2, xmmword ptr [rsp]
- paddd xmm3, xmmword ptr [rsp+0x60]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- pshuflw xmm15, xmm15, 0xB1
- pshufhw xmm15, xmm15, 0xB1
- pshuflw xmm12, xmm12, 0xB1
- pshufhw xmm12, xmm12, 0xB1
- pshuflw xmm13, xmm13, 0xB1
- pshufhw xmm13, xmm13, 0xB1
- pshuflw xmm14, xmm14, 0xB1
- pshufhw xmm14, xmm14, 0xB1
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+0x20]
- paddd xmm1, xmmword ptr [rsp+0x30]
- paddd xmm2, xmmword ptr [rsp+0x10]
- paddd xmm3, xmmword ptr [rsp+0x40]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- movdqa xmm8, xmm15
- psrld xmm15, 8
- pslld xmm8, 24
- pxor xmm15, xmm8
- movdqa xmm8, xmm12
- psrld xmm12, 8
- pslld xmm8, 24
- pxor xmm12, xmm8
- movdqa xmm8, xmm13
- psrld xmm13, 8
- pslld xmm8, 24
- pxor xmm13, xmm8
- movdqa xmm8, xmm14
- psrld xmm14, 8
- pslld xmm8, 24
- pxor xmm14, xmm8
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+0x90]
- paddd xmm1, xmmword ptr [rsp+0xB0]
- paddd xmm2, xmmword ptr [rsp+0x80]
- paddd xmm3, xmmword ptr [rsp+0xF0]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- pshuflw xmm12, xmm12, 0xB1
- pshufhw xmm12, xmm12, 0xB1
- pshuflw xmm13, xmm13, 0xB1
- pshufhw xmm13, xmm13, 0xB1
- pshuflw xmm14, xmm14, 0xB1
- pshufhw xmm14, xmm14, 0xB1
- pshuflw xmm15, xmm15, 0xB1
- pshufhw xmm15, xmm15, 0xB1
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+0xE0]
- paddd xmm1, xmmword ptr [rsp+0x50]
- paddd xmm2, xmmword ptr [rsp+0xC0]
- paddd xmm3, xmmword ptr [rsp+0x10]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- movdqa xmm8, xmm12
- psrld xmm12, 8
- pslld xmm8, 24
- pxor xmm12, xmm8
- movdqa xmm8, xmm13
- psrld xmm13, 8
- pslld xmm8, 24
- pxor xmm13, xmm8
- movdqa xmm8, xmm14
- psrld xmm14, 8
- pslld xmm8, 24
- pxor xmm14, xmm8
- movdqa xmm8, xmm15
- psrld xmm15, 8
- pslld xmm8, 24
- pxor xmm15, xmm8
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+0xD0]
- paddd xmm1, xmmword ptr [rsp]
- paddd xmm2, xmmword ptr [rsp+0x20]
- paddd xmm3, xmmword ptr [rsp+0x40]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- pshuflw xmm15, xmm15, 0xB1
- pshufhw xmm15, xmm15, 0xB1
- pshuflw xmm12, xmm12, 0xB1
- pshufhw xmm12, xmm12, 0xB1
- pshuflw xmm13, xmm13, 0xB1
- pshufhw xmm13, xmm13, 0xB1
- pshuflw xmm14, xmm14, 0xB1
- pshufhw xmm14, xmm14, 0xB1
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+0x30]
- paddd xmm1, xmmword ptr [rsp+0xA0]
- paddd xmm2, xmmword ptr [rsp+0x60]
- paddd xmm3, xmmword ptr [rsp+0x70]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- movdqa xmm8, xmm15
- psrld xmm15, 8
- pslld xmm8, 24
- pxor xmm15, xmm8
- movdqa xmm8, xmm12
- psrld xmm12, 8
- pslld xmm8, 24
- pxor xmm12, xmm8
- movdqa xmm8, xmm13
- psrld xmm13, 8
- pslld xmm8, 24
- pxor xmm13, xmm8
- movdqa xmm8, xmm14
- psrld xmm14, 8
- pslld xmm8, 24
- pxor xmm14, xmm8
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+0xB0]
- paddd xmm1, xmmword ptr [rsp+0x50]
- paddd xmm2, xmmword ptr [rsp+0x10]
- paddd xmm3, xmmword ptr [rsp+0x80]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- pshuflw xmm12, xmm12, 0xB1
- pshufhw xmm12, xmm12, 0xB1
- pshuflw xmm13, xmm13, 0xB1
- pshufhw xmm13, xmm13, 0xB1
- pshuflw xmm14, xmm14, 0xB1
- pshufhw xmm14, xmm14, 0xB1
- pshuflw xmm15, xmm15, 0xB1
- pshufhw xmm15, xmm15, 0xB1
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+0xF0]
- paddd xmm1, xmmword ptr [rsp]
- paddd xmm2, xmmword ptr [rsp+0x90]
- paddd xmm3, xmmword ptr [rsp+0x60]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- movdqa xmm8, xmm12
- psrld xmm12, 8
- pslld xmm8, 24
- pxor xmm12, xmm8
- movdqa xmm8, xmm13
- psrld xmm13, 8
- pslld xmm8, 24
- pxor xmm13, xmm8
- movdqa xmm8, xmm14
- psrld xmm14, 8
- pslld xmm8, 24
- pxor xmm14, xmm8
- movdqa xmm8, xmm15
- psrld xmm15, 8
- pslld xmm8, 24
- pxor xmm15, xmm8
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+0xE0]
- paddd xmm1, xmmword ptr [rsp+0x20]
- paddd xmm2, xmmword ptr [rsp+0x30]
- paddd xmm3, xmmword ptr [rsp+0x70]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- pshuflw xmm15, xmm15, 0xB1
- pshufhw xmm15, xmm15, 0xB1
- pshuflw xmm12, xmm12, 0xB1
- pshufhw xmm12, xmm12, 0xB1
- pshuflw xmm13, xmm13, 0xB1
- pshufhw xmm13, xmm13, 0xB1
- pshuflw xmm14, xmm14, 0xB1
- pshufhw xmm14, xmm14, 0xB1
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+0xA0]
- paddd xmm1, xmmword ptr [rsp+0xC0]
- paddd xmm2, xmmword ptr [rsp+0x40]
- paddd xmm3, xmmword ptr [rsp+0xD0]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- movdqa xmm8, xmm15
- psrld xmm15, 8
- pslld xmm8, 24
- pxor xmm15, xmm8
- movdqa xmm8, xmm12
- psrld xmm12, 8
- pslld xmm8, 24
- pxor xmm12, xmm8
- movdqa xmm8, xmm13
- psrld xmm13, 8
- pslld xmm8, 24
- pxor xmm13, xmm8
- movdqa xmm8, xmm14
- psrld xmm14, 8
- pslld xmm8, 24
- pxor xmm14, xmm8
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- pxor xmm0, xmm8
- pxor xmm1, xmm9
- pxor xmm2, xmm10
- pxor xmm3, xmm11
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- pxor xmm4, xmm12
- pxor xmm5, xmm13
- pxor xmm6, xmm14
- pxor xmm7, xmm15
- mov eax, r13d
- jne 9b
- movdqa xmm9, xmm0
- punpckldq xmm0, xmm1
- punpckhdq xmm9, xmm1
- movdqa xmm11, xmm2
- punpckldq xmm2, xmm3
- punpckhdq xmm11, xmm3
- movdqa xmm1, xmm0
- punpcklqdq xmm0, xmm2
- punpckhqdq xmm1, xmm2
- movdqa xmm3, xmm9
- punpcklqdq xmm9, xmm11
- punpckhqdq xmm3, xmm11
- movdqu xmmword ptr [rbx], xmm0
- movdqu xmmword ptr [rbx+0x20], xmm1
- movdqu xmmword ptr [rbx+0x40], xmm9
- movdqu xmmword ptr [rbx+0x60], xmm3
- movdqa xmm9, xmm4
- punpckldq xmm4, xmm5
- punpckhdq xmm9, xmm5
- movdqa xmm11, xmm6
- punpckldq xmm6, xmm7
- punpckhdq xmm11, xmm7
- movdqa xmm5, xmm4
- punpcklqdq xmm4, xmm6
- punpckhqdq xmm5, xmm6
- movdqa xmm7, xmm9
- punpcklqdq xmm9, xmm11
- punpckhqdq xmm7, xmm11
- movdqu xmmword ptr [rbx+0x10], xmm4
- movdqu xmmword ptr [rbx+0x30], xmm5
- movdqu xmmword ptr [rbx+0x50], xmm9
- movdqu xmmword ptr [rbx+0x70], xmm7
- movdqa xmm1, xmmword ptr [rsp+0x110]
- movdqa xmm0, xmm1
- paddd xmm1, xmmword ptr [rsp+0x150]
- movdqa xmmword ptr [rsp+0x110], xmm1
- pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
- pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
- pcmpgtd xmm0, xmm1
- movdqa xmm1, xmmword ptr [rsp+0x120]
- psubd xmm1, xmm0
- movdqa xmmword ptr [rsp+0x120], xmm1
- add rbx, 128
- add rdi, 32
- sub rsi, 4
- cmp rsi, 4
- jnc 2b
- test rsi, rsi
- jnz 3f
-4:
- mov rsp, rbp
- pop rbp
- pop rbx
- pop r12
- pop r13
- pop r14
- pop r15
- ret
-.p2align 5
-3:
- test esi, 0x2
- je 3f
- movups xmm0, xmmword ptr [rcx]
- movups xmm1, xmmword ptr [rcx+0x10]
- movaps xmm8, xmm0
- movaps xmm9, xmm1
- movd xmm13, dword ptr [rsp+0x110]
- movd xmm14, dword ptr [rsp+0x120]
- punpckldq xmm13, xmm14
- movaps xmmword ptr [rsp], xmm13
- movd xmm14, dword ptr [rsp+0x114]
- movd xmm13, dword ptr [rsp+0x124]
- punpckldq xmm14, xmm13
- movaps xmmword ptr [rsp+0x10], xmm14
- mov r8, qword ptr [rdi]
- mov r9, qword ptr [rdi+0x8]
- movzx eax, byte ptr [rbp+0x40]
- or eax, r13d
- xor edx, edx
-2:
- mov r14d, eax
- or eax, r12d
- add rdx, 64
- cmp rdx, r15
- cmovne eax, r14d
- movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
- movaps xmm10, xmm2
- movups xmm4, xmmword ptr [r8+rdx-0x40]
- movups xmm5, xmmword ptr [r8+rdx-0x30]
- movaps xmm3, xmm4
- shufps xmm4, xmm5, 136
- shufps xmm3, xmm5, 221
- movaps xmm5, xmm3
- movups xmm6, xmmword ptr [r8+rdx-0x20]
- movups xmm7, xmmword ptr [r8+rdx-0x10]
- movaps xmm3, xmm6
- shufps xmm6, xmm7, 136
- pshufd xmm6, xmm6, 0x93
- shufps xmm3, xmm7, 221
- pshufd xmm7, xmm3, 0x93
- movups xmm12, xmmword ptr [r9+rdx-0x40]
- movups xmm13, xmmword ptr [r9+rdx-0x30]
- movaps xmm11, xmm12
- shufps xmm12, xmm13, 136
- shufps xmm11, xmm13, 221
- movaps xmm13, xmm11
- movups xmm14, xmmword ptr [r9+rdx-0x20]
- movups xmm15, xmmword ptr [r9+rdx-0x10]
- movaps xmm11, xmm14
- shufps xmm14, xmm15, 136
- pshufd xmm14, xmm14, 0x93
- shufps xmm11, xmm15, 221
- pshufd xmm15, xmm11, 0x93
- shl rax, 0x20
- or rax, 0x40
- movd xmm3, rax
- movdqa xmmword ptr [rsp+0x20], xmm3
- movaps xmm3, xmmword ptr [rsp]
- movaps xmm11, xmmword ptr [rsp+0x10]
- punpcklqdq xmm3, xmmword ptr [rsp+0x20]
- punpcklqdq xmm11, xmmword ptr [rsp+0x20]
- mov al, 7
-9:
- paddd xmm0, xmm4
- paddd xmm8, xmm12
- movaps xmmword ptr [rsp+0x20], xmm4
- movaps xmmword ptr [rsp+0x30], xmm12
- paddd xmm0, xmm1
- paddd xmm8, xmm9
- pxor xmm3, xmm0
- pxor xmm11, xmm8
- pshuflw xmm3, xmm3, 0xB1
- pshufhw xmm3, xmm3, 0xB1
- pshuflw xmm11, xmm11, 0xB1
- pshufhw xmm11, xmm11, 0xB1
- paddd xmm2, xmm3
- paddd xmm10, xmm11
- pxor xmm1, xmm2
- pxor xmm9, xmm10
- movdqa xmm4, xmm1
- pslld xmm1, 20
- psrld xmm4, 12
- por xmm1, xmm4
- movdqa xmm4, xmm9
- pslld xmm9, 20
- psrld xmm4, 12
- por xmm9, xmm4
- paddd xmm0, xmm5
- paddd xmm8, xmm13
- movaps xmmword ptr [rsp+0x40], xmm5
- movaps xmmword ptr [rsp+0x50], xmm13
- paddd xmm0, xmm1
- paddd xmm8, xmm9
- pxor xmm3, xmm0
- pxor xmm11, xmm8
- movdqa xmm13, xmm3
- psrld xmm3, 8
- pslld xmm13, 24
- pxor xmm3, xmm13
- movdqa xmm13, xmm11
- psrld xmm11, 8
- pslld xmm13, 24
- pxor xmm11, xmm13
- paddd xmm2, xmm3
- paddd xmm10, xmm11
- pxor xmm1, xmm2
- pxor xmm9, xmm10
- movdqa xmm4, xmm1
- pslld xmm1, 25
- psrld xmm4, 7
- por xmm1, xmm4
- movdqa xmm4, xmm9
- pslld xmm9, 25
- psrld xmm4, 7
- por xmm9, xmm4
- pshufd xmm0, xmm0, 0x93
- pshufd xmm8, xmm8, 0x93
- pshufd xmm3, xmm3, 0x4E
- pshufd xmm11, xmm11, 0x4E
- pshufd xmm2, xmm2, 0x39
- pshufd xmm10, xmm10, 0x39
- paddd xmm0, xmm6
- paddd xmm8, xmm14
- paddd xmm0, xmm1
- paddd xmm8, xmm9
- pxor xmm3, xmm0
- pxor xmm11, xmm8
- pshuflw xmm3, xmm3, 0xB1
- pshufhw xmm3, xmm3, 0xB1
- pshuflw xmm11, xmm11, 0xB1
- pshufhw xmm11, xmm11, 0xB1
- paddd xmm2, xmm3
- paddd xmm10, xmm11
- pxor xmm1, xmm2
- pxor xmm9, xmm10
- movdqa xmm4, xmm1
- pslld xmm1, 20
- psrld xmm4, 12
- por xmm1, xmm4
- movdqa xmm4, xmm9
- pslld xmm9, 20
- psrld xmm4, 12
- por xmm9, xmm4
- paddd xmm0, xmm7
- paddd xmm8, xmm15
- paddd xmm0, xmm1
- paddd xmm8, xmm9
- pxor xmm3, xmm0
- pxor xmm11, xmm8
- movdqa xmm13, xmm3
- psrld xmm3, 8
- pslld xmm13, 24
- pxor xmm3, xmm13
- movdqa xmm13, xmm11
- psrld xmm11, 8
- pslld xmm13, 24
- pxor xmm11, xmm13
- paddd xmm2, xmm3
- paddd xmm10, xmm11
- pxor xmm1, xmm2
- pxor xmm9, xmm10
- movdqa xmm4, xmm1
- pslld xmm1, 25
- psrld xmm4, 7
- por xmm1, xmm4
- movdqa xmm4, xmm9
- pslld xmm9, 25
- psrld xmm4, 7
- por xmm9, xmm4
- pshufd xmm0, xmm0, 0x39
- pshufd xmm8, xmm8, 0x39
- pshufd xmm3, xmm3, 0x4E
- pshufd xmm11, xmm11, 0x4E
- pshufd xmm2, xmm2, 0x93
- pshufd xmm10, xmm10, 0x93
- dec al
- je 9f
- movdqa xmm12, xmmword ptr [rsp+0x20]
- movdqa xmm5, xmmword ptr [rsp+0x40]
- pshufd xmm13, xmm12, 0x0F
- shufps xmm12, xmm5, 214
- pshufd xmm4, xmm12, 0x39
- movdqa xmm12, xmm6
- shufps xmm12, xmm7, 250
- pand xmm13, xmmword ptr [PBLENDW_0x33_MASK+rip]
- pand xmm12, xmmword ptr [PBLENDW_0xCC_MASK+rip]
- por xmm13, xmm12
- movdqa xmmword ptr [rsp+0x20], xmm13
- movdqa xmm12, xmm7
- punpcklqdq xmm12, xmm5
- movdqa xmm13, xmm6
- pand xmm12, xmmword ptr [PBLENDW_0x3F_MASK+rip]
- pand xmm13, xmmword ptr [PBLENDW_0xC0_MASK+rip]
- por xmm12, xmm13
- pshufd xmm12, xmm12, 0x78
- punpckhdq xmm5, xmm7
- punpckldq xmm6, xmm5
- pshufd xmm7, xmm6, 0x1E
- movdqa xmmword ptr [rsp+0x40], xmm12
- movdqa xmm5, xmmword ptr [rsp+0x30]
- movdqa xmm13, xmmword ptr [rsp+0x50]
- pshufd xmm6, xmm5, 0x0F
- shufps xmm5, xmm13, 214
- pshufd xmm12, xmm5, 0x39
- movdqa xmm5, xmm14
- shufps xmm5, xmm15, 250
- pand xmm6, xmmword ptr [PBLENDW_0x33_MASK+rip]
- pand xmm5, xmmword ptr [PBLENDW_0xCC_MASK+rip]
- por xmm6, xmm5
- movdqa xmm5, xmm15
- punpcklqdq xmm5, xmm13
- movdqa xmmword ptr [rsp+0x30], xmm2
- movdqa xmm2, xmm14
- pand xmm5, xmmword ptr [PBLENDW_0x3F_MASK+rip]
- pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip]
- por xmm5, xmm2
- movdqa xmm2, xmmword ptr [rsp+0x30]
- pshufd xmm5, xmm5, 0x78
- punpckhdq xmm13, xmm15
- punpckldq xmm14, xmm13
- pshufd xmm15, xmm14, 0x1E
- movdqa xmm13, xmm6
- movdqa xmm14, xmm5
- movdqa xmm5, xmmword ptr [rsp+0x20]
- movdqa xmm6, xmmword ptr [rsp+0x40]
- jmp 9b
-9:
- pxor xmm0, xmm2
- pxor xmm1, xmm3
- pxor xmm8, xmm10
- pxor xmm9, xmm11
- mov eax, r13d
- cmp rdx, r15
- jne 2b
- movups xmmword ptr [rbx], xmm0
- movups xmmword ptr [rbx+0x10], xmm1
- movups xmmword ptr [rbx+0x20], xmm8
- movups xmmword ptr [rbx+0x30], xmm9
- mov eax, dword ptr [rsp+0x130]
- neg eax
- mov r10d, dword ptr [rsp+0x110+8*rax]
- mov r11d, dword ptr [rsp+0x120+8*rax]
- mov dword ptr [rsp+0x110], r10d
- mov dword ptr [rsp+0x120], r11d
- add rdi, 16
- add rbx, 64
- sub rsi, 2
-3:
- test esi, 0x1
- je 4b
- movups xmm0, xmmword ptr [rcx]
- movups xmm1, xmmword ptr [rcx+0x10]
- movd xmm13, dword ptr [rsp+0x110]
- movd xmm14, dword ptr [rsp+0x120]
- punpckldq xmm13, xmm14
- mov r8, qword ptr [rdi]
- movzx eax, byte ptr [rbp+0x40]
- or eax, r13d
- xor edx, edx
-2:
- mov r14d, eax
- or eax, r12d
- add rdx, 64
- cmp rdx, r15
- cmovne eax, r14d
- movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
- shl rax, 32
- or rax, 64
- movd xmm12, rax
- movdqa xmm3, xmm13
- punpcklqdq xmm3, xmm12
- movups xmm4, xmmword ptr [r8+rdx-0x40]
- movups xmm5, xmmword ptr [r8+rdx-0x30]
- movaps xmm8, xmm4
- shufps xmm4, xmm5, 136
- shufps xmm8, xmm5, 221
- movaps xmm5, xmm8
- movups xmm6, xmmword ptr [r8+rdx-0x20]
- movups xmm7, xmmword ptr [r8+rdx-0x10]
- movaps xmm8, xmm6
- shufps xmm6, xmm7, 136
- pshufd xmm6, xmm6, 0x93
- shufps xmm8, xmm7, 221
- pshufd xmm7, xmm8, 0x93
- mov al, 7
-9:
- paddd xmm0, xmm4
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- pshuflw xmm3, xmm3, 0xB1
- pshufhw xmm3, xmm3, 0xB1
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 20
- psrld xmm11, 12
- por xmm1, xmm11
- paddd xmm0, xmm5
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- movdqa xmm14, xmm3
- psrld xmm3, 8
- pslld xmm14, 24
- pxor xmm3, xmm14
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 25
- psrld xmm11, 7
- por xmm1, xmm11
- pshufd xmm0, xmm0, 0x93
- pshufd xmm3, xmm3, 0x4E
- pshufd xmm2, xmm2, 0x39
- paddd xmm0, xmm6
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- pshuflw xmm3, xmm3, 0xB1
- pshufhw xmm3, xmm3, 0xB1
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 20
- psrld xmm11, 12
- por xmm1, xmm11
- paddd xmm0, xmm7
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- movdqa xmm14, xmm3
- psrld xmm3, 8
- pslld xmm14, 24
- pxor xmm3, xmm14
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 25
- psrld xmm11, 7
- por xmm1, xmm11
- pshufd xmm0, xmm0, 0x39
- pshufd xmm3, xmm3, 0x4E
- pshufd xmm2, xmm2, 0x93
- dec al
- jz 9f
- movdqa xmm8, xmm4
- shufps xmm8, xmm5, 214
- pshufd xmm9, xmm4, 0x0F
- pshufd xmm4, xmm8, 0x39
- movdqa xmm8, xmm6
- shufps xmm8, xmm7, 250
- pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
- pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
- por xmm9, xmm8
- movdqa xmm8, xmm7
- punpcklqdq xmm8, xmm5
- movdqa xmm10, xmm6
- pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
- pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
- por xmm8, xmm10
- pshufd xmm8, xmm8, 0x78
- punpckhdq xmm5, xmm7
- punpckldq xmm6, xmm5
- pshufd xmm7, xmm6, 0x1E
- movdqa xmm5, xmm9
- movdqa xmm6, xmm8
- jmp 9b
-9:
- pxor xmm0, xmm2
- pxor xmm1, xmm3
- mov eax, r13d
- cmp rdx, r15
- jne 2b
- movups xmmword ptr [rbx], xmm0
- movups xmmword ptr [rbx+0x10], xmm1
- jmp 4b
-
-.p2align 6
-blake3_compress_in_place_sse2:
-_blake3_compress_in_place_sse2:
- _CET_ENDBR
- movups xmm0, xmmword ptr [rdi]
- movups xmm1, xmmword ptr [rdi+0x10]
- movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
- shl r8, 32
- add rdx, r8
- movq xmm3, rcx
- movq xmm4, rdx
- punpcklqdq xmm3, xmm4
- movups xmm4, xmmword ptr [rsi]
- movups xmm5, xmmword ptr [rsi+0x10]
- movaps xmm8, xmm4
- shufps xmm4, xmm5, 136
- shufps xmm8, xmm5, 221
- movaps xmm5, xmm8
- movups xmm6, xmmword ptr [rsi+0x20]
- movups xmm7, xmmword ptr [rsi+0x30]
- movaps xmm8, xmm6
- shufps xmm6, xmm7, 136
- pshufd xmm6, xmm6, 0x93
- shufps xmm8, xmm7, 221
- pshufd xmm7, xmm8, 0x93
- mov al, 7
-9:
- paddd xmm0, xmm4
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- pshuflw xmm3, xmm3, 0xB1
- pshufhw xmm3, xmm3, 0xB1
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 20
- psrld xmm11, 12
- por xmm1, xmm11
- paddd xmm0, xmm5
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- movdqa xmm14, xmm3
- psrld xmm3, 8
- pslld xmm14, 24
- pxor xmm3, xmm14
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 25
- psrld xmm11, 7
- por xmm1, xmm11
- pshufd xmm0, xmm0, 0x93
- pshufd xmm3, xmm3, 0x4E
- pshufd xmm2, xmm2, 0x39
- paddd xmm0, xmm6
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- pshuflw xmm3, xmm3, 0xB1
- pshufhw xmm3, xmm3, 0xB1
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 20
- psrld xmm11, 12
- por xmm1, xmm11
- paddd xmm0, xmm7
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- movdqa xmm14, xmm3
- psrld xmm3, 8
- pslld xmm14, 24
- pxor xmm3, xmm14
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 25
- psrld xmm11, 7
- por xmm1, xmm11
- pshufd xmm0, xmm0, 0x39
- pshufd xmm3, xmm3, 0x4E
- pshufd xmm2, xmm2, 0x93
- dec al
- jz 9f
- movdqa xmm8, xmm4
- shufps xmm8, xmm5, 214
- pshufd xmm9, xmm4, 0x0F
- pshufd xmm4, xmm8, 0x39
- movdqa xmm8, xmm6
- shufps xmm8, xmm7, 250
- pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
- pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
- por xmm9, xmm8
- movdqa xmm8, xmm7
- punpcklqdq xmm8, xmm5
- movdqa xmm10, xmm6
- pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
- pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
- por xmm8, xmm10
- pshufd xmm8, xmm8, 0x78
- punpckhdq xmm5, xmm7
- punpckldq xmm6, xmm5
- pshufd xmm7, xmm6, 0x1E
- movdqa xmm5, xmm9
- movdqa xmm6, xmm8
- jmp 9b
-9:
- pxor xmm0, xmm2
- pxor xmm1, xmm3
- movups xmmword ptr [rdi], xmm0
- movups xmmword ptr [rdi+0x10], xmm1
- ret
-
-.p2align 6
-blake3_compress_xof_sse2:
-_blake3_compress_xof_sse2:
- _CET_ENDBR
- movups xmm0, xmmword ptr [rdi]
- movups xmm1, xmmword ptr [rdi+0x10]
- movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
- movzx eax, r8b
- movzx edx, dl
- shl rax, 32
- add rdx, rax
- movq xmm3, rcx
- movq xmm4, rdx
- punpcklqdq xmm3, xmm4
- movups xmm4, xmmword ptr [rsi]
- movups xmm5, xmmword ptr [rsi+0x10]
- movaps xmm8, xmm4
- shufps xmm4, xmm5, 136
- shufps xmm8, xmm5, 221
- movaps xmm5, xmm8
- movups xmm6, xmmword ptr [rsi+0x20]
- movups xmm7, xmmword ptr [rsi+0x30]
- movaps xmm8, xmm6
- shufps xmm6, xmm7, 136
- pshufd xmm6, xmm6, 0x93
- shufps xmm8, xmm7, 221
- pshufd xmm7, xmm8, 0x93
- mov al, 7
-9:
- paddd xmm0, xmm4
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- pshuflw xmm3, xmm3, 0xB1
- pshufhw xmm3, xmm3, 0xB1
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 20
- psrld xmm11, 12
- por xmm1, xmm11
- paddd xmm0, xmm5
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- movdqa xmm14, xmm3
- psrld xmm3, 8
- pslld xmm14, 24
- pxor xmm3, xmm14
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 25
- psrld xmm11, 7
- por xmm1, xmm11
- pshufd xmm0, xmm0, 0x93
- pshufd xmm3, xmm3, 0x4E
- pshufd xmm2, xmm2, 0x39
- paddd xmm0, xmm6
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- pshuflw xmm3, xmm3, 0xB1
- pshufhw xmm3, xmm3, 0xB1
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 20
- psrld xmm11, 12
- por xmm1, xmm11
- paddd xmm0, xmm7
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- movdqa xmm14, xmm3
- psrld xmm3, 8
- pslld xmm14, 24
- pxor xmm3, xmm14
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 25
- psrld xmm11, 7
- por xmm1, xmm11
- pshufd xmm0, xmm0, 0x39
- pshufd xmm3, xmm3, 0x4E
- pshufd xmm2, xmm2, 0x93
- dec al
- jz 9f
- movdqa xmm8, xmm4
- shufps xmm8, xmm5, 214
- pshufd xmm9, xmm4, 0x0F
- pshufd xmm4, xmm8, 0x39
- movdqa xmm8, xmm6
- shufps xmm8, xmm7, 250
- pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
- pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
- por xmm9, xmm8
- movdqa xmm8, xmm7
- punpcklqdq xmm8, xmm5
- movdqa xmm10, xmm6
- pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
- pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
- por xmm8, xmm10
- pshufd xmm8, xmm8, 0x78
- punpckhdq xmm5, xmm7
- punpckldq xmm6, xmm5
- pshufd xmm7, xmm6, 0x1E
- movdqa xmm5, xmm9
- movdqa xmm6, xmm8
- jmp 9b
-9:
- movdqu xmm4, xmmword ptr [rdi]
- movdqu xmm5, xmmword ptr [rdi+0x10]
- pxor xmm0, xmm2
- pxor xmm1, xmm3
- pxor xmm2, xmm4
- pxor xmm3, xmm5
- movups xmmword ptr [r9], xmm0
- movups xmmword ptr [r9+0x10], xmm1
- movups xmmword ptr [r9+0x20], xmm2
- movups xmmword ptr [r9+0x30], xmm3
- ret
-
-
-#ifdef __APPLE__
-.static_data
-#else
-.section .rodata
-#endif
-.p2align 6
-BLAKE3_IV:
- .long 0x6A09E667, 0xBB67AE85
- .long 0x3C6EF372, 0xA54FF53A
-ADD0:
- .long 0, 1, 2, 3
-ADD1:
- .long 4, 4, 4, 4
-BLAKE3_IV_0:
- .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
-BLAKE3_IV_1:
- .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
-BLAKE3_IV_2:
- .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
-BLAKE3_IV_3:
- .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
-BLAKE3_BLOCK_LEN:
- .long 64, 64, 64, 64
-CMP_MSB_MASK:
- .long 0x80000000, 0x80000000, 0x80000000, 0x80000000
-PBLENDW_0x33_MASK:
- .long 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000
-PBLENDW_0xCC_MASK:
- .long 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF
-PBLENDW_0x3F_MASK:
- .long 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000
-PBLENDW_0xC0_MASK:
- .long 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF
diff --git a/thirdparty/BLAKE3/c/blake3_sse2_x86-64_windows_gnu.S b/thirdparty/BLAKE3/c/blake3_sse2_x86-64_windows_gnu.S
deleted file mode 100644
index 494c0c6fd..000000000
--- a/thirdparty/BLAKE3/c/blake3_sse2_x86-64_windows_gnu.S
+++ /dev/null
@@ -1,2332 +0,0 @@
-.intel_syntax noprefix
-.global blake3_hash_many_sse2
-.global _blake3_hash_many_sse2
-.global blake3_compress_in_place_sse2
-.global _blake3_compress_in_place_sse2
-.global blake3_compress_xof_sse2
-.global _blake3_compress_xof_sse2
-.section .text
- .p2align 6
-_blake3_hash_many_sse2:
-blake3_hash_many_sse2:
- push r15
- push r14
- push r13
- push r12
- push rsi
- push rdi
- push rbx
- push rbp
- mov rbp, rsp
- sub rsp, 528
- and rsp, 0xFFFFFFFFFFFFFFC0
- movdqa xmmword ptr [rsp+0x170], xmm6
- movdqa xmmword ptr [rsp+0x180], xmm7
- movdqa xmmword ptr [rsp+0x190], xmm8
- movdqa xmmword ptr [rsp+0x1A0], xmm9
- movdqa xmmword ptr [rsp+0x1B0], xmm10
- movdqa xmmword ptr [rsp+0x1C0], xmm11
- movdqa xmmword ptr [rsp+0x1D0], xmm12
- movdqa xmmword ptr [rsp+0x1E0], xmm13
- movdqa xmmword ptr [rsp+0x1F0], xmm14
- movdqa xmmword ptr [rsp+0x200], xmm15
- mov rdi, rcx
- mov rsi, rdx
- mov rdx, r8
- mov rcx, r9
- mov r8, qword ptr [rbp+0x68]
- movzx r9, byte ptr [rbp+0x70]
- neg r9d
- movd xmm0, r9d
- pshufd xmm0, xmm0, 0x00
- movdqa xmmword ptr [rsp+0x130], xmm0
- movdqa xmm1, xmm0
- pand xmm1, xmmword ptr [ADD0+rip]
- pand xmm0, xmmword ptr [ADD1+rip]
- movdqa xmmword ptr [rsp+0x150], xmm0
- movd xmm0, r8d
- pshufd xmm0, xmm0, 0x00
- paddd xmm0, xmm1
- movdqa xmmword ptr [rsp+0x110], xmm0
- pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
- pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
- pcmpgtd xmm1, xmm0
- shr r8, 32
- movd xmm2, r8d
- pshufd xmm2, xmm2, 0x00
- psubd xmm2, xmm1
- movdqa xmmword ptr [rsp+0x120], xmm2
- mov rbx, qword ptr [rbp+0x90]
- mov r15, rdx
- shl r15, 6
- movzx r13d, byte ptr [rbp+0x78]
- movzx r12d, byte ptr [rbp+0x88]
- cmp rsi, 4
- jc 3f
-2:
- movdqu xmm3, xmmword ptr [rcx]
- pshufd xmm0, xmm3, 0x00
- pshufd xmm1, xmm3, 0x55
- pshufd xmm2, xmm3, 0xAA
- pshufd xmm3, xmm3, 0xFF
- movdqu xmm7, xmmword ptr [rcx+0x10]
- pshufd xmm4, xmm7, 0x00
- pshufd xmm5, xmm7, 0x55
- pshufd xmm6, xmm7, 0xAA
- pshufd xmm7, xmm7, 0xFF
- mov r8, qword ptr [rdi]
- mov r9, qword ptr [rdi+0x8]
- mov r10, qword ptr [rdi+0x10]
- mov r11, qword ptr [rdi+0x18]
- movzx eax, byte ptr [rbp+0x80]
- or eax, r13d
- xor edx, edx
-9:
- mov r14d, eax
- or eax, r12d
- add rdx, 64
- cmp rdx, r15
- cmovne eax, r14d
- movdqu xmm8, xmmword ptr [r8+rdx-0x40]
- movdqu xmm9, xmmword ptr [r9+rdx-0x40]
- movdqu xmm10, xmmword ptr [r10+rdx-0x40]
- movdqu xmm11, xmmword ptr [r11+rdx-0x40]
- movdqa xmm12, xmm8
- punpckldq xmm8, xmm9
- punpckhdq xmm12, xmm9
- movdqa xmm14, xmm10
- punpckldq xmm10, xmm11
- punpckhdq xmm14, xmm11
- movdqa xmm9, xmm8
- punpcklqdq xmm8, xmm10
- punpckhqdq xmm9, xmm10
- movdqa xmm13, xmm12
- punpcklqdq xmm12, xmm14
- punpckhqdq xmm13, xmm14
- movdqa xmmword ptr [rsp], xmm8
- movdqa xmmword ptr [rsp+0x10], xmm9
- movdqa xmmword ptr [rsp+0x20], xmm12
- movdqa xmmword ptr [rsp+0x30], xmm13
- movdqu xmm8, xmmword ptr [r8+rdx-0x30]
- movdqu xmm9, xmmword ptr [r9+rdx-0x30]
- movdqu xmm10, xmmword ptr [r10+rdx-0x30]
- movdqu xmm11, xmmword ptr [r11+rdx-0x30]
- movdqa xmm12, xmm8
- punpckldq xmm8, xmm9
- punpckhdq xmm12, xmm9
- movdqa xmm14, xmm10
- punpckldq xmm10, xmm11
- punpckhdq xmm14, xmm11
- movdqa xmm9, xmm8
- punpcklqdq xmm8, xmm10
- punpckhqdq xmm9, xmm10
- movdqa xmm13, xmm12
- punpcklqdq xmm12, xmm14
- punpckhqdq xmm13, xmm14
- movdqa xmmword ptr [rsp+0x40], xmm8
- movdqa xmmword ptr [rsp+0x50], xmm9
- movdqa xmmword ptr [rsp+0x60], xmm12
- movdqa xmmword ptr [rsp+0x70], xmm13
- movdqu xmm8, xmmword ptr [r8+rdx-0x20]
- movdqu xmm9, xmmword ptr [r9+rdx-0x20]
- movdqu xmm10, xmmword ptr [r10+rdx-0x20]
- movdqu xmm11, xmmword ptr [r11+rdx-0x20]
- movdqa xmm12, xmm8
- punpckldq xmm8, xmm9
- punpckhdq xmm12, xmm9
- movdqa xmm14, xmm10
- punpckldq xmm10, xmm11
- punpckhdq xmm14, xmm11
- movdqa xmm9, xmm8
- punpcklqdq xmm8, xmm10
- punpckhqdq xmm9, xmm10
- movdqa xmm13, xmm12
- punpcklqdq xmm12, xmm14
- punpckhqdq xmm13, xmm14
- movdqa xmmword ptr [rsp+0x80], xmm8
- movdqa xmmword ptr [rsp+0x90], xmm9
- movdqa xmmword ptr [rsp+0xA0], xmm12
- movdqa xmmword ptr [rsp+0xB0], xmm13
- movdqu xmm8, xmmword ptr [r8+rdx-0x10]
- movdqu xmm9, xmmword ptr [r9+rdx-0x10]
- movdqu xmm10, xmmword ptr [r10+rdx-0x10]
- movdqu xmm11, xmmword ptr [r11+rdx-0x10]
- movdqa xmm12, xmm8
- punpckldq xmm8, xmm9
- punpckhdq xmm12, xmm9
- movdqa xmm14, xmm10
- punpckldq xmm10, xmm11
- punpckhdq xmm14, xmm11
- movdqa xmm9, xmm8
- punpcklqdq xmm8, xmm10
- punpckhqdq xmm9, xmm10
- movdqa xmm13, xmm12
- punpcklqdq xmm12, xmm14
- punpckhqdq xmm13, xmm14
- movdqa xmmword ptr [rsp+0xC0], xmm8
- movdqa xmmword ptr [rsp+0xD0], xmm9
- movdqa xmmword ptr [rsp+0xE0], xmm12
- movdqa xmmword ptr [rsp+0xF0], xmm13
- movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip]
- movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip]
- movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip]
- movdqa xmm12, xmmword ptr [rsp+0x110]
- movdqa xmm13, xmmword ptr [rsp+0x120]
- movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
- movd xmm15, eax
- pshufd xmm15, xmm15, 0x00
- prefetcht0 [r8+rdx+0x80]
- prefetcht0 [r9+rdx+0x80]
- prefetcht0 [r10+rdx+0x80]
- prefetcht0 [r11+rdx+0x80]
- paddd xmm0, xmmword ptr [rsp]
- paddd xmm1, xmmword ptr [rsp+0x20]
- paddd xmm2, xmmword ptr [rsp+0x40]
- paddd xmm3, xmmword ptr [rsp+0x60]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- pshuflw xmm12, xmm12, 0xB1
- pshufhw xmm12, xmm12, 0xB1
- pshuflw xmm13, xmm13, 0xB1
- pshufhw xmm13, xmm13, 0xB1
- pshuflw xmm14, xmm14, 0xB1
- pshufhw xmm14, xmm14, 0xB1
- pshuflw xmm15, xmm15, 0xB1
- pshufhw xmm15, xmm15, 0xB1
- movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+0x10]
- paddd xmm1, xmmword ptr [rsp+0x30]
- paddd xmm2, xmmword ptr [rsp+0x50]
- paddd xmm3, xmmword ptr [rsp+0x70]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- movdqa xmm8, xmm12
- psrld xmm12, 8
- pslld xmm8, 24
- pxor xmm12, xmm8
- movdqa xmm8, xmm13
- psrld xmm13, 8
- pslld xmm8, 24
- pxor xmm13, xmm8
- movdqa xmm8, xmm14
- psrld xmm14, 8
- pslld xmm8, 24
- pxor xmm14, xmm8
- movdqa xmm8, xmm15
- psrld xmm15, 8
- pslld xmm8, 24
- pxor xmm15, xmm8
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+0x80]
- paddd xmm1, xmmword ptr [rsp+0xA0]
- paddd xmm2, xmmword ptr [rsp+0xC0]
- paddd xmm3, xmmword ptr [rsp+0xE0]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- pshuflw xmm15, xmm15, 0xB1
- pshufhw xmm15, xmm15, 0xB1
- pshuflw xmm12, xmm12, 0xB1
- pshufhw xmm12, xmm12, 0xB1
- pshuflw xmm13, xmm13, 0xB1
- pshufhw xmm13, xmm13, 0xB1
- pshuflw xmm14, xmm14, 0xB1
- pshufhw xmm14, xmm14, 0xB1
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+0x90]
- paddd xmm1, xmmword ptr [rsp+0xB0]
- paddd xmm2, xmmword ptr [rsp+0xD0]
- paddd xmm3, xmmword ptr [rsp+0xF0]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- movdqa xmm8, xmm15
- psrld xmm15, 8
- pslld xmm8, 24
- pxor xmm15, xmm8
- movdqa xmm8, xmm12
- psrld xmm12, 8
- pslld xmm8, 24
- pxor xmm12, xmm8
- movdqa xmm8, xmm13
- psrld xmm13, 8
- pslld xmm8, 24
- pxor xmm13, xmm8
- movdqa xmm8, xmm14
- psrld xmm14, 8
- pslld xmm8, 24
- pxor xmm14, xmm8
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+0x20]
- paddd xmm1, xmmword ptr [rsp+0x30]
- paddd xmm2, xmmword ptr [rsp+0x70]
- paddd xmm3, xmmword ptr [rsp+0x40]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- pshuflw xmm12, xmm12, 0xB1
- pshufhw xmm12, xmm12, 0xB1
- pshuflw xmm13, xmm13, 0xB1
- pshufhw xmm13, xmm13, 0xB1
- pshuflw xmm14, xmm14, 0xB1
- pshufhw xmm14, xmm14, 0xB1
- pshuflw xmm15, xmm15, 0xB1
- pshufhw xmm15, xmm15, 0xB1
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+0x60]
- paddd xmm1, xmmword ptr [rsp+0xA0]
- paddd xmm2, xmmword ptr [rsp]
- paddd xmm3, xmmword ptr [rsp+0xD0]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- movdqa xmm8, xmm12
- psrld xmm12, 8
- pslld xmm8, 24
- pxor xmm12, xmm8
- movdqa xmm8, xmm13
- psrld xmm13, 8
- pslld xmm8, 24
- pxor xmm13, xmm8
- movdqa xmm8, xmm14
- psrld xmm14, 8
- pslld xmm8, 24
- pxor xmm14, xmm8
- movdqa xmm8, xmm15
- psrld xmm15, 8
- pslld xmm8, 24
- pxor xmm15, xmm8
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+0x10]
- paddd xmm1, xmmword ptr [rsp+0xC0]
- paddd xmm2, xmmword ptr [rsp+0x90]
- paddd xmm3, xmmword ptr [rsp+0xF0]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- pshuflw xmm15, xmm15, 0xB1
- pshufhw xmm15, xmm15, 0xB1
- pshuflw xmm12, xmm12, 0xB1
- pshufhw xmm12, xmm12, 0xB1
- pshuflw xmm13, xmm13, 0xB1
- pshufhw xmm13, xmm13, 0xB1
- pshuflw xmm14, xmm14, 0xB1
- pshufhw xmm14, xmm14, 0xB1
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+0xB0]
- paddd xmm1, xmmword ptr [rsp+0x50]
- paddd xmm2, xmmword ptr [rsp+0xE0]
- paddd xmm3, xmmword ptr [rsp+0x80]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- movdqa xmm8, xmm15
- psrld xmm15, 8
- pslld xmm8, 24
- pxor xmm15, xmm8
- movdqa xmm8, xmm12
- psrld xmm12, 8
- pslld xmm8, 24
- pxor xmm12, xmm8
- movdqa xmm8, xmm13
- psrld xmm13, 8
- pslld xmm8, 24
- pxor xmm13, xmm8
- movdqa xmm8, xmm14
- psrld xmm14, 8
- pslld xmm8, 24
- pxor xmm14, xmm8
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+0x30]
- paddd xmm1, xmmword ptr [rsp+0xA0]
- paddd xmm2, xmmword ptr [rsp+0xD0]
- paddd xmm3, xmmword ptr [rsp+0x70]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- pshuflw xmm12, xmm12, 0xB1
- pshufhw xmm12, xmm12, 0xB1
- pshuflw xmm13, xmm13, 0xB1
- pshufhw xmm13, xmm13, 0xB1
- pshuflw xmm14, xmm14, 0xB1
- pshufhw xmm14, xmm14, 0xB1
- pshuflw xmm15, xmm15, 0xB1
- pshufhw xmm15, xmm15, 0xB1
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+0x40]
- paddd xmm1, xmmword ptr [rsp+0xC0]
- paddd xmm2, xmmword ptr [rsp+0x20]
- paddd xmm3, xmmword ptr [rsp+0xE0]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- movdqa xmm8, xmm12
- psrld xmm12, 8
- pslld xmm8, 24
- pxor xmm12, xmm8
- movdqa xmm8, xmm13
- psrld xmm13, 8
- pslld xmm8, 24
- pxor xmm13, xmm8
- movdqa xmm8, xmm14
- psrld xmm14, 8
- pslld xmm8, 24
- pxor xmm14, xmm8
- movdqa xmm8, xmm15
- psrld xmm15, 8
- pslld xmm8, 24
- pxor xmm15, xmm8
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+0x60]
- paddd xmm1, xmmword ptr [rsp+0x90]
- paddd xmm2, xmmword ptr [rsp+0xB0]
- paddd xmm3, xmmword ptr [rsp+0x80]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- pshuflw xmm15, xmm15, 0xB1
- pshufhw xmm15, xmm15, 0xB1
- pshuflw xmm12, xmm12, 0xB1
- pshufhw xmm12, xmm12, 0xB1
- pshuflw xmm13, xmm13, 0xB1
- pshufhw xmm13, xmm13, 0xB1
- pshuflw xmm14, xmm14, 0xB1
- pshufhw xmm14, xmm14, 0xB1
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+0x50]
- paddd xmm1, xmmword ptr [rsp]
- paddd xmm2, xmmword ptr [rsp+0xF0]
- paddd xmm3, xmmword ptr [rsp+0x10]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- movdqa xmm8, xmm15
- psrld xmm15, 8
- pslld xmm8, 24
- pxor xmm15, xmm8
- movdqa xmm8, xmm12
- psrld xmm12, 8
- pslld xmm8, 24
- pxor xmm12, xmm8
- movdqa xmm8, xmm13
- psrld xmm13, 8
- pslld xmm8, 24
- pxor xmm13, xmm8
- movdqa xmm8, xmm14
- psrld xmm14, 8
- pslld xmm8, 24
- pxor xmm14, xmm8
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+0xA0]
- paddd xmm1, xmmword ptr [rsp+0xC0]
- paddd xmm2, xmmword ptr [rsp+0xE0]
- paddd xmm3, xmmword ptr [rsp+0xD0]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- pshuflw xmm12, xmm12, 0xB1
- pshufhw xmm12, xmm12, 0xB1
- pshuflw xmm13, xmm13, 0xB1
- pshufhw xmm13, xmm13, 0xB1
- pshuflw xmm14, xmm14, 0xB1
- pshufhw xmm14, xmm14, 0xB1
- pshuflw xmm15, xmm15, 0xB1
- pshufhw xmm15, xmm15, 0xB1
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+0x70]
- paddd xmm1, xmmword ptr [rsp+0x90]
- paddd xmm2, xmmword ptr [rsp+0x30]
- paddd xmm3, xmmword ptr [rsp+0xF0]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- movdqa xmm8, xmm12
- psrld xmm12, 8
- pslld xmm8, 24
- pxor xmm12, xmm8
- movdqa xmm8, xmm13
- psrld xmm13, 8
- pslld xmm8, 24
- pxor xmm13, xmm8
- movdqa xmm8, xmm14
- psrld xmm14, 8
- pslld xmm8, 24
- pxor xmm14, xmm8
- movdqa xmm8, xmm15
- psrld xmm15, 8
- pslld xmm8, 24
- pxor xmm15, xmm8
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+0x40]
- paddd xmm1, xmmword ptr [rsp+0xB0]
- paddd xmm2, xmmword ptr [rsp+0x50]
- paddd xmm3, xmmword ptr [rsp+0x10]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- pshuflw xmm15, xmm15, 0xB1
- pshufhw xmm15, xmm15, 0xB1
- pshuflw xmm12, xmm12, 0xB1
- pshufhw xmm12, xmm12, 0xB1
- pshuflw xmm13, xmm13, 0xB1
- pshufhw xmm13, xmm13, 0xB1
- pshuflw xmm14, xmm14, 0xB1
- pshufhw xmm14, xmm14, 0xB1
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp]
- paddd xmm1, xmmword ptr [rsp+0x20]
- paddd xmm2, xmmword ptr [rsp+0x80]
- paddd xmm3, xmmword ptr [rsp+0x60]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- movdqa xmm8, xmm15
- psrld xmm15, 8
- pslld xmm8, 24
- pxor xmm15, xmm8
- movdqa xmm8, xmm12
- psrld xmm12, 8
- pslld xmm8, 24
- pxor xmm12, xmm8
- movdqa xmm8, xmm13
- psrld xmm13, 8
- pslld xmm8, 24
- pxor xmm13, xmm8
- movdqa xmm8, xmm14
- psrld xmm14, 8
- pslld xmm8, 24
- pxor xmm14, xmm8
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+0xC0]
- paddd xmm1, xmmword ptr [rsp+0x90]
- paddd xmm2, xmmword ptr [rsp+0xF0]
- paddd xmm3, xmmword ptr [rsp+0xE0]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- pshuflw xmm12, xmm12, 0xB1
- pshufhw xmm12, xmm12, 0xB1
- pshuflw xmm13, xmm13, 0xB1
- pshufhw xmm13, xmm13, 0xB1
- pshuflw xmm14, xmm14, 0xB1
- pshufhw xmm14, xmm14, 0xB1
- pshuflw xmm15, xmm15, 0xB1
- pshufhw xmm15, xmm15, 0xB1
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+0xD0]
- paddd xmm1, xmmword ptr [rsp+0xB0]
- paddd xmm2, xmmword ptr [rsp+0xA0]
- paddd xmm3, xmmword ptr [rsp+0x80]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- movdqa xmm8, xmm12
- psrld xmm12, 8
- pslld xmm8, 24
- pxor xmm12, xmm8
- movdqa xmm8, xmm13
- psrld xmm13, 8
- pslld xmm8, 24
- pxor xmm13, xmm8
- movdqa xmm8, xmm14
- psrld xmm14, 8
- pslld xmm8, 24
- pxor xmm14, xmm8
- movdqa xmm8, xmm15
- psrld xmm15, 8
- pslld xmm8, 24
- pxor xmm15, xmm8
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+0x70]
- paddd xmm1, xmmword ptr [rsp+0x50]
- paddd xmm2, xmmword ptr [rsp]
- paddd xmm3, xmmword ptr [rsp+0x60]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- pshuflw xmm15, xmm15, 0xB1
- pshufhw xmm15, xmm15, 0xB1
- pshuflw xmm12, xmm12, 0xB1
- pshufhw xmm12, xmm12, 0xB1
- pshuflw xmm13, xmm13, 0xB1
- pshufhw xmm13, xmm13, 0xB1
- pshuflw xmm14, xmm14, 0xB1
- pshufhw xmm14, xmm14, 0xB1
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+0x20]
- paddd xmm1, xmmword ptr [rsp+0x30]
- paddd xmm2, xmmword ptr [rsp+0x10]
- paddd xmm3, xmmword ptr [rsp+0x40]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- movdqa xmm8, xmm15
- psrld xmm15, 8
- pslld xmm8, 24
- pxor xmm15, xmm8
- movdqa xmm8, xmm12
- psrld xmm12, 8
- pslld xmm8, 24
- pxor xmm12, xmm8
- movdqa xmm8, xmm13
- psrld xmm13, 8
- pslld xmm8, 24
- pxor xmm13, xmm8
- movdqa xmm8, xmm14
- psrld xmm14, 8
- pslld xmm8, 24
- pxor xmm14, xmm8
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+0x90]
- paddd xmm1, xmmword ptr [rsp+0xB0]
- paddd xmm2, xmmword ptr [rsp+0x80]
- paddd xmm3, xmmword ptr [rsp+0xF0]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- pshuflw xmm12, xmm12, 0xB1
- pshufhw xmm12, xmm12, 0xB1
- pshuflw xmm13, xmm13, 0xB1
- pshufhw xmm13, xmm13, 0xB1
- pshuflw xmm14, xmm14, 0xB1
- pshufhw xmm14, xmm14, 0xB1
- pshuflw xmm15, xmm15, 0xB1
- pshufhw xmm15, xmm15, 0xB1
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+0xE0]
- paddd xmm1, xmmword ptr [rsp+0x50]
- paddd xmm2, xmmword ptr [rsp+0xC0]
- paddd xmm3, xmmword ptr [rsp+0x10]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- movdqa xmm8, xmm12
- psrld xmm12, 8
- pslld xmm8, 24
- pxor xmm12, xmm8
- movdqa xmm8, xmm13
- psrld xmm13, 8
- pslld xmm8, 24
- pxor xmm13, xmm8
- movdqa xmm8, xmm14
- psrld xmm14, 8
- pslld xmm8, 24
- pxor xmm14, xmm8
- movdqa xmm8, xmm15
- psrld xmm15, 8
- pslld xmm8, 24
- pxor xmm15, xmm8
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+0xD0]
- paddd xmm1, xmmword ptr [rsp]
- paddd xmm2, xmmword ptr [rsp+0x20]
- paddd xmm3, xmmword ptr [rsp+0x40]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- pshuflw xmm15, xmm15, 0xB1
- pshufhw xmm15, xmm15, 0xB1
- pshuflw xmm12, xmm12, 0xB1
- pshufhw xmm12, xmm12, 0xB1
- pshuflw xmm13, xmm13, 0xB1
- pshufhw xmm13, xmm13, 0xB1
- pshuflw xmm14, xmm14, 0xB1
- pshufhw xmm14, xmm14, 0xB1
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+0x30]
- paddd xmm1, xmmword ptr [rsp+0xA0]
- paddd xmm2, xmmword ptr [rsp+0x60]
- paddd xmm3, xmmword ptr [rsp+0x70]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- movdqa xmm8, xmm15
- psrld xmm15, 8
- pslld xmm8, 24
- pxor xmm15, xmm8
- movdqa xmm8, xmm12
- psrld xmm12, 8
- pslld xmm8, 24
- pxor xmm12, xmm8
- movdqa xmm8, xmm13
- psrld xmm13, 8
- pslld xmm8, 24
- pxor xmm13, xmm8
- movdqa xmm8, xmm14
- psrld xmm14, 8
- pslld xmm8, 24
- pxor xmm14, xmm8
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+0xB0]
- paddd xmm1, xmmword ptr [rsp+0x50]
- paddd xmm2, xmmword ptr [rsp+0x10]
- paddd xmm3, xmmword ptr [rsp+0x80]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- pshuflw xmm12, xmm12, 0xB1
- pshufhw xmm12, xmm12, 0xB1
- pshuflw xmm13, xmm13, 0xB1
- pshufhw xmm13, xmm13, 0xB1
- pshuflw xmm14, xmm14, 0xB1
- pshufhw xmm14, xmm14, 0xB1
- pshuflw xmm15, xmm15, 0xB1
- pshufhw xmm15, xmm15, 0xB1
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+0xF0]
- paddd xmm1, xmmword ptr [rsp]
- paddd xmm2, xmmword ptr [rsp+0x90]
- paddd xmm3, xmmword ptr [rsp+0x60]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- movdqa xmm8, xmm12
- psrld xmm12, 8
- pslld xmm8, 24
- pxor xmm12, xmm8
- movdqa xmm8, xmm13
- psrld xmm13, 8
- pslld xmm8, 24
- pxor xmm13, xmm8
- movdqa xmm8, xmm14
- psrld xmm14, 8
- pslld xmm8, 24
- pxor xmm14, xmm8
- movdqa xmm8, xmm15
- psrld xmm15, 8
- pslld xmm8, 24
- pxor xmm15, xmm8
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+0xE0]
- paddd xmm1, xmmword ptr [rsp+0x20]
- paddd xmm2, xmmword ptr [rsp+0x30]
- paddd xmm3, xmmword ptr [rsp+0x70]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- pshuflw xmm15, xmm15, 0xB1
- pshufhw xmm15, xmm15, 0xB1
- pshuflw xmm12, xmm12, 0xB1
- pshufhw xmm12, xmm12, 0xB1
- pshuflw xmm13, xmm13, 0xB1
- pshufhw xmm13, xmm13, 0xB1
- pshuflw xmm14, xmm14, 0xB1
- pshufhw xmm14, xmm14, 0xB1
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+0xA0]
- paddd xmm1, xmmword ptr [rsp+0xC0]
- paddd xmm2, xmmword ptr [rsp+0x40]
- paddd xmm3, xmmword ptr [rsp+0xD0]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- movdqa xmm8, xmm15
- psrld xmm15, 8
- pslld xmm8, 24
- pxor xmm15, xmm8
- movdqa xmm8, xmm12
- psrld xmm12, 8
- pslld xmm8, 24
- pxor xmm12, xmm8
- movdqa xmm8, xmm13
- psrld xmm13, 8
- pslld xmm8, 24
- pxor xmm13, xmm8
- movdqa xmm8, xmm14
- psrld xmm14, 8
- pslld xmm8, 24
- pxor xmm14, xmm8
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- pxor xmm0, xmm8
- pxor xmm1, xmm9
- pxor xmm2, xmm10
- pxor xmm3, xmm11
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- pxor xmm4, xmm12
- pxor xmm5, xmm13
- pxor xmm6, xmm14
- pxor xmm7, xmm15
- mov eax, r13d
- jne 9b
- movdqa xmm9, xmm0
- punpckldq xmm0, xmm1
- punpckhdq xmm9, xmm1
- movdqa xmm11, xmm2
- punpckldq xmm2, xmm3
- punpckhdq xmm11, xmm3
- movdqa xmm1, xmm0
- punpcklqdq xmm0, xmm2
- punpckhqdq xmm1, xmm2
- movdqa xmm3, xmm9
- punpcklqdq xmm9, xmm11
- punpckhqdq xmm3, xmm11
- movdqu xmmword ptr [rbx], xmm0
- movdqu xmmword ptr [rbx+0x20], xmm1
- movdqu xmmword ptr [rbx+0x40], xmm9
- movdqu xmmword ptr [rbx+0x60], xmm3
- movdqa xmm9, xmm4
- punpckldq xmm4, xmm5
- punpckhdq xmm9, xmm5
- movdqa xmm11, xmm6
- punpckldq xmm6, xmm7
- punpckhdq xmm11, xmm7
- movdqa xmm5, xmm4
- punpcklqdq xmm4, xmm6
- punpckhqdq xmm5, xmm6
- movdqa xmm7, xmm9
- punpcklqdq xmm9, xmm11
- punpckhqdq xmm7, xmm11
- movdqu xmmword ptr [rbx+0x10], xmm4
- movdqu xmmword ptr [rbx+0x30], xmm5
- movdqu xmmword ptr [rbx+0x50], xmm9
- movdqu xmmword ptr [rbx+0x70], xmm7
- movdqa xmm1, xmmword ptr [rsp+0x110]
- movdqa xmm0, xmm1
- paddd xmm1, xmmword ptr [rsp+0x150]
- movdqa xmmword ptr [rsp+0x110], xmm1
- pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
- pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
- pcmpgtd xmm0, xmm1
- movdqa xmm1, xmmword ptr [rsp+0x120]
- psubd xmm1, xmm0
- movdqa xmmword ptr [rsp+0x120], xmm1
- add rbx, 128
- add rdi, 32
- sub rsi, 4
- cmp rsi, 4
- jnc 2b
- test rsi, rsi
- jne 3f
-4:
- movdqa xmm6, xmmword ptr [rsp+0x170]
- movdqa xmm7, xmmword ptr [rsp+0x180]
- movdqa xmm8, xmmword ptr [rsp+0x190]
- movdqa xmm9, xmmword ptr [rsp+0x1A0]
- movdqa xmm10, xmmword ptr [rsp+0x1B0]
- movdqa xmm11, xmmword ptr [rsp+0x1C0]
- movdqa xmm12, xmmword ptr [rsp+0x1D0]
- movdqa xmm13, xmmword ptr [rsp+0x1E0]
- movdqa xmm14, xmmword ptr [rsp+0x1F0]
- movdqa xmm15, xmmword ptr [rsp+0x200]
- mov rsp, rbp
- pop rbp
- pop rbx
- pop rdi
- pop rsi
- pop r12
- pop r13
- pop r14
- pop r15
- ret
-.p2align 5
-3:
- test esi, 0x2
- je 3f
- movups xmm0, xmmword ptr [rcx]
- movups xmm1, xmmword ptr [rcx+0x10]
- movaps xmm8, xmm0
- movaps xmm9, xmm1
- movd xmm13, dword ptr [rsp+0x110]
- movd xmm14, dword ptr [rsp+0x120]
- punpckldq xmm13, xmm14
- movaps xmmword ptr [rsp], xmm13
- movd xmm14, dword ptr [rsp+0x114]
- movd xmm13, dword ptr [rsp+0x124]
- punpckldq xmm14, xmm13
- movaps xmmword ptr [rsp+0x10], xmm14
- mov r8, qword ptr [rdi]
- mov r9, qword ptr [rdi+0x8]
- movzx eax, byte ptr [rbp+0x80]
- or eax, r13d
- xor edx, edx
-2:
- mov r14d, eax
- or eax, r12d
- add rdx, 64
- cmp rdx, r15
- cmovne eax, r14d
- movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
- movaps xmm10, xmm2
- movups xmm4, xmmword ptr [r8+rdx-0x40]
- movups xmm5, xmmword ptr [r8+rdx-0x30]
- movaps xmm3, xmm4
- shufps xmm4, xmm5, 136
- shufps xmm3, xmm5, 221
- movaps xmm5, xmm3
- movups xmm6, xmmword ptr [r8+rdx-0x20]
- movups xmm7, xmmword ptr [r8+rdx-0x10]
- movaps xmm3, xmm6
- shufps xmm6, xmm7, 136
- pshufd xmm6, xmm6, 0x93
- shufps xmm3, xmm7, 221
- pshufd xmm7, xmm3, 0x93
- movups xmm12, xmmword ptr [r9+rdx-0x40]
- movups xmm13, xmmword ptr [r9+rdx-0x30]
- movaps xmm11, xmm12
- shufps xmm12, xmm13, 136
- shufps xmm11, xmm13, 221
- movaps xmm13, xmm11
- movups xmm14, xmmword ptr [r9+rdx-0x20]
- movups xmm15, xmmword ptr [r9+rdx-0x10]
- movaps xmm11, xmm14
- shufps xmm14, xmm15, 136
- pshufd xmm14, xmm14, 0x93
- shufps xmm11, xmm15, 221
- pshufd xmm15, xmm11, 0x93
- shl rax, 0x20
- or rax, 0x40
- movd xmm3, rax
- movdqa xmmword ptr [rsp+0x20], xmm3
- movaps xmm3, xmmword ptr [rsp]
- movaps xmm11, xmmword ptr [rsp+0x10]
- punpcklqdq xmm3, xmmword ptr [rsp+0x20]
- punpcklqdq xmm11, xmmword ptr [rsp+0x20]
- mov al, 7
-9:
- paddd xmm0, xmm4
- paddd xmm8, xmm12
- movaps xmmword ptr [rsp+0x20], xmm4
- movaps xmmword ptr [rsp+0x30], xmm12
- paddd xmm0, xmm1
- paddd xmm8, xmm9
- pxor xmm3, xmm0
- pxor xmm11, xmm8
- pshuflw xmm3, xmm3, 0xB1
- pshufhw xmm3, xmm3, 0xB1
- pshuflw xmm11, xmm11, 0xB1
- pshufhw xmm11, xmm11, 0xB1
- paddd xmm2, xmm3
- paddd xmm10, xmm11
- pxor xmm1, xmm2
- pxor xmm9, xmm10
- movdqa xmm4, xmm1
- pslld xmm1, 20
- psrld xmm4, 12
- por xmm1, xmm4
- movdqa xmm4, xmm9
- pslld xmm9, 20
- psrld xmm4, 12
- por xmm9, xmm4
- paddd xmm0, xmm5
- paddd xmm8, xmm13
- movaps xmmword ptr [rsp+0x40], xmm5
- movaps xmmword ptr [rsp+0x50], xmm13
- paddd xmm0, xmm1
- paddd xmm8, xmm9
- pxor xmm3, xmm0
- pxor xmm11, xmm8
- movdqa xmm13, xmm3
- psrld xmm3, 8
- pslld xmm13, 24
- pxor xmm3, xmm13
- movdqa xmm13, xmm11
- psrld xmm11, 8
- pslld xmm13, 24
- pxor xmm11, xmm13
- paddd xmm2, xmm3
- paddd xmm10, xmm11
- pxor xmm1, xmm2
- pxor xmm9, xmm10
- movdqa xmm4, xmm1
- pslld xmm1, 25
- psrld xmm4, 7
- por xmm1, xmm4
- movdqa xmm4, xmm9
- pslld xmm9, 25
- psrld xmm4, 7
- por xmm9, xmm4
- pshufd xmm0, xmm0, 0x93
- pshufd xmm8, xmm8, 0x93
- pshufd xmm3, xmm3, 0x4E
- pshufd xmm11, xmm11, 0x4E
- pshufd xmm2, xmm2, 0x39
- pshufd xmm10, xmm10, 0x39
- paddd xmm0, xmm6
- paddd xmm8, xmm14
- paddd xmm0, xmm1
- paddd xmm8, xmm9
- pxor xmm3, xmm0
- pxor xmm11, xmm8
- pshuflw xmm3, xmm3, 0xB1
- pshufhw xmm3, xmm3, 0xB1
- pshuflw xmm11, xmm11, 0xB1
- pshufhw xmm11, xmm11, 0xB1
- paddd xmm2, xmm3
- paddd xmm10, xmm11
- pxor xmm1, xmm2
- pxor xmm9, xmm10
- movdqa xmm4, xmm1
- pslld xmm1, 20
- psrld xmm4, 12
- por xmm1, xmm4
- movdqa xmm4, xmm9
- pslld xmm9, 20
- psrld xmm4, 12
- por xmm9, xmm4
- paddd xmm0, xmm7
- paddd xmm8, xmm15
- paddd xmm0, xmm1
- paddd xmm8, xmm9
- pxor xmm3, xmm0
- pxor xmm11, xmm8
- movdqa xmm13, xmm3
- psrld xmm3, 8
- pslld xmm13, 24
- pxor xmm3, xmm13
- movdqa xmm13, xmm11
- psrld xmm11, 8
- pslld xmm13, 24
- pxor xmm11, xmm13
- paddd xmm2, xmm3
- paddd xmm10, xmm11
- pxor xmm1, xmm2
- pxor xmm9, xmm10
- movdqa xmm4, xmm1
- pslld xmm1, 25
- psrld xmm4, 7
- por xmm1, xmm4
- movdqa xmm4, xmm9
- pslld xmm9, 25
- psrld xmm4, 7
- por xmm9, xmm4
- pshufd xmm0, xmm0, 0x39
- pshufd xmm8, xmm8, 0x39
- pshufd xmm3, xmm3, 0x4E
- pshufd xmm11, xmm11, 0x4E
- pshufd xmm2, xmm2, 0x93
- pshufd xmm10, xmm10, 0x93
- dec al
- je 9f
- movdqa xmm12, xmmword ptr [rsp+0x20]
- movdqa xmm5, xmmword ptr [rsp+0x40]
- pshufd xmm13, xmm12, 0x0F
- shufps xmm12, xmm5, 214
- pshufd xmm4, xmm12, 0x39
- movdqa xmm12, xmm6
- shufps xmm12, xmm7, 250
- pand xmm13, xmmword ptr [PBLENDW_0x33_MASK+rip]
- pand xmm12, xmmword ptr [PBLENDW_0xCC_MASK+rip]
- por xmm13, xmm12
- movdqa xmmword ptr [rsp+0x20], xmm13
- movdqa xmm12, xmm7
- punpcklqdq xmm12, xmm5
- movdqa xmm13, xmm6
- pand xmm12, xmmword ptr [PBLENDW_0x3F_MASK+rip]
- pand xmm13, xmmword ptr [PBLENDW_0xC0_MASK+rip]
- por xmm12, xmm13
- pshufd xmm12, xmm12, 0x78
- punpckhdq xmm5, xmm7
- punpckldq xmm6, xmm5
- pshufd xmm7, xmm6, 0x1E
- movdqa xmmword ptr [rsp+0x40], xmm12
- movdqa xmm5, xmmword ptr [rsp+0x30]
- movdqa xmm13, xmmword ptr [rsp+0x50]
- pshufd xmm6, xmm5, 0x0F
- shufps xmm5, xmm13, 214
- pshufd xmm12, xmm5, 0x39
- movdqa xmm5, xmm14
- shufps xmm5, xmm15, 250
- pand xmm6, xmmword ptr [PBLENDW_0x33_MASK+rip]
- pand xmm5, xmmword ptr [PBLENDW_0xCC_MASK+rip]
- por xmm6, xmm5
- movdqa xmm5, xmm15
- punpcklqdq xmm5, xmm13
- movdqa xmmword ptr [rsp+0x30], xmm2
- movdqa xmm2, xmm14
- pand xmm5, xmmword ptr [PBLENDW_0x3F_MASK+rip]
- pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip]
- por xmm5, xmm2
- movdqa xmm2, xmmword ptr [rsp+0x30]
- pshufd xmm5, xmm5, 0x78
- punpckhdq xmm13, xmm15
- punpckldq xmm14, xmm13
- pshufd xmm15, xmm14, 0x1E
- movdqa xmm13, xmm6
- movdqa xmm14, xmm5
- movdqa xmm5, xmmword ptr [rsp+0x20]
- movdqa xmm6, xmmword ptr [rsp+0x40]
- jmp 9b
-9:
- pxor xmm0, xmm2
- pxor xmm1, xmm3
- pxor xmm8, xmm10
- pxor xmm9, xmm11
- mov eax, r13d
- cmp rdx, r15
- jne 2b
- movups xmmword ptr [rbx], xmm0
- movups xmmword ptr [rbx+0x10], xmm1
- movups xmmword ptr [rbx+0x20], xmm8
- movups xmmword ptr [rbx+0x30], xmm9
- mov eax, dword ptr [rsp+0x130]
- neg eax
- mov r10d, dword ptr [rsp+0x110+8*rax]
- mov r11d, dword ptr [rsp+0x120+8*rax]
- mov dword ptr [rsp+0x110], r10d
- mov dword ptr [rsp+0x120], r11d
- add rdi, 16
- add rbx, 64
- sub rsi, 2
-3:
- test esi, 0x1
- je 4b
- movups xmm0, xmmword ptr [rcx]
- movups xmm1, xmmword ptr [rcx+0x10]
- movd xmm13, dword ptr [rsp+0x110]
- movd xmm14, dword ptr [rsp+0x120]
- punpckldq xmm13, xmm14
- mov r8, qword ptr [rdi]
- movzx eax, byte ptr [rbp+0x80]
- or eax, r13d
- xor edx, edx
-2:
- mov r14d, eax
- or eax, r12d
- add rdx, 64
- cmp rdx, r15
- cmovne eax, r14d
- movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
- shl rax, 32
- or rax, 64
- movd xmm12, rax
- movdqa xmm3, xmm13
- punpcklqdq xmm3, xmm12
- movups xmm4, xmmword ptr [r8+rdx-0x40]
- movups xmm5, xmmword ptr [r8+rdx-0x30]
- movaps xmm8, xmm4
- shufps xmm4, xmm5, 136
- shufps xmm8, xmm5, 221
- movaps xmm5, xmm8
- movups xmm6, xmmword ptr [r8+rdx-0x20]
- movups xmm7, xmmword ptr [r8+rdx-0x10]
- movaps xmm8, xmm6
- shufps xmm6, xmm7, 136
- pshufd xmm6, xmm6, 0x93
- shufps xmm8, xmm7, 221
- pshufd xmm7, xmm8, 0x93
- mov al, 7
-9:
- paddd xmm0, xmm4
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- pshuflw xmm3, xmm3, 0xB1
- pshufhw xmm3, xmm3, 0xB1
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 20
- psrld xmm11, 12
- por xmm1, xmm11
- paddd xmm0, xmm5
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- movdqa xmm14, xmm3
- psrld xmm3, 8
- pslld xmm14, 24
- pxor xmm3, xmm14
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 25
- psrld xmm11, 7
- por xmm1, xmm11
- pshufd xmm0, xmm0, 0x93
- pshufd xmm3, xmm3, 0x4E
- pshufd xmm2, xmm2, 0x39
- paddd xmm0, xmm6
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- pshuflw xmm3, xmm3, 0xB1
- pshufhw xmm3, xmm3, 0xB1
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 20
- psrld xmm11, 12
- por xmm1, xmm11
- paddd xmm0, xmm7
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- movdqa xmm14, xmm3
- psrld xmm3, 8
- pslld xmm14, 24
- pxor xmm3, xmm14
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 25
- psrld xmm11, 7
- por xmm1, xmm11
- pshufd xmm0, xmm0, 0x39
- pshufd xmm3, xmm3, 0x4E
- pshufd xmm2, xmm2, 0x93
- dec al
- jz 9f
- movdqa xmm8, xmm4
- shufps xmm8, xmm5, 214
- pshufd xmm9, xmm4, 0x0F
- pshufd xmm4, xmm8, 0x39
- movdqa xmm8, xmm6
- shufps xmm8, xmm7, 250
- pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
- pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
- por xmm9, xmm8
- movdqa xmm8, xmm7
- punpcklqdq xmm8, xmm5
- movdqa xmm10, xmm6
- pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
- pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
- por xmm8, xmm10
- pshufd xmm8, xmm8, 0x78
- punpckhdq xmm5, xmm7
- punpckldq xmm6, xmm5
- pshufd xmm7, xmm6, 0x1E
- movdqa xmm5, xmm9
- movdqa xmm6, xmm8
- jmp 9b
-9:
- pxor xmm0, xmm2
- pxor xmm1, xmm3
- mov eax, r13d
- cmp rdx, r15
- jne 2b
- movups xmmword ptr [rbx], xmm0
- movups xmmword ptr [rbx+0x10], xmm1
- jmp 4b
-
-.p2align 6
-blake3_compress_in_place_sse2:
-_blake3_compress_in_place_sse2:
- sub rsp, 120
- movdqa xmmword ptr [rsp], xmm6
- movdqa xmmword ptr [rsp+0x10], xmm7
- movdqa xmmword ptr [rsp+0x20], xmm8
- movdqa xmmword ptr [rsp+0x30], xmm9
- movdqa xmmword ptr [rsp+0x40], xmm11
- movdqa xmmword ptr [rsp+0x50], xmm14
- movdqa xmmword ptr [rsp+0x60], xmm15
- movups xmm0, xmmword ptr [rcx]
- movups xmm1, xmmword ptr [rcx+0x10]
- movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
- movzx eax, byte ptr [rsp+0xA0]
- movzx r8d, r8b
- shl rax, 32
- add r8, rax
- movq xmm3, r9
- movq xmm4, r8
- punpcklqdq xmm3, xmm4
- movups xmm4, xmmword ptr [rdx]
- movups xmm5, xmmword ptr [rdx+0x10]
- movaps xmm8, xmm4
- shufps xmm4, xmm5, 136
- shufps xmm8, xmm5, 221
- movaps xmm5, xmm8
- movups xmm6, xmmword ptr [rdx+0x20]
- movups xmm7, xmmword ptr [rdx+0x30]
- movaps xmm8, xmm6
- shufps xmm6, xmm7, 136
- pshufd xmm6, xmm6, 0x93
- shufps xmm8, xmm7, 221
- pshufd xmm7, xmm8, 0x93
- mov al, 7
-9:
- paddd xmm0, xmm4
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- pshuflw xmm3, xmm3, 0xB1
- pshufhw xmm3, xmm3, 0xB1
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 20
- psrld xmm11, 12
- por xmm1, xmm11
- paddd xmm0, xmm5
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- movdqa xmm14, xmm3
- psrld xmm3, 8
- pslld xmm14, 24
- pxor xmm3, xmm14
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 25
- psrld xmm11, 7
- por xmm1, xmm11
- pshufd xmm0, xmm0, 0x93
- pshufd xmm3, xmm3, 0x4E
- pshufd xmm2, xmm2, 0x39
- paddd xmm0, xmm6
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- pshuflw xmm3, xmm3, 0xB1
- pshufhw xmm3, xmm3, 0xB1
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 20
- psrld xmm11, 12
- por xmm1, xmm11
- paddd xmm0, xmm7
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- movdqa xmm14, xmm3
- psrld xmm3, 8
- pslld xmm14, 24
- pxor xmm3, xmm14
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 25
- psrld xmm11, 7
- por xmm1, xmm11
- pshufd xmm0, xmm0, 0x39
- pshufd xmm3, xmm3, 0x4E
- pshufd xmm2, xmm2, 0x93
- dec al
- jz 9f
- movdqa xmm8, xmm4
- shufps xmm8, xmm5, 214
- pshufd xmm9, xmm4, 0x0F
- pshufd xmm4, xmm8, 0x39
- movdqa xmm8, xmm6
- shufps xmm8, xmm7, 250
- pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
- pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
- por xmm9, xmm8
- movdqa xmm8, xmm7
- punpcklqdq xmm8, xmm5
- movdqa xmm10, xmm6
- pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
- pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
- por xmm8, xmm10
- pshufd xmm8, xmm8, 0x78
- punpckhdq xmm5, xmm7
- punpckldq xmm6, xmm5
- pshufd xmm7, xmm6, 0x1E
- movdqa xmm5, xmm9
- movdqa xmm6, xmm8
- jmp 9b
-9:
- pxor xmm0, xmm2
- pxor xmm1, xmm3
- movups xmmword ptr [rcx], xmm0
- movups xmmword ptr [rcx+0x10], xmm1
- movdqa xmm6, xmmword ptr [rsp]
- movdqa xmm7, xmmword ptr [rsp+0x10]
- movdqa xmm8, xmmword ptr [rsp+0x20]
- movdqa xmm9, xmmword ptr [rsp+0x30]
- movdqa xmm11, xmmword ptr [rsp+0x40]
- movdqa xmm14, xmmword ptr [rsp+0x50]
- movdqa xmm15, xmmword ptr [rsp+0x60]
- add rsp, 120
- ret
-
-
-.p2align 6
-_blake3_compress_xof_sse2:
-blake3_compress_xof_sse2:
- sub rsp, 120
- movdqa xmmword ptr [rsp], xmm6
- movdqa xmmword ptr [rsp+0x10], xmm7
- movdqa xmmword ptr [rsp+0x20], xmm8
- movdqa xmmword ptr [rsp+0x30], xmm9
- movdqa xmmword ptr [rsp+0x40], xmm11
- movdqa xmmword ptr [rsp+0x50], xmm14
- movdqa xmmword ptr [rsp+0x60], xmm15
- movups xmm0, xmmword ptr [rcx]
- movups xmm1, xmmword ptr [rcx+0x10]
- movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
- movzx eax, byte ptr [rsp+0xA0]
- movzx r8d, r8b
- mov r10, qword ptr [rsp+0xA8]
- shl rax, 32
- add r8, rax
- movq xmm3, r9
- movq xmm4, r8
- punpcklqdq xmm3, xmm4
- movups xmm4, xmmword ptr [rdx]
- movups xmm5, xmmword ptr [rdx+0x10]
- movaps xmm8, xmm4
- shufps xmm4, xmm5, 136
- shufps xmm8, xmm5, 221
- movaps xmm5, xmm8
- movups xmm6, xmmword ptr [rdx+0x20]
- movups xmm7, xmmword ptr [rdx+0x30]
- movaps xmm8, xmm6
- shufps xmm6, xmm7, 136
- pshufd xmm6, xmm6, 0x93
- shufps xmm8, xmm7, 221
- pshufd xmm7, xmm8, 0x93
- mov al, 7
-9:
- paddd xmm0, xmm4
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- pshuflw xmm3, xmm3, 0xB1
- pshufhw xmm3, xmm3, 0xB1
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 20
- psrld xmm11, 12
- por xmm1, xmm11
- paddd xmm0, xmm5
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- movdqa xmm14, xmm3
- psrld xmm3, 8
- pslld xmm14, 24
- pxor xmm3, xmm14
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 25
- psrld xmm11, 7
- por xmm1, xmm11
- pshufd xmm0, xmm0, 0x93
- pshufd xmm3, xmm3, 0x4E
- pshufd xmm2, xmm2, 0x39
- paddd xmm0, xmm6
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- pshuflw xmm3, xmm3, 0xB1
- pshufhw xmm3, xmm3, 0xB1
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 20
- psrld xmm11, 12
- por xmm1, xmm11
- paddd xmm0, xmm7
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- movdqa xmm14, xmm3
- psrld xmm3, 8
- pslld xmm14, 24
- pxor xmm3, xmm14
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 25
- psrld xmm11, 7
- por xmm1, xmm11
- pshufd xmm0, xmm0, 0x39
- pshufd xmm3, xmm3, 0x4E
- pshufd xmm2, xmm2, 0x93
- dec al
- jz 9f
- movdqa xmm8, xmm4
- shufps xmm8, xmm5, 214
- pshufd xmm9, xmm4, 0x0F
- pshufd xmm4, xmm8, 0x39
- movdqa xmm8, xmm6
- shufps xmm8, xmm7, 250
- pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
- pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
- por xmm9, xmm8
- movdqa xmm8, xmm7
- punpcklqdq xmm8, xmm5
- movdqa xmm10, xmm6
- pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
- pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
- por xmm8, xmm10
- pshufd xmm8, xmm8, 0x78
- punpckhdq xmm5, xmm7
- punpckldq xmm6, xmm5
- pshufd xmm7, xmm6, 0x1E
- movdqa xmm5, xmm9
- movdqa xmm6, xmm8
- jmp 9b
-9:
- movdqu xmm4, xmmword ptr [rcx]
- movdqu xmm5, xmmword ptr [rcx+0x10]
- pxor xmm0, xmm2
- pxor xmm1, xmm3
- pxor xmm2, xmm4
- pxor xmm3, xmm5
- movups xmmword ptr [r10], xmm0
- movups xmmword ptr [r10+0x10], xmm1
- movups xmmword ptr [r10+0x20], xmm2
- movups xmmword ptr [r10+0x30], xmm3
- movdqa xmm6, xmmword ptr [rsp]
- movdqa xmm7, xmmword ptr [rsp+0x10]
- movdqa xmm8, xmmword ptr [rsp+0x20]
- movdqa xmm9, xmmword ptr [rsp+0x30]
- movdqa xmm11, xmmword ptr [rsp+0x40]
- movdqa xmm14, xmmword ptr [rsp+0x50]
- movdqa xmm15, xmmword ptr [rsp+0x60]
- add rsp, 120
- ret
-
-
-.section .rodata
-.p2align 6
-BLAKE3_IV:
- .long 0x6A09E667, 0xBB67AE85
- .long 0x3C6EF372, 0xA54FF53A
-ADD0:
- .long 0, 1, 2, 3
-ADD1:
- .long 4, 4, 4, 4
-BLAKE3_IV_0:
- .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
-BLAKE3_IV_1:
- .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
-BLAKE3_IV_2:
- .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
-BLAKE3_IV_3:
- .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
-BLAKE3_BLOCK_LEN:
- .long 64, 64, 64, 64
-CMP_MSB_MASK:
- .long 0x80000000, 0x80000000, 0x80000000, 0x80000000
-PBLENDW_0x33_MASK:
- .long 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000
-PBLENDW_0xCC_MASK:
- .long 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF
-PBLENDW_0x3F_MASK:
- .long 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000
-PBLENDW_0xC0_MASK:
- .long 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF
diff --git a/thirdparty/BLAKE3/c/blake3_sse2_x86-64_windows_msvc.asm b/thirdparty/BLAKE3/c/blake3_sse2_x86-64_windows_msvc.asm
deleted file mode 100644
index 72deb7bbc..000000000
--- a/thirdparty/BLAKE3/c/blake3_sse2_x86-64_windows_msvc.asm
+++ /dev/null
@@ -1,2350 +0,0 @@
-public _blake3_hash_many_sse2
-public blake3_hash_many_sse2
-public blake3_compress_in_place_sse2
-public _blake3_compress_in_place_sse2
-public blake3_compress_xof_sse2
-public _blake3_compress_xof_sse2
-
-_TEXT SEGMENT ALIGN(16) 'CODE'
-
-ALIGN 16
-blake3_hash_many_sse2 PROC
-_blake3_hash_many_sse2 PROC
- push r15
- push r14
- push r13
- push r12
- push rsi
- push rdi
- push rbx
- push rbp
- mov rbp, rsp
- sub rsp, 528
- and rsp, 0FFFFFFFFFFFFFFC0H
- movdqa xmmword ptr [rsp+170H], xmm6
- movdqa xmmword ptr [rsp+180H], xmm7
- movdqa xmmword ptr [rsp+190H], xmm8
- movdqa xmmword ptr [rsp+1A0H], xmm9
- movdqa xmmword ptr [rsp+1B0H], xmm10
- movdqa xmmword ptr [rsp+1C0H], xmm11
- movdqa xmmword ptr [rsp+1D0H], xmm12
- movdqa xmmword ptr [rsp+1E0H], xmm13
- movdqa xmmword ptr [rsp+1F0H], xmm14
- movdqa xmmword ptr [rsp+200H], xmm15
- mov rdi, rcx
- mov rsi, rdx
- mov rdx, r8
- mov rcx, r9
- mov r8, qword ptr [rbp+68H]
- movzx r9, byte ptr [rbp+70H]
- neg r9d
- movd xmm0, r9d
- pshufd xmm0, xmm0, 00H
- movdqa xmmword ptr [rsp+130H], xmm0
- movdqa xmm1, xmm0
- pand xmm1, xmmword ptr [ADD0]
- pand xmm0, xmmword ptr [ADD1]
- movdqa xmmword ptr [rsp+150H], xmm0
- movd xmm0, r8d
- pshufd xmm0, xmm0, 00H
- paddd xmm0, xmm1
- movdqa xmmword ptr [rsp+110H], xmm0
- pxor xmm0, xmmword ptr [CMP_MSB_MASK]
- pxor xmm1, xmmword ptr [CMP_MSB_MASK]
- pcmpgtd xmm1, xmm0
- shr r8, 32
- movd xmm2, r8d
- pshufd xmm2, xmm2, 00H
- psubd xmm2, xmm1
- movdqa xmmword ptr [rsp+120H], xmm2
- mov rbx, qword ptr [rbp+90H]
- mov r15, rdx
- shl r15, 6
- movzx r13d, byte ptr [rbp+78H]
- movzx r12d, byte ptr [rbp+88H]
- cmp rsi, 4
- jc final3blocks
-outerloop4:
- movdqu xmm3, xmmword ptr [rcx]
- pshufd xmm0, xmm3, 00H
- pshufd xmm1, xmm3, 55H
- pshufd xmm2, xmm3, 0AAH
- pshufd xmm3, xmm3, 0FFH
- movdqu xmm7, xmmword ptr [rcx+10H]
- pshufd xmm4, xmm7, 00H
- pshufd xmm5, xmm7, 55H
- pshufd xmm6, xmm7, 0AAH
- pshufd xmm7, xmm7, 0FFH
- mov r8, qword ptr [rdi]
- mov r9, qword ptr [rdi+8H]
- mov r10, qword ptr [rdi+10H]
- mov r11, qword ptr [rdi+18H]
- movzx eax, byte ptr [rbp+80H]
- or eax, r13d
- xor edx, edx
-innerloop4:
- mov r14d, eax
- or eax, r12d
- add rdx, 64
- cmp rdx, r15
- cmovne eax, r14d
- movdqu xmm8, xmmword ptr [r8+rdx-40H]
- movdqu xmm9, xmmword ptr [r9+rdx-40H]
- movdqu xmm10, xmmword ptr [r10+rdx-40H]
- movdqu xmm11, xmmword ptr [r11+rdx-40H]
- movdqa xmm12, xmm8
- punpckldq xmm8, xmm9
- punpckhdq xmm12, xmm9
- movdqa xmm14, xmm10
- punpckldq xmm10, xmm11
- punpckhdq xmm14, xmm11
- movdqa xmm9, xmm8
- punpcklqdq xmm8, xmm10
- punpckhqdq xmm9, xmm10
- movdqa xmm13, xmm12
- punpcklqdq xmm12, xmm14
- punpckhqdq xmm13, xmm14
- movdqa xmmword ptr [rsp], xmm8
- movdqa xmmword ptr [rsp+10H], xmm9
- movdqa xmmword ptr [rsp+20H], xmm12
- movdqa xmmword ptr [rsp+30H], xmm13
- movdqu xmm8, xmmword ptr [r8+rdx-30H]
- movdqu xmm9, xmmword ptr [r9+rdx-30H]
- movdqu xmm10, xmmword ptr [r10+rdx-30H]
- movdqu xmm11, xmmword ptr [r11+rdx-30H]
- movdqa xmm12, xmm8
- punpckldq xmm8, xmm9
- punpckhdq xmm12, xmm9
- movdqa xmm14, xmm10
- punpckldq xmm10, xmm11
- punpckhdq xmm14, xmm11
- movdqa xmm9, xmm8
- punpcklqdq xmm8, xmm10
- punpckhqdq xmm9, xmm10
- movdqa xmm13, xmm12
- punpcklqdq xmm12, xmm14
- punpckhqdq xmm13, xmm14
- movdqa xmmword ptr [rsp+40H], xmm8
- movdqa xmmword ptr [rsp+50H], xmm9
- movdqa xmmword ptr [rsp+60H], xmm12
- movdqa xmmword ptr [rsp+70H], xmm13
- movdqu xmm8, xmmword ptr [r8+rdx-20H]
- movdqu xmm9, xmmword ptr [r9+rdx-20H]
- movdqu xmm10, xmmword ptr [r10+rdx-20H]
- movdqu xmm11, xmmword ptr [r11+rdx-20H]
- movdqa xmm12, xmm8
- punpckldq xmm8, xmm9
- punpckhdq xmm12, xmm9
- movdqa xmm14, xmm10
- punpckldq xmm10, xmm11
- punpckhdq xmm14, xmm11
- movdqa xmm9, xmm8
- punpcklqdq xmm8, xmm10
- punpckhqdq xmm9, xmm10
- movdqa xmm13, xmm12
- punpcklqdq xmm12, xmm14
- punpckhqdq xmm13, xmm14
- movdqa xmmword ptr [rsp+80H], xmm8
- movdqa xmmword ptr [rsp+90H], xmm9
- movdqa xmmword ptr [rsp+0A0H], xmm12
- movdqa xmmword ptr [rsp+0B0H], xmm13
- movdqu xmm8, xmmword ptr [r8+rdx-10H]
- movdqu xmm9, xmmword ptr [r9+rdx-10H]
- movdqu xmm10, xmmword ptr [r10+rdx-10H]
- movdqu xmm11, xmmword ptr [r11+rdx-10H]
- movdqa xmm12, xmm8
- punpckldq xmm8, xmm9
- punpckhdq xmm12, xmm9
- movdqa xmm14, xmm10
- punpckldq xmm10, xmm11
- punpckhdq xmm14, xmm11
- movdqa xmm9, xmm8
- punpcklqdq xmm8, xmm10
- punpckhqdq xmm9, xmm10
- movdqa xmm13, xmm12
- punpcklqdq xmm12, xmm14
- punpckhqdq xmm13, xmm14
- movdqa xmmword ptr [rsp+0C0H], xmm8
- movdqa xmmword ptr [rsp+0D0H], xmm9
- movdqa xmmword ptr [rsp+0E0H], xmm12
- movdqa xmmword ptr [rsp+0F0H], xmm13
- movdqa xmm9, xmmword ptr [BLAKE3_IV_1]
- movdqa xmm10, xmmword ptr [BLAKE3_IV_2]
- movdqa xmm11, xmmword ptr [BLAKE3_IV_3]
- movdqa xmm12, xmmword ptr [rsp+110H]
- movdqa xmm13, xmmword ptr [rsp+120H]
- movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN]
- movd xmm15, eax
- pshufd xmm15, xmm15, 00H
- prefetcht0 byte ptr [r8+rdx+80H]
- prefetcht0 byte ptr [r9+rdx+80H]
- prefetcht0 byte ptr [r10+rdx+80H]
- prefetcht0 byte ptr [r11+rdx+80H]
- paddd xmm0, xmmword ptr [rsp]
- paddd xmm1, xmmword ptr [rsp+20H]
- paddd xmm2, xmmword ptr [rsp+40H]
- paddd xmm3, xmmword ptr [rsp+60H]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- pshuflw xmm12, xmm12, 0B1H
- pshufhw xmm12, xmm12, 0B1H
- pshuflw xmm13, xmm13, 0B1H
- pshufhw xmm13, xmm13, 0B1H
- pshuflw xmm14, xmm14, 0B1H
- pshufhw xmm14, xmm14, 0B1H
- pshuflw xmm15, xmm15, 0B1H
- pshufhw xmm15, xmm15, 0B1H
- movdqa xmm8, xmmword ptr [BLAKE3_IV_0]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+100H], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+10H]
- paddd xmm1, xmmword ptr [rsp+30H]
- paddd xmm2, xmmword ptr [rsp+50H]
- paddd xmm3, xmmword ptr [rsp+70H]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- movdqa xmm8, xmm12
- psrld xmm12, 8
- pslld xmm8, 24
- pxor xmm12, xmm8
- movdqa xmm8, xmm13
- psrld xmm13, 8
- pslld xmm8, 24
- pxor xmm13, xmm8
- movdqa xmm8, xmm14
- psrld xmm14, 8
- pslld xmm8, 24
- pxor xmm14, xmm8
- movdqa xmm8, xmm15
- psrld xmm15, 8
- pslld xmm8, 24
- pxor xmm15, xmm8
- movdqa xmm8, xmmword ptr [rsp+100H]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+100H], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+80H]
- paddd xmm1, xmmword ptr [rsp+0A0H]
- paddd xmm2, xmmword ptr [rsp+0C0H]
- paddd xmm3, xmmword ptr [rsp+0E0H]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- pshuflw xmm15, xmm15, 0B1H
- pshufhw xmm15, xmm15, 0B1H
- pshuflw xmm12, xmm12, 0B1H
- pshufhw xmm12, xmm12, 0B1H
- pshuflw xmm13, xmm13, 0B1H
- pshufhw xmm13, xmm13, 0B1H
- pshuflw xmm14, xmm14, 0B1H
- pshufhw xmm14, xmm14, 0B1H
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+100H]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+100H], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+90H]
- paddd xmm1, xmmword ptr [rsp+0B0H]
- paddd xmm2, xmmword ptr [rsp+0D0H]
- paddd xmm3, xmmword ptr [rsp+0F0H]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- movdqa xmm8, xmm15
- psrld xmm15, 8
- pslld xmm8, 24
- pxor xmm15, xmm8
- movdqa xmm8, xmm12
- psrld xmm12, 8
- pslld xmm8, 24
- pxor xmm12, xmm8
- movdqa xmm8, xmm13
- psrld xmm13, 8
- pslld xmm8, 24
- pxor xmm13, xmm8
- movdqa xmm8, xmm14
- psrld xmm14, 8
- pslld xmm8, 24
- pxor xmm14, xmm8
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+100H]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+100H], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+20H]
- paddd xmm1, xmmword ptr [rsp+30H]
- paddd xmm2, xmmword ptr [rsp+70H]
- paddd xmm3, xmmword ptr [rsp+40H]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- pshuflw xmm12, xmm12, 0B1H
- pshufhw xmm12, xmm12, 0B1H
- pshuflw xmm13, xmm13, 0B1H
- pshufhw xmm13, xmm13, 0B1H
- pshuflw xmm14, xmm14, 0B1H
- pshufhw xmm14, xmm14, 0B1H
- pshuflw xmm15, xmm15, 0B1H
- pshufhw xmm15, xmm15, 0B1H
- movdqa xmm8, xmmword ptr [rsp+100H]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+100H], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+60H]
- paddd xmm1, xmmword ptr [rsp+0A0H]
- paddd xmm2, xmmword ptr [rsp]
- paddd xmm3, xmmword ptr [rsp+0D0H]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- movdqa xmm8, xmm12
- psrld xmm12, 8
- pslld xmm8, 24
- pxor xmm12, xmm8
- movdqa xmm8, xmm13
- psrld xmm13, 8
- pslld xmm8, 24
- pxor xmm13, xmm8
- movdqa xmm8, xmm14
- psrld xmm14, 8
- pslld xmm8, 24
- pxor xmm14, xmm8
- movdqa xmm8, xmm15
- psrld xmm15, 8
- pslld xmm8, 24
- pxor xmm15, xmm8
- movdqa xmm8, xmmword ptr [rsp+100H]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+100H], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+10H]
- paddd xmm1, xmmword ptr [rsp+0C0H]
- paddd xmm2, xmmword ptr [rsp+90H]
- paddd xmm3, xmmword ptr [rsp+0F0H]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- pshuflw xmm15, xmm15, 0B1H
- pshufhw xmm15, xmm15, 0B1H
- pshuflw xmm12, xmm12, 0B1H
- pshufhw xmm12, xmm12, 0B1H
- pshuflw xmm13, xmm13, 0B1H
- pshufhw xmm13, xmm13, 0B1H
- pshuflw xmm14, xmm14, 0B1H
- pshufhw xmm14, xmm14, 0B1H
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+100H]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+100H], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+0B0H]
- paddd xmm1, xmmword ptr [rsp+50H]
- paddd xmm2, xmmword ptr [rsp+0E0H]
- paddd xmm3, xmmword ptr [rsp+80H]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- movdqa xmm8, xmm15
- psrld xmm15, 8
- pslld xmm8, 24
- pxor xmm15, xmm8
- movdqa xmm8, xmm12
- psrld xmm12, 8
- pslld xmm8, 24
- pxor xmm12, xmm8
- movdqa xmm8, xmm13
- psrld xmm13, 8
- pslld xmm8, 24
- pxor xmm13, xmm8
- movdqa xmm8, xmm14
- psrld xmm14, 8
- pslld xmm8, 24
- pxor xmm14, xmm8
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+100H]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+100H], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+30H]
- paddd xmm1, xmmword ptr [rsp+0A0H]
- paddd xmm2, xmmword ptr [rsp+0D0H]
- paddd xmm3, xmmword ptr [rsp+70H]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- pshuflw xmm12, xmm12, 0B1H
- pshufhw xmm12, xmm12, 0B1H
- pshuflw xmm13, xmm13, 0B1H
- pshufhw xmm13, xmm13, 0B1H
- pshuflw xmm14, xmm14, 0B1H
- pshufhw xmm14, xmm14, 0B1H
- pshuflw xmm15, xmm15, 0B1H
- pshufhw xmm15, xmm15, 0B1H
- movdqa xmm8, xmmword ptr [rsp+100H]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+100H], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+40H]
- paddd xmm1, xmmword ptr [rsp+0C0H]
- paddd xmm2, xmmword ptr [rsp+20H]
- paddd xmm3, xmmword ptr [rsp+0E0H]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- movdqa xmm8, xmm12
- psrld xmm12, 8
- pslld xmm8, 24
- pxor xmm12, xmm8
- movdqa xmm8, xmm13
- psrld xmm13, 8
- pslld xmm8, 24
- pxor xmm13, xmm8
- movdqa xmm8, xmm14
- psrld xmm14, 8
- pslld xmm8, 24
- pxor xmm14, xmm8
- movdqa xmm8, xmm15
- psrld xmm15, 8
- pslld xmm8, 24
- pxor xmm15, xmm8
- movdqa xmm8, xmmword ptr [rsp+100H]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+100H], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+60H]
- paddd xmm1, xmmword ptr [rsp+90H]
- paddd xmm2, xmmword ptr [rsp+0B0H]
- paddd xmm3, xmmword ptr [rsp+80H]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- pshuflw xmm15, xmm15, 0B1H
- pshufhw xmm15, xmm15, 0B1H
- pshuflw xmm12, xmm12, 0B1H
- pshufhw xmm12, xmm12, 0B1H
- pshuflw xmm13, xmm13, 0B1H
- pshufhw xmm13, xmm13, 0B1H
- pshuflw xmm14, xmm14, 0B1H
- pshufhw xmm14, xmm14, 0B1H
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+100H]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+100H], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+50H]
- paddd xmm1, xmmword ptr [rsp]
- paddd xmm2, xmmword ptr [rsp+0F0H]
- paddd xmm3, xmmword ptr [rsp+10H]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- movdqa xmm8, xmm15
- psrld xmm15, 8
- pslld xmm8, 24
- pxor xmm15, xmm8
- movdqa xmm8, xmm12
- psrld xmm12, 8
- pslld xmm8, 24
- pxor xmm12, xmm8
- movdqa xmm8, xmm13
- psrld xmm13, 8
- pslld xmm8, 24
- pxor xmm13, xmm8
- movdqa xmm8, xmm14
- psrld xmm14, 8
- pslld xmm8, 24
- pxor xmm14, xmm8
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+100H]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+100H], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+0A0H]
- paddd xmm1, xmmword ptr [rsp+0C0H]
- paddd xmm2, xmmword ptr [rsp+0E0H]
- paddd xmm3, xmmword ptr [rsp+0D0H]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- pshuflw xmm12, xmm12, 0B1H
- pshufhw xmm12, xmm12, 0B1H
- pshuflw xmm13, xmm13, 0B1H
- pshufhw xmm13, xmm13, 0B1H
- pshuflw xmm14, xmm14, 0B1H
- pshufhw xmm14, xmm14, 0B1H
- pshuflw xmm15, xmm15, 0B1H
- pshufhw xmm15, xmm15, 0B1H
- movdqa xmm8, xmmword ptr [rsp+100H]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+100H], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+70H]
- paddd xmm1, xmmword ptr [rsp+90H]
- paddd xmm2, xmmword ptr [rsp+30H]
- paddd xmm3, xmmword ptr [rsp+0F0H]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- movdqa xmm8, xmm12
- psrld xmm12, 8
- pslld xmm8, 24
- pxor xmm12, xmm8
- movdqa xmm8, xmm13
- psrld xmm13, 8
- pslld xmm8, 24
- pxor xmm13, xmm8
- movdqa xmm8, xmm14
- psrld xmm14, 8
- pslld xmm8, 24
- pxor xmm14, xmm8
- movdqa xmm8, xmm15
- psrld xmm15, 8
- pslld xmm8, 24
- pxor xmm15, xmm8
- movdqa xmm8, xmmword ptr [rsp+100H]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+100H], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+40H]
- paddd xmm1, xmmword ptr [rsp+0B0H]
- paddd xmm2, xmmword ptr [rsp+50H]
- paddd xmm3, xmmword ptr [rsp+10H]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- pshuflw xmm15, xmm15, 0B1H
- pshufhw xmm15, xmm15, 0B1H
- pshuflw xmm12, xmm12, 0B1H
- pshufhw xmm12, xmm12, 0B1H
- pshuflw xmm13, xmm13, 0B1H
- pshufhw xmm13, xmm13, 0B1H
- pshuflw xmm14, xmm14, 0B1H
- pshufhw xmm14, xmm14, 0B1H
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+100H]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+100H], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp]
- paddd xmm1, xmmword ptr [rsp+20H]
- paddd xmm2, xmmword ptr [rsp+80H]
- paddd xmm3, xmmword ptr [rsp+60H]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- movdqa xmm8, xmm15
- psrld xmm15, 8
- pslld xmm8, 24
- pxor xmm15, xmm8
- movdqa xmm8, xmm12
- psrld xmm12, 8
- pslld xmm8, 24
- pxor xmm12, xmm8
- movdqa xmm8, xmm13
- psrld xmm13, 8
- pslld xmm8, 24
- pxor xmm13, xmm8
- movdqa xmm8, xmm14
- psrld xmm14, 8
- pslld xmm8, 24
- pxor xmm14, xmm8
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+100H]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+100H], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+0C0H]
- paddd xmm1, xmmword ptr [rsp+90H]
- paddd xmm2, xmmword ptr [rsp+0F0H]
- paddd xmm3, xmmword ptr [rsp+0E0H]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- pshuflw xmm12, xmm12, 0B1H
- pshufhw xmm12, xmm12, 0B1H
- pshuflw xmm13, xmm13, 0B1H
- pshufhw xmm13, xmm13, 0B1H
- pshuflw xmm14, xmm14, 0B1H
- pshufhw xmm14, xmm14, 0B1H
- pshuflw xmm15, xmm15, 0B1H
- pshufhw xmm15, xmm15, 0B1H
- movdqa xmm8, xmmword ptr [rsp+100H]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+100H], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+0D0H]
- paddd xmm1, xmmword ptr [rsp+0B0H]
- paddd xmm2, xmmword ptr [rsp+0A0H]
- paddd xmm3, xmmword ptr [rsp+80H]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- movdqa xmm8, xmm12
- psrld xmm12, 8
- pslld xmm8, 24
- pxor xmm12, xmm8
- movdqa xmm8, xmm13
- psrld xmm13, 8
- pslld xmm8, 24
- pxor xmm13, xmm8
- movdqa xmm8, xmm14
- psrld xmm14, 8
- pslld xmm8, 24
- pxor xmm14, xmm8
- movdqa xmm8, xmm15
- psrld xmm15, 8
- pslld xmm8, 24
- pxor xmm15, xmm8
- movdqa xmm8, xmmword ptr [rsp+100H]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+100H], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+70H]
- paddd xmm1, xmmword ptr [rsp+50H]
- paddd xmm2, xmmword ptr [rsp]
- paddd xmm3, xmmword ptr [rsp+60H]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- pshuflw xmm15, xmm15, 0B1H
- pshufhw xmm15, xmm15, 0B1H
- pshuflw xmm12, xmm12, 0B1H
- pshufhw xmm12, xmm12, 0B1H
- pshuflw xmm13, xmm13, 0B1H
- pshufhw xmm13, xmm13, 0B1H
- pshuflw xmm14, xmm14, 0B1H
- pshufhw xmm14, xmm14, 0B1H
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+100H]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+100H], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+20H]
- paddd xmm1, xmmword ptr [rsp+30H]
- paddd xmm2, xmmword ptr [rsp+10H]
- paddd xmm3, xmmword ptr [rsp+40H]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- movdqa xmm8, xmm15
- psrld xmm15, 8
- pslld xmm8, 24
- pxor xmm15, xmm8
- movdqa xmm8, xmm12
- psrld xmm12, 8
- pslld xmm8, 24
- pxor xmm12, xmm8
- movdqa xmm8, xmm13
- psrld xmm13, 8
- pslld xmm8, 24
- pxor xmm13, xmm8
- movdqa xmm8, xmm14
- psrld xmm14, 8
- pslld xmm8, 24
- pxor xmm14, xmm8
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+100H]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+100H], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+90H]
- paddd xmm1, xmmword ptr [rsp+0B0H]
- paddd xmm2, xmmword ptr [rsp+80H]
- paddd xmm3, xmmword ptr [rsp+0F0H]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- pshuflw xmm12, xmm12, 0B1H
- pshufhw xmm12, xmm12, 0B1H
- pshuflw xmm13, xmm13, 0B1H
- pshufhw xmm13, xmm13, 0B1H
- pshuflw xmm14, xmm14, 0B1H
- pshufhw xmm14, xmm14, 0B1H
- pshuflw xmm15, xmm15, 0B1H
- pshufhw xmm15, xmm15, 0B1H
- movdqa xmm8, xmmword ptr [rsp+100H]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+100H], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+0E0H]
- paddd xmm1, xmmword ptr [rsp+50H]
- paddd xmm2, xmmword ptr [rsp+0C0H]
- paddd xmm3, xmmword ptr [rsp+10H]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- movdqa xmm8, xmm12
- psrld xmm12, 8
- pslld xmm8, 24
- pxor xmm12, xmm8
- movdqa xmm8, xmm13
- psrld xmm13, 8
- pslld xmm8, 24
- pxor xmm13, xmm8
- movdqa xmm8, xmm14
- psrld xmm14, 8
- pslld xmm8, 24
- pxor xmm14, xmm8
- movdqa xmm8, xmm15
- psrld xmm15, 8
- pslld xmm8, 24
- pxor xmm15, xmm8
- movdqa xmm8, xmmword ptr [rsp+100H]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+100H], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+0D0H]
- paddd xmm1, xmmword ptr [rsp]
- paddd xmm2, xmmword ptr [rsp+20H]
- paddd xmm3, xmmword ptr [rsp+40H]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- pshuflw xmm15, xmm15, 0B1H
- pshufhw xmm15, xmm15, 0B1H
- pshuflw xmm12, xmm12, 0B1H
- pshufhw xmm12, xmm12, 0B1H
- pshuflw xmm13, xmm13, 0B1H
- pshufhw xmm13, xmm13, 0B1H
- pshuflw xmm14, xmm14, 0B1H
- pshufhw xmm14, xmm14, 0B1H
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+100H]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+100H], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+30H]
- paddd xmm1, xmmword ptr [rsp+0A0H]
- paddd xmm2, xmmword ptr [rsp+60H]
- paddd xmm3, xmmword ptr [rsp+70H]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- movdqa xmm8, xmm15
- psrld xmm15, 8
- pslld xmm8, 24
- pxor xmm15, xmm8
- movdqa xmm8, xmm12
- psrld xmm12, 8
- pslld xmm8, 24
- pxor xmm12, xmm8
- movdqa xmm8, xmm13
- psrld xmm13, 8
- pslld xmm8, 24
- pxor xmm13, xmm8
- movdqa xmm8, xmm14
- psrld xmm14, 8
- pslld xmm8, 24
- pxor xmm14, xmm8
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+100H]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+100H], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+0B0H]
- paddd xmm1, xmmword ptr [rsp+50H]
- paddd xmm2, xmmword ptr [rsp+10H]
- paddd xmm3, xmmword ptr [rsp+80H]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- pshuflw xmm12, xmm12, 0B1H
- pshufhw xmm12, xmm12, 0B1H
- pshuflw xmm13, xmm13, 0B1H
- pshufhw xmm13, xmm13, 0B1H
- pshuflw xmm14, xmm14, 0B1H
- pshufhw xmm14, xmm14, 0B1H
- pshuflw xmm15, xmm15, 0B1H
- pshufhw xmm15, xmm15, 0B1H
- movdqa xmm8, xmmword ptr [rsp+100H]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+100H], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+0F0H]
- paddd xmm1, xmmword ptr [rsp]
- paddd xmm2, xmmword ptr [rsp+90H]
- paddd xmm3, xmmword ptr [rsp+60H]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- movdqa xmm8, xmm12
- psrld xmm12, 8
- pslld xmm8, 24
- pxor xmm12, xmm8
- movdqa xmm8, xmm13
- psrld xmm13, 8
- pslld xmm8, 24
- pxor xmm13, xmm8
- movdqa xmm8, xmm14
- psrld xmm14, 8
- pslld xmm8, 24
- pxor xmm14, xmm8
- movdqa xmm8, xmm15
- psrld xmm15, 8
- pslld xmm8, 24
- pxor xmm15, xmm8
- movdqa xmm8, xmmword ptr [rsp+100H]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+100H], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+0E0H]
- paddd xmm1, xmmword ptr [rsp+20H]
- paddd xmm2, xmmword ptr [rsp+30H]
- paddd xmm3, xmmword ptr [rsp+70H]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- pshuflw xmm15, xmm15, 0B1H
- pshufhw xmm15, xmm15, 0B1H
- pshuflw xmm12, xmm12, 0B1H
- pshufhw xmm12, xmm12, 0B1H
- pshuflw xmm13, xmm13, 0B1H
- pshufhw xmm13, xmm13, 0B1H
- pshuflw xmm14, xmm14, 0B1H
- pshufhw xmm14, xmm14, 0B1H
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+100H]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+100H], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+0A0H]
- paddd xmm1, xmmword ptr [rsp+0C0H]
- paddd xmm2, xmmword ptr [rsp+40H]
- paddd xmm3, xmmword ptr [rsp+0D0H]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- movdqa xmm8, xmm15
- psrld xmm15, 8
- pslld xmm8, 24
- pxor xmm15, xmm8
- movdqa xmm8, xmm12
- psrld xmm12, 8
- pslld xmm8, 24
- pxor xmm12, xmm8
- movdqa xmm8, xmm13
- psrld xmm13, 8
- pslld xmm8, 24
- pxor xmm13, xmm8
- movdqa xmm8, xmm14
- psrld xmm14, 8
- pslld xmm8, 24
- pxor xmm14, xmm8
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+100H]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- pxor xmm0, xmm8
- pxor xmm1, xmm9
- pxor xmm2, xmm10
- pxor xmm3, xmm11
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- pxor xmm4, xmm12
- pxor xmm5, xmm13
- pxor xmm6, xmm14
- pxor xmm7, xmm15
- mov eax, r13d
- jne innerloop4
- movdqa xmm9, xmm0
- punpckldq xmm0, xmm1
- punpckhdq xmm9, xmm1
- movdqa xmm11, xmm2
- punpckldq xmm2, xmm3
- punpckhdq xmm11, xmm3
- movdqa xmm1, xmm0
- punpcklqdq xmm0, xmm2
- punpckhqdq xmm1, xmm2
- movdqa xmm3, xmm9
- punpcklqdq xmm9, xmm11
- punpckhqdq xmm3, xmm11
- movdqu xmmword ptr [rbx], xmm0
- movdqu xmmword ptr [rbx+20H], xmm1
- movdqu xmmword ptr [rbx+40H], xmm9
- movdqu xmmword ptr [rbx+60H], xmm3
- movdqa xmm9, xmm4
- punpckldq xmm4, xmm5
- punpckhdq xmm9, xmm5
- movdqa xmm11, xmm6
- punpckldq xmm6, xmm7
- punpckhdq xmm11, xmm7
- movdqa xmm5, xmm4
- punpcklqdq xmm4, xmm6
- punpckhqdq xmm5, xmm6
- movdqa xmm7, xmm9
- punpcklqdq xmm9, xmm11
- punpckhqdq xmm7, xmm11
- movdqu xmmword ptr [rbx+10H], xmm4
- movdqu xmmword ptr [rbx+30H], xmm5
- movdqu xmmword ptr [rbx+50H], xmm9
- movdqu xmmword ptr [rbx+70H], xmm7
- movdqa xmm1, xmmword ptr [rsp+110H]
- movdqa xmm0, xmm1
- paddd xmm1, xmmword ptr [rsp+150H]
- movdqa xmmword ptr [rsp+110H], xmm1
- pxor xmm0, xmmword ptr [CMP_MSB_MASK]
- pxor xmm1, xmmword ptr [CMP_MSB_MASK]
- pcmpgtd xmm0, xmm1
- movdqa xmm1, xmmword ptr [rsp+120H]
- psubd xmm1, xmm0
- movdqa xmmword ptr [rsp+120H], xmm1
- add rbx, 128
- add rdi, 32
- sub rsi, 4
- cmp rsi, 4
- jnc outerloop4
- test rsi, rsi
- jne final3blocks
-unwind:
- movdqa xmm6, xmmword ptr [rsp+170H]
- movdqa xmm7, xmmword ptr [rsp+180H]
- movdqa xmm8, xmmword ptr [rsp+190H]
- movdqa xmm9, xmmword ptr [rsp+1A0H]
- movdqa xmm10, xmmword ptr [rsp+1B0H]
- movdqa xmm11, xmmword ptr [rsp+1C0H]
- movdqa xmm12, xmmword ptr [rsp+1D0H]
- movdqa xmm13, xmmword ptr [rsp+1E0H]
- movdqa xmm14, xmmword ptr [rsp+1F0H]
- movdqa xmm15, xmmword ptr [rsp+200H]
- mov rsp, rbp
- pop rbp
- pop rbx
- pop rdi
- pop rsi
- pop r12
- pop r13
- pop r14
- pop r15
- ret
-ALIGN 16
-final3blocks:
- test esi, 2H
- je final1block
- movups xmm0, xmmword ptr [rcx]
- movups xmm1, xmmword ptr [rcx+10H]
- movaps xmm8, xmm0
- movaps xmm9, xmm1
- movd xmm13, dword ptr [rsp+110H]
- movd xmm14, dword ptr [rsp+120H]
- punpckldq xmm13, xmm14
- movaps xmmword ptr [rsp], xmm13
- movd xmm14, dword ptr [rsp+114H]
- movd xmm13, dword ptr [rsp+124H]
- punpckldq xmm14, xmm13
- movaps xmmword ptr [rsp+10H], xmm14
- mov r8, qword ptr [rdi]
- mov r9, qword ptr [rdi+8H]
- movzx eax, byte ptr [rbp+80H]
- or eax, r13d
- xor edx, edx
-innerloop2:
- mov r14d, eax
- or eax, r12d
- add rdx, 64
- cmp rdx, r15
- cmovne eax, r14d
- movaps xmm2, xmmword ptr [BLAKE3_IV]
- movaps xmm10, xmm2
- movups xmm4, xmmword ptr [r8+rdx-40H]
- movups xmm5, xmmword ptr [r8+rdx-30H]
- movaps xmm3, xmm4
- shufps xmm4, xmm5, 136
- shufps xmm3, xmm5, 221
- movaps xmm5, xmm3
- movups xmm6, xmmword ptr [r8+rdx-20H]
- movups xmm7, xmmword ptr [r8+rdx-10H]
- movaps xmm3, xmm6
- shufps xmm6, xmm7, 136
- pshufd xmm6, xmm6, 93H
- shufps xmm3, xmm7, 221
- pshufd xmm7, xmm3, 93H
- movups xmm12, xmmword ptr [r9+rdx-40H]
- movups xmm13, xmmword ptr [r9+rdx-30H]
- movaps xmm11, xmm12
- shufps xmm12, xmm13, 136
- shufps xmm11, xmm13, 221
- movaps xmm13, xmm11
- movups xmm14, xmmword ptr [r9+rdx-20H]
- movups xmm15, xmmword ptr [r9+rdx-10H]
- movaps xmm11, xmm14
- shufps xmm14, xmm15, 136
- pshufd xmm14, xmm14, 93H
- shufps xmm11, xmm15, 221
- pshufd xmm15, xmm11, 93H
- shl rax, 20H
- or rax, 40H
- movd xmm3, rax
- movdqa xmmword ptr [rsp+20H], xmm3
- movaps xmm3, xmmword ptr [rsp]
- movaps xmm11, xmmword ptr [rsp+10H]
- punpcklqdq xmm3, xmmword ptr [rsp+20H]
- punpcklqdq xmm11, xmmword ptr [rsp+20H]
- mov al, 7
-roundloop2:
- paddd xmm0, xmm4
- paddd xmm8, xmm12
- movaps xmmword ptr [rsp+20H], xmm4
- movaps xmmword ptr [rsp+30H], xmm12
- paddd xmm0, xmm1
- paddd xmm8, xmm9
- pxor xmm3, xmm0
- pxor xmm11, xmm8
- pshuflw xmm3, xmm3, 0B1H
- pshufhw xmm3, xmm3, 0B1H
- pshuflw xmm11, xmm11, 0B1H
- pshufhw xmm11, xmm11, 0B1H
- paddd xmm2, xmm3
- paddd xmm10, xmm11
- pxor xmm1, xmm2
- pxor xmm9, xmm10
- movdqa xmm4, xmm1
- pslld xmm1, 20
- psrld xmm4, 12
- por xmm1, xmm4
- movdqa xmm4, xmm9
- pslld xmm9, 20
- psrld xmm4, 12
- por xmm9, xmm4
- paddd xmm0, xmm5
- paddd xmm8, xmm13
- movaps xmmword ptr [rsp+40H], xmm5
- movaps xmmword ptr [rsp+50H], xmm13
- paddd xmm0, xmm1
- paddd xmm8, xmm9
- pxor xmm3, xmm0
- pxor xmm11, xmm8
- movdqa xmm13, xmm3
- psrld xmm3, 8
- pslld xmm13, 24
- pxor xmm3, xmm13
- movdqa xmm13, xmm11
- psrld xmm11, 8
- pslld xmm13, 24
- pxor xmm11, xmm13
- paddd xmm2, xmm3
- paddd xmm10, xmm11
- pxor xmm1, xmm2
- pxor xmm9, xmm10
- movdqa xmm4, xmm1
- pslld xmm1, 25
- psrld xmm4, 7
- por xmm1, xmm4
- movdqa xmm4, xmm9
- pslld xmm9, 25
- psrld xmm4, 7
- por xmm9, xmm4
- pshufd xmm0, xmm0, 93H
- pshufd xmm8, xmm8, 93H
- pshufd xmm3, xmm3, 4EH
- pshufd xmm11, xmm11, 4EH
- pshufd xmm2, xmm2, 39H
- pshufd xmm10, xmm10, 39H
- paddd xmm0, xmm6
- paddd xmm8, xmm14
- paddd xmm0, xmm1
- paddd xmm8, xmm9
- pxor xmm3, xmm0
- pxor xmm11, xmm8
- pshuflw xmm3, xmm3, 0B1H
- pshufhw xmm3, xmm3, 0B1H
- pshuflw xmm11, xmm11, 0B1H
- pshufhw xmm11, xmm11, 0B1H
- paddd xmm2, xmm3
- paddd xmm10, xmm11
- pxor xmm1, xmm2
- pxor xmm9, xmm10
- movdqa xmm4, xmm1
- pslld xmm1, 20
- psrld xmm4, 12
- por xmm1, xmm4
- movdqa xmm4, xmm9
- pslld xmm9, 20
- psrld xmm4, 12
- por xmm9, xmm4
- paddd xmm0, xmm7
- paddd xmm8, xmm15
- paddd xmm0, xmm1
- paddd xmm8, xmm9
- pxor xmm3, xmm0
- pxor xmm11, xmm8
- movdqa xmm13, xmm3
- psrld xmm3, 8
- pslld xmm13, 24
- pxor xmm3, xmm13
- movdqa xmm13, xmm11
- psrld xmm11, 8
- pslld xmm13, 24
- pxor xmm11, xmm13
- paddd xmm2, xmm3
- paddd xmm10, xmm11
- pxor xmm1, xmm2
- pxor xmm9, xmm10
- movdqa xmm4, xmm1
- pslld xmm1, 25
- psrld xmm4, 7
- por xmm1, xmm4
- movdqa xmm4, xmm9
- pslld xmm9, 25
- psrld xmm4, 7
- por xmm9, xmm4
- pshufd xmm0, xmm0, 39H
- pshufd xmm8, xmm8, 39H
- pshufd xmm3, xmm3, 4EH
- pshufd xmm11, xmm11, 4EH
- pshufd xmm2, xmm2, 93H
- pshufd xmm10, xmm10, 93H
- dec al
- je endroundloop2
- movdqa xmm12, xmmword ptr [rsp+20H]
- movdqa xmm5, xmmword ptr [rsp+40H]
- pshufd xmm13, xmm12, 0FH
- shufps xmm12, xmm5, 214
- pshufd xmm4, xmm12, 39H
- movdqa xmm12, xmm6
- shufps xmm12, xmm7, 250
- pand xmm13, xmmword ptr [PBLENDW_0x33_MASK]
- pand xmm12, xmmword ptr [PBLENDW_0xCC_MASK]
- por xmm13, xmm12
- movdqa xmmword ptr [rsp+20H], xmm13
- movdqa xmm12, xmm7
- punpcklqdq xmm12, xmm5
- movdqa xmm13, xmm6
- pand xmm12, xmmword ptr [PBLENDW_0x3F_MASK]
- pand xmm13, xmmword ptr [PBLENDW_0xC0_MASK]
- por xmm12, xmm13
- pshufd xmm12, xmm12, 78H
- punpckhdq xmm5, xmm7
- punpckldq xmm6, xmm5
- pshufd xmm7, xmm6, 1EH
- movdqa xmmword ptr [rsp+40H], xmm12
- movdqa xmm5, xmmword ptr [rsp+30H]
- movdqa xmm13, xmmword ptr [rsp+50H]
- pshufd xmm6, xmm5, 0FH
- shufps xmm5, xmm13, 214
- pshufd xmm12, xmm5, 39H
- movdqa xmm5, xmm14
- shufps xmm5, xmm15, 250
- pand xmm6, xmmword ptr [PBLENDW_0x33_MASK]
- pand xmm5, xmmword ptr [PBLENDW_0xCC_MASK]
- por xmm6, xmm5
- movdqa xmm5, xmm15
- punpcklqdq xmm5, xmm13
- movdqa xmmword ptr [rsp+30H], xmm2
- movdqa xmm2, xmm14
- pand xmm5, xmmword ptr [PBLENDW_0x3F_MASK]
- pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK]
- por xmm5, xmm2
- movdqa xmm2, xmmword ptr [rsp+30H]
- pshufd xmm5, xmm5, 78H
- punpckhdq xmm13, xmm15
- punpckldq xmm14, xmm13
- pshufd xmm15, xmm14, 1EH
- movdqa xmm13, xmm6
- movdqa xmm14, xmm5
- movdqa xmm5, xmmword ptr [rsp+20H]
- movdqa xmm6, xmmword ptr [rsp+40H]
- jmp roundloop2
-endroundloop2:
- pxor xmm0, xmm2
- pxor xmm1, xmm3
- pxor xmm8, xmm10
- pxor xmm9, xmm11
- mov eax, r13d
- cmp rdx, r15
- jne innerloop2
- movups xmmword ptr [rbx], xmm0
- movups xmmword ptr [rbx+10H], xmm1
- movups xmmword ptr [rbx+20H], xmm8
- movups xmmword ptr [rbx+30H], xmm9
- mov eax, dword ptr [rsp+130H]
- neg eax
- mov r10d, dword ptr [rsp+110H+8*rax]
- mov r11d, dword ptr [rsp+120H+8*rax]
- mov dword ptr [rsp+110H], r10d
- mov dword ptr [rsp+120H], r11d
- add rdi, 16
- add rbx, 64
- sub rsi, 2
-final1block:
- test esi, 1H
- je unwind
- movups xmm0, xmmword ptr [rcx]
- movups xmm1, xmmword ptr [rcx+10H]
- movd xmm13, dword ptr [rsp+110H]
- movd xmm14, dword ptr [rsp+120H]
- punpckldq xmm13, xmm14
- mov r8, qword ptr [rdi]
- movzx eax, byte ptr [rbp+80H]
- or eax, r13d
- xor edx, edx
-innerloop1:
- mov r14d, eax
- or eax, r12d
- add rdx, 64
- cmp rdx, r15
- cmovne eax, r14d
- movaps xmm2, xmmword ptr [BLAKE3_IV]
- shl rax, 32
- or rax, 64
- movd xmm12, rax
- movdqa xmm3, xmm13
- punpcklqdq xmm3, xmm12
- movups xmm4, xmmword ptr [r8+rdx-40H]
- movups xmm5, xmmword ptr [r8+rdx-30H]
- movaps xmm8, xmm4
- shufps xmm4, xmm5, 136
- shufps xmm8, xmm5, 221
- movaps xmm5, xmm8
- movups xmm6, xmmword ptr [r8+rdx-20H]
- movups xmm7, xmmword ptr [r8+rdx-10H]
- movaps xmm8, xmm6
- shufps xmm6, xmm7, 136
- pshufd xmm6, xmm6, 93H
- shufps xmm8, xmm7, 221
- pshufd xmm7, xmm8, 93H
- mov al, 7
-roundloop1:
- paddd xmm0, xmm4
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- pshuflw xmm3, xmm3, 0B1H
- pshufhw xmm3, xmm3, 0B1H
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 20
- psrld xmm11, 12
- por xmm1, xmm11
- paddd xmm0, xmm5
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- movdqa xmm14, xmm3
- psrld xmm3, 8
- pslld xmm14, 24
- pxor xmm3, xmm14
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 25
- psrld xmm11, 7
- por xmm1, xmm11
- pshufd xmm0, xmm0, 93H
- pshufd xmm3, xmm3, 4EH
- pshufd xmm2, xmm2, 39H
- paddd xmm0, xmm6
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- pshuflw xmm3, xmm3, 0B1H
- pshufhw xmm3, xmm3, 0B1H
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 20
- psrld xmm11, 12
- por xmm1, xmm11
- paddd xmm0, xmm7
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- movdqa xmm14, xmm3
- psrld xmm3, 8
- pslld xmm14, 24
- pxor xmm3, xmm14
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 25
- psrld xmm11, 7
- por xmm1, xmm11
- pshufd xmm0, xmm0, 39H
- pshufd xmm3, xmm3, 4EH
- pshufd xmm2, xmm2, 93H
- dec al
- jz endroundloop1
- movdqa xmm8, xmm4
- shufps xmm8, xmm5, 214
- pshufd xmm9, xmm4, 0FH
- pshufd xmm4, xmm8, 39H
- movdqa xmm8, xmm6
- shufps xmm8, xmm7, 250
- pand xmm9, xmmword ptr [PBLENDW_0x33_MASK]
- pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK]
- por xmm9, xmm8
- movdqa xmm8, xmm7
- punpcklqdq xmm8, xmm5
- movdqa xmm10, xmm6
- pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK]
- pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK]
- por xmm8, xmm10
- pshufd xmm8, xmm8, 78H
- punpckhdq xmm5, xmm7
- punpckldq xmm6, xmm5
- pshufd xmm7, xmm6, 1EH
- movdqa xmm5, xmm9
- movdqa xmm6, xmm8
- jmp roundloop1
-endroundloop1:
- pxor xmm0, xmm2
- pxor xmm1, xmm3
- mov eax, r13d
- cmp rdx, r15
- jne innerloop1
- movups xmmword ptr [rbx], xmm0
- movups xmmword ptr [rbx+10H], xmm1
- jmp unwind
-_blake3_hash_many_sse2 ENDP
-blake3_hash_many_sse2 ENDP
-
-blake3_compress_in_place_sse2 PROC
-_blake3_compress_in_place_sse2 PROC
- sub rsp, 120
- movdqa xmmword ptr [rsp], xmm6
- movdqa xmmword ptr [rsp+10H], xmm7
- movdqa xmmword ptr [rsp+20H], xmm8
- movdqa xmmword ptr [rsp+30H], xmm9
- movdqa xmmword ptr [rsp+40H], xmm11
- movdqa xmmword ptr [rsp+50H], xmm14
- movdqa xmmword ptr [rsp+60H], xmm15
- movups xmm0, xmmword ptr [rcx]
- movups xmm1, xmmword ptr [rcx+10H]
- movaps xmm2, xmmword ptr [BLAKE3_IV]
- movzx eax, byte ptr [rsp+0A0H]
- movzx r8d, r8b
- shl rax, 32
- add r8, rax
- movq xmm3, r9
- movq xmm4, r8
- punpcklqdq xmm3, xmm4
- movups xmm4, xmmword ptr [rdx]
- movups xmm5, xmmword ptr [rdx+10H]
- movaps xmm8, xmm4
- shufps xmm4, xmm5, 136
- shufps xmm8, xmm5, 221
- movaps xmm5, xmm8
- movups xmm6, xmmword ptr [rdx+20H]
- movups xmm7, xmmword ptr [rdx+30H]
- movaps xmm8, xmm6
- shufps xmm6, xmm7, 136
- pshufd xmm6, xmm6, 93H
- shufps xmm8, xmm7, 221
- pshufd xmm7, xmm8, 93H
- mov al, 7
-@@:
- paddd xmm0, xmm4
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- pshuflw xmm3, xmm3, 0B1H
- pshufhw xmm3, xmm3, 0B1H
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 20
- psrld xmm11, 12
- por xmm1, xmm11
- paddd xmm0, xmm5
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- movdqa xmm14, xmm3
- psrld xmm3, 8
- pslld xmm14, 24
- pxor xmm3, xmm14
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 25
- psrld xmm11, 7
- por xmm1, xmm11
- pshufd xmm0, xmm0, 93H
- pshufd xmm3, xmm3, 4EH
- pshufd xmm2, xmm2, 39H
- paddd xmm0, xmm6
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- pshuflw xmm3, xmm3, 0B1H
- pshufhw xmm3, xmm3, 0B1H
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 20
- psrld xmm11, 12
- por xmm1, xmm11
- paddd xmm0, xmm7
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- movdqa xmm14, xmm3
- psrld xmm3, 8
- pslld xmm14, 24
- pxor xmm3, xmm14
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 25
- psrld xmm11, 7
- por xmm1, xmm11
- pshufd xmm0, xmm0, 39H
- pshufd xmm3, xmm3, 4EH
- pshufd xmm2, xmm2, 93H
- dec al
- jz @F
- movdqa xmm8, xmm4
- shufps xmm8, xmm5, 214
- pshufd xmm9, xmm4, 0FH
- pshufd xmm4, xmm8, 39H
- movdqa xmm8, xmm6
- shufps xmm8, xmm7, 250
- pand xmm9, xmmword ptr [PBLENDW_0x33_MASK]
- pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK]
- por xmm9, xmm8
- movdqa xmm8, xmm7
- punpcklqdq xmm8, xmm5
- movdqa xmm10, xmm6
- pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK]
- pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK]
- por xmm8, xmm10
- pshufd xmm8, xmm8, 78H
- punpckhdq xmm5, xmm7
- punpckldq xmm6, xmm5
- pshufd xmm7, xmm6, 1EH
- movdqa xmm5, xmm9
- movdqa xmm6, xmm8
- jmp @B
-@@:
- pxor xmm0, xmm2
- pxor xmm1, xmm3
- movups xmmword ptr [rcx], xmm0
- movups xmmword ptr [rcx+10H], xmm1
- movdqa xmm6, xmmword ptr [rsp]
- movdqa xmm7, xmmword ptr [rsp+10H]
- movdqa xmm8, xmmword ptr [rsp+20H]
- movdqa xmm9, xmmword ptr [rsp+30H]
- movdqa xmm11, xmmword ptr [rsp+40H]
- movdqa xmm14, xmmword ptr [rsp+50H]
- movdqa xmm15, xmmword ptr [rsp+60H]
- add rsp, 120
- ret
-_blake3_compress_in_place_sse2 ENDP
-blake3_compress_in_place_sse2 ENDP
-
-ALIGN 16
-blake3_compress_xof_sse2 PROC
-_blake3_compress_xof_sse2 PROC
- sub rsp, 120
- movdqa xmmword ptr [rsp], xmm6
- movdqa xmmword ptr [rsp+10H], xmm7
- movdqa xmmword ptr [rsp+20H], xmm8
- movdqa xmmword ptr [rsp+30H], xmm9
- movdqa xmmword ptr [rsp+40H], xmm11
- movdqa xmmword ptr [rsp+50H], xmm14
- movdqa xmmword ptr [rsp+60H], xmm15
- movups xmm0, xmmword ptr [rcx]
- movups xmm1, xmmword ptr [rcx+10H]
- movaps xmm2, xmmword ptr [BLAKE3_IV]
- movzx eax, byte ptr [rsp+0A0H]
- movzx r8d, r8b
- mov r10, qword ptr [rsp+0A8H]
- shl rax, 32
- add r8, rax
- movq xmm3, r9
- movq xmm4, r8
- punpcklqdq xmm3, xmm4
- movups xmm4, xmmword ptr [rdx]
- movups xmm5, xmmword ptr [rdx+10H]
- movaps xmm8, xmm4
- shufps xmm4, xmm5, 136
- shufps xmm8, xmm5, 221
- movaps xmm5, xmm8
- movups xmm6, xmmword ptr [rdx+20H]
- movups xmm7, xmmword ptr [rdx+30H]
- movaps xmm8, xmm6
- shufps xmm6, xmm7, 136
- pshufd xmm6, xmm6, 93H
- shufps xmm8, xmm7, 221
- pshufd xmm7, xmm8, 93H
- mov al, 7
-@@:
- paddd xmm0, xmm4
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- pshuflw xmm3, xmm3, 0B1H
- pshufhw xmm3, xmm3, 0B1H
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 20
- psrld xmm11, 12
- por xmm1, xmm11
- paddd xmm0, xmm5
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- movdqa xmm14, xmm3
- psrld xmm3, 8
- pslld xmm14, 24
- pxor xmm3, xmm14
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 25
- psrld xmm11, 7
- por xmm1, xmm11
- pshufd xmm0, xmm0, 93H
- pshufd xmm3, xmm3, 4EH
- pshufd xmm2, xmm2, 39H
- paddd xmm0, xmm6
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- pshuflw xmm3, xmm3, 0B1H
- pshufhw xmm3, xmm3, 0B1H
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 20
- psrld xmm11, 12
- por xmm1, xmm11
- paddd xmm0, xmm7
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- movdqa xmm14, xmm3
- psrld xmm3, 8
- pslld xmm14, 24
- pxor xmm3, xmm14
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 25
- psrld xmm11, 7
- por xmm1, xmm11
- pshufd xmm0, xmm0, 39H
- pshufd xmm3, xmm3, 4EH
- pshufd xmm2, xmm2, 93H
- dec al
- jz @F
- movdqa xmm8, xmm4
- shufps xmm8, xmm5, 214
- pshufd xmm9, xmm4, 0FH
- pshufd xmm4, xmm8, 39H
- movdqa xmm8, xmm6
- shufps xmm8, xmm7, 250
- pand xmm9, xmmword ptr [PBLENDW_0x33_MASK]
- pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK]
- por xmm9, xmm8
- movdqa xmm8, xmm7
- punpcklqdq xmm8, xmm5
- movdqa xmm10, xmm6
- pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK]
- pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK]
- por xmm8, xmm10
- pshufd xmm8, xmm8, 78H
- punpckhdq xmm5, xmm7
- punpckldq xmm6, xmm5
- pshufd xmm7, xmm6, 1EH
- movdqa xmm5, xmm9
- movdqa xmm6, xmm8
- jmp @B
-@@:
- movdqu xmm4, xmmword ptr [rcx]
- movdqu xmm5, xmmword ptr [rcx+10H]
- pxor xmm0, xmm2
- pxor xmm1, xmm3
- pxor xmm2, xmm4
- pxor xmm3, xmm5
- movups xmmword ptr [r10], xmm0
- movups xmmword ptr [r10+10H], xmm1
- movups xmmword ptr [r10+20H], xmm2
- movups xmmword ptr [r10+30H], xmm3
- movdqa xmm6, xmmword ptr [rsp]
- movdqa xmm7, xmmword ptr [rsp+10H]
- movdqa xmm8, xmmword ptr [rsp+20H]
- movdqa xmm9, xmmword ptr [rsp+30H]
- movdqa xmm11, xmmword ptr [rsp+40H]
- movdqa xmm14, xmmword ptr [rsp+50H]
- movdqa xmm15, xmmword ptr [rsp+60H]
- add rsp, 120
- ret
-_blake3_compress_xof_sse2 ENDP
-blake3_compress_xof_sse2 ENDP
-
-_TEXT ENDS
-
-
-_RDATA SEGMENT READONLY PAGE ALIAS(".rdata") 'CONST'
-ALIGN 64
-BLAKE3_IV:
- dd 6A09E667H, 0BB67AE85H, 3C6EF372H, 0A54FF53AH
-
-ADD0:
- dd 0, 1, 2, 3
-
-ADD1:
- dd 4 dup (4)
-
-BLAKE3_IV_0:
- dd 4 dup (6A09E667H)
-
-BLAKE3_IV_1:
- dd 4 dup (0BB67AE85H)
-
-BLAKE3_IV_2:
- dd 4 dup (3C6EF372H)
-
-BLAKE3_IV_3:
- dd 4 dup (0A54FF53AH)
-
-BLAKE3_BLOCK_LEN:
- dd 4 dup (64)
-
-CMP_MSB_MASK:
- dd 8 dup(80000000H)
-
-PBLENDW_0x33_MASK:
- dd 0FFFFFFFFH, 000000000H, 0FFFFFFFFH, 000000000H
-PBLENDW_0xCC_MASK:
- dd 000000000H, 0FFFFFFFFH, 000000000H, 0FFFFFFFFH
-PBLENDW_0x3F_MASK:
- dd 0FFFFFFFFH, 0FFFFFFFFH, 0FFFFFFFFH, 000000000H
-PBLENDW_0xC0_MASK:
- dd 000000000H, 000000000H, 000000000H, 0FFFFFFFFH
-
-_RDATA ENDS
-END
diff --git a/thirdparty/BLAKE3/c/blake3_sse41.c b/thirdparty/BLAKE3/c/blake3_sse41.c
deleted file mode 100644
index b31122533..000000000
--- a/thirdparty/BLAKE3/c/blake3_sse41.c
+++ /dev/null
@@ -1,559 +0,0 @@
-#include "blake3_impl.h"
-
-#include <immintrin.h>
-
-#define DEGREE 4
-
-#define _mm_shuffle_ps2(a, b, c) \
- (_mm_castps_si128( \
- _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c))))
-
-INLINE __m128i loadu(const uint8_t src[16]) {
- return _mm_loadu_si128((const __m128i *)src);
-}
-
-INLINE void storeu(__m128i src, uint8_t dest[16]) {
- _mm_storeu_si128((__m128i *)dest, src);
-}
-
-INLINE __m128i addv(__m128i a, __m128i b) { return _mm_add_epi32(a, b); }
-
-// Note that clang-format doesn't like the name "xor" for some reason.
-INLINE __m128i xorv(__m128i a, __m128i b) { return _mm_xor_si128(a, b); }
-
-INLINE __m128i set1(uint32_t x) { return _mm_set1_epi32((int32_t)x); }
-
-INLINE __m128i set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
- return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d);
-}
-
-INLINE __m128i rot16(__m128i x) {
- return _mm_shuffle_epi8(
- x, _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2));
-}
-
-INLINE __m128i rot12(__m128i x) {
- return xorv(_mm_srli_epi32(x, 12), _mm_slli_epi32(x, 32 - 12));
-}
-
-INLINE __m128i rot8(__m128i x) {
- return _mm_shuffle_epi8(
- x, _mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1));
-}
-
-INLINE __m128i rot7(__m128i x) {
- return xorv(_mm_srli_epi32(x, 7), _mm_slli_epi32(x, 32 - 7));
-}
-
-INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
- __m128i m) {
- *row0 = addv(addv(*row0, m), *row1);
- *row3 = xorv(*row3, *row0);
- *row3 = rot16(*row3);
- *row2 = addv(*row2, *row3);
- *row1 = xorv(*row1, *row2);
- *row1 = rot12(*row1);
-}
-
-INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
- __m128i m) {
- *row0 = addv(addv(*row0, m), *row1);
- *row3 = xorv(*row3, *row0);
- *row3 = rot8(*row3);
- *row2 = addv(*row2, *row3);
- *row1 = xorv(*row1, *row2);
- *row1 = rot7(*row1);
-}
-
-// Note the optimization here of leaving row1 as the unrotated row, rather than
-// row0. All the message loads below are adjusted to compensate for this. See
-// discussion at https://github.com/sneves/blake2-avx2/pull/4
-INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
- *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3));
- *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
- *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1));
-}
-
-INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
- *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1));
- *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
- *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3));
-}
-
-INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8],
- const uint8_t block[BLAKE3_BLOCK_LEN],
- uint8_t block_len, uint64_t counter, uint8_t flags) {
- rows[0] = loadu((uint8_t *)&cv[0]);
- rows[1] = loadu((uint8_t *)&cv[4]);
- rows[2] = set4(IV[0], IV[1], IV[2], IV[3]);
- rows[3] = set4(counter_low(counter), counter_high(counter),
- (uint32_t)block_len, (uint32_t)flags);
-
- __m128i m0 = loadu(&block[sizeof(__m128i) * 0]);
- __m128i m1 = loadu(&block[sizeof(__m128i) * 1]);
- __m128i m2 = loadu(&block[sizeof(__m128i) * 2]);
- __m128i m3 = loadu(&block[sizeof(__m128i) * 3]);
-
- __m128i t0, t1, t2, t3, tt;
-
- // Round 1. The first round permutes the message words from the original
- // input order, into the groups that get mixed in parallel.
- t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); // 6 4 2 0
- g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
- t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); // 7 5 3 1
- g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
- diagonalize(&rows[0], &rows[2], &rows[3]);
- t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10 8
- t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3)); // 12 10 8 14
- g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
- t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11 9
- t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3)); // 13 11 9 15
- g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
- undiagonalize(&rows[0], &rows[2], &rows[3]);
- m0 = t0;
- m1 = t1;
- m2 = t2;
- m3 = t3;
-
- // Round 2. This round and all following rounds apply a fixed permutation
- // to the message words from the round before.
- t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
- t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
- g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
- t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
- tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
- t1 = _mm_blend_epi16(tt, t1, 0xCC);
- g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
- diagonalize(&rows[0], &rows[2], &rows[3]);
- t2 = _mm_unpacklo_epi64(m3, m1);
- tt = _mm_blend_epi16(t2, m2, 0xC0);
- t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
- g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
- t3 = _mm_unpackhi_epi32(m1, m3);
- tt = _mm_unpacklo_epi32(m2, t3);
- t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
- g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
- undiagonalize(&rows[0], &rows[2], &rows[3]);
- m0 = t0;
- m1 = t1;
- m2 = t2;
- m3 = t3;
-
- // Round 3
- t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
- t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
- g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
- t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
- tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
- t1 = _mm_blend_epi16(tt, t1, 0xCC);
- g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
- diagonalize(&rows[0], &rows[2], &rows[3]);
- t2 = _mm_unpacklo_epi64(m3, m1);
- tt = _mm_blend_epi16(t2, m2, 0xC0);
- t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
- g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
- t3 = _mm_unpackhi_epi32(m1, m3);
- tt = _mm_unpacklo_epi32(m2, t3);
- t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
- g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
- undiagonalize(&rows[0], &rows[2], &rows[3]);
- m0 = t0;
- m1 = t1;
- m2 = t2;
- m3 = t3;
-
- // Round 4
- t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
- t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
- g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
- t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
- tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
- t1 = _mm_blend_epi16(tt, t1, 0xCC);
- g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
- diagonalize(&rows[0], &rows[2], &rows[3]);
- t2 = _mm_unpacklo_epi64(m3, m1);
- tt = _mm_blend_epi16(t2, m2, 0xC0);
- t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
- g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
- t3 = _mm_unpackhi_epi32(m1, m3);
- tt = _mm_unpacklo_epi32(m2, t3);
- t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
- g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
- undiagonalize(&rows[0], &rows[2], &rows[3]);
- m0 = t0;
- m1 = t1;
- m2 = t2;
- m3 = t3;
-
- // Round 5
- t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
- t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
- g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
- t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
- tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
- t1 = _mm_blend_epi16(tt, t1, 0xCC);
- g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
- diagonalize(&rows[0], &rows[2], &rows[3]);
- t2 = _mm_unpacklo_epi64(m3, m1);
- tt = _mm_blend_epi16(t2, m2, 0xC0);
- t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
- g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
- t3 = _mm_unpackhi_epi32(m1, m3);
- tt = _mm_unpacklo_epi32(m2, t3);
- t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
- g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
- undiagonalize(&rows[0], &rows[2], &rows[3]);
- m0 = t0;
- m1 = t1;
- m2 = t2;
- m3 = t3;
-
- // Round 6
- t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
- t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
- g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
- t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
- tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
- t1 = _mm_blend_epi16(tt, t1, 0xCC);
- g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
- diagonalize(&rows[0], &rows[2], &rows[3]);
- t2 = _mm_unpacklo_epi64(m3, m1);
- tt = _mm_blend_epi16(t2, m2, 0xC0);
- t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
- g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
- t3 = _mm_unpackhi_epi32(m1, m3);
- tt = _mm_unpacklo_epi32(m2, t3);
- t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
- g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
- undiagonalize(&rows[0], &rows[2], &rows[3]);
- m0 = t0;
- m1 = t1;
- m2 = t2;
- m3 = t3;
-
- // Round 7
- t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
- t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
- g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
- t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
- tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
- t1 = _mm_blend_epi16(tt, t1, 0xCC);
- g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
- diagonalize(&rows[0], &rows[2], &rows[3]);
- t2 = _mm_unpacklo_epi64(m3, m1);
- tt = _mm_blend_epi16(t2, m2, 0xC0);
- t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
- g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
- t3 = _mm_unpackhi_epi32(m1, m3);
- tt = _mm_unpacklo_epi32(m2, t3);
- t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
- g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
- undiagonalize(&rows[0], &rows[2], &rows[3]);
-}
-
-void blake3_compress_in_place_sse41(uint32_t cv[8],
- const uint8_t block[BLAKE3_BLOCK_LEN],
- uint8_t block_len, uint64_t counter,
- uint8_t flags) {
- __m128i rows[4];
- compress_pre(rows, cv, block, block_len, counter, flags);
- storeu(xorv(rows[0], rows[2]), (uint8_t *)&cv[0]);
- storeu(xorv(rows[1], rows[3]), (uint8_t *)&cv[4]);
-}
-
-void blake3_compress_xof_sse41(const uint32_t cv[8],
- const uint8_t block[BLAKE3_BLOCK_LEN],
- uint8_t block_len, uint64_t counter,
- uint8_t flags, uint8_t out[64]) {
- __m128i rows[4];
- compress_pre(rows, cv, block, block_len, counter, flags);
- storeu(xorv(rows[0], rows[2]), &out[0]);
- storeu(xorv(rows[1], rows[3]), &out[16]);
- storeu(xorv(rows[2], loadu((uint8_t *)&cv[0])), &out[32]);
- storeu(xorv(rows[3], loadu((uint8_t *)&cv[4])), &out[48]);
-}
-
-INLINE void round_fn(__m128i v[16], __m128i m[16], size_t r) {
- v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
- v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
- v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
- v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
- v[0] = addv(v[0], v[4]);
- v[1] = addv(v[1], v[5]);
- v[2] = addv(v[2], v[6]);
- v[3] = addv(v[3], v[7]);
- v[12] = xorv(v[12], v[0]);
- v[13] = xorv(v[13], v[1]);
- v[14] = xorv(v[14], v[2]);
- v[15] = xorv(v[15], v[3]);
- v[12] = rot16(v[12]);
- v[13] = rot16(v[13]);
- v[14] = rot16(v[14]);
- v[15] = rot16(v[15]);
- v[8] = addv(v[8], v[12]);
- v[9] = addv(v[9], v[13]);
- v[10] = addv(v[10], v[14]);
- v[11] = addv(v[11], v[15]);
- v[4] = xorv(v[4], v[8]);
- v[5] = xorv(v[5], v[9]);
- v[6] = xorv(v[6], v[10]);
- v[7] = xorv(v[7], v[11]);
- v[4] = rot12(v[4]);
- v[5] = rot12(v[5]);
- v[6] = rot12(v[6]);
- v[7] = rot12(v[7]);
- v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
- v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
- v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
- v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
- v[0] = addv(v[0], v[4]);
- v[1] = addv(v[1], v[5]);
- v[2] = addv(v[2], v[6]);
- v[3] = addv(v[3], v[7]);
- v[12] = xorv(v[12], v[0]);
- v[13] = xorv(v[13], v[1]);
- v[14] = xorv(v[14], v[2]);
- v[15] = xorv(v[15], v[3]);
- v[12] = rot8(v[12]);
- v[13] = rot8(v[13]);
- v[14] = rot8(v[14]);
- v[15] = rot8(v[15]);
- v[8] = addv(v[8], v[12]);
- v[9] = addv(v[9], v[13]);
- v[10] = addv(v[10], v[14]);
- v[11] = addv(v[11], v[15]);
- v[4] = xorv(v[4], v[8]);
- v[5] = xorv(v[5], v[9]);
- v[6] = xorv(v[6], v[10]);
- v[7] = xorv(v[7], v[11]);
- v[4] = rot7(v[4]);
- v[5] = rot7(v[5]);
- v[6] = rot7(v[6]);
- v[7] = rot7(v[7]);
-
- v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
- v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
- v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
- v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
- v[0] = addv(v[0], v[5]);
- v[1] = addv(v[1], v[6]);
- v[2] = addv(v[2], v[7]);
- v[3] = addv(v[3], v[4]);
- v[15] = xorv(v[15], v[0]);
- v[12] = xorv(v[12], v[1]);
- v[13] = xorv(v[13], v[2]);
- v[14] = xorv(v[14], v[3]);
- v[15] = rot16(v[15]);
- v[12] = rot16(v[12]);
- v[13] = rot16(v[13]);
- v[14] = rot16(v[14]);
- v[10] = addv(v[10], v[15]);
- v[11] = addv(v[11], v[12]);
- v[8] = addv(v[8], v[13]);
- v[9] = addv(v[9], v[14]);
- v[5] = xorv(v[5], v[10]);
- v[6] = xorv(v[6], v[11]);
- v[7] = xorv(v[7], v[8]);
- v[4] = xorv(v[4], v[9]);
- v[5] = rot12(v[5]);
- v[6] = rot12(v[6]);
- v[7] = rot12(v[7]);
- v[4] = rot12(v[4]);
- v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
- v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
- v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
- v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
- v[0] = addv(v[0], v[5]);
- v[1] = addv(v[1], v[6]);
- v[2] = addv(v[2], v[7]);
- v[3] = addv(v[3], v[4]);
- v[15] = xorv(v[15], v[0]);
- v[12] = xorv(v[12], v[1]);
- v[13] = xorv(v[13], v[2]);
- v[14] = xorv(v[14], v[3]);
- v[15] = rot8(v[15]);
- v[12] = rot8(v[12]);
- v[13] = rot8(v[13]);
- v[14] = rot8(v[14]);
- v[10] = addv(v[10], v[15]);
- v[11] = addv(v[11], v[12]);
- v[8] = addv(v[8], v[13]);
- v[9] = addv(v[9], v[14]);
- v[5] = xorv(v[5], v[10]);
- v[6] = xorv(v[6], v[11]);
- v[7] = xorv(v[7], v[8]);
- v[4] = xorv(v[4], v[9]);
- v[5] = rot7(v[5]);
- v[6] = rot7(v[6]);
- v[7] = rot7(v[7]);
- v[4] = rot7(v[4]);
-}
-
-INLINE void transpose_vecs(__m128i vecs[DEGREE]) {
- // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is
- // 22/33. Note that this doesn't split the vector into two lanes, as the
- // AVX2 counterparts do.
- __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
- __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
- __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
- __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
-
- // Interleave 64-bit lanes.
- __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01);
- __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01);
- __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23);
- __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23);
-
- vecs[0] = abcd_0;
- vecs[1] = abcd_1;
- vecs[2] = abcd_2;
- vecs[3] = abcd_3;
-}
-
-INLINE void transpose_msg_vecs(const uint8_t *const *inputs,
- size_t block_offset, __m128i out[16]) {
- out[0] = loadu(&inputs[0][block_offset + 0 * sizeof(__m128i)]);
- out[1] = loadu(&inputs[1][block_offset + 0 * sizeof(__m128i)]);
- out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m128i)]);
- out[3] = loadu(&inputs[3][block_offset + 0 * sizeof(__m128i)]);
- out[4] = loadu(&inputs[0][block_offset + 1 * sizeof(__m128i)]);
- out[5] = loadu(&inputs[1][block_offset + 1 * sizeof(__m128i)]);
- out[6] = loadu(&inputs[2][block_offset + 1 * sizeof(__m128i)]);
- out[7] = loadu(&inputs[3][block_offset + 1 * sizeof(__m128i)]);
- out[8] = loadu(&inputs[0][block_offset + 2 * sizeof(__m128i)]);
- out[9] = loadu(&inputs[1][block_offset + 2 * sizeof(__m128i)]);
- out[10] = loadu(&inputs[2][block_offset + 2 * sizeof(__m128i)]);
- out[11] = loadu(&inputs[3][block_offset + 2 * sizeof(__m128i)]);
- out[12] = loadu(&inputs[0][block_offset + 3 * sizeof(__m128i)]);
- out[13] = loadu(&inputs[1][block_offset + 3 * sizeof(__m128i)]);
- out[14] = loadu(&inputs[2][block_offset + 3 * sizeof(__m128i)]);
- out[15] = loadu(&inputs[3][block_offset + 3 * sizeof(__m128i)]);
- for (size_t i = 0; i < 4; ++i) {
- _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0);
- }
- transpose_vecs(&out[0]);
- transpose_vecs(&out[4]);
- transpose_vecs(&out[8]);
- transpose_vecs(&out[12]);
-}
-
-INLINE void load_counters(uint64_t counter, bool increment_counter,
- __m128i *out_lo, __m128i *out_hi) {
- const __m128i mask = _mm_set1_epi32(-(int32_t)increment_counter);
- const __m128i add0 = _mm_set_epi32(3, 2, 1, 0);
- const __m128i add1 = _mm_and_si128(mask, add0);
- __m128i l = _mm_add_epi32(_mm_set1_epi32(counter), add1);
- __m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32(0x80000000)),
- _mm_xor_si128( l, _mm_set1_epi32(0x80000000)));
- __m128i h = _mm_sub_epi32(_mm_set1_epi32(counter >> 32), carry);
- *out_lo = l;
- *out_hi = h;
-}
-
-void blake3_hash4_sse41(const uint8_t *const *inputs, size_t blocks,
- const uint32_t key[8], uint64_t counter,
- bool increment_counter, uint8_t flags,
- uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
- __m128i h_vecs[8] = {
- set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]),
- set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]),
- };
- __m128i counter_low_vec, counter_high_vec;
- load_counters(counter, increment_counter, &counter_low_vec,
- &counter_high_vec);
- uint8_t block_flags = flags | flags_start;
-
- for (size_t block = 0; block < blocks; block++) {
- if (block + 1 == blocks) {
- block_flags |= flags_end;
- }
- __m128i block_len_vec = set1(BLAKE3_BLOCK_LEN);
- __m128i block_flags_vec = set1(block_flags);
- __m128i msg_vecs[16];
- transpose_msg_vecs(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
-
- __m128i v[16] = {
- h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
- h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
- set1(IV[0]), set1(IV[1]), set1(IV[2]), set1(IV[3]),
- counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
- };
- round_fn(v, msg_vecs, 0);
- round_fn(v, msg_vecs, 1);
- round_fn(v, msg_vecs, 2);
- round_fn(v, msg_vecs, 3);
- round_fn(v, msg_vecs, 4);
- round_fn(v, msg_vecs, 5);
- round_fn(v, msg_vecs, 6);
- h_vecs[0] = xorv(v[0], v[8]);
- h_vecs[1] = xorv(v[1], v[9]);
- h_vecs[2] = xorv(v[2], v[10]);
- h_vecs[3] = xorv(v[3], v[11]);
- h_vecs[4] = xorv(v[4], v[12]);
- h_vecs[5] = xorv(v[5], v[13]);
- h_vecs[6] = xorv(v[6], v[14]);
- h_vecs[7] = xorv(v[7], v[15]);
-
- block_flags = flags;
- }
-
- transpose_vecs(&h_vecs[0]);
- transpose_vecs(&h_vecs[4]);
- // The first four vecs now contain the first half of each output, and the
- // second four vecs contain the second half of each output.
- storeu(h_vecs[0], &out[0 * sizeof(__m128i)]);
- storeu(h_vecs[4], &out[1 * sizeof(__m128i)]);
- storeu(h_vecs[1], &out[2 * sizeof(__m128i)]);
- storeu(h_vecs[5], &out[3 * sizeof(__m128i)]);
- storeu(h_vecs[2], &out[4 * sizeof(__m128i)]);
- storeu(h_vecs[6], &out[5 * sizeof(__m128i)]);
- storeu(h_vecs[3], &out[6 * sizeof(__m128i)]);
- storeu(h_vecs[7], &out[7 * sizeof(__m128i)]);
-}
-
-INLINE void hash_one_sse41(const uint8_t *input, size_t blocks,
- const uint32_t key[8], uint64_t counter,
- uint8_t flags, uint8_t flags_start,
- uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) {
- uint32_t cv[8];
- memcpy(cv, key, BLAKE3_KEY_LEN);
- uint8_t block_flags = flags | flags_start;
- while (blocks > 0) {
- if (blocks == 1) {
- block_flags |= flags_end;
- }
- blake3_compress_in_place_sse41(cv, input, BLAKE3_BLOCK_LEN, counter,
- block_flags);
- input = &input[BLAKE3_BLOCK_LEN];
- blocks -= 1;
- block_flags = flags;
- }
- memcpy(out, cv, BLAKE3_OUT_LEN);
-}
-
-void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs,
- size_t blocks, const uint32_t key[8],
- uint64_t counter, bool increment_counter,
- uint8_t flags, uint8_t flags_start,
- uint8_t flags_end, uint8_t *out) {
- while (num_inputs >= DEGREE) {
- blake3_hash4_sse41(inputs, blocks, key, counter, increment_counter, flags,
- flags_start, flags_end, out);
- if (increment_counter) {
- counter += DEGREE;
- }
- inputs += DEGREE;
- num_inputs -= DEGREE;
- out = &out[DEGREE * BLAKE3_OUT_LEN];
- }
- while (num_inputs > 0) {
- hash_one_sse41(inputs[0], blocks, key, counter, flags, flags_start,
- flags_end, out);
- if (increment_counter) {
- counter += 1;
- }
- inputs += 1;
- num_inputs -= 1;
- out = &out[BLAKE3_OUT_LEN];
- }
-}
diff --git a/thirdparty/BLAKE3/c/blake3_sse41_x86-64_unix.S b/thirdparty/BLAKE3/c/blake3_sse41_x86-64_unix.S
deleted file mode 100644
index a3ff64269..000000000
--- a/thirdparty/BLAKE3/c/blake3_sse41_x86-64_unix.S
+++ /dev/null
@@ -1,2028 +0,0 @@
-#if defined(__ELF__) && defined(__linux__)
-.section .note.GNU-stack,"",%progbits
-#endif
-
-#if defined(__ELF__) && defined(__CET__) && defined(__has_include)
-#if __has_include(<cet.h>)
-#include <cet.h>
-#endif
-#endif
-
-#if !defined(_CET_ENDBR)
-#define _CET_ENDBR
-#endif
-
-.intel_syntax noprefix
-.global blake3_hash_many_sse41
-.global _blake3_hash_many_sse41
-.global blake3_compress_in_place_sse41
-.global _blake3_compress_in_place_sse41
-.global blake3_compress_xof_sse41
-.global _blake3_compress_xof_sse41
-#ifdef __APPLE__
-.text
-#else
-.section .text
-#endif
- .p2align 6
-_blake3_hash_many_sse41:
-blake3_hash_many_sse41:
- _CET_ENDBR
- push r15
- push r14
- push r13
- push r12
- push rbx
- push rbp
- mov rbp, rsp
- sub rsp, 360
- and rsp, 0xFFFFFFFFFFFFFFC0
- neg r9d
- movd xmm0, r9d
- pshufd xmm0, xmm0, 0x00
- movdqa xmmword ptr [rsp+0x130], xmm0
- movdqa xmm1, xmm0
- pand xmm1, xmmword ptr [ADD0+rip]
- pand xmm0, xmmword ptr [ADD1+rip]
- movdqa xmmword ptr [rsp+0x150], xmm0
- movd xmm0, r8d
- pshufd xmm0, xmm0, 0x00
- paddd xmm0, xmm1
- movdqa xmmword ptr [rsp+0x110], xmm0
- pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
- pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
- pcmpgtd xmm1, xmm0
- shr r8, 32
- movd xmm2, r8d
- pshufd xmm2, xmm2, 0x00
- psubd xmm2, xmm1
- movdqa xmmword ptr [rsp+0x120], xmm2
- mov rbx, qword ptr [rbp+0x50]
- mov r15, rdx
- shl r15, 6
- movzx r13d, byte ptr [rbp+0x38]
- movzx r12d, byte ptr [rbp+0x48]
- cmp rsi, 4
- jc 3f
-2:
- movdqu xmm3, xmmword ptr [rcx]
- pshufd xmm0, xmm3, 0x00
- pshufd xmm1, xmm3, 0x55
- pshufd xmm2, xmm3, 0xAA
- pshufd xmm3, xmm3, 0xFF
- movdqu xmm7, xmmword ptr [rcx+0x10]
- pshufd xmm4, xmm7, 0x00
- pshufd xmm5, xmm7, 0x55
- pshufd xmm6, xmm7, 0xAA
- pshufd xmm7, xmm7, 0xFF
- mov r8, qword ptr [rdi]
- mov r9, qword ptr [rdi+0x8]
- mov r10, qword ptr [rdi+0x10]
- mov r11, qword ptr [rdi+0x18]
- movzx eax, byte ptr [rbp+0x40]
- or eax, r13d
- xor edx, edx
-9:
- mov r14d, eax
- or eax, r12d
- add rdx, 64
- cmp rdx, r15
- cmovne eax, r14d
- movdqu xmm8, xmmword ptr [r8+rdx-0x40]
- movdqu xmm9, xmmword ptr [r9+rdx-0x40]
- movdqu xmm10, xmmword ptr [r10+rdx-0x40]
- movdqu xmm11, xmmword ptr [r11+rdx-0x40]
- movdqa xmm12, xmm8
- punpckldq xmm8, xmm9
- punpckhdq xmm12, xmm9
- movdqa xmm14, xmm10
- punpckldq xmm10, xmm11
- punpckhdq xmm14, xmm11
- movdqa xmm9, xmm8
- punpcklqdq xmm8, xmm10
- punpckhqdq xmm9, xmm10
- movdqa xmm13, xmm12
- punpcklqdq xmm12, xmm14
- punpckhqdq xmm13, xmm14
- movdqa xmmword ptr [rsp], xmm8
- movdqa xmmword ptr [rsp+0x10], xmm9
- movdqa xmmword ptr [rsp+0x20], xmm12
- movdqa xmmword ptr [rsp+0x30], xmm13
- movdqu xmm8, xmmword ptr [r8+rdx-0x30]
- movdqu xmm9, xmmword ptr [r9+rdx-0x30]
- movdqu xmm10, xmmword ptr [r10+rdx-0x30]
- movdqu xmm11, xmmword ptr [r11+rdx-0x30]
- movdqa xmm12, xmm8
- punpckldq xmm8, xmm9
- punpckhdq xmm12, xmm9
- movdqa xmm14, xmm10
- punpckldq xmm10, xmm11
- punpckhdq xmm14, xmm11
- movdqa xmm9, xmm8
- punpcklqdq xmm8, xmm10
- punpckhqdq xmm9, xmm10
- movdqa xmm13, xmm12
- punpcklqdq xmm12, xmm14
- punpckhqdq xmm13, xmm14
- movdqa xmmword ptr [rsp+0x40], xmm8
- movdqa xmmword ptr [rsp+0x50], xmm9
- movdqa xmmword ptr [rsp+0x60], xmm12
- movdqa xmmword ptr [rsp+0x70], xmm13
- movdqu xmm8, xmmword ptr [r8+rdx-0x20]
- movdqu xmm9, xmmword ptr [r9+rdx-0x20]
- movdqu xmm10, xmmword ptr [r10+rdx-0x20]
- movdqu xmm11, xmmword ptr [r11+rdx-0x20]
- movdqa xmm12, xmm8
- punpckldq xmm8, xmm9
- punpckhdq xmm12, xmm9
- movdqa xmm14, xmm10
- punpckldq xmm10, xmm11
- punpckhdq xmm14, xmm11
- movdqa xmm9, xmm8
- punpcklqdq xmm8, xmm10
- punpckhqdq xmm9, xmm10
- movdqa xmm13, xmm12
- punpcklqdq xmm12, xmm14
- punpckhqdq xmm13, xmm14
- movdqa xmmword ptr [rsp+0x80], xmm8
- movdqa xmmword ptr [rsp+0x90], xmm9
- movdqa xmmword ptr [rsp+0xA0], xmm12
- movdqa xmmword ptr [rsp+0xB0], xmm13
- movdqu xmm8, xmmword ptr [r8+rdx-0x10]
- movdqu xmm9, xmmword ptr [r9+rdx-0x10]
- movdqu xmm10, xmmword ptr [r10+rdx-0x10]
- movdqu xmm11, xmmword ptr [r11+rdx-0x10]
- movdqa xmm12, xmm8
- punpckldq xmm8, xmm9
- punpckhdq xmm12, xmm9
- movdqa xmm14, xmm10
- punpckldq xmm10, xmm11
- punpckhdq xmm14, xmm11
- movdqa xmm9, xmm8
- punpcklqdq xmm8, xmm10
- punpckhqdq xmm9, xmm10
- movdqa xmm13, xmm12
- punpcklqdq xmm12, xmm14
- punpckhqdq xmm13, xmm14
- movdqa xmmword ptr [rsp+0xC0], xmm8
- movdqa xmmword ptr [rsp+0xD0], xmm9
- movdqa xmmword ptr [rsp+0xE0], xmm12
- movdqa xmmword ptr [rsp+0xF0], xmm13
- movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip]
- movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip]
- movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip]
- movdqa xmm12, xmmword ptr [rsp+0x110]
- movdqa xmm13, xmmword ptr [rsp+0x120]
- movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
- movd xmm15, eax
- pshufd xmm15, xmm15, 0x00
- prefetcht0 [r8+rdx+0x80]
- prefetcht0 [r9+rdx+0x80]
- prefetcht0 [r10+rdx+0x80]
- prefetcht0 [r11+rdx+0x80]
- paddd xmm0, xmmword ptr [rsp]
- paddd xmm1, xmmword ptr [rsp+0x20]
- paddd xmm2, xmmword ptr [rsp+0x40]
- paddd xmm3, xmmword ptr [rsp+0x60]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT16+rip]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
- movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+0x10]
- paddd xmm1, xmmword ptr [rsp+0x30]
- paddd xmm2, xmmword ptr [rsp+0x50]
- paddd xmm3, xmmword ptr [rsp+0x70]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT8+rip]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+0x80]
- paddd xmm1, xmmword ptr [rsp+0xA0]
- paddd xmm2, xmmword ptr [rsp+0xC0]
- paddd xmm3, xmmword ptr [rsp+0xE0]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT16+rip]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+0x90]
- paddd xmm1, xmmword ptr [rsp+0xB0]
- paddd xmm2, xmmword ptr [rsp+0xD0]
- paddd xmm3, xmmword ptr [rsp+0xF0]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT8+rip]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+0x20]
- paddd xmm1, xmmword ptr [rsp+0x30]
- paddd xmm2, xmmword ptr [rsp+0x70]
- paddd xmm3, xmmword ptr [rsp+0x40]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT16+rip]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+0x60]
- paddd xmm1, xmmword ptr [rsp+0xA0]
- paddd xmm2, xmmword ptr [rsp]
- paddd xmm3, xmmword ptr [rsp+0xD0]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT8+rip]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+0x10]
- paddd xmm1, xmmword ptr [rsp+0xC0]
- paddd xmm2, xmmword ptr [rsp+0x90]
- paddd xmm3, xmmword ptr [rsp+0xF0]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT16+rip]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+0xB0]
- paddd xmm1, xmmword ptr [rsp+0x50]
- paddd xmm2, xmmword ptr [rsp+0xE0]
- paddd xmm3, xmmword ptr [rsp+0x80]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT8+rip]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+0x30]
- paddd xmm1, xmmword ptr [rsp+0xA0]
- paddd xmm2, xmmword ptr [rsp+0xD0]
- paddd xmm3, xmmword ptr [rsp+0x70]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT16+rip]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+0x40]
- paddd xmm1, xmmword ptr [rsp+0xC0]
- paddd xmm2, xmmword ptr [rsp+0x20]
- paddd xmm3, xmmword ptr [rsp+0xE0]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT8+rip]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+0x60]
- paddd xmm1, xmmword ptr [rsp+0x90]
- paddd xmm2, xmmword ptr [rsp+0xB0]
- paddd xmm3, xmmword ptr [rsp+0x80]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT16+rip]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+0x50]
- paddd xmm1, xmmword ptr [rsp]
- paddd xmm2, xmmword ptr [rsp+0xF0]
- paddd xmm3, xmmword ptr [rsp+0x10]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT8+rip]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+0xA0]
- paddd xmm1, xmmword ptr [rsp+0xC0]
- paddd xmm2, xmmword ptr [rsp+0xE0]
- paddd xmm3, xmmword ptr [rsp+0xD0]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT16+rip]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+0x70]
- paddd xmm1, xmmword ptr [rsp+0x90]
- paddd xmm2, xmmword ptr [rsp+0x30]
- paddd xmm3, xmmword ptr [rsp+0xF0]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT8+rip]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+0x40]
- paddd xmm1, xmmword ptr [rsp+0xB0]
- paddd xmm2, xmmword ptr [rsp+0x50]
- paddd xmm3, xmmword ptr [rsp+0x10]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT16+rip]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp]
- paddd xmm1, xmmword ptr [rsp+0x20]
- paddd xmm2, xmmword ptr [rsp+0x80]
- paddd xmm3, xmmword ptr [rsp+0x60]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT8+rip]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+0xC0]
- paddd xmm1, xmmword ptr [rsp+0x90]
- paddd xmm2, xmmword ptr [rsp+0xF0]
- paddd xmm3, xmmword ptr [rsp+0xE0]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT16+rip]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+0xD0]
- paddd xmm1, xmmword ptr [rsp+0xB0]
- paddd xmm2, xmmword ptr [rsp+0xA0]
- paddd xmm3, xmmword ptr [rsp+0x80]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT8+rip]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+0x70]
- paddd xmm1, xmmword ptr [rsp+0x50]
- paddd xmm2, xmmword ptr [rsp]
- paddd xmm3, xmmword ptr [rsp+0x60]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT16+rip]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+0x20]
- paddd xmm1, xmmword ptr [rsp+0x30]
- paddd xmm2, xmmword ptr [rsp+0x10]
- paddd xmm3, xmmword ptr [rsp+0x40]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT8+rip]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+0x90]
- paddd xmm1, xmmword ptr [rsp+0xB0]
- paddd xmm2, xmmword ptr [rsp+0x80]
- paddd xmm3, xmmword ptr [rsp+0xF0]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT16+rip]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+0xE0]
- paddd xmm1, xmmword ptr [rsp+0x50]
- paddd xmm2, xmmword ptr [rsp+0xC0]
- paddd xmm3, xmmword ptr [rsp+0x10]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT8+rip]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+0xD0]
- paddd xmm1, xmmword ptr [rsp]
- paddd xmm2, xmmword ptr [rsp+0x20]
- paddd xmm3, xmmword ptr [rsp+0x40]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT16+rip]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+0x30]
- paddd xmm1, xmmword ptr [rsp+0xA0]
- paddd xmm2, xmmword ptr [rsp+0x60]
- paddd xmm3, xmmword ptr [rsp+0x70]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT8+rip]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+0xB0]
- paddd xmm1, xmmword ptr [rsp+0x50]
- paddd xmm2, xmmword ptr [rsp+0x10]
- paddd xmm3, xmmword ptr [rsp+0x80]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT16+rip]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+0xF0]
- paddd xmm1, xmmword ptr [rsp]
- paddd xmm2, xmmword ptr [rsp+0x90]
- paddd xmm3, xmmword ptr [rsp+0x60]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT8+rip]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+0xE0]
- paddd xmm1, xmmword ptr [rsp+0x20]
- paddd xmm2, xmmword ptr [rsp+0x30]
- paddd xmm3, xmmword ptr [rsp+0x70]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT16+rip]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+0xA0]
- paddd xmm1, xmmword ptr [rsp+0xC0]
- paddd xmm2, xmmword ptr [rsp+0x40]
- paddd xmm3, xmmword ptr [rsp+0xD0]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT8+rip]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- pxor xmm0, xmm8
- pxor xmm1, xmm9
- pxor xmm2, xmm10
- pxor xmm3, xmm11
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- pxor xmm4, xmm12
- pxor xmm5, xmm13
- pxor xmm6, xmm14
- pxor xmm7, xmm15
- mov eax, r13d
- jne 9b
- movdqa xmm9, xmm0
- punpckldq xmm0, xmm1
- punpckhdq xmm9, xmm1
- movdqa xmm11, xmm2
- punpckldq xmm2, xmm3
- punpckhdq xmm11, xmm3
- movdqa xmm1, xmm0
- punpcklqdq xmm0, xmm2
- punpckhqdq xmm1, xmm2
- movdqa xmm3, xmm9
- punpcklqdq xmm9, xmm11
- punpckhqdq xmm3, xmm11
- movdqu xmmword ptr [rbx], xmm0
- movdqu xmmword ptr [rbx+0x20], xmm1
- movdqu xmmword ptr [rbx+0x40], xmm9
- movdqu xmmword ptr [rbx+0x60], xmm3
- movdqa xmm9, xmm4
- punpckldq xmm4, xmm5
- punpckhdq xmm9, xmm5
- movdqa xmm11, xmm6
- punpckldq xmm6, xmm7
- punpckhdq xmm11, xmm7
- movdqa xmm5, xmm4
- punpcklqdq xmm4, xmm6
- punpckhqdq xmm5, xmm6
- movdqa xmm7, xmm9
- punpcklqdq xmm9, xmm11
- punpckhqdq xmm7, xmm11
- movdqu xmmword ptr [rbx+0x10], xmm4
- movdqu xmmword ptr [rbx+0x30], xmm5
- movdqu xmmword ptr [rbx+0x50], xmm9
- movdqu xmmword ptr [rbx+0x70], xmm7
- movdqa xmm1, xmmword ptr [rsp+0x110]
- movdqa xmm0, xmm1
- paddd xmm1, xmmword ptr [rsp+0x150]
- movdqa xmmword ptr [rsp+0x110], xmm1
- pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
- pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
- pcmpgtd xmm0, xmm1
- movdqa xmm1, xmmword ptr [rsp+0x120]
- psubd xmm1, xmm0
- movdqa xmmword ptr [rsp+0x120], xmm1
- add rbx, 128
- add rdi, 32
- sub rsi, 4
- cmp rsi, 4
- jnc 2b
- test rsi, rsi
- jnz 3f
-4:
- mov rsp, rbp
- pop rbp
- pop rbx
- pop r12
- pop r13
- pop r14
- pop r15
- ret
-.p2align 5
-3:
- test esi, 0x2
- je 3f
- movups xmm0, xmmword ptr [rcx]
- movups xmm1, xmmword ptr [rcx+0x10]
- movaps xmm8, xmm0
- movaps xmm9, xmm1
- movd xmm13, dword ptr [rsp+0x110]
- pinsrd xmm13, dword ptr [rsp+0x120], 1
- pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
- movaps xmmword ptr [rsp], xmm13
- movd xmm14, dword ptr [rsp+0x114]
- pinsrd xmm14, dword ptr [rsp+0x124], 1
- pinsrd xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
- movaps xmmword ptr [rsp+0x10], xmm14
- mov r8, qword ptr [rdi]
- mov r9, qword ptr [rdi+0x8]
- movzx eax, byte ptr [rbp+0x40]
- or eax, r13d
- xor edx, edx
-2:
- mov r14d, eax
- or eax, r12d
- add rdx, 64
- cmp rdx, r15
- cmovne eax, r14d
- movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
- movaps xmm10, xmm2
- movups xmm4, xmmword ptr [r8+rdx-0x40]
- movups xmm5, xmmword ptr [r8+rdx-0x30]
- movaps xmm3, xmm4
- shufps xmm4, xmm5, 136
- shufps xmm3, xmm5, 221
- movaps xmm5, xmm3
- movups xmm6, xmmword ptr [r8+rdx-0x20]
- movups xmm7, xmmword ptr [r8+rdx-0x10]
- movaps xmm3, xmm6
- shufps xmm6, xmm7, 136
- pshufd xmm6, xmm6, 0x93
- shufps xmm3, xmm7, 221
- pshufd xmm7, xmm3, 0x93
- movups xmm12, xmmword ptr [r9+rdx-0x40]
- movups xmm13, xmmword ptr [r9+rdx-0x30]
- movaps xmm11, xmm12
- shufps xmm12, xmm13, 136
- shufps xmm11, xmm13, 221
- movaps xmm13, xmm11
- movups xmm14, xmmword ptr [r9+rdx-0x20]
- movups xmm15, xmmword ptr [r9+rdx-0x10]
- movaps xmm11, xmm14
- shufps xmm14, xmm15, 136
- pshufd xmm14, xmm14, 0x93
- shufps xmm11, xmm15, 221
- pshufd xmm15, xmm11, 0x93
- movaps xmm3, xmmword ptr [rsp]
- movaps xmm11, xmmword ptr [rsp+0x10]
- pinsrd xmm3, eax, 3
- pinsrd xmm11, eax, 3
- mov al, 7
-9:
- paddd xmm0, xmm4
- paddd xmm8, xmm12
- movaps xmmword ptr [rsp+0x20], xmm4
- movaps xmmword ptr [rsp+0x30], xmm12
- paddd xmm0, xmm1
- paddd xmm8, xmm9
- pxor xmm3, xmm0
- pxor xmm11, xmm8
- movaps xmm12, xmmword ptr [ROT16+rip]
- pshufb xmm3, xmm12
- pshufb xmm11, xmm12
- paddd xmm2, xmm3
- paddd xmm10, xmm11
- pxor xmm1, xmm2
- pxor xmm9, xmm10
- movdqa xmm4, xmm1
- pslld xmm1, 20
- psrld xmm4, 12
- por xmm1, xmm4
- movdqa xmm4, xmm9
- pslld xmm9, 20
- psrld xmm4, 12
- por xmm9, xmm4
- paddd xmm0, xmm5
- paddd xmm8, xmm13
- movaps xmmword ptr [rsp+0x40], xmm5
- movaps xmmword ptr [rsp+0x50], xmm13
- paddd xmm0, xmm1
- paddd xmm8, xmm9
- pxor xmm3, xmm0
- pxor xmm11, xmm8
- movaps xmm13, xmmword ptr [ROT8+rip]
- pshufb xmm3, xmm13
- pshufb xmm11, xmm13
- paddd xmm2, xmm3
- paddd xmm10, xmm11
- pxor xmm1, xmm2
- pxor xmm9, xmm10
- movdqa xmm4, xmm1
- pslld xmm1, 25
- psrld xmm4, 7
- por xmm1, xmm4
- movdqa xmm4, xmm9
- pslld xmm9, 25
- psrld xmm4, 7
- por xmm9, xmm4
- pshufd xmm0, xmm0, 0x93
- pshufd xmm8, xmm8, 0x93
- pshufd xmm3, xmm3, 0x4E
- pshufd xmm11, xmm11, 0x4E
- pshufd xmm2, xmm2, 0x39
- pshufd xmm10, xmm10, 0x39
- paddd xmm0, xmm6
- paddd xmm8, xmm14
- paddd xmm0, xmm1
- paddd xmm8, xmm9
- pxor xmm3, xmm0
- pxor xmm11, xmm8
- pshufb xmm3, xmm12
- pshufb xmm11, xmm12
- paddd xmm2, xmm3
- paddd xmm10, xmm11
- pxor xmm1, xmm2
- pxor xmm9, xmm10
- movdqa xmm4, xmm1
- pslld xmm1, 20
- psrld xmm4, 12
- por xmm1, xmm4
- movdqa xmm4, xmm9
- pslld xmm9, 20
- psrld xmm4, 12
- por xmm9, xmm4
- paddd xmm0, xmm7
- paddd xmm8, xmm15
- paddd xmm0, xmm1
- paddd xmm8, xmm9
- pxor xmm3, xmm0
- pxor xmm11, xmm8
- pshufb xmm3, xmm13
- pshufb xmm11, xmm13
- paddd xmm2, xmm3
- paddd xmm10, xmm11
- pxor xmm1, xmm2
- pxor xmm9, xmm10
- movdqa xmm4, xmm1
- pslld xmm1, 25
- psrld xmm4, 7
- por xmm1, xmm4
- movdqa xmm4, xmm9
- pslld xmm9, 25
- psrld xmm4, 7
- por xmm9, xmm4
- pshufd xmm0, xmm0, 0x39
- pshufd xmm8, xmm8, 0x39
- pshufd xmm3, xmm3, 0x4E
- pshufd xmm11, xmm11, 0x4E
- pshufd xmm2, xmm2, 0x93
- pshufd xmm10, xmm10, 0x93
- dec al
- je 9f
- movdqa xmm12, xmmword ptr [rsp+0x20]
- movdqa xmm5, xmmword ptr [rsp+0x40]
- pshufd xmm13, xmm12, 0x0F
- shufps xmm12, xmm5, 214
- pshufd xmm4, xmm12, 0x39
- movdqa xmm12, xmm6
- shufps xmm12, xmm7, 250
- pblendw xmm13, xmm12, 0xCC
- movdqa xmm12, xmm7
- punpcklqdq xmm12, xmm5
- pblendw xmm12, xmm6, 0xC0
- pshufd xmm12, xmm12, 0x78
- punpckhdq xmm5, xmm7
- punpckldq xmm6, xmm5
- pshufd xmm7, xmm6, 0x1E
- movdqa xmmword ptr [rsp+0x20], xmm13
- movdqa xmmword ptr [rsp+0x40], xmm12
- movdqa xmm5, xmmword ptr [rsp+0x30]
- movdqa xmm13, xmmword ptr [rsp+0x50]
- pshufd xmm6, xmm5, 0x0F
- shufps xmm5, xmm13, 214
- pshufd xmm12, xmm5, 0x39
- movdqa xmm5, xmm14
- shufps xmm5, xmm15, 250
- pblendw xmm6, xmm5, 0xCC
- movdqa xmm5, xmm15
- punpcklqdq xmm5, xmm13
- pblendw xmm5, xmm14, 0xC0
- pshufd xmm5, xmm5, 0x78
- punpckhdq xmm13, xmm15
- punpckldq xmm14, xmm13
- pshufd xmm15, xmm14, 0x1E
- movdqa xmm13, xmm6
- movdqa xmm14, xmm5
- movdqa xmm5, xmmword ptr [rsp+0x20]
- movdqa xmm6, xmmword ptr [rsp+0x40]
- jmp 9b
-9:
- pxor xmm0, xmm2
- pxor xmm1, xmm3
- pxor xmm8, xmm10
- pxor xmm9, xmm11
- mov eax, r13d
- cmp rdx, r15
- jne 2b
- movups xmmword ptr [rbx], xmm0
- movups xmmword ptr [rbx+0x10], xmm1
- movups xmmword ptr [rbx+0x20], xmm8
- movups xmmword ptr [rbx+0x30], xmm9
- movdqa xmm0, xmmword ptr [rsp+0x130]
- movdqa xmm1, xmmword ptr [rsp+0x110]
- movdqa xmm2, xmmword ptr [rsp+0x120]
- movdqu xmm3, xmmword ptr [rsp+0x118]
- movdqu xmm4, xmmword ptr [rsp+0x128]
- blendvps xmm1, xmm3, xmm0
- blendvps xmm2, xmm4, xmm0
- movdqa xmmword ptr [rsp+0x110], xmm1
- movdqa xmmword ptr [rsp+0x120], xmm2
- add rdi, 16
- add rbx, 64
- sub rsi, 2
-3:
- test esi, 0x1
- je 4b
- movups xmm0, xmmword ptr [rcx]
- movups xmm1, xmmword ptr [rcx+0x10]
- movd xmm13, dword ptr [rsp+0x110]
- pinsrd xmm13, dword ptr [rsp+0x120], 1
- pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
- movaps xmm14, xmmword ptr [ROT8+rip]
- movaps xmm15, xmmword ptr [ROT16+rip]
- mov r8, qword ptr [rdi]
- movzx eax, byte ptr [rbp+0x40]
- or eax, r13d
- xor edx, edx
-2:
- mov r14d, eax
- or eax, r12d
- add rdx, 64
- cmp rdx, r15
- cmovne eax, r14d
- movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
- movaps xmm3, xmm13
- pinsrd xmm3, eax, 3
- movups xmm4, xmmword ptr [r8+rdx-0x40]
- movups xmm5, xmmword ptr [r8+rdx-0x30]
- movaps xmm8, xmm4
- shufps xmm4, xmm5, 136
- shufps xmm8, xmm5, 221
- movaps xmm5, xmm8
- movups xmm6, xmmword ptr [r8+rdx-0x20]
- movups xmm7, xmmword ptr [r8+rdx-0x10]
- movaps xmm8, xmm6
- shufps xmm6, xmm7, 136
- pshufd xmm6, xmm6, 0x93
- shufps xmm8, xmm7, 221
- pshufd xmm7, xmm8, 0x93
- mov al, 7
-9:
- paddd xmm0, xmm4
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- pshufb xmm3, xmm15
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 20
- psrld xmm11, 12
- por xmm1, xmm11
- paddd xmm0, xmm5
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- pshufb xmm3, xmm14
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 25
- psrld xmm11, 7
- por xmm1, xmm11
- pshufd xmm0, xmm0, 0x93
- pshufd xmm3, xmm3, 0x4E
- pshufd xmm2, xmm2, 0x39
- paddd xmm0, xmm6
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- pshufb xmm3, xmm15
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 20
- psrld xmm11, 12
- por xmm1, xmm11
- paddd xmm0, xmm7
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- pshufb xmm3, xmm14
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 25
- psrld xmm11, 7
- por xmm1, xmm11
- pshufd xmm0, xmm0, 0x39
- pshufd xmm3, xmm3, 0x4E
- pshufd xmm2, xmm2, 0x93
- dec al
- jz 9f
- movdqa xmm8, xmm4
- shufps xmm8, xmm5, 214
- pshufd xmm9, xmm4, 0x0F
- pshufd xmm4, xmm8, 0x39
- movdqa xmm8, xmm6
- shufps xmm8, xmm7, 250
- pblendw xmm9, xmm8, 0xCC
- movdqa xmm8, xmm7
- punpcklqdq xmm8, xmm5
- pblendw xmm8, xmm6, 0xC0
- pshufd xmm8, xmm8, 0x78
- punpckhdq xmm5, xmm7
- punpckldq xmm6, xmm5
- pshufd xmm7, xmm6, 0x1E
- movdqa xmm5, xmm9
- movdqa xmm6, xmm8
- jmp 9b
-9:
- pxor xmm0, xmm2
- pxor xmm1, xmm3
- mov eax, r13d
- cmp rdx, r15
- jne 2b
- movups xmmword ptr [rbx], xmm0
- movups xmmword ptr [rbx+0x10], xmm1
- jmp 4b
-
-.p2align 6
-blake3_compress_in_place_sse41:
-_blake3_compress_in_place_sse41:
- _CET_ENDBR
- movups xmm0, xmmword ptr [rdi]
- movups xmm1, xmmword ptr [rdi+0x10]
- movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
- shl r8, 32
- add rdx, r8
- movq xmm3, rcx
- movq xmm4, rdx
- punpcklqdq xmm3, xmm4
- movups xmm4, xmmword ptr [rsi]
- movups xmm5, xmmword ptr [rsi+0x10]
- movaps xmm8, xmm4
- shufps xmm4, xmm5, 136
- shufps xmm8, xmm5, 221
- movaps xmm5, xmm8
- movups xmm6, xmmword ptr [rsi+0x20]
- movups xmm7, xmmword ptr [rsi+0x30]
- movaps xmm8, xmm6
- shufps xmm6, xmm7, 136
- pshufd xmm6, xmm6, 0x93
- shufps xmm8, xmm7, 221
- pshufd xmm7, xmm8, 0x93
- movaps xmm14, xmmword ptr [ROT8+rip]
- movaps xmm15, xmmword ptr [ROT16+rip]
- mov al, 7
-9:
- paddd xmm0, xmm4
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- pshufb xmm3, xmm15
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 20
- psrld xmm11, 12
- por xmm1, xmm11
- paddd xmm0, xmm5
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- pshufb xmm3, xmm14
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 25
- psrld xmm11, 7
- por xmm1, xmm11
- pshufd xmm0, xmm0, 0x93
- pshufd xmm3, xmm3, 0x4E
- pshufd xmm2, xmm2, 0x39
- paddd xmm0, xmm6
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- pshufb xmm3, xmm15
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 20
- psrld xmm11, 12
- por xmm1, xmm11
- paddd xmm0, xmm7
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- pshufb xmm3, xmm14
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 25
- psrld xmm11, 7
- por xmm1, xmm11
- pshufd xmm0, xmm0, 0x39
- pshufd xmm3, xmm3, 0x4E
- pshufd xmm2, xmm2, 0x93
- dec al
- jz 9f
- movdqa xmm8, xmm4
- shufps xmm8, xmm5, 214
- pshufd xmm9, xmm4, 0x0F
- pshufd xmm4, xmm8, 0x39
- movdqa xmm8, xmm6
- shufps xmm8, xmm7, 250
- pblendw xmm9, xmm8, 0xCC
- movdqa xmm8, xmm7
- punpcklqdq xmm8, xmm5
- pblendw xmm8, xmm6, 0xC0
- pshufd xmm8, xmm8, 0x78
- punpckhdq xmm5, xmm7
- punpckldq xmm6, xmm5
- pshufd xmm7, xmm6, 0x1E
- movdqa xmm5, xmm9
- movdqa xmm6, xmm8
- jmp 9b
-9:
- pxor xmm0, xmm2
- pxor xmm1, xmm3
- movups xmmword ptr [rdi], xmm0
- movups xmmword ptr [rdi+0x10], xmm1
- ret
-
-.p2align 6
-blake3_compress_xof_sse41:
-_blake3_compress_xof_sse41:
- _CET_ENDBR
- movups xmm0, xmmword ptr [rdi]
- movups xmm1, xmmword ptr [rdi+0x10]
- movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
- movzx eax, r8b
- movzx edx, dl
- shl rax, 32
- add rdx, rax
- movq xmm3, rcx
- movq xmm4, rdx
- punpcklqdq xmm3, xmm4
- movups xmm4, xmmword ptr [rsi]
- movups xmm5, xmmword ptr [rsi+0x10]
- movaps xmm8, xmm4
- shufps xmm4, xmm5, 136
- shufps xmm8, xmm5, 221
- movaps xmm5, xmm8
- movups xmm6, xmmword ptr [rsi+0x20]
- movups xmm7, xmmword ptr [rsi+0x30]
- movaps xmm8, xmm6
- shufps xmm6, xmm7, 136
- pshufd xmm6, xmm6, 0x93
- shufps xmm8, xmm7, 221
- pshufd xmm7, xmm8, 0x93
- movaps xmm14, xmmword ptr [ROT8+rip]
- movaps xmm15, xmmword ptr [ROT16+rip]
- mov al, 7
-9:
- paddd xmm0, xmm4
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- pshufb xmm3, xmm15
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 20
- psrld xmm11, 12
- por xmm1, xmm11
- paddd xmm0, xmm5
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- pshufb xmm3, xmm14
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 25
- psrld xmm11, 7
- por xmm1, xmm11
- pshufd xmm0, xmm0, 0x93
- pshufd xmm3, xmm3, 0x4E
- pshufd xmm2, xmm2, 0x39
- paddd xmm0, xmm6
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- pshufb xmm3, xmm15
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 20
- psrld xmm11, 12
- por xmm1, xmm11
- paddd xmm0, xmm7
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- pshufb xmm3, xmm14
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 25
- psrld xmm11, 7
- por xmm1, xmm11
- pshufd xmm0, xmm0, 0x39
- pshufd xmm3, xmm3, 0x4E
- pshufd xmm2, xmm2, 0x93
- dec al
- jz 9f
- movdqa xmm8, xmm4
- shufps xmm8, xmm5, 214
- pshufd xmm9, xmm4, 0x0F
- pshufd xmm4, xmm8, 0x39
- movdqa xmm8, xmm6
- shufps xmm8, xmm7, 250
- pblendw xmm9, xmm8, 0xCC
- movdqa xmm8, xmm7
- punpcklqdq xmm8, xmm5
- pblendw xmm8, xmm6, 0xC0
- pshufd xmm8, xmm8, 0x78
- punpckhdq xmm5, xmm7
- punpckldq xmm6, xmm5
- pshufd xmm7, xmm6, 0x1E
- movdqa xmm5, xmm9
- movdqa xmm6, xmm8
- jmp 9b
-9:
- movdqu xmm4, xmmword ptr [rdi]
- movdqu xmm5, xmmword ptr [rdi+0x10]
- pxor xmm0, xmm2
- pxor xmm1, xmm3
- pxor xmm2, xmm4
- pxor xmm3, xmm5
- movups xmmword ptr [r9], xmm0
- movups xmmword ptr [r9+0x10], xmm1
- movups xmmword ptr [r9+0x20], xmm2
- movups xmmword ptr [r9+0x30], xmm3
- ret
-
-
-#ifdef __APPLE__
-.static_data
-#else
-.section .rodata
-#endif
-.p2align 6
-BLAKE3_IV:
- .long 0x6A09E667, 0xBB67AE85
- .long 0x3C6EF372, 0xA54FF53A
-ROT16:
- .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
-ROT8:
- .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
-ADD0:
- .long 0, 1, 2, 3
-ADD1:
- .long 4, 4, 4, 4
-BLAKE3_IV_0:
- .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
-BLAKE3_IV_1:
- .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
-BLAKE3_IV_2:
- .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
-BLAKE3_IV_3:
- .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
-BLAKE3_BLOCK_LEN:
- .long 64, 64, 64, 64
-CMP_MSB_MASK:
- .long 0x80000000, 0x80000000, 0x80000000, 0x80000000
diff --git a/thirdparty/BLAKE3/c/blake3_sse41_x86-64_windows_gnu.S b/thirdparty/BLAKE3/c/blake3_sse41_x86-64_windows_gnu.S
deleted file mode 100644
index 60d0a4042..000000000
--- a/thirdparty/BLAKE3/c/blake3_sse41_x86-64_windows_gnu.S
+++ /dev/null
@@ -1,2069 +0,0 @@
-.intel_syntax noprefix
-.global blake3_hash_many_sse41
-.global _blake3_hash_many_sse41
-.global blake3_compress_in_place_sse41
-.global _blake3_compress_in_place_sse41
-.global blake3_compress_xof_sse41
-.global _blake3_compress_xof_sse41
-.section .text
- .p2align 6
-_blake3_hash_many_sse41:
-blake3_hash_many_sse41:
- push r15
- push r14
- push r13
- push r12
- push rsi
- push rdi
- push rbx
- push rbp
- mov rbp, rsp
- sub rsp, 528
- and rsp, 0xFFFFFFFFFFFFFFC0
- movdqa xmmword ptr [rsp+0x170], xmm6
- movdqa xmmword ptr [rsp+0x180], xmm7
- movdqa xmmword ptr [rsp+0x190], xmm8
- movdqa xmmword ptr [rsp+0x1A0], xmm9
- movdqa xmmword ptr [rsp+0x1B0], xmm10
- movdqa xmmword ptr [rsp+0x1C0], xmm11
- movdqa xmmword ptr [rsp+0x1D0], xmm12
- movdqa xmmword ptr [rsp+0x1E0], xmm13
- movdqa xmmword ptr [rsp+0x1F0], xmm14
- movdqa xmmword ptr [rsp+0x200], xmm15
- mov rdi, rcx
- mov rsi, rdx
- mov rdx, r8
- mov rcx, r9
- mov r8, qword ptr [rbp+0x68]
- movzx r9, byte ptr [rbp+0x70]
- neg r9d
- movd xmm0, r9d
- pshufd xmm0, xmm0, 0x00
- movdqa xmmword ptr [rsp+0x130], xmm0
- movdqa xmm1, xmm0
- pand xmm1, xmmword ptr [ADD0+rip]
- pand xmm0, xmmword ptr [ADD1+rip]
- movdqa xmmword ptr [rsp+0x150], xmm0
- movd xmm0, r8d
- pshufd xmm0, xmm0, 0x00
- paddd xmm0, xmm1
- movdqa xmmword ptr [rsp+0x110], xmm0
- pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
- pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
- pcmpgtd xmm1, xmm0
- shr r8, 32
- movd xmm2, r8d
- pshufd xmm2, xmm2, 0x00
- psubd xmm2, xmm1
- movdqa xmmword ptr [rsp+0x120], xmm2
- mov rbx, qword ptr [rbp+0x90]
- mov r15, rdx
- shl r15, 6
- movzx r13d, byte ptr [rbp+0x78]
- movzx r12d, byte ptr [rbp+0x88]
- cmp rsi, 4
- jc 3f
-2:
- movdqu xmm3, xmmword ptr [rcx]
- pshufd xmm0, xmm3, 0x00
- pshufd xmm1, xmm3, 0x55
- pshufd xmm2, xmm3, 0xAA
- pshufd xmm3, xmm3, 0xFF
- movdqu xmm7, xmmword ptr [rcx+0x10]
- pshufd xmm4, xmm7, 0x00
- pshufd xmm5, xmm7, 0x55
- pshufd xmm6, xmm7, 0xAA
- pshufd xmm7, xmm7, 0xFF
- mov r8, qword ptr [rdi]
- mov r9, qword ptr [rdi+0x8]
- mov r10, qword ptr [rdi+0x10]
- mov r11, qword ptr [rdi+0x18]
- movzx eax, byte ptr [rbp+0x80]
- or eax, r13d
- xor edx, edx
-9:
- mov r14d, eax
- or eax, r12d
- add rdx, 64
- cmp rdx, r15
- cmovne eax, r14d
- movdqu xmm8, xmmword ptr [r8+rdx-0x40]
- movdqu xmm9, xmmword ptr [r9+rdx-0x40]
- movdqu xmm10, xmmword ptr [r10+rdx-0x40]
- movdqu xmm11, xmmword ptr [r11+rdx-0x40]
- movdqa xmm12, xmm8
- punpckldq xmm8, xmm9
- punpckhdq xmm12, xmm9
- movdqa xmm14, xmm10
- punpckldq xmm10, xmm11
- punpckhdq xmm14, xmm11
- movdqa xmm9, xmm8
- punpcklqdq xmm8, xmm10
- punpckhqdq xmm9, xmm10
- movdqa xmm13, xmm12
- punpcklqdq xmm12, xmm14
- punpckhqdq xmm13, xmm14
- movdqa xmmword ptr [rsp], xmm8
- movdqa xmmword ptr [rsp+0x10], xmm9
- movdqa xmmword ptr [rsp+0x20], xmm12
- movdqa xmmword ptr [rsp+0x30], xmm13
- movdqu xmm8, xmmword ptr [r8+rdx-0x30]
- movdqu xmm9, xmmword ptr [r9+rdx-0x30]
- movdqu xmm10, xmmword ptr [r10+rdx-0x30]
- movdqu xmm11, xmmword ptr [r11+rdx-0x30]
- movdqa xmm12, xmm8
- punpckldq xmm8, xmm9
- punpckhdq xmm12, xmm9
- movdqa xmm14, xmm10
- punpckldq xmm10, xmm11
- punpckhdq xmm14, xmm11
- movdqa xmm9, xmm8
- punpcklqdq xmm8, xmm10
- punpckhqdq xmm9, xmm10
- movdqa xmm13, xmm12
- punpcklqdq xmm12, xmm14
- punpckhqdq xmm13, xmm14
- movdqa xmmword ptr [rsp+0x40], xmm8
- movdqa xmmword ptr [rsp+0x50], xmm9
- movdqa xmmword ptr [rsp+0x60], xmm12
- movdqa xmmword ptr [rsp+0x70], xmm13
- movdqu xmm8, xmmword ptr [r8+rdx-0x20]
- movdqu xmm9, xmmword ptr [r9+rdx-0x20]
- movdqu xmm10, xmmword ptr [r10+rdx-0x20]
- movdqu xmm11, xmmword ptr [r11+rdx-0x20]
- movdqa xmm12, xmm8
- punpckldq xmm8, xmm9
- punpckhdq xmm12, xmm9
- movdqa xmm14, xmm10
- punpckldq xmm10, xmm11
- punpckhdq xmm14, xmm11
- movdqa xmm9, xmm8
- punpcklqdq xmm8, xmm10
- punpckhqdq xmm9, xmm10
- movdqa xmm13, xmm12
- punpcklqdq xmm12, xmm14
- punpckhqdq xmm13, xmm14
- movdqa xmmword ptr [rsp+0x80], xmm8
- movdqa xmmword ptr [rsp+0x90], xmm9
- movdqa xmmword ptr [rsp+0xA0], xmm12
- movdqa xmmword ptr [rsp+0xB0], xmm13
- movdqu xmm8, xmmword ptr [r8+rdx-0x10]
- movdqu xmm9, xmmword ptr [r9+rdx-0x10]
- movdqu xmm10, xmmword ptr [r10+rdx-0x10]
- movdqu xmm11, xmmword ptr [r11+rdx-0x10]
- movdqa xmm12, xmm8
- punpckldq xmm8, xmm9
- punpckhdq xmm12, xmm9
- movdqa xmm14, xmm10
- punpckldq xmm10, xmm11
- punpckhdq xmm14, xmm11
- movdqa xmm9, xmm8
- punpcklqdq xmm8, xmm10
- punpckhqdq xmm9, xmm10
- movdqa xmm13, xmm12
- punpcklqdq xmm12, xmm14
- punpckhqdq xmm13, xmm14
- movdqa xmmword ptr [rsp+0xC0], xmm8
- movdqa xmmword ptr [rsp+0xD0], xmm9
- movdqa xmmword ptr [rsp+0xE0], xmm12
- movdqa xmmword ptr [rsp+0xF0], xmm13
- movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip]
- movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip]
- movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip]
- movdqa xmm12, xmmword ptr [rsp+0x110]
- movdqa xmm13, xmmword ptr [rsp+0x120]
- movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
- movd xmm15, eax
- pshufd xmm15, xmm15, 0x00
- prefetcht0 [r8+rdx+0x80]
- prefetcht0 [r9+rdx+0x80]
- prefetcht0 [r10+rdx+0x80]
- prefetcht0 [r11+rdx+0x80]
- paddd xmm0, xmmword ptr [rsp]
- paddd xmm1, xmmword ptr [rsp+0x20]
- paddd xmm2, xmmword ptr [rsp+0x40]
- paddd xmm3, xmmword ptr [rsp+0x60]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT16+rip]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
- movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+0x10]
- paddd xmm1, xmmword ptr [rsp+0x30]
- paddd xmm2, xmmword ptr [rsp+0x50]
- paddd xmm3, xmmword ptr [rsp+0x70]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT8+rip]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+0x80]
- paddd xmm1, xmmword ptr [rsp+0xA0]
- paddd xmm2, xmmword ptr [rsp+0xC0]
- paddd xmm3, xmmword ptr [rsp+0xE0]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT16+rip]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+0x90]
- paddd xmm1, xmmword ptr [rsp+0xB0]
- paddd xmm2, xmmword ptr [rsp+0xD0]
- paddd xmm3, xmmword ptr [rsp+0xF0]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT8+rip]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+0x20]
- paddd xmm1, xmmword ptr [rsp+0x30]
- paddd xmm2, xmmword ptr [rsp+0x70]
- paddd xmm3, xmmword ptr [rsp+0x40]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT16+rip]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+0x60]
- paddd xmm1, xmmword ptr [rsp+0xA0]
- paddd xmm2, xmmword ptr [rsp]
- paddd xmm3, xmmword ptr [rsp+0xD0]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT8+rip]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+0x10]
- paddd xmm1, xmmword ptr [rsp+0xC0]
- paddd xmm2, xmmword ptr [rsp+0x90]
- paddd xmm3, xmmword ptr [rsp+0xF0]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT16+rip]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+0xB0]
- paddd xmm1, xmmword ptr [rsp+0x50]
- paddd xmm2, xmmword ptr [rsp+0xE0]
- paddd xmm3, xmmword ptr [rsp+0x80]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT8+rip]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+0x30]
- paddd xmm1, xmmword ptr [rsp+0xA0]
- paddd xmm2, xmmword ptr [rsp+0xD0]
- paddd xmm3, xmmword ptr [rsp+0x70]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT16+rip]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+0x40]
- paddd xmm1, xmmword ptr [rsp+0xC0]
- paddd xmm2, xmmword ptr [rsp+0x20]
- paddd xmm3, xmmword ptr [rsp+0xE0]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT8+rip]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+0x60]
- paddd xmm1, xmmword ptr [rsp+0x90]
- paddd xmm2, xmmword ptr [rsp+0xB0]
- paddd xmm3, xmmword ptr [rsp+0x80]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT16+rip]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+0x50]
- paddd xmm1, xmmword ptr [rsp]
- paddd xmm2, xmmword ptr [rsp+0xF0]
- paddd xmm3, xmmword ptr [rsp+0x10]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT8+rip]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+0xA0]
- paddd xmm1, xmmword ptr [rsp+0xC0]
- paddd xmm2, xmmword ptr [rsp+0xE0]
- paddd xmm3, xmmword ptr [rsp+0xD0]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT16+rip]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+0x70]
- paddd xmm1, xmmword ptr [rsp+0x90]
- paddd xmm2, xmmword ptr [rsp+0x30]
- paddd xmm3, xmmword ptr [rsp+0xF0]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT8+rip]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+0x40]
- paddd xmm1, xmmword ptr [rsp+0xB0]
- paddd xmm2, xmmword ptr [rsp+0x50]
- paddd xmm3, xmmword ptr [rsp+0x10]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT16+rip]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp]
- paddd xmm1, xmmword ptr [rsp+0x20]
- paddd xmm2, xmmword ptr [rsp+0x80]
- paddd xmm3, xmmword ptr [rsp+0x60]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT8+rip]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+0xC0]
- paddd xmm1, xmmword ptr [rsp+0x90]
- paddd xmm2, xmmword ptr [rsp+0xF0]
- paddd xmm3, xmmword ptr [rsp+0xE0]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT16+rip]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+0xD0]
- paddd xmm1, xmmword ptr [rsp+0xB0]
- paddd xmm2, xmmword ptr [rsp+0xA0]
- paddd xmm3, xmmword ptr [rsp+0x80]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT8+rip]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+0x70]
- paddd xmm1, xmmword ptr [rsp+0x50]
- paddd xmm2, xmmword ptr [rsp]
- paddd xmm3, xmmword ptr [rsp+0x60]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT16+rip]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+0x20]
- paddd xmm1, xmmword ptr [rsp+0x30]
- paddd xmm2, xmmword ptr [rsp+0x10]
- paddd xmm3, xmmword ptr [rsp+0x40]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT8+rip]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+0x90]
- paddd xmm1, xmmword ptr [rsp+0xB0]
- paddd xmm2, xmmword ptr [rsp+0x80]
- paddd xmm3, xmmword ptr [rsp+0xF0]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT16+rip]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+0xE0]
- paddd xmm1, xmmword ptr [rsp+0x50]
- paddd xmm2, xmmword ptr [rsp+0xC0]
- paddd xmm3, xmmword ptr [rsp+0x10]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT8+rip]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+0xD0]
- paddd xmm1, xmmword ptr [rsp]
- paddd xmm2, xmmword ptr [rsp+0x20]
- paddd xmm3, xmmword ptr [rsp+0x40]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT16+rip]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+0x30]
- paddd xmm1, xmmword ptr [rsp+0xA0]
- paddd xmm2, xmmword ptr [rsp+0x60]
- paddd xmm3, xmmword ptr [rsp+0x70]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT8+rip]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+0xB0]
- paddd xmm1, xmmword ptr [rsp+0x50]
- paddd xmm2, xmmword ptr [rsp+0x10]
- paddd xmm3, xmmword ptr [rsp+0x80]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT16+rip]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+0xF0]
- paddd xmm1, xmmword ptr [rsp]
- paddd xmm2, xmmword ptr [rsp+0x90]
- paddd xmm3, xmmword ptr [rsp+0x60]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT8+rip]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+0xE0]
- paddd xmm1, xmmword ptr [rsp+0x20]
- paddd xmm2, xmmword ptr [rsp+0x30]
- paddd xmm3, xmmword ptr [rsp+0x70]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT16+rip]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+0x100], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+0xA0]
- paddd xmm1, xmmword ptr [rsp+0xC0]
- paddd xmm2, xmmword ptr [rsp+0x40]
- paddd xmm3, xmmword ptr [rsp+0xD0]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT8+rip]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+0x100]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- pxor xmm0, xmm8
- pxor xmm1, xmm9
- pxor xmm2, xmm10
- pxor xmm3, xmm11
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- pxor xmm4, xmm12
- pxor xmm5, xmm13
- pxor xmm6, xmm14
- pxor xmm7, xmm15
- mov eax, r13d
- jne 9b
- movdqa xmm9, xmm0
- punpckldq xmm0, xmm1
- punpckhdq xmm9, xmm1
- movdqa xmm11, xmm2
- punpckldq xmm2, xmm3
- punpckhdq xmm11, xmm3
- movdqa xmm1, xmm0
- punpcklqdq xmm0, xmm2
- punpckhqdq xmm1, xmm2
- movdqa xmm3, xmm9
- punpcklqdq xmm9, xmm11
- punpckhqdq xmm3, xmm11
- movdqu xmmword ptr [rbx], xmm0
- movdqu xmmword ptr [rbx+0x20], xmm1
- movdqu xmmword ptr [rbx+0x40], xmm9
- movdqu xmmword ptr [rbx+0x60], xmm3
- movdqa xmm9, xmm4
- punpckldq xmm4, xmm5
- punpckhdq xmm9, xmm5
- movdqa xmm11, xmm6
- punpckldq xmm6, xmm7
- punpckhdq xmm11, xmm7
- movdqa xmm5, xmm4
- punpcklqdq xmm4, xmm6
- punpckhqdq xmm5, xmm6
- movdqa xmm7, xmm9
- punpcklqdq xmm9, xmm11
- punpckhqdq xmm7, xmm11
- movdqu xmmword ptr [rbx+0x10], xmm4
- movdqu xmmword ptr [rbx+0x30], xmm5
- movdqu xmmword ptr [rbx+0x50], xmm9
- movdqu xmmword ptr [rbx+0x70], xmm7
- movdqa xmm1, xmmword ptr [rsp+0x110]
- movdqa xmm0, xmm1
- paddd xmm1, xmmword ptr [rsp+0x150]
- movdqa xmmword ptr [rsp+0x110], xmm1
- pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
- pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
- pcmpgtd xmm0, xmm1
- movdqa xmm1, xmmword ptr [rsp+0x120]
- psubd xmm1, xmm0
- movdqa xmmword ptr [rsp+0x120], xmm1
- add rbx, 128
- add rdi, 32
- sub rsi, 4
- cmp rsi, 4
- jnc 2b
- test rsi, rsi
- jne 3f
-4:
- movdqa xmm6, xmmword ptr [rsp+0x170]
- movdqa xmm7, xmmword ptr [rsp+0x180]
- movdqa xmm8, xmmword ptr [rsp+0x190]
- movdqa xmm9, xmmword ptr [rsp+0x1A0]
- movdqa xmm10, xmmword ptr [rsp+0x1B0]
- movdqa xmm11, xmmword ptr [rsp+0x1C0]
- movdqa xmm12, xmmword ptr [rsp+0x1D0]
- movdqa xmm13, xmmword ptr [rsp+0x1E0]
- movdqa xmm14, xmmword ptr [rsp+0x1F0]
- movdqa xmm15, xmmword ptr [rsp+0x200]
- mov rsp, rbp
- pop rbp
- pop rbx
- pop rdi
- pop rsi
- pop r12
- pop r13
- pop r14
- pop r15
- ret
-.p2align 5
-3:
- test esi, 0x2
- je 3f
- movups xmm0, xmmword ptr [rcx]
- movups xmm1, xmmword ptr [rcx+0x10]
- movaps xmm8, xmm0
- movaps xmm9, xmm1
- movd xmm13, dword ptr [rsp+0x110]
- pinsrd xmm13, dword ptr [rsp+0x120], 1
- pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
- movaps xmmword ptr [rsp], xmm13
- movd xmm14, dword ptr [rsp+0x114]
- pinsrd xmm14, dword ptr [rsp+0x124], 1
- pinsrd xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
- movaps xmmword ptr [rsp+0x10], xmm14
- mov r8, qword ptr [rdi]
- mov r9, qword ptr [rdi+0x8]
- movzx eax, byte ptr [rbp+0x80]
- or eax, r13d
- xor edx, edx
-2:
- mov r14d, eax
- or eax, r12d
- add rdx, 64
- cmp rdx, r15
- cmovne eax, r14d
- movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
- movaps xmm10, xmm2
- movups xmm4, xmmword ptr [r8+rdx-0x40]
- movups xmm5, xmmword ptr [r8+rdx-0x30]
- movaps xmm3, xmm4
- shufps xmm4, xmm5, 136
- shufps xmm3, xmm5, 221
- movaps xmm5, xmm3
- movups xmm6, xmmword ptr [r8+rdx-0x20]
- movups xmm7, xmmword ptr [r8+rdx-0x10]
- movaps xmm3, xmm6
- shufps xmm6, xmm7, 136
- pshufd xmm6, xmm6, 0x93
- shufps xmm3, xmm7, 221
- pshufd xmm7, xmm3, 0x93
- movups xmm12, xmmword ptr [r9+rdx-0x40]
- movups xmm13, xmmword ptr [r9+rdx-0x30]
- movaps xmm11, xmm12
- shufps xmm12, xmm13, 136
- shufps xmm11, xmm13, 221
- movaps xmm13, xmm11
- movups xmm14, xmmword ptr [r9+rdx-0x20]
- movups xmm15, xmmword ptr [r9+rdx-0x10]
- movaps xmm11, xmm14
- shufps xmm14, xmm15, 136
- pshufd xmm14, xmm14, 0x93
- shufps xmm11, xmm15, 221
- pshufd xmm15, xmm11, 0x93
- movaps xmm3, xmmword ptr [rsp]
- movaps xmm11, xmmword ptr [rsp+0x10]
- pinsrd xmm3, eax, 3
- pinsrd xmm11, eax, 3
- mov al, 7
-9:
- paddd xmm0, xmm4
- paddd xmm8, xmm12
- movaps xmmword ptr [rsp+0x20], xmm4
- movaps xmmword ptr [rsp+0x30], xmm12
- paddd xmm0, xmm1
- paddd xmm8, xmm9
- pxor xmm3, xmm0
- pxor xmm11, xmm8
- movaps xmm12, xmmword ptr [ROT16+rip]
- pshufb xmm3, xmm12
- pshufb xmm11, xmm12
- paddd xmm2, xmm3
- paddd xmm10, xmm11
- pxor xmm1, xmm2
- pxor xmm9, xmm10
- movdqa xmm4, xmm1
- pslld xmm1, 20
- psrld xmm4, 12
- por xmm1, xmm4
- movdqa xmm4, xmm9
- pslld xmm9, 20
- psrld xmm4, 12
- por xmm9, xmm4
- paddd xmm0, xmm5
- paddd xmm8, xmm13
- movaps xmmword ptr [rsp+0x40], xmm5
- movaps xmmword ptr [rsp+0x50], xmm13
- paddd xmm0, xmm1
- paddd xmm8, xmm9
- pxor xmm3, xmm0
- pxor xmm11, xmm8
- movaps xmm13, xmmword ptr [ROT8+rip]
- pshufb xmm3, xmm13
- pshufb xmm11, xmm13
- paddd xmm2, xmm3
- paddd xmm10, xmm11
- pxor xmm1, xmm2
- pxor xmm9, xmm10
- movdqa xmm4, xmm1
- pslld xmm1, 25
- psrld xmm4, 7
- por xmm1, xmm4
- movdqa xmm4, xmm9
- pslld xmm9, 25
- psrld xmm4, 7
- por xmm9, xmm4
- pshufd xmm0, xmm0, 0x93
- pshufd xmm8, xmm8, 0x93
- pshufd xmm3, xmm3, 0x4E
- pshufd xmm11, xmm11, 0x4E
- pshufd xmm2, xmm2, 0x39
- pshufd xmm10, xmm10, 0x39
- paddd xmm0, xmm6
- paddd xmm8, xmm14
- paddd xmm0, xmm1
- paddd xmm8, xmm9
- pxor xmm3, xmm0
- pxor xmm11, xmm8
- pshufb xmm3, xmm12
- pshufb xmm11, xmm12
- paddd xmm2, xmm3
- paddd xmm10, xmm11
- pxor xmm1, xmm2
- pxor xmm9, xmm10
- movdqa xmm4, xmm1
- pslld xmm1, 20
- psrld xmm4, 12
- por xmm1, xmm4
- movdqa xmm4, xmm9
- pslld xmm9, 20
- psrld xmm4, 12
- por xmm9, xmm4
- paddd xmm0, xmm7
- paddd xmm8, xmm15
- paddd xmm0, xmm1
- paddd xmm8, xmm9
- pxor xmm3, xmm0
- pxor xmm11, xmm8
- pshufb xmm3, xmm13
- pshufb xmm11, xmm13
- paddd xmm2, xmm3
- paddd xmm10, xmm11
- pxor xmm1, xmm2
- pxor xmm9, xmm10
- movdqa xmm4, xmm1
- pslld xmm1, 25
- psrld xmm4, 7
- por xmm1, xmm4
- movdqa xmm4, xmm9
- pslld xmm9, 25
- psrld xmm4, 7
- por xmm9, xmm4
- pshufd xmm0, xmm0, 0x39
- pshufd xmm8, xmm8, 0x39
- pshufd xmm3, xmm3, 0x4E
- pshufd xmm11, xmm11, 0x4E
- pshufd xmm2, xmm2, 0x93
- pshufd xmm10, xmm10, 0x93
- dec al
- je 9f
- movdqa xmm12, xmmword ptr [rsp+0x20]
- movdqa xmm5, xmmword ptr [rsp+0x40]
- pshufd xmm13, xmm12, 0x0F
- shufps xmm12, xmm5, 214
- pshufd xmm4, xmm12, 0x39
- movdqa xmm12, xmm6
- shufps xmm12, xmm7, 250
- pblendw xmm13, xmm12, 0xCC
- movdqa xmm12, xmm7
- punpcklqdq xmm12, xmm5
- pblendw xmm12, xmm6, 0xC0
- pshufd xmm12, xmm12, 0x78
- punpckhdq xmm5, xmm7
- punpckldq xmm6, xmm5
- pshufd xmm7, xmm6, 0x1E
- movdqa xmmword ptr [rsp+0x20], xmm13
- movdqa xmmword ptr [rsp+0x40], xmm12
- movdqa xmm5, xmmword ptr [rsp+0x30]
- movdqa xmm13, xmmword ptr [rsp+0x50]
- pshufd xmm6, xmm5, 0x0F
- shufps xmm5, xmm13, 214
- pshufd xmm12, xmm5, 0x39
- movdqa xmm5, xmm14
- shufps xmm5, xmm15, 250
- pblendw xmm6, xmm5, 0xCC
- movdqa xmm5, xmm15
- punpcklqdq xmm5, xmm13
- pblendw xmm5, xmm14, 0xC0
- pshufd xmm5, xmm5, 0x78
- punpckhdq xmm13, xmm15
- punpckldq xmm14, xmm13
- pshufd xmm15, xmm14, 0x1E
- movdqa xmm13, xmm6
- movdqa xmm14, xmm5
- movdqa xmm5, xmmword ptr [rsp+0x20]
- movdqa xmm6, xmmword ptr [rsp+0x40]
- jmp 9b
-9:
- pxor xmm0, xmm2
- pxor xmm1, xmm3
- pxor xmm8, xmm10
- pxor xmm9, xmm11
- mov eax, r13d
- cmp rdx, r15
- jne 2b
- movups xmmword ptr [rbx], xmm0
- movups xmmword ptr [rbx+0x10], xmm1
- movups xmmword ptr [rbx+0x20], xmm8
- movups xmmword ptr [rbx+0x30], xmm9
- movdqa xmm0, xmmword ptr [rsp+0x130]
- movdqa xmm1, xmmword ptr [rsp+0x110]
- movdqa xmm2, xmmword ptr [rsp+0x120]
- movdqu xmm3, xmmword ptr [rsp+0x118]
- movdqu xmm4, xmmword ptr [rsp+0x128]
- blendvps xmm1, xmm3, xmm0
- blendvps xmm2, xmm4, xmm0
- movdqa xmmword ptr [rsp+0x110], xmm1
- movdqa xmmword ptr [rsp+0x120], xmm2
- add rdi, 16
- add rbx, 64
- sub rsi, 2
-3:
- test esi, 0x1
- je 4b
- movups xmm0, xmmword ptr [rcx]
- movups xmm1, xmmword ptr [rcx+0x10]
- movd xmm13, dword ptr [rsp+0x110]
- pinsrd xmm13, dword ptr [rsp+0x120], 1
- pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
- movaps xmm14, xmmword ptr [ROT8+rip]
- movaps xmm15, xmmword ptr [ROT16+rip]
- mov r8, qword ptr [rdi]
- movzx eax, byte ptr [rbp+0x80]
- or eax, r13d
- xor edx, edx
-2:
- mov r14d, eax
- or eax, r12d
- add rdx, 64
- cmp rdx, r15
- cmovne eax, r14d
- movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
- movaps xmm3, xmm13
- pinsrd xmm3, eax, 3
- movups xmm4, xmmword ptr [r8+rdx-0x40]
- movups xmm5, xmmword ptr [r8+rdx-0x30]
- movaps xmm8, xmm4
- shufps xmm4, xmm5, 136
- shufps xmm8, xmm5, 221
- movaps xmm5, xmm8
- movups xmm6, xmmword ptr [r8+rdx-0x20]
- movups xmm7, xmmword ptr [r8+rdx-0x10]
- movaps xmm8, xmm6
- shufps xmm6, xmm7, 136
- pshufd xmm6, xmm6, 0x93
- shufps xmm8, xmm7, 221
- pshufd xmm7, xmm8, 0x93
- mov al, 7
-9:
- paddd xmm0, xmm4
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- pshufb xmm3, xmm15
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 20
- psrld xmm11, 12
- por xmm1, xmm11
- paddd xmm0, xmm5
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- pshufb xmm3, xmm14
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 25
- psrld xmm11, 7
- por xmm1, xmm11
- pshufd xmm0, xmm0, 0x93
- pshufd xmm3, xmm3, 0x4E
- pshufd xmm2, xmm2, 0x39
- paddd xmm0, xmm6
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- pshufb xmm3, xmm15
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 20
- psrld xmm11, 12
- por xmm1, xmm11
- paddd xmm0, xmm7
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- pshufb xmm3, xmm14
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 25
- psrld xmm11, 7
- por xmm1, xmm11
- pshufd xmm0, xmm0, 0x39
- pshufd xmm3, xmm3, 0x4E
- pshufd xmm2, xmm2, 0x93
- dec al
- jz 9f
- movdqa xmm8, xmm4
- shufps xmm8, xmm5, 214
- pshufd xmm9, xmm4, 0x0F
- pshufd xmm4, xmm8, 0x39
- movdqa xmm8, xmm6
- shufps xmm8, xmm7, 250
- pblendw xmm9, xmm8, 0xCC
- movdqa xmm8, xmm7
- punpcklqdq xmm8, xmm5
- pblendw xmm8, xmm6, 0xC0
- pshufd xmm8, xmm8, 0x78
- punpckhdq xmm5, xmm7
- punpckldq xmm6, xmm5
- pshufd xmm7, xmm6, 0x1E
- movdqa xmm5, xmm9
- movdqa xmm6, xmm8
- jmp 9b
-9:
- pxor xmm0, xmm2
- pxor xmm1, xmm3
- mov eax, r13d
- cmp rdx, r15
- jne 2b
- movups xmmword ptr [rbx], xmm0
- movups xmmword ptr [rbx+0x10], xmm1
- jmp 4b
-
-.p2align 6
-blake3_compress_in_place_sse41:
-_blake3_compress_in_place_sse41:
- sub rsp, 120
- movdqa xmmword ptr [rsp], xmm6
- movdqa xmmword ptr [rsp+0x10], xmm7
- movdqa xmmword ptr [rsp+0x20], xmm8
- movdqa xmmword ptr [rsp+0x30], xmm9
- movdqa xmmword ptr [rsp+0x40], xmm11
- movdqa xmmword ptr [rsp+0x50], xmm14
- movdqa xmmword ptr [rsp+0x60], xmm15
- movups xmm0, xmmword ptr [rcx]
- movups xmm1, xmmword ptr [rcx+0x10]
- movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
- movzx eax, byte ptr [rsp+0xA0]
- movzx r8d, r8b
- shl rax, 32
- add r8, rax
- movq xmm3, r9
- movq xmm4, r8
- punpcklqdq xmm3, xmm4
- movups xmm4, xmmword ptr [rdx]
- movups xmm5, xmmword ptr [rdx+0x10]
- movaps xmm8, xmm4
- shufps xmm4, xmm5, 136
- shufps xmm8, xmm5, 221
- movaps xmm5, xmm8
- movups xmm6, xmmword ptr [rdx+0x20]
- movups xmm7, xmmword ptr [rdx+0x30]
- movaps xmm8, xmm6
- shufps xmm6, xmm7, 136
- pshufd xmm6, xmm6, 0x93
- shufps xmm8, xmm7, 221
- pshufd xmm7, xmm8, 0x93
- movaps xmm14, xmmword ptr [ROT8+rip]
- movaps xmm15, xmmword ptr [ROT16+rip]
- mov al, 7
-9:
- paddd xmm0, xmm4
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- pshufb xmm3, xmm15
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 20
- psrld xmm11, 12
- por xmm1, xmm11
- paddd xmm0, xmm5
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- pshufb xmm3, xmm14
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 25
- psrld xmm11, 7
- por xmm1, xmm11
- pshufd xmm0, xmm0, 0x93
- pshufd xmm3, xmm3, 0x4E
- pshufd xmm2, xmm2, 0x39
- paddd xmm0, xmm6
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- pshufb xmm3, xmm15
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 20
- psrld xmm11, 12
- por xmm1, xmm11
- paddd xmm0, xmm7
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- pshufb xmm3, xmm14
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 25
- psrld xmm11, 7
- por xmm1, xmm11
- pshufd xmm0, xmm0, 0x39
- pshufd xmm3, xmm3, 0x4E
- pshufd xmm2, xmm2, 0x93
- dec al
- jz 9f
- movdqa xmm8, xmm4
- shufps xmm8, xmm5, 214
- pshufd xmm9, xmm4, 0x0F
- pshufd xmm4, xmm8, 0x39
- movdqa xmm8, xmm6
- shufps xmm8, xmm7, 250
- pblendw xmm9, xmm8, 0xCC
- movdqa xmm8, xmm7
- punpcklqdq xmm8, xmm5
- pblendw xmm8, xmm6, 0xC0
- pshufd xmm8, xmm8, 0x78
- punpckhdq xmm5, xmm7
- punpckldq xmm6, xmm5
- pshufd xmm7, xmm6, 0x1E
- movdqa xmm5, xmm9
- movdqa xmm6, xmm8
- jmp 9b
-9:
- pxor xmm0, xmm2
- pxor xmm1, xmm3
- movups xmmword ptr [rcx], xmm0
- movups xmmword ptr [rcx+0x10], xmm1
- movdqa xmm6, xmmword ptr [rsp]
- movdqa xmm7, xmmword ptr [rsp+0x10]
- movdqa xmm8, xmmword ptr [rsp+0x20]
- movdqa xmm9, xmmword ptr [rsp+0x30]
- movdqa xmm11, xmmword ptr [rsp+0x40]
- movdqa xmm14, xmmword ptr [rsp+0x50]
- movdqa xmm15, xmmword ptr [rsp+0x60]
- add rsp, 120
- ret
-
-
-.p2align 6
-_blake3_compress_xof_sse41:
-blake3_compress_xof_sse41:
- sub rsp, 120
- movdqa xmmword ptr [rsp], xmm6
- movdqa xmmword ptr [rsp+0x10], xmm7
- movdqa xmmword ptr [rsp+0x20], xmm8
- movdqa xmmword ptr [rsp+0x30], xmm9
- movdqa xmmword ptr [rsp+0x40], xmm11
- movdqa xmmword ptr [rsp+0x50], xmm14
- movdqa xmmword ptr [rsp+0x60], xmm15
- movups xmm0, xmmword ptr [rcx]
- movups xmm1, xmmword ptr [rcx+0x10]
- movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
- movzx eax, byte ptr [rsp+0xA0]
- movzx r8d, r8b
- mov r10, qword ptr [rsp+0xA8]
- shl rax, 32
- add r8, rax
- movq xmm3, r9
- movq xmm4, r8
- punpcklqdq xmm3, xmm4
- movups xmm4, xmmword ptr [rdx]
- movups xmm5, xmmword ptr [rdx+0x10]
- movaps xmm8, xmm4
- shufps xmm4, xmm5, 136
- shufps xmm8, xmm5, 221
- movaps xmm5, xmm8
- movups xmm6, xmmword ptr [rdx+0x20]
- movups xmm7, xmmword ptr [rdx+0x30]
- movaps xmm8, xmm6
- shufps xmm6, xmm7, 136
- pshufd xmm6, xmm6, 0x93
- shufps xmm8, xmm7, 221
- pshufd xmm7, xmm8, 0x93
- movaps xmm14, xmmword ptr [ROT8+rip]
- movaps xmm15, xmmword ptr [ROT16+rip]
- mov al, 7
-9:
- paddd xmm0, xmm4
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- pshufb xmm3, xmm15
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 20
- psrld xmm11, 12
- por xmm1, xmm11
- paddd xmm0, xmm5
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- pshufb xmm3, xmm14
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 25
- psrld xmm11, 7
- por xmm1, xmm11
- pshufd xmm0, xmm0, 0x93
- pshufd xmm3, xmm3, 0x4E
- pshufd xmm2, xmm2, 0x39
- paddd xmm0, xmm6
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- pshufb xmm3, xmm15
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 20
- psrld xmm11, 12
- por xmm1, xmm11
- paddd xmm0, xmm7
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- pshufb xmm3, xmm14
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 25
- psrld xmm11, 7
- por xmm1, xmm11
- pshufd xmm0, xmm0, 0x39
- pshufd xmm3, xmm3, 0x4E
- pshufd xmm2, xmm2, 0x93
- dec al
- jz 9f
- movdqa xmm8, xmm4
- shufps xmm8, xmm5, 214
- pshufd xmm9, xmm4, 0x0F
- pshufd xmm4, xmm8, 0x39
- movdqa xmm8, xmm6
- shufps xmm8, xmm7, 250
- pblendw xmm9, xmm8, 0xCC
- movdqa xmm8, xmm7
- punpcklqdq xmm8, xmm5
- pblendw xmm8, xmm6, 0xC0
- pshufd xmm8, xmm8, 0x78
- punpckhdq xmm5, xmm7
- punpckldq xmm6, xmm5
- pshufd xmm7, xmm6, 0x1E
- movdqa xmm5, xmm9
- movdqa xmm6, xmm8
- jmp 9b
-9:
- movdqu xmm4, xmmword ptr [rcx]
- movdqu xmm5, xmmword ptr [rcx+0x10]
- pxor xmm0, xmm2
- pxor xmm1, xmm3
- pxor xmm2, xmm4
- pxor xmm3, xmm5
- movups xmmword ptr [r10], xmm0
- movups xmmword ptr [r10+0x10], xmm1
- movups xmmword ptr [r10+0x20], xmm2
- movups xmmword ptr [r10+0x30], xmm3
- movdqa xmm6, xmmword ptr [rsp]
- movdqa xmm7, xmmword ptr [rsp+0x10]
- movdqa xmm8, xmmword ptr [rsp+0x20]
- movdqa xmm9, xmmword ptr [rsp+0x30]
- movdqa xmm11, xmmword ptr [rsp+0x40]
- movdqa xmm14, xmmword ptr [rsp+0x50]
- movdqa xmm15, xmmword ptr [rsp+0x60]
- add rsp, 120
- ret
-
-
-.section .rodata
-.p2align 6
-BLAKE3_IV:
- .long 0x6A09E667, 0xBB67AE85
- .long 0x3C6EF372, 0xA54FF53A
-ROT16:
- .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
-ROT8:
- .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
-ADD0:
- .long 0, 1, 2, 3
-ADD1:
- .long 4, 4, 4, 4
-BLAKE3_IV_0:
- .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
-BLAKE3_IV_1:
- .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
-BLAKE3_IV_2:
- .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
-BLAKE3_IV_3:
- .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
-BLAKE3_BLOCK_LEN:
- .long 64, 64, 64, 64
-CMP_MSB_MASK:
- .long 0x80000000, 0x80000000, 0x80000000, 0x80000000
diff --git a/thirdparty/BLAKE3/c/blake3_sse41_x86-64_windows_msvc.asm b/thirdparty/BLAKE3/c/blake3_sse41_x86-64_windows_msvc.asm
deleted file mode 100644
index 87001e4d3..000000000
--- a/thirdparty/BLAKE3/c/blake3_sse41_x86-64_windows_msvc.asm
+++ /dev/null
@@ -1,2089 +0,0 @@
-public _blake3_hash_many_sse41
-public blake3_hash_many_sse41
-public blake3_compress_in_place_sse41
-public _blake3_compress_in_place_sse41
-public blake3_compress_xof_sse41
-public _blake3_compress_xof_sse41
-
-_TEXT SEGMENT ALIGN(16) 'CODE'
-
-ALIGN 16
-blake3_hash_many_sse41 PROC
-_blake3_hash_many_sse41 PROC
- push r15
- push r14
- push r13
- push r12
- push rsi
- push rdi
- push rbx
- push rbp
- mov rbp, rsp
- sub rsp, 528
- and rsp, 0FFFFFFFFFFFFFFC0H
- movdqa xmmword ptr [rsp+170H], xmm6
- movdqa xmmword ptr [rsp+180H], xmm7
- movdqa xmmword ptr [rsp+190H], xmm8
- movdqa xmmword ptr [rsp+1A0H], xmm9
- movdqa xmmword ptr [rsp+1B0H], xmm10
- movdqa xmmword ptr [rsp+1C0H], xmm11
- movdqa xmmword ptr [rsp+1D0H], xmm12
- movdqa xmmword ptr [rsp+1E0H], xmm13
- movdqa xmmword ptr [rsp+1F0H], xmm14
- movdqa xmmword ptr [rsp+200H], xmm15
- mov rdi, rcx
- mov rsi, rdx
- mov rdx, r8
- mov rcx, r9
- mov r8, qword ptr [rbp+68H]
- movzx r9, byte ptr [rbp+70H]
- neg r9d
- movd xmm0, r9d
- pshufd xmm0, xmm0, 00H
- movdqa xmmword ptr [rsp+130H], xmm0
- movdqa xmm1, xmm0
- pand xmm1, xmmword ptr [ADD0]
- pand xmm0, xmmword ptr [ADD1]
- movdqa xmmword ptr [rsp+150H], xmm0
- movd xmm0, r8d
- pshufd xmm0, xmm0, 00H
- paddd xmm0, xmm1
- movdqa xmmword ptr [rsp+110H], xmm0
- pxor xmm0, xmmword ptr [CMP_MSB_MASK]
- pxor xmm1, xmmword ptr [CMP_MSB_MASK]
- pcmpgtd xmm1, xmm0
- shr r8, 32
- movd xmm2, r8d
- pshufd xmm2, xmm2, 00H
- psubd xmm2, xmm1
- movdqa xmmword ptr [rsp+120H], xmm2
- mov rbx, qword ptr [rbp+90H]
- mov r15, rdx
- shl r15, 6
- movzx r13d, byte ptr [rbp+78H]
- movzx r12d, byte ptr [rbp+88H]
- cmp rsi, 4
- jc final3blocks
-outerloop4:
- movdqu xmm3, xmmword ptr [rcx]
- pshufd xmm0, xmm3, 00H
- pshufd xmm1, xmm3, 55H
- pshufd xmm2, xmm3, 0AAH
- pshufd xmm3, xmm3, 0FFH
- movdqu xmm7, xmmword ptr [rcx+10H]
- pshufd xmm4, xmm7, 00H
- pshufd xmm5, xmm7, 55H
- pshufd xmm6, xmm7, 0AAH
- pshufd xmm7, xmm7, 0FFH
- mov r8, qword ptr [rdi]
- mov r9, qword ptr [rdi+8H]
- mov r10, qword ptr [rdi+10H]
- mov r11, qword ptr [rdi+18H]
- movzx eax, byte ptr [rbp+80H]
- or eax, r13d
- xor edx, edx
-innerloop4:
- mov r14d, eax
- or eax, r12d
- add rdx, 64
- cmp rdx, r15
- cmovne eax, r14d
- movdqu xmm8, xmmword ptr [r8+rdx-40H]
- movdqu xmm9, xmmword ptr [r9+rdx-40H]
- movdqu xmm10, xmmword ptr [r10+rdx-40H]
- movdqu xmm11, xmmword ptr [r11+rdx-40H]
- movdqa xmm12, xmm8
- punpckldq xmm8, xmm9
- punpckhdq xmm12, xmm9
- movdqa xmm14, xmm10
- punpckldq xmm10, xmm11
- punpckhdq xmm14, xmm11
- movdqa xmm9, xmm8
- punpcklqdq xmm8, xmm10
- punpckhqdq xmm9, xmm10
- movdqa xmm13, xmm12
- punpcklqdq xmm12, xmm14
- punpckhqdq xmm13, xmm14
- movdqa xmmword ptr [rsp], xmm8
- movdqa xmmword ptr [rsp+10H], xmm9
- movdqa xmmword ptr [rsp+20H], xmm12
- movdqa xmmword ptr [rsp+30H], xmm13
- movdqu xmm8, xmmword ptr [r8+rdx-30H]
- movdqu xmm9, xmmword ptr [r9+rdx-30H]
- movdqu xmm10, xmmword ptr [r10+rdx-30H]
- movdqu xmm11, xmmword ptr [r11+rdx-30H]
- movdqa xmm12, xmm8
- punpckldq xmm8, xmm9
- punpckhdq xmm12, xmm9
- movdqa xmm14, xmm10
- punpckldq xmm10, xmm11
- punpckhdq xmm14, xmm11
- movdqa xmm9, xmm8
- punpcklqdq xmm8, xmm10
- punpckhqdq xmm9, xmm10
- movdqa xmm13, xmm12
- punpcklqdq xmm12, xmm14
- punpckhqdq xmm13, xmm14
- movdqa xmmword ptr [rsp+40H], xmm8
- movdqa xmmword ptr [rsp+50H], xmm9
- movdqa xmmword ptr [rsp+60H], xmm12
- movdqa xmmword ptr [rsp+70H], xmm13
- movdqu xmm8, xmmword ptr [r8+rdx-20H]
- movdqu xmm9, xmmword ptr [r9+rdx-20H]
- movdqu xmm10, xmmword ptr [r10+rdx-20H]
- movdqu xmm11, xmmword ptr [r11+rdx-20H]
- movdqa xmm12, xmm8
- punpckldq xmm8, xmm9
- punpckhdq xmm12, xmm9
- movdqa xmm14, xmm10
- punpckldq xmm10, xmm11
- punpckhdq xmm14, xmm11
- movdqa xmm9, xmm8
- punpcklqdq xmm8, xmm10
- punpckhqdq xmm9, xmm10
- movdqa xmm13, xmm12
- punpcklqdq xmm12, xmm14
- punpckhqdq xmm13, xmm14
- movdqa xmmword ptr [rsp+80H], xmm8
- movdqa xmmword ptr [rsp+90H], xmm9
- movdqa xmmword ptr [rsp+0A0H], xmm12
- movdqa xmmword ptr [rsp+0B0H], xmm13
- movdqu xmm8, xmmword ptr [r8+rdx-10H]
- movdqu xmm9, xmmword ptr [r9+rdx-10H]
- movdqu xmm10, xmmword ptr [r10+rdx-10H]
- movdqu xmm11, xmmword ptr [r11+rdx-10H]
- movdqa xmm12, xmm8
- punpckldq xmm8, xmm9
- punpckhdq xmm12, xmm9
- movdqa xmm14, xmm10
- punpckldq xmm10, xmm11
- punpckhdq xmm14, xmm11
- movdqa xmm9, xmm8
- punpcklqdq xmm8, xmm10
- punpckhqdq xmm9, xmm10
- movdqa xmm13, xmm12
- punpcklqdq xmm12, xmm14
- punpckhqdq xmm13, xmm14
- movdqa xmmword ptr [rsp+0C0H], xmm8
- movdqa xmmword ptr [rsp+0D0H], xmm9
- movdqa xmmword ptr [rsp+0E0H], xmm12
- movdqa xmmword ptr [rsp+0F0H], xmm13
- movdqa xmm9, xmmword ptr [BLAKE3_IV_1]
- movdqa xmm10, xmmword ptr [BLAKE3_IV_2]
- movdqa xmm11, xmmword ptr [BLAKE3_IV_3]
- movdqa xmm12, xmmword ptr [rsp+110H]
- movdqa xmm13, xmmword ptr [rsp+120H]
- movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN]
- movd xmm15, eax
- pshufd xmm15, xmm15, 00H
- prefetcht0 byte ptr [r8+rdx+80H]
- prefetcht0 byte ptr [r9+rdx+80H]
- prefetcht0 byte ptr [r10+rdx+80H]
- prefetcht0 byte ptr [r11+rdx+80H]
- paddd xmm0, xmmword ptr [rsp]
- paddd xmm1, xmmword ptr [rsp+20H]
- paddd xmm2, xmmword ptr [rsp+40H]
- paddd xmm3, xmmword ptr [rsp+60H]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT16]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
- movdqa xmm8, xmmword ptr [BLAKE3_IV_0]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+100H], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+10H]
- paddd xmm1, xmmword ptr [rsp+30H]
- paddd xmm2, xmmword ptr [rsp+50H]
- paddd xmm3, xmmword ptr [rsp+70H]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT8]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
- movdqa xmm8, xmmword ptr [rsp+100H]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+100H], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+80H]
- paddd xmm1, xmmword ptr [rsp+0A0H]
- paddd xmm2, xmmword ptr [rsp+0C0H]
- paddd xmm3, xmmword ptr [rsp+0E0H]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT16]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+100H]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+100H], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+90H]
- paddd xmm1, xmmword ptr [rsp+0B0H]
- paddd xmm2, xmmword ptr [rsp+0D0H]
- paddd xmm3, xmmword ptr [rsp+0F0H]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT8]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+100H]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+100H], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+20H]
- paddd xmm1, xmmword ptr [rsp+30H]
- paddd xmm2, xmmword ptr [rsp+70H]
- paddd xmm3, xmmword ptr [rsp+40H]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT16]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
- movdqa xmm8, xmmword ptr [rsp+100H]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+100H], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+60H]
- paddd xmm1, xmmword ptr [rsp+0A0H]
- paddd xmm2, xmmword ptr [rsp]
- paddd xmm3, xmmword ptr [rsp+0D0H]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT8]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
- movdqa xmm8, xmmword ptr [rsp+100H]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+100H], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+10H]
- paddd xmm1, xmmword ptr [rsp+0C0H]
- paddd xmm2, xmmword ptr [rsp+90H]
- paddd xmm3, xmmword ptr [rsp+0F0H]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT16]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+100H]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+100H], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+0B0H]
- paddd xmm1, xmmword ptr [rsp+50H]
- paddd xmm2, xmmword ptr [rsp+0E0H]
- paddd xmm3, xmmword ptr [rsp+80H]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT8]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+100H]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+100H], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+30H]
- paddd xmm1, xmmword ptr [rsp+0A0H]
- paddd xmm2, xmmword ptr [rsp+0D0H]
- paddd xmm3, xmmword ptr [rsp+70H]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT16]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
- movdqa xmm8, xmmword ptr [rsp+100H]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+100H], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+40H]
- paddd xmm1, xmmword ptr [rsp+0C0H]
- paddd xmm2, xmmword ptr [rsp+20H]
- paddd xmm3, xmmword ptr [rsp+0E0H]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT8]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
- movdqa xmm8, xmmword ptr [rsp+100H]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+100H], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+60H]
- paddd xmm1, xmmword ptr [rsp+90H]
- paddd xmm2, xmmword ptr [rsp+0B0H]
- paddd xmm3, xmmword ptr [rsp+80H]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT16]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+100H]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+100H], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+50H]
- paddd xmm1, xmmword ptr [rsp]
- paddd xmm2, xmmword ptr [rsp+0F0H]
- paddd xmm3, xmmword ptr [rsp+10H]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT8]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+100H]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+100H], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+0A0H]
- paddd xmm1, xmmword ptr [rsp+0C0H]
- paddd xmm2, xmmword ptr [rsp+0E0H]
- paddd xmm3, xmmword ptr [rsp+0D0H]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT16]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
- movdqa xmm8, xmmword ptr [rsp+100H]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+100H], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+70H]
- paddd xmm1, xmmword ptr [rsp+90H]
- paddd xmm2, xmmword ptr [rsp+30H]
- paddd xmm3, xmmword ptr [rsp+0F0H]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT8]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
- movdqa xmm8, xmmword ptr [rsp+100H]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+100H], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+40H]
- paddd xmm1, xmmword ptr [rsp+0B0H]
- paddd xmm2, xmmword ptr [rsp+50H]
- paddd xmm3, xmmword ptr [rsp+10H]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT16]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+100H]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+100H], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp]
- paddd xmm1, xmmword ptr [rsp+20H]
- paddd xmm2, xmmword ptr [rsp+80H]
- paddd xmm3, xmmword ptr [rsp+60H]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT8]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+100H]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+100H], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+0C0H]
- paddd xmm1, xmmword ptr [rsp+90H]
- paddd xmm2, xmmword ptr [rsp+0F0H]
- paddd xmm3, xmmword ptr [rsp+0E0H]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT16]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
- movdqa xmm8, xmmword ptr [rsp+100H]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+100H], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+0D0H]
- paddd xmm1, xmmword ptr [rsp+0B0H]
- paddd xmm2, xmmword ptr [rsp+0A0H]
- paddd xmm3, xmmword ptr [rsp+80H]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT8]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
- movdqa xmm8, xmmword ptr [rsp+100H]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+100H], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+70H]
- paddd xmm1, xmmword ptr [rsp+50H]
- paddd xmm2, xmmword ptr [rsp]
- paddd xmm3, xmmword ptr [rsp+60H]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT16]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+100H]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+100H], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+20H]
- paddd xmm1, xmmword ptr [rsp+30H]
- paddd xmm2, xmmword ptr [rsp+10H]
- paddd xmm3, xmmword ptr [rsp+40H]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT8]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+100H]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+100H], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+90H]
- paddd xmm1, xmmword ptr [rsp+0B0H]
- paddd xmm2, xmmword ptr [rsp+80H]
- paddd xmm3, xmmword ptr [rsp+0F0H]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT16]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
- movdqa xmm8, xmmword ptr [rsp+100H]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+100H], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+0E0H]
- paddd xmm1, xmmword ptr [rsp+50H]
- paddd xmm2, xmmword ptr [rsp+0C0H]
- paddd xmm3, xmmword ptr [rsp+10H]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT8]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
- movdqa xmm8, xmmword ptr [rsp+100H]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+100H], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+0D0H]
- paddd xmm1, xmmword ptr [rsp]
- paddd xmm2, xmmword ptr [rsp+20H]
- paddd xmm3, xmmword ptr [rsp+40H]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT16]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+100H]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+100H], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+30H]
- paddd xmm1, xmmword ptr [rsp+0A0H]
- paddd xmm2, xmmword ptr [rsp+60H]
- paddd xmm3, xmmword ptr [rsp+70H]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT8]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+100H]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+100H], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+0B0H]
- paddd xmm1, xmmword ptr [rsp+50H]
- paddd xmm2, xmmword ptr [rsp+10H]
- paddd xmm3, xmmword ptr [rsp+80H]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT16]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
- movdqa xmm8, xmmword ptr [rsp+100H]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+100H], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+0F0H]
- paddd xmm1, xmmword ptr [rsp]
- paddd xmm2, xmmword ptr [rsp+90H]
- paddd xmm3, xmmword ptr [rsp+60H]
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- paddd xmm2, xmm6
- paddd xmm3, xmm7
- pxor xmm12, xmm0
- pxor xmm13, xmm1
- pxor xmm14, xmm2
- pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT8]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
- movdqa xmm8, xmmword ptr [rsp+100H]
- paddd xmm8, xmm12
- paddd xmm9, xmm13
- paddd xmm10, xmm14
- paddd xmm11, xmm15
- pxor xmm4, xmm8
- pxor xmm5, xmm9
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- movdqa xmmword ptr [rsp+100H], xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- paddd xmm0, xmmword ptr [rsp+0E0H]
- paddd xmm1, xmmword ptr [rsp+20H]
- paddd xmm2, xmmword ptr [rsp+30H]
- paddd xmm3, xmmword ptr [rsp+70H]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT16]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+100H]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- movdqa xmmword ptr [rsp+100H], xmm8
- movdqa xmm8, xmm5
- psrld xmm8, 12
- pslld xmm5, 20
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 12
- pslld xmm6, 20
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 12
- pslld xmm7, 20
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 12
- pslld xmm4, 20
- por xmm4, xmm8
- paddd xmm0, xmmword ptr [rsp+0A0H]
- paddd xmm1, xmmword ptr [rsp+0C0H]
- paddd xmm2, xmmword ptr [rsp+40H]
- paddd xmm3, xmmword ptr [rsp+0D0H]
- paddd xmm0, xmm5
- paddd xmm1, xmm6
- paddd xmm2, xmm7
- paddd xmm3, xmm4
- pxor xmm15, xmm0
- pxor xmm12, xmm1
- pxor xmm13, xmm2
- pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT8]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- paddd xmm10, xmm15
- paddd xmm11, xmm12
- movdqa xmm8, xmmword ptr [rsp+100H]
- paddd xmm8, xmm13
- paddd xmm9, xmm14
- pxor xmm5, xmm10
- pxor xmm6, xmm11
- pxor xmm7, xmm8
- pxor xmm4, xmm9
- pxor xmm0, xmm8
- pxor xmm1, xmm9
- pxor xmm2, xmm10
- pxor xmm3, xmm11
- movdqa xmm8, xmm5
- psrld xmm8, 7
- pslld xmm5, 25
- por xmm5, xmm8
- movdqa xmm8, xmm6
- psrld xmm8, 7
- pslld xmm6, 25
- por xmm6, xmm8
- movdqa xmm8, xmm7
- psrld xmm8, 7
- pslld xmm7, 25
- por xmm7, xmm8
- movdqa xmm8, xmm4
- psrld xmm8, 7
- pslld xmm4, 25
- por xmm4, xmm8
- pxor xmm4, xmm12
- pxor xmm5, xmm13
- pxor xmm6, xmm14
- pxor xmm7, xmm15
- mov eax, r13d
- jne innerloop4
- movdqa xmm9, xmm0
- punpckldq xmm0, xmm1
- punpckhdq xmm9, xmm1
- movdqa xmm11, xmm2
- punpckldq xmm2, xmm3
- punpckhdq xmm11, xmm3
- movdqa xmm1, xmm0
- punpcklqdq xmm0, xmm2
- punpckhqdq xmm1, xmm2
- movdqa xmm3, xmm9
- punpcklqdq xmm9, xmm11
- punpckhqdq xmm3, xmm11
- movdqu xmmword ptr [rbx], xmm0
- movdqu xmmword ptr [rbx+20H], xmm1
- movdqu xmmword ptr [rbx+40H], xmm9
- movdqu xmmword ptr [rbx+60H], xmm3
- movdqa xmm9, xmm4
- punpckldq xmm4, xmm5
- punpckhdq xmm9, xmm5
- movdqa xmm11, xmm6
- punpckldq xmm6, xmm7
- punpckhdq xmm11, xmm7
- movdqa xmm5, xmm4
- punpcklqdq xmm4, xmm6
- punpckhqdq xmm5, xmm6
- movdqa xmm7, xmm9
- punpcklqdq xmm9, xmm11
- punpckhqdq xmm7, xmm11
- movdqu xmmword ptr [rbx+10H], xmm4
- movdqu xmmword ptr [rbx+30H], xmm5
- movdqu xmmword ptr [rbx+50H], xmm9
- movdqu xmmword ptr [rbx+70H], xmm7
- movdqa xmm1, xmmword ptr [rsp+110H]
- movdqa xmm0, xmm1
- paddd xmm1, xmmword ptr [rsp+150H]
- movdqa xmmword ptr [rsp+110H], xmm1
- pxor xmm0, xmmword ptr [CMP_MSB_MASK]
- pxor xmm1, xmmword ptr [CMP_MSB_MASK]
- pcmpgtd xmm0, xmm1
- movdqa xmm1, xmmword ptr [rsp+120H]
- psubd xmm1, xmm0
- movdqa xmmword ptr [rsp+120H], xmm1
- add rbx, 128
- add rdi, 32
- sub rsi, 4
- cmp rsi, 4
- jnc outerloop4
- test rsi, rsi
- jne final3blocks
-unwind:
- movdqa xmm6, xmmword ptr [rsp+170H]
- movdqa xmm7, xmmword ptr [rsp+180H]
- movdqa xmm8, xmmword ptr [rsp+190H]
- movdqa xmm9, xmmword ptr [rsp+1A0H]
- movdqa xmm10, xmmword ptr [rsp+1B0H]
- movdqa xmm11, xmmword ptr [rsp+1C0H]
- movdqa xmm12, xmmword ptr [rsp+1D0H]
- movdqa xmm13, xmmword ptr [rsp+1E0H]
- movdqa xmm14, xmmword ptr [rsp+1F0H]
- movdqa xmm15, xmmword ptr [rsp+200H]
- mov rsp, rbp
- pop rbp
- pop rbx
- pop rdi
- pop rsi
- pop r12
- pop r13
- pop r14
- pop r15
- ret
-ALIGN 16
-final3blocks:
- test esi, 2H
- je final1block
- movups xmm0, xmmword ptr [rcx]
- movups xmm1, xmmword ptr [rcx+10H]
- movaps xmm8, xmm0
- movaps xmm9, xmm1
- movd xmm13, dword ptr [rsp+110H]
- pinsrd xmm13, dword ptr [rsp+120H], 1
- pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN], 2
- movaps xmmword ptr [rsp], xmm13
- movd xmm14, dword ptr [rsp+114H]
- pinsrd xmm14, dword ptr [rsp+124H], 1
- pinsrd xmm14, dword ptr [BLAKE3_BLOCK_LEN], 2
- movaps xmmword ptr [rsp+10H], xmm14
- mov r8, qword ptr [rdi]
- mov r9, qword ptr [rdi+8H]
- movzx eax, byte ptr [rbp+80H]
- or eax, r13d
- xor edx, edx
-innerloop2:
- mov r14d, eax
- or eax, r12d
- add rdx, 64
- cmp rdx, r15
- cmovne eax, r14d
- movaps xmm2, xmmword ptr [BLAKE3_IV]
- movaps xmm10, xmm2
- movups xmm4, xmmword ptr [r8+rdx-40H]
- movups xmm5, xmmword ptr [r8+rdx-30H]
- movaps xmm3, xmm4
- shufps xmm4, xmm5, 136
- shufps xmm3, xmm5, 221
- movaps xmm5, xmm3
- movups xmm6, xmmword ptr [r8+rdx-20H]
- movups xmm7, xmmword ptr [r8+rdx-10H]
- movaps xmm3, xmm6
- shufps xmm6, xmm7, 136
- pshufd xmm6, xmm6, 93H
- shufps xmm3, xmm7, 221
- pshufd xmm7, xmm3, 93H
- movups xmm12, xmmword ptr [r9+rdx-40H]
- movups xmm13, xmmword ptr [r9+rdx-30H]
- movaps xmm11, xmm12
- shufps xmm12, xmm13, 136
- shufps xmm11, xmm13, 221
- movaps xmm13, xmm11
- movups xmm14, xmmword ptr [r9+rdx-20H]
- movups xmm15, xmmword ptr [r9+rdx-10H]
- movaps xmm11, xmm14
- shufps xmm14, xmm15, 136
- pshufd xmm14, xmm14, 93H
- shufps xmm11, xmm15, 221
- pshufd xmm15, xmm11, 93H
- movaps xmm3, xmmword ptr [rsp]
- movaps xmm11, xmmword ptr [rsp+10H]
- pinsrd xmm3, eax, 3
- pinsrd xmm11, eax, 3
- mov al, 7
-roundloop2:
- paddd xmm0, xmm4
- paddd xmm8, xmm12
- movaps xmmword ptr [rsp+20H], xmm4
- movaps xmmword ptr [rsp+30H], xmm12
- paddd xmm0, xmm1
- paddd xmm8, xmm9
- pxor xmm3, xmm0
- pxor xmm11, xmm8
- movaps xmm12, xmmword ptr [ROT16]
- pshufb xmm3, xmm12
- pshufb xmm11, xmm12
- paddd xmm2, xmm3
- paddd xmm10, xmm11
- pxor xmm1, xmm2
- pxor xmm9, xmm10
- movdqa xmm4, xmm1
- pslld xmm1, 20
- psrld xmm4, 12
- por xmm1, xmm4
- movdqa xmm4, xmm9
- pslld xmm9, 20
- psrld xmm4, 12
- por xmm9, xmm4
- paddd xmm0, xmm5
- paddd xmm8, xmm13
- movaps xmmword ptr [rsp+40H], xmm5
- movaps xmmword ptr [rsp+50H], xmm13
- paddd xmm0, xmm1
- paddd xmm8, xmm9
- pxor xmm3, xmm0
- pxor xmm11, xmm8
- movaps xmm13, xmmword ptr [ROT8]
- pshufb xmm3, xmm13
- pshufb xmm11, xmm13
- paddd xmm2, xmm3
- paddd xmm10, xmm11
- pxor xmm1, xmm2
- pxor xmm9, xmm10
- movdqa xmm4, xmm1
- pslld xmm1, 25
- psrld xmm4, 7
- por xmm1, xmm4
- movdqa xmm4, xmm9
- pslld xmm9, 25
- psrld xmm4, 7
- por xmm9, xmm4
- pshufd xmm0, xmm0, 93H
- pshufd xmm8, xmm8, 93H
- pshufd xmm3, xmm3, 4EH
- pshufd xmm11, xmm11, 4EH
- pshufd xmm2, xmm2, 39H
- pshufd xmm10, xmm10, 39H
- paddd xmm0, xmm6
- paddd xmm8, xmm14
- paddd xmm0, xmm1
- paddd xmm8, xmm9
- pxor xmm3, xmm0
- pxor xmm11, xmm8
- pshufb xmm3, xmm12
- pshufb xmm11, xmm12
- paddd xmm2, xmm3
- paddd xmm10, xmm11
- pxor xmm1, xmm2
- pxor xmm9, xmm10
- movdqa xmm4, xmm1
- pslld xmm1, 20
- psrld xmm4, 12
- por xmm1, xmm4
- movdqa xmm4, xmm9
- pslld xmm9, 20
- psrld xmm4, 12
- por xmm9, xmm4
- paddd xmm0, xmm7
- paddd xmm8, xmm15
- paddd xmm0, xmm1
- paddd xmm8, xmm9
- pxor xmm3, xmm0
- pxor xmm11, xmm8
- pshufb xmm3, xmm13
- pshufb xmm11, xmm13
- paddd xmm2, xmm3
- paddd xmm10, xmm11
- pxor xmm1, xmm2
- pxor xmm9, xmm10
- movdqa xmm4, xmm1
- pslld xmm1, 25
- psrld xmm4, 7
- por xmm1, xmm4
- movdqa xmm4, xmm9
- pslld xmm9, 25
- psrld xmm4, 7
- por xmm9, xmm4
- pshufd xmm0, xmm0, 39H
- pshufd xmm8, xmm8, 39H
- pshufd xmm3, xmm3, 4EH
- pshufd xmm11, xmm11, 4EH
- pshufd xmm2, xmm2, 93H
- pshufd xmm10, xmm10, 93H
- dec al
- je endroundloop2
- movdqa xmm12, xmmword ptr [rsp+20H]
- movdqa xmm5, xmmword ptr [rsp+40H]
- pshufd xmm13, xmm12, 0FH
- shufps xmm12, xmm5, 214
- pshufd xmm4, xmm12, 39H
- movdqa xmm12, xmm6
- shufps xmm12, xmm7, 250
- pblendw xmm13, xmm12, 0CCH
- movdqa xmm12, xmm7
- punpcklqdq xmm12, xmm5
- pblendw xmm12, xmm6, 0C0H
- pshufd xmm12, xmm12, 78H
- punpckhdq xmm5, xmm7
- punpckldq xmm6, xmm5
- pshufd xmm7, xmm6, 1EH
- movdqa xmmword ptr [rsp+20H], xmm13
- movdqa xmmword ptr [rsp+40H], xmm12
- movdqa xmm5, xmmword ptr [rsp+30H]
- movdqa xmm13, xmmword ptr [rsp+50H]
- pshufd xmm6, xmm5, 0FH
- shufps xmm5, xmm13, 214
- pshufd xmm12, xmm5, 39H
- movdqa xmm5, xmm14
- shufps xmm5, xmm15, 250
- pblendw xmm6, xmm5, 0CCH
- movdqa xmm5, xmm15
- punpcklqdq xmm5, xmm13
- pblendw xmm5, xmm14, 0C0H
- pshufd xmm5, xmm5, 78H
- punpckhdq xmm13, xmm15
- punpckldq xmm14, xmm13
- pshufd xmm15, xmm14, 1EH
- movdqa xmm13, xmm6
- movdqa xmm14, xmm5
- movdqa xmm5, xmmword ptr [rsp+20H]
- movdqa xmm6, xmmword ptr [rsp+40H]
- jmp roundloop2
-endroundloop2:
- pxor xmm0, xmm2
- pxor xmm1, xmm3
- pxor xmm8, xmm10
- pxor xmm9, xmm11
- mov eax, r13d
- cmp rdx, r15
- jne innerloop2
- movups xmmword ptr [rbx], xmm0
- movups xmmword ptr [rbx+10H], xmm1
- movups xmmword ptr [rbx+20H], xmm8
- movups xmmword ptr [rbx+30H], xmm9
- movdqa xmm0, xmmword ptr [rsp+130H]
- movdqa xmm1, xmmword ptr [rsp+110H]
- movdqa xmm2, xmmword ptr [rsp+120H]
- movdqu xmm3, xmmword ptr [rsp+118H]
- movdqu xmm4, xmmword ptr [rsp+128H]
- blendvps xmm1, xmm3, xmm0
- blendvps xmm2, xmm4, xmm0
- movdqa xmmword ptr [rsp+110H], xmm1
- movdqa xmmword ptr [rsp+120H], xmm2
- add rdi, 16
- add rbx, 64
- sub rsi, 2
-final1block:
- test esi, 1H
- je unwind
- movups xmm0, xmmword ptr [rcx]
- movups xmm1, xmmword ptr [rcx+10H]
- movd xmm13, dword ptr [rsp+110H]
- pinsrd xmm13, dword ptr [rsp+120H], 1
- pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN], 2
- movaps xmm14, xmmword ptr [ROT8]
- movaps xmm15, xmmword ptr [ROT16]
- mov r8, qword ptr [rdi]
- movzx eax, byte ptr [rbp+80H]
- or eax, r13d
- xor edx, edx
-innerloop1:
- mov r14d, eax
- or eax, r12d
- add rdx, 64
- cmp rdx, r15
- cmovne eax, r14d
- movaps xmm2, xmmword ptr [BLAKE3_IV]
- movaps xmm3, xmm13
- pinsrd xmm3, eax, 3
- movups xmm4, xmmword ptr [r8+rdx-40H]
- movups xmm5, xmmword ptr [r8+rdx-30H]
- movaps xmm8, xmm4
- shufps xmm4, xmm5, 136
- shufps xmm8, xmm5, 221
- movaps xmm5, xmm8
- movups xmm6, xmmword ptr [r8+rdx-20H]
- movups xmm7, xmmword ptr [r8+rdx-10H]
- movaps xmm8, xmm6
- shufps xmm6, xmm7, 136
- pshufd xmm6, xmm6, 93H
- shufps xmm8, xmm7, 221
- pshufd xmm7, xmm8, 93H
- mov al, 7
-roundloop1:
- paddd xmm0, xmm4
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- pshufb xmm3, xmm15
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 20
- psrld xmm11, 12
- por xmm1, xmm11
- paddd xmm0, xmm5
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- pshufb xmm3, xmm14
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 25
- psrld xmm11, 7
- por xmm1, xmm11
- pshufd xmm0, xmm0, 93H
- pshufd xmm3, xmm3, 4EH
- pshufd xmm2, xmm2, 39H
- paddd xmm0, xmm6
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- pshufb xmm3, xmm15
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 20
- psrld xmm11, 12
- por xmm1, xmm11
- paddd xmm0, xmm7
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- pshufb xmm3, xmm14
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 25
- psrld xmm11, 7
- por xmm1, xmm11
- pshufd xmm0, xmm0, 39H
- pshufd xmm3, xmm3, 4EH
- pshufd xmm2, xmm2, 93H
- dec al
- jz endroundloop1
- movdqa xmm8, xmm4
- shufps xmm8, xmm5, 214
- pshufd xmm9, xmm4, 0FH
- pshufd xmm4, xmm8, 39H
- movdqa xmm8, xmm6
- shufps xmm8, xmm7, 250
- pblendw xmm9, xmm8, 0CCH
- movdqa xmm8, xmm7
- punpcklqdq xmm8, xmm5
- pblendw xmm8, xmm6, 0C0H
- pshufd xmm8, xmm8, 78H
- punpckhdq xmm5, xmm7
- punpckldq xmm6, xmm5
- pshufd xmm7, xmm6, 1EH
- movdqa xmm5, xmm9
- movdqa xmm6, xmm8
- jmp roundloop1
-endroundloop1:
- pxor xmm0, xmm2
- pxor xmm1, xmm3
- mov eax, r13d
- cmp rdx, r15
- jne innerloop1
- movups xmmword ptr [rbx], xmm0
- movups xmmword ptr [rbx+10H], xmm1
- jmp unwind
-_blake3_hash_many_sse41 ENDP
-blake3_hash_many_sse41 ENDP
-
-blake3_compress_in_place_sse41 PROC
-_blake3_compress_in_place_sse41 PROC
- sub rsp, 120
- movdqa xmmword ptr [rsp], xmm6
- movdqa xmmword ptr [rsp+10H], xmm7
- movdqa xmmword ptr [rsp+20H], xmm8
- movdqa xmmword ptr [rsp+30H], xmm9
- movdqa xmmword ptr [rsp+40H], xmm11
- movdqa xmmword ptr [rsp+50H], xmm14
- movdqa xmmword ptr [rsp+60H], xmm15
- movups xmm0, xmmword ptr [rcx]
- movups xmm1, xmmword ptr [rcx+10H]
- movaps xmm2, xmmword ptr [BLAKE3_IV]
- movzx eax, byte ptr [rsp+0A0H]
- movzx r8d, r8b
- shl rax, 32
- add r8, rax
- movq xmm3, r9
- movq xmm4, r8
- punpcklqdq xmm3, xmm4
- movups xmm4, xmmword ptr [rdx]
- movups xmm5, xmmword ptr [rdx+10H]
- movaps xmm8, xmm4
- shufps xmm4, xmm5, 136
- shufps xmm8, xmm5, 221
- movaps xmm5, xmm8
- movups xmm6, xmmword ptr [rdx+20H]
- movups xmm7, xmmword ptr [rdx+30H]
- movaps xmm8, xmm6
- shufps xmm6, xmm7, 136
- pshufd xmm6, xmm6, 93H
- shufps xmm8, xmm7, 221
- pshufd xmm7, xmm8, 93H
- movaps xmm14, xmmword ptr [ROT8]
- movaps xmm15, xmmword ptr [ROT16]
- mov al, 7
-@@:
- paddd xmm0, xmm4
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- pshufb xmm3, xmm15
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 20
- psrld xmm11, 12
- por xmm1, xmm11
- paddd xmm0, xmm5
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- pshufb xmm3, xmm14
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 25
- psrld xmm11, 7
- por xmm1, xmm11
- pshufd xmm0, xmm0, 93H
- pshufd xmm3, xmm3, 4EH
- pshufd xmm2, xmm2, 39H
- paddd xmm0, xmm6
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- pshufb xmm3, xmm15
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 20
- psrld xmm11, 12
- por xmm1, xmm11
- paddd xmm0, xmm7
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- pshufb xmm3, xmm14
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 25
- psrld xmm11, 7
- por xmm1, xmm11
- pshufd xmm0, xmm0, 39H
- pshufd xmm3, xmm3, 4EH
- pshufd xmm2, xmm2, 93H
- dec al
- jz @F
- movdqa xmm8, xmm4
- shufps xmm8, xmm5, 214
- pshufd xmm9, xmm4, 0FH
- pshufd xmm4, xmm8, 39H
- movdqa xmm8, xmm6
- shufps xmm8, xmm7, 250
- pblendw xmm9, xmm8, 0CCH
- movdqa xmm8, xmm7
- punpcklqdq xmm8, xmm5
- pblendw xmm8, xmm6, 0C0H
- pshufd xmm8, xmm8, 78H
- punpckhdq xmm5, xmm7
- punpckldq xmm6, xmm5
- pshufd xmm7, xmm6, 1EH
- movdqa xmm5, xmm9
- movdqa xmm6, xmm8
- jmp @B
-@@:
- pxor xmm0, xmm2
- pxor xmm1, xmm3
- movups xmmword ptr [rcx], xmm0
- movups xmmword ptr [rcx+10H], xmm1
- movdqa xmm6, xmmword ptr [rsp]
- movdqa xmm7, xmmword ptr [rsp+10H]
- movdqa xmm8, xmmword ptr [rsp+20H]
- movdqa xmm9, xmmword ptr [rsp+30H]
- movdqa xmm11, xmmword ptr [rsp+40H]
- movdqa xmm14, xmmword ptr [rsp+50H]
- movdqa xmm15, xmmword ptr [rsp+60H]
- add rsp, 120
- ret
-_blake3_compress_in_place_sse41 ENDP
-blake3_compress_in_place_sse41 ENDP
-
-ALIGN 16
-blake3_compress_xof_sse41 PROC
-_blake3_compress_xof_sse41 PROC
- sub rsp, 120
- movdqa xmmword ptr [rsp], xmm6
- movdqa xmmword ptr [rsp+10H], xmm7
- movdqa xmmword ptr [rsp+20H], xmm8
- movdqa xmmword ptr [rsp+30H], xmm9
- movdqa xmmword ptr [rsp+40H], xmm11
- movdqa xmmword ptr [rsp+50H], xmm14
- movdqa xmmword ptr [rsp+60H], xmm15
- movups xmm0, xmmword ptr [rcx]
- movups xmm1, xmmword ptr [rcx+10H]
- movaps xmm2, xmmword ptr [BLAKE3_IV]
- movzx eax, byte ptr [rsp+0A0H]
- movzx r8d, r8b
- mov r10, qword ptr [rsp+0A8H]
- shl rax, 32
- add r8, rax
- movq xmm3, r9
- movq xmm4, r8
- punpcklqdq xmm3, xmm4
- movups xmm4, xmmword ptr [rdx]
- movups xmm5, xmmword ptr [rdx+10H]
- movaps xmm8, xmm4
- shufps xmm4, xmm5, 136
- shufps xmm8, xmm5, 221
- movaps xmm5, xmm8
- movups xmm6, xmmword ptr [rdx+20H]
- movups xmm7, xmmword ptr [rdx+30H]
- movaps xmm8, xmm6
- shufps xmm6, xmm7, 136
- pshufd xmm6, xmm6, 93H
- shufps xmm8, xmm7, 221
- pshufd xmm7, xmm8, 93H
- movaps xmm14, xmmword ptr [ROT8]
- movaps xmm15, xmmword ptr [ROT16]
- mov al, 7
-@@:
- paddd xmm0, xmm4
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- pshufb xmm3, xmm15
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 20
- psrld xmm11, 12
- por xmm1, xmm11
- paddd xmm0, xmm5
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- pshufb xmm3, xmm14
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 25
- psrld xmm11, 7
- por xmm1, xmm11
- pshufd xmm0, xmm0, 93H
- pshufd xmm3, xmm3, 4EH
- pshufd xmm2, xmm2, 39H
- paddd xmm0, xmm6
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- pshufb xmm3, xmm15
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 20
- psrld xmm11, 12
- por xmm1, xmm11
- paddd xmm0, xmm7
- paddd xmm0, xmm1
- pxor xmm3, xmm0
- pshufb xmm3, xmm14
- paddd xmm2, xmm3
- pxor xmm1, xmm2
- movdqa xmm11, xmm1
- pslld xmm1, 25
- psrld xmm11, 7
- por xmm1, xmm11
- pshufd xmm0, xmm0, 39H
- pshufd xmm3, xmm3, 4EH
- pshufd xmm2, xmm2, 93H
- dec al
- jz @F
- movdqa xmm8, xmm4
- shufps xmm8, xmm5, 214
- pshufd xmm9, xmm4, 0FH
- pshufd xmm4, xmm8, 39H
- movdqa xmm8, xmm6
- shufps xmm8, xmm7, 250
- pblendw xmm9, xmm8, 0CCH
- movdqa xmm8, xmm7
- punpcklqdq xmm8, xmm5
- pblendw xmm8, xmm6, 0C0H
- pshufd xmm8, xmm8, 78H
- punpckhdq xmm5, xmm7
- punpckldq xmm6, xmm5
- pshufd xmm7, xmm6, 1EH
- movdqa xmm5, xmm9
- movdqa xmm6, xmm8
- jmp @B
-@@:
- movdqu xmm4, xmmword ptr [rcx]
- movdqu xmm5, xmmword ptr [rcx+10H]
- pxor xmm0, xmm2
- pxor xmm1, xmm3
- pxor xmm2, xmm4
- pxor xmm3, xmm5
- movups xmmword ptr [r10], xmm0
- movups xmmword ptr [r10+10H], xmm1
- movups xmmword ptr [r10+20H], xmm2
- movups xmmword ptr [r10+30H], xmm3
- movdqa xmm6, xmmword ptr [rsp]
- movdqa xmm7, xmmword ptr [rsp+10H]
- movdqa xmm8, xmmword ptr [rsp+20H]
- movdqa xmm9, xmmword ptr [rsp+30H]
- movdqa xmm11, xmmword ptr [rsp+40H]
- movdqa xmm14, xmmword ptr [rsp+50H]
- movdqa xmm15, xmmword ptr [rsp+60H]
- add rsp, 120
- ret
-_blake3_compress_xof_sse41 ENDP
-blake3_compress_xof_sse41 ENDP
-
-_TEXT ENDS
-
-
-_RDATA SEGMENT READONLY PAGE ALIAS(".rdata") 'CONST'
-ALIGN 64
-BLAKE3_IV:
- dd 6A09E667H, 0BB67AE85H, 3C6EF372H, 0A54FF53AH
-
-ADD0:
- dd 0, 1, 2, 3
-
-ADD1:
- dd 4 dup (4)
-
-BLAKE3_IV_0:
- dd 4 dup (6A09E667H)
-
-BLAKE3_IV_1:
- dd 4 dup (0BB67AE85H)
-
-BLAKE3_IV_2:
- dd 4 dup (3C6EF372H)
-
-BLAKE3_IV_3:
- dd 4 dup (0A54FF53AH)
-
-BLAKE3_BLOCK_LEN:
- dd 4 dup (64)
-
-ROT16:
- db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
-
-ROT8:
- db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
-
-CMP_MSB_MASK:
- dd 8 dup(80000000H)
-
-_RDATA ENDS
-END
-
diff --git a/thirdparty/BLAKE3/c/example.c b/thirdparty/BLAKE3/c/example.c
deleted file mode 100644
index 02fe3c32b..000000000
--- a/thirdparty/BLAKE3/c/example.c
+++ /dev/null
@@ -1,27 +0,0 @@
-#include "blake3.h"
-#include <stdio.h>
-#include <unistd.h>
-
-int main() {
- // Initialize the hasher.
- blake3_hasher hasher;
- blake3_hasher_init(&hasher);
-
- // Read input bytes from stdin.
- unsigned char buf[65536];
- ssize_t n;
- while ((n = read(STDIN_FILENO, buf, sizeof(buf))) > 0) {
- blake3_hasher_update(&hasher, buf, n);
- }
-
- // Finalize the hash. BLAKE3_OUT_LEN is the default output length, 32 bytes.
- uint8_t output[BLAKE3_OUT_LEN];
- blake3_hasher_finalize(&hasher, output, BLAKE3_OUT_LEN);
-
- // Print the hash as hexadecimal.
- for (size_t i = 0; i < BLAKE3_OUT_LEN; i++) {
- printf("%02x", output[i]);
- }
- printf("\n");
- return 0;
-}
diff --git a/thirdparty/BLAKE3/c/main.c b/thirdparty/BLAKE3/c/main.c
deleted file mode 100644
index 9b8a436f3..000000000
--- a/thirdparty/BLAKE3/c/main.c
+++ /dev/null
@@ -1,166 +0,0 @@
-/*
- * This main file is intended for testing via `make test`. It does not build in
- * other settings. See README.md in this directory for examples of how to build
- * C code.
- */
-
-#include <assert.h>
-#include <errno.h>
-#include <stdbool.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "blake3.h"
-#include "blake3_impl.h"
-
-#define HASH_MODE 0
-#define KEYED_HASH_MODE 1
-#define DERIVE_KEY_MODE 2
-
-static void hex_char_value(uint8_t c, uint8_t *value, bool *valid) {
- if ('0' <= c && c <= '9') {
- *value = c - '0';
- *valid = true;
- } else if ('a' <= c && c <= 'f') {
- *value = 10 + c - 'a';
- *valid = true;
- } else {
- *valid = false;
- }
-}
-
-static int parse_key(char *hex_key, uint8_t out[BLAKE3_KEY_LEN]) {
- size_t hex_len = strlen(hex_key);
- if (hex_len != 64) {
- fprintf(stderr, "Expected a 64-char hexadecimal key, got %zu chars.\n",
- hex_len);
- return 1;
- }
- for (size_t i = 0; i < 64; i++) {
- uint8_t value;
- bool valid;
- hex_char_value(hex_key[i], &value, &valid);
- if (!valid) {
- fprintf(stderr, "Invalid hex char.\n");
- return 1;
- }
- if (i % 2 == 0) {
- out[i / 2] = 0;
- value <<= 4;
- }
- out[i / 2] += value;
- }
- return 0;
-}
-
-/* A little repetition here */
-enum cpu_feature {
- SSE2 = 1 << 0,
- SSSE3 = 1 << 1,
- SSE41 = 1 << 2,
- AVX = 1 << 3,
- AVX2 = 1 << 4,
- AVX512F = 1 << 5,
- AVX512VL = 1 << 6,
- /* ... */
- UNDEFINED = 1 << 30
-};
-
-extern enum cpu_feature g_cpu_features;
-enum cpu_feature get_cpu_features();
-
-int main(int argc, char **argv) {
- size_t out_len = BLAKE3_OUT_LEN;
- uint8_t key[BLAKE3_KEY_LEN];
- char *context = "";
- uint8_t mode = HASH_MODE;
- while (argc > 1) {
- if (argc <= 2) {
- fprintf(stderr, "Odd number of arguments.\n");
- return 1;
- }
- if (strcmp("--length", argv[1]) == 0) {
- char *endptr = NULL;
- errno = 0;
- unsigned long long out_len_ll = strtoull(argv[2], &endptr, 10);
- if (errno != 0 || out_len > SIZE_MAX || endptr == argv[2] ||
- *endptr != 0) {
- fprintf(stderr, "Bad length argument.\n");
- return 1;
- }
- out_len = (size_t)out_len_ll;
- } else if (strcmp("--keyed", argv[1]) == 0) {
- mode = KEYED_HASH_MODE;
- int ret = parse_key(argv[2], key);
- if (ret != 0) {
- return ret;
- }
- } else if (strcmp("--derive-key", argv[1]) == 0) {
- mode = DERIVE_KEY_MODE;
- context = argv[2];
- } else {
- fprintf(stderr, "Unknown flag.\n");
- return 1;
- }
- argc -= 2;
- argv += 2;
- }
-
- /*
- * We're going to hash the input multiple times, so we need to buffer it all.
- * This is just for test cases, so go ahead and assume that the input is less
- * than 1 MiB.
- */
- size_t buf_capacity = 1 << 20;
- uint8_t *buf = malloc(buf_capacity);
- assert(buf != NULL);
- size_t buf_len = 0;
- while (1) {
- size_t n = fread(&buf[buf_len], 1, buf_capacity - buf_len, stdin);
- if (n == 0) {
- break;
- }
- buf_len += n;
- assert(buf_len < buf_capacity);
- }
-
- const int mask = get_cpu_features();
- int feature = 0;
- do {
- fprintf(stderr, "Testing 0x%08X\n", feature);
- g_cpu_features = feature;
- blake3_hasher hasher;
- switch (mode) {
- case HASH_MODE:
- blake3_hasher_init(&hasher);
- break;
- case KEYED_HASH_MODE:
- blake3_hasher_init_keyed(&hasher, key);
- break;
- case DERIVE_KEY_MODE:
- blake3_hasher_init_derive_key(&hasher, context);
- break;
- default:
- abort();
- }
-
- blake3_hasher_update(&hasher, buf, buf_len);
-
- /* TODO: An incremental output reader API to avoid this allocation. */
- uint8_t *out = malloc(out_len);
- if (out_len > 0 && out == NULL) {
- fprintf(stderr, "malloc() failed.\n");
- return 1;
- }
- blake3_hasher_finalize(&hasher, out, out_len);
- for (size_t i = 0; i < out_len; i++) {
- printf("%02x", out[i]);
- }
- printf("\n");
- free(out);
- feature = (feature - mask) & mask;
- } while (feature != 0);
- free(buf);
- return 0;
-}
diff --git a/thirdparty/BLAKE3/c/test.py b/thirdparty/BLAKE3/c/test.py
deleted file mode 100644
index b0b192950..000000000
--- a/thirdparty/BLAKE3/c/test.py
+++ /dev/null
@@ -1,97 +0,0 @@
-#! /usr/bin/env python3
-
-from binascii import hexlify
-import json
-from os import path
-import subprocess
-
-HERE = path.dirname(__file__)
-TEST_VECTORS_PATH = path.join(HERE, "..", "test_vectors", "test_vectors.json")
-TEST_VECTORS = json.load(open(TEST_VECTORS_PATH))
-
-
-def run_blake3(args, input):
- output = subprocess.run([path.join(HERE, "blake3")] + args,
- input=input,
- stdout=subprocess.PIPE,
- check=True)
- return output.stdout.decode().strip()
-
-
-# Fill the input with a repeating byte pattern. We use a cycle length of 251,
-# because that's the largets prime number less than 256. This makes it unlikely
-# to swapping any two adjacent input blocks or chunks will give the same
-# answer.
-def make_test_input(length):
- i = 0
- buf = bytearray()
- while len(buf) < length:
- buf.append(i)
- i = (i + 1) % 251
- return buf
-
-
-def main():
- for case in TEST_VECTORS["cases"]:
- input_len = case["input_len"]
- input = make_test_input(input_len)
- hex_key = hexlify(TEST_VECTORS["key"].encode())
- context_string = TEST_VECTORS["context_string"]
- expected_hash_xof = case["hash"]
- expected_hash = expected_hash_xof[:64]
- expected_keyed_hash_xof = case["keyed_hash"]
- expected_keyed_hash = expected_keyed_hash_xof[:64]
- expected_derive_key_xof = case["derive_key"]
- expected_derive_key = expected_derive_key_xof[:64]
-
- # Test the default hash.
- test_hash = run_blake3([], input)
- for line in test_hash.splitlines():
- assert expected_hash == line, \
- "hash({}): {} != {}".format(input_len, expected_hash, line)
-
- # Test the extended hash.
- xof_len = len(expected_hash_xof) // 2
- test_hash_xof = run_blake3(["--length", str(xof_len)], input)
- for line in test_hash_xof.splitlines():
- assert expected_hash_xof == line, \
- "hash_xof({}): {} != {}".format(
- input_len, expected_hash_xof, line)
-
- # Test the default keyed hash.
- test_keyed_hash = run_blake3(["--keyed", hex_key], input)
- for line in test_keyed_hash.splitlines():
- assert expected_keyed_hash == line, \
- "keyed_hash({}): {} != {}".format(
- input_len, expected_keyed_hash, line)
-
- # Test the extended keyed hash.
- xof_len = len(expected_keyed_hash_xof) // 2
- test_keyed_hash_xof = run_blake3(
- ["--keyed", hex_key, "--length",
- str(xof_len)], input)
- for line in test_keyed_hash_xof.splitlines():
- assert expected_keyed_hash_xof == line, \
- "keyed_hash_xof({}): {} != {}".format(
- input_len, expected_keyed_hash_xof, line)
-
- # Test the default derive key.
- test_derive_key = run_blake3(["--derive-key", context_string], input)
- for line in test_derive_key.splitlines():
- assert expected_derive_key == line, \
- "derive_key({}): {} != {}".format(
- input_len, expected_derive_key, line)
-
- # Test the extended derive key.
- xof_len = len(expected_derive_key_xof) // 2
- test_derive_key_xof = run_blake3(
- ["--derive-key", context_string, "--length",
- str(xof_len)], input)
- for line in test_derive_key_xof.splitlines():
- assert expected_derive_key_xof == line, \
- "derive_key_xof({}): {} != {}".format(
- input_len, expected_derive_key_xof, line)
-
-
-if __name__ == "__main__":
- main()
diff --git a/thirdparty/BLAKE3/lib/Linux_x64/libblake3.a b/thirdparty/BLAKE3/lib/Linux_x64/libblake3.a
deleted file mode 100644
index b956e22cb..000000000
--- a/thirdparty/BLAKE3/lib/Linux_x64/libblake3.a
+++ /dev/null
Binary files differ
diff --git a/thirdparty/BLAKE3/lib/Mac_arm64/libblake3.a b/thirdparty/BLAKE3/lib/Mac_arm64/libblake3.a
deleted file mode 100644
index a86e4001e..000000000
--- a/thirdparty/BLAKE3/lib/Mac_arm64/libblake3.a
+++ /dev/null
Binary files differ
diff --git a/thirdparty/BLAKE3/lib/Mac_x64/libblake3.a b/thirdparty/BLAKE3/lib/Mac_x64/libblake3.a
deleted file mode 100644
index c2ed0276a..000000000
--- a/thirdparty/BLAKE3/lib/Mac_x64/libblake3.a
+++ /dev/null
Binary files differ
diff --git a/thirdparty/BLAKE3/lib/Win64/BLAKE3.lib b/thirdparty/BLAKE3/lib/Win64/BLAKE3.lib
deleted file mode 100644
index 1308d9928..000000000
--- a/thirdparty/BLAKE3/lib/Win64/BLAKE3.lib
+++ /dev/null
Binary files differ
diff --git a/thirdparty/BLAKE3/media/B3.svg b/thirdparty/BLAKE3/media/B3.svg
deleted file mode 100644
index a50da0ce9..000000000
--- a/thirdparty/BLAKE3/media/B3.svg
+++ /dev/null
@@ -1,70 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<!-- Created with Inkscape (http://www.inkscape.org/) -->
-
-<svg
- xmlns:dc="http://purl.org/dc/elements/1.1/"
- xmlns:cc="http://creativecommons.org/ns#"
- xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
- xmlns:svg="http://www.w3.org/2000/svg"
- xmlns="http://www.w3.org/2000/svg"
- xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
- xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
- width="13.356165mm"
- height="7.1437497mm"
- viewBox="0 0 13.356165 7.1437497"
- version="1.1"
- id="svg8"
- sodipodi:docname="B3.svg"
- inkscape:version="0.92.4 5da689c313, 2019-01-14">
- <defs
- id="defs2" />
- <sodipodi:namedview
- id="base"
- pagecolor="#ffffff"
- bordercolor="#666666"
- borderopacity="1.0"
- inkscape:pageopacity="0.0"
- inkscape:pageshadow="2"
- inkscape:zoom="4"
- inkscape:cx="72.73328"
- inkscape:cy="-34.835127"
- inkscape:document-units="mm"
- inkscape:current-layer="layer1"
- showgrid="false"
- inkscape:window-width="1920"
- inkscape:window-height="1016"
- inkscape:window-x="0"
- inkscape:window-y="27"
- inkscape:window-maximized="1" />
- <metadata
- id="metadata5">
- <rdf:RDF>
- <cc:Work
- rdf:about="">
- <dc:format>image/svg+xml</dc:format>
- <dc:type
- rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
- <dc:title />
- </cc:Work>
- </rdf:RDF>
- </metadata>
- <g
- inkscape:label="Layer 1"
- inkscape:groupmode="layer"
- id="layer1"
- transform="translate(-24.441005,-113.52518)">
- <g
- aria-label="B3"
- style="font-style:normal;font-weight:normal;font-size:10.58333302px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458332"
- id="text868">
- <path
- d="m 28.176921,113.52518 q 0.635,0 1.0795,0.14817 0.455084,0.13758 0.740834,0.40216 0.296333,0.254 0.433916,0.61384 0.137584,0.35983 0.137584,0.79375 0,0.62441 -0.264584,1.00541 -0.254,0.381 -0.762,0.58209 0.508,0.21166 0.783167,0.61383 0.275167,0.39158 0.275167,1.016 0,0.43392 -0.137584,0.79375 -0.137583,0.35983 -0.433916,0.62442 -0.28575,0.254 -0.740834,0.40216 -0.4445,0.14817 -1.0795,0.14817 h -3.174999 q -0.592667,0 -0.592667,-0.58208 v -5.97959 q 0,-0.58208 0.592667,-0.58208 z m -2.508249,5.78908 q 0,0.11642 0.137583,0.11642 h 2.434166 q 0.5715,0 0.836084,-0.24342 0.264583,-0.24341 0.264583,-0.68791 0,-0.92075 -1.100667,-0.92075 h -2.571749 z m 0,-2.77283 h 2.539999 q 1.100667,0 1.100667,-0.85725 0,-0.42333 -0.264583,-0.67733 -0.254,-0.254 -0.8255,-0.254 h -2.413 q -0.137583,0 -0.137583,0.127 z"
- style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-family:Nasalization;-inkscape-font-specification:Nasalization;stroke-width:0.26458332"
- id="path814" />
- <path
- d="m 35.38417,113.52518 q 0.635,0 1.0795,0.14817 0.455083,0.13758 0.740833,0.40216 0.296333,0.254 0.433917,0.60325 0.137583,0.34925 0.137583,0.762 0,0.635 -0.264583,1.03717 -0.254,0.39158 -0.751417,0.60325 0.508,0.21167 0.772583,0.62442 0.264584,0.40216 0.264584,1.04775 0,0.40216 -0.137584,0.75141 -0.137583,0.34925 -0.423333,0.61384 -0.28575,0.254 -0.740833,0.40216 -0.4445,0.14817 -1.0795,0.14817 h -3.608917 v -1.24883 h 3.608917 q 0.550333,0 0.814917,-0.23284 0.264583,-0.24341 0.264583,-0.67733 0,-0.85725 -1.090083,-0.85725 h -2.201334 v -1.13242 h 2.169584 q 0.550333,0 0.814916,-0.20108 0.275167,-0.21167 0.275167,-0.65617 0,-0.40216 -0.254,-0.64558 -0.254,-0.24342 -0.8255,-0.24342 h -3.566583 v -1.24883 z"
- style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-family:Nasalization;-inkscape-font-specification:Nasalization;stroke-width:0.26458332"
- id="path816" />
- </g>
- </g>
-</svg>
diff --git a/thirdparty/BLAKE3/media/BLAKE3.svg b/thirdparty/BLAKE3/media/BLAKE3.svg
deleted file mode 100644
index 2d50c2d3b..000000000
--- a/thirdparty/BLAKE3/media/BLAKE3.svg
+++ /dev/null
@@ -1,85 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<!-- Created with Inkscape (http://www.inkscape.org/) -->
-
-<svg
- xmlns:dc="http://purl.org/dc/elements/1.1/"
- xmlns:cc="http://creativecommons.org/ns#"
- xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
- xmlns:svg="http://www.w3.org/2000/svg"
- xmlns="http://www.w3.org/2000/svg"
- xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
- xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
- width="43.063534mm"
- height="7.2707496mm"
- viewBox="0 0 43.063534 7.2707496"
- version="1.1"
- id="svg8"
- sodipodi:docname="BLAKE3.svg"
- inkscape:version="0.92.4 5da689c313, 2019-01-14">
- <defs
- id="defs2" />
- <sodipodi:namedview
- id="base"
- pagecolor="#ffffff"
- bordercolor="#666666"
- borderopacity="1.0"
- inkscape:pageopacity="0.0"
- inkscape:pageshadow="2"
- inkscape:zoom="4"
- inkscape:cx="72.73328"
- inkscape:cy="-34.835127"
- inkscape:document-units="mm"
- inkscape:current-layer="layer1"
- showgrid="false"
- inkscape:window-width="1920"
- inkscape:window-height="1016"
- inkscape:window-x="0"
- inkscape:window-y="27"
- inkscape:window-maximized="1" />
- <metadata
- id="metadata5">
- <rdf:RDF>
- <cc:Work
- rdf:about="">
- <dc:format>image/svg+xml</dc:format>
- <dc:type
- rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
- </cc:Work>
- </rdf:RDF>
- </metadata>
- <g
- inkscape:label="Layer 1"
- inkscape:groupmode="layer"
- id="layer1"
- transform="translate(-24.441005,-113.39818)">
- <g
- aria-label="BLAKE3"
- style="font-style:normal;font-weight:normal;font-size:10.58333302px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458332"
- id="text868">
- <path
- d="m 28.176921,113.52518 q 0.635,0 1.0795,0.14817 0.455084,0.13758 0.740834,0.40216 0.296333,0.254 0.433916,0.61384 0.137584,0.35983 0.137584,0.79375 0,0.62441 -0.264584,1.00541 -0.254,0.381 -0.762,0.58209 0.508,0.21166 0.783167,0.61383 0.275167,0.39158 0.275167,1.016 0,0.43392 -0.137584,0.79375 -0.137583,0.35983 -0.433916,0.62442 -0.28575,0.254 -0.740834,0.40216 -0.4445,0.14817 -1.0795,0.14817 h -3.174999 q -0.592667,0 -0.592667,-0.58208 v -5.97959 q 0,-0.58208 0.592667,-0.58208 z m -2.508249,5.78908 q 0,0.11642 0.137583,0.11642 h 2.434166 q 0.5715,0 0.836084,-0.24342 0.264583,-0.24341 0.264583,-0.68791 0,-0.92075 -1.100667,-0.92075 h -2.571749 z m 0,-2.77283 h 2.539999 q 1.100667,0 1.100667,-0.85725 0,-0.42333 -0.264583,-0.67733 -0.254,-0.254 -0.8255,-0.254 h -2.413 q -0.137583,0 -0.137583,0.127 z"
- style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-family:Nasalization;-inkscape-font-specification:Nasalization;stroke-width:0.26458332"
- id="path814" />
- <path
- d="m 33.22517,113.52518 v 4.66725 q 0,0.254 0.0635,0.48683 0.07408,0.22225 0.243417,0.39159 0.169333,0.15875 0.4445,0.254 0.28575,0.0953 0.709083,0.0953 h 2.772833 v 1.24883 h -2.846916 q -0.709084,0 -1.217084,-0.17992 -0.497416,-0.1905 -0.814916,-0.51858 -0.3175,-0.32808 -0.465667,-0.77258 -0.137583,-0.45509 -0.137583,-0.99484 v -4.67783 z"
- style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-family:Nasalization;-inkscape-font-specification:Nasalization;stroke-width:0.26458332"
- id="path816" />
- <path
- d="M 39.342334,120.66893 H 37.9665 l 2.50825,-6.35 q 0.201084,-0.508 0.560917,-0.70908 0.370417,-0.21167 0.941917,-0.21167 0.560916,0 0.92075,0.21167 0.370416,0.20108 0.560916,0.70908 l 2.413,6.35 h -1.386416 l -2.169584,-5.74675 q -0.09525,-0.24342 -0.34925,-0.24342 -0.254,0 -0.359833,0.24342 z"
- style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-family:Nasalization;-inkscape-font-specification:Nasalization;stroke-width:0.26458332"
- id="path818" />
- <path
- d="m 48.179401,113.52518 v 3.02683 h 0.687917 q 0.455083,0 0.740833,-0.0212 0.296333,-0.0318 0.486833,-0.127 0.1905,-0.0953 0.3175,-0.26459 0.137584,-0.17991 0.28575,-0.47625 l 1.090084,-2.13783 h 1.344083 l -1.121833,2.2225 q -0.243417,0.47625 -0.518584,0.79375 -0.275166,0.3175 -0.719666,0.508 0.254,0.0635 0.4445,0.17992 0.1905,0.10583 0.34925,0.27516 0.169333,0.15875 0.3175,0.39159 0.148166,0.22225 0.306916,0.52916 l 1.153584,2.24367 h -1.397 l -1.090084,-2.11667 q -0.148166,-0.28575 -0.28575,-0.45508 -0.137583,-0.16933 -0.34925,-0.26458 -0.211666,-0.0952 -0.529166,-0.11642 -0.3175,-0.0317 -0.8255,-0.0317 h -0.687917 v 2.9845 h -1.248833 v -7.14375 z"
- style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-family:Nasalization;-inkscape-font-specification:Nasalization;stroke-width:0.26458332"
- id="path820" />
- <path
- d="m 60.127965,113.52518 v 1.24883 h -3.577166 q -0.5715,0 -0.8255,0.24342 -0.254,0.24342 -0.254,0.65617 0,0.84666 1.090083,0.84666 h 3.513667 v 1.13242 h -3.545417 q -1.090083,0 -1.090083,0.86783 0,0.42334 0.264583,0.66675 0.264583,0.23284 0.814917,0.23284 h 3.6195 v 1.24883 h -3.6195 q -0.635,0 -1.090083,-0.14817 -0.4445,-0.14816 -0.740834,-0.40216 -0.28575,-0.26459 -0.423333,-0.62442 -0.127,-0.35983 -0.127,-0.77258 0,-0.61384 0.264583,-1.016 0.264584,-0.41275 0.762,-0.62442 -1.005416,-0.41275 -1.005416,-1.60867 0,-0.42333 0.137583,-0.78316 0.137583,-0.35984 0.423333,-0.61384 0.296334,-0.26458 0.740834,-0.40216 0.455083,-0.14817 1.090083,-0.14817 z"
- style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-family:Nasalization;-inkscape-font-specification:Nasalization;stroke-width:0.26458332"
- id="path822" />
- <path
- d="m 65.091539,113.52518 q 0.635,0 1.0795,0.14817 0.455083,0.13758 0.740833,0.40216 0.296333,0.254 0.433917,0.60325 0.137583,0.34925 0.137583,0.762 0,0.635 -0.264583,1.03717 -0.254,0.39158 -0.751417,0.60325 0.508,0.21167 0.772583,0.62442 0.264584,0.40216 0.264584,1.04775 0,0.40216 -0.137584,0.75141 -0.137583,0.34925 -0.423333,0.61384 -0.28575,0.254 -0.740833,0.40216 -0.4445,0.14817 -1.0795,0.14817 h -3.608917 v -1.24883 h 3.608917 q 0.550333,0 0.814916,-0.23284 0.264584,-0.24341 0.264584,-0.67733 0,-0.85725 -1.090084,-0.85725 h -2.201333 v -1.13242 h 2.169583 q 0.550334,0 0.814917,-0.20108 0.275167,-0.21167 0.275167,-0.65617 0,-0.40216 -0.254,-0.64558 -0.254,-0.24342 -0.8255,-0.24342 h -3.566583 v -1.24883 z"
- style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-family:Nasalization;-inkscape-font-specification:Nasalization;stroke-width:0.26458332"
- id="path824" />
- </g>
- </g>
-</svg>
diff --git a/thirdparty/BLAKE3/media/speed.svg b/thirdparty/BLAKE3/media/speed.svg
deleted file mode 100644
index 7bd65ca3c..000000000
--- a/thirdparty/BLAKE3/media/speed.svg
+++ /dev/null
@@ -1,1474 +0,0 @@
-<?xml version="1.0" encoding="utf-8" standalone="no"?>
-<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
- "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
-<!-- Created with matplotlib (https://matplotlib.org/) -->
-<svg height="331.389812pt" version="1.1" viewBox="0 0 449.761406 331.389812" width="449.761406pt" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
- <defs>
- <style type="text/css">
-*{stroke-linecap:butt;stroke-linejoin:round;}
- </style>
- </defs>
- <g id="figure_1">
- <g id="patch_1">
- <path d="M 0 331.389812
-L 449.761406 331.389812
-L 449.761406 0
-L 0 0
-z
-" style="fill:#ffffff;"/>
- </g>
- <g id="axes_1">
- <g id="patch_2">
- <path d="M 71.443906 288.430125
-L 428.563906 288.430125
-L 428.563906 22.318125
-L 71.443906 22.318125
-z
-" style="fill:#ffffff;"/>
- </g>
- <g id="matplotlib.axis_1">
- <g id="xtick_1">
- <g id="line2d_1">
- <defs>
- <path d="M 0 0
-L 0 6
-" id="me95d5351a6" style="stroke:#262626;stroke-width:1.25;"/>
- </defs>
- <g>
- <use style="fill:#262626;stroke:#262626;stroke-width:1.25;" x="71.443906" xlink:href="#me95d5351a6" y="288.430125"/>
- </g>
- </g>
- <g id="text_1">
- <!-- 0 -->
- <defs>
- <path d="M 31.78125 66.40625
-Q 24.171875 66.40625 20.328125 58.90625
-Q 16.5 51.421875 16.5 36.375
-Q 16.5 21.390625 20.328125 13.890625
-Q 24.171875 6.390625 31.78125 6.390625
-Q 39.453125 6.390625 43.28125 13.890625
-Q 47.125 21.390625 47.125 36.375
-Q 47.125 51.421875 43.28125 58.90625
-Q 39.453125 66.40625 31.78125 66.40625
-z
-M 31.78125 74.21875
-Q 44.046875 74.21875 50.515625 64.515625
-Q 56.984375 54.828125 56.984375 36.375
-Q 56.984375 17.96875 50.515625 8.265625
-Q 44.046875 -1.421875 31.78125 -1.421875
-Q 19.53125 -1.421875 13.0625 8.265625
-Q 6.59375 17.96875 6.59375 36.375
-Q 6.59375 54.828125 13.0625 64.515625
-Q 19.53125 74.21875 31.78125 74.21875
-z
-" id="DejaVuSans-48"/>
- </defs>
- <g style="fill:#262626;" transform="translate(67.944531 306.288406)scale(0.11 -0.11)">
- <use xlink:href="#DejaVuSans-48"/>
- </g>
- </g>
- </g>
- <g id="xtick_2">
- <g id="line2d_2">
- <g>
- <use style="fill:#262626;stroke:#262626;stroke-width:1.25;" x="116.083906" xlink:href="#me95d5351a6" y="288.430125"/>
- </g>
- </g>
- <g id="text_2">
- <!-- 1000 -->
- <defs>
- <path d="M 12.40625 8.296875
-L 28.515625 8.296875
-L 28.515625 63.921875
-L 10.984375 60.40625
-L 10.984375 69.390625
-L 28.421875 72.90625
-L 38.28125 72.90625
-L 38.28125 8.296875
-L 54.390625 8.296875
-L 54.390625 0
-L 12.40625 0
-z
-" id="DejaVuSans-49"/>
- </defs>
- <g style="fill:#262626;" transform="translate(102.086406 306.288406)scale(0.11 -0.11)">
- <use xlink:href="#DejaVuSans-49"/>
- <use x="63.623047" xlink:href="#DejaVuSans-48"/>
- <use x="127.246094" xlink:href="#DejaVuSans-48"/>
- <use x="190.869141" xlink:href="#DejaVuSans-48"/>
- </g>
- </g>
- </g>
- <g id="xtick_3">
- <g id="line2d_3">
- <g>
- <use style="fill:#262626;stroke:#262626;stroke-width:1.25;" x="160.723906" xlink:href="#me95d5351a6" y="288.430125"/>
- </g>
- </g>
- <g id="text_3">
- <!-- 2000 -->
- <defs>
- <path d="M 19.1875 8.296875
-L 53.609375 8.296875
-L 53.609375 0
-L 7.328125 0
-L 7.328125 8.296875
-Q 12.9375 14.109375 22.625 23.890625
-Q 32.328125 33.6875 34.8125 36.53125
-Q 39.546875 41.84375 41.421875 45.53125
-Q 43.3125 49.21875 43.3125 52.78125
-Q 43.3125 58.59375 39.234375 62.25
-Q 35.15625 65.921875 28.609375 65.921875
-Q 23.96875 65.921875 18.8125 64.3125
-Q 13.671875 62.703125 7.8125 59.421875
-L 7.8125 69.390625
-Q 13.765625 71.78125 18.9375 73
-Q 24.125 74.21875 28.421875 74.21875
-Q 39.75 74.21875 46.484375 68.546875
-Q 53.21875 62.890625 53.21875 53.421875
-Q 53.21875 48.921875 51.53125 44.890625
-Q 49.859375 40.875 45.40625 35.40625
-Q 44.1875 33.984375 37.640625 27.21875
-Q 31.109375 20.453125 19.1875 8.296875
-z
-" id="DejaVuSans-50"/>
- </defs>
- <g style="fill:#262626;" transform="translate(146.726406 306.288406)scale(0.11 -0.11)">
- <use xlink:href="#DejaVuSans-50"/>
- <use x="63.623047" xlink:href="#DejaVuSans-48"/>
- <use x="127.246094" xlink:href="#DejaVuSans-48"/>
- <use x="190.869141" xlink:href="#DejaVuSans-48"/>
- </g>
- </g>
- </g>
- <g id="xtick_4">
- <g id="line2d_4">
- <g>
- <use style="fill:#262626;stroke:#262626;stroke-width:1.25;" x="205.363906" xlink:href="#me95d5351a6" y="288.430125"/>
- </g>
- </g>
- <g id="text_4">
- <!-- 3000 -->
- <defs>
- <path d="M 40.578125 39.3125
-Q 47.65625 37.796875 51.625 33
-Q 55.609375 28.21875 55.609375 21.1875
-Q 55.609375 10.40625 48.1875 4.484375
-Q 40.765625 -1.421875 27.09375 -1.421875
-Q 22.515625 -1.421875 17.65625 -0.515625
-Q 12.796875 0.390625 7.625 2.203125
-L 7.625 11.71875
-Q 11.71875 9.328125 16.59375 8.109375
-Q 21.484375 6.890625 26.8125 6.890625
-Q 36.078125 6.890625 40.9375 10.546875
-Q 45.796875 14.203125 45.796875 21.1875
-Q 45.796875 27.640625 41.28125 31.265625
-Q 36.765625 34.90625 28.71875 34.90625
-L 20.21875 34.90625
-L 20.21875 43.015625
-L 29.109375 43.015625
-Q 36.375 43.015625 40.234375 45.921875
-Q 44.09375 48.828125 44.09375 54.296875
-Q 44.09375 59.90625 40.109375 62.90625
-Q 36.140625 65.921875 28.71875 65.921875
-Q 24.65625 65.921875 20.015625 65.03125
-Q 15.375 64.15625 9.8125 62.3125
-L 9.8125 71.09375
-Q 15.4375 72.65625 20.34375 73.4375
-Q 25.25 74.21875 29.59375 74.21875
-Q 40.828125 74.21875 47.359375 69.109375
-Q 53.90625 64.015625 53.90625 55.328125
-Q 53.90625 49.265625 50.4375 45.09375
-Q 46.96875 40.921875 40.578125 39.3125
-z
-" id="DejaVuSans-51"/>
- </defs>
- <g style="fill:#262626;" transform="translate(191.366406 306.288406)scale(0.11 -0.11)">
- <use xlink:href="#DejaVuSans-51"/>
- <use x="63.623047" xlink:href="#DejaVuSans-48"/>
- <use x="127.246094" xlink:href="#DejaVuSans-48"/>
- <use x="190.869141" xlink:href="#DejaVuSans-48"/>
- </g>
- </g>
- </g>
- <g id="xtick_5">
- <g id="line2d_5">
- <g>
- <use style="fill:#262626;stroke:#262626;stroke-width:1.25;" x="250.003906" xlink:href="#me95d5351a6" y="288.430125"/>
- </g>
- </g>
- <g id="text_5">
- <!-- 4000 -->
- <defs>
- <path d="M 37.796875 64.3125
-L 12.890625 25.390625
-L 37.796875 25.390625
-z
-M 35.203125 72.90625
-L 47.609375 72.90625
-L 47.609375 25.390625
-L 58.015625 25.390625
-L 58.015625 17.1875
-L 47.609375 17.1875
-L 47.609375 0
-L 37.796875 0
-L 37.796875 17.1875
-L 4.890625 17.1875
-L 4.890625 26.703125
-z
-" id="DejaVuSans-52"/>
- </defs>
- <g style="fill:#262626;" transform="translate(236.006406 306.288406)scale(0.11 -0.11)">
- <use xlink:href="#DejaVuSans-52"/>
- <use x="63.623047" xlink:href="#DejaVuSans-48"/>
- <use x="127.246094" xlink:href="#DejaVuSans-48"/>
- <use x="190.869141" xlink:href="#DejaVuSans-48"/>
- </g>
- </g>
- </g>
- <g id="xtick_6">
- <g id="line2d_6">
- <g>
- <use style="fill:#262626;stroke:#262626;stroke-width:1.25;" x="294.643906" xlink:href="#me95d5351a6" y="288.430125"/>
- </g>
- </g>
- <g id="text_6">
- <!-- 5000 -->
- <defs>
- <path d="M 10.796875 72.90625
-L 49.515625 72.90625
-L 49.515625 64.59375
-L 19.828125 64.59375
-L 19.828125 46.734375
-Q 21.96875 47.46875 24.109375 47.828125
-Q 26.265625 48.1875 28.421875 48.1875
-Q 40.625 48.1875 47.75 41.5
-Q 54.890625 34.8125 54.890625 23.390625
-Q 54.890625 11.625 47.5625 5.09375
-Q 40.234375 -1.421875 26.90625 -1.421875
-Q 22.3125 -1.421875 17.546875 -0.640625
-Q 12.796875 0.140625 7.71875 1.703125
-L 7.71875 11.625
-Q 12.109375 9.234375 16.796875 8.0625
-Q 21.484375 6.890625 26.703125 6.890625
-Q 35.15625 6.890625 40.078125 11.328125
-Q 45.015625 15.765625 45.015625 23.390625
-Q 45.015625 31 40.078125 35.4375
-Q 35.15625 39.890625 26.703125 39.890625
-Q 22.75 39.890625 18.8125 39.015625
-Q 14.890625 38.140625 10.796875 36.28125
-z
-" id="DejaVuSans-53"/>
- </defs>
- <g style="fill:#262626;" transform="translate(280.646406 306.288406)scale(0.11 -0.11)">
- <use xlink:href="#DejaVuSans-53"/>
- <use x="63.623047" xlink:href="#DejaVuSans-48"/>
- <use x="127.246094" xlink:href="#DejaVuSans-48"/>
- <use x="190.869141" xlink:href="#DejaVuSans-48"/>
- </g>
- </g>
- </g>
- <g id="xtick_7">
- <g id="line2d_7">
- <g>
- <use style="fill:#262626;stroke:#262626;stroke-width:1.25;" x="339.283906" xlink:href="#me95d5351a6" y="288.430125"/>
- </g>
- </g>
- <g id="text_7">
- <!-- 6000 -->
- <defs>
- <path d="M 33.015625 40.375
-Q 26.375 40.375 22.484375 35.828125
-Q 18.609375 31.296875 18.609375 23.390625
-Q 18.609375 15.53125 22.484375 10.953125
-Q 26.375 6.390625 33.015625 6.390625
-Q 39.65625 6.390625 43.53125 10.953125
-Q 47.40625 15.53125 47.40625 23.390625
-Q 47.40625 31.296875 43.53125 35.828125
-Q 39.65625 40.375 33.015625 40.375
-z
-M 52.59375 71.296875
-L 52.59375 62.3125
-Q 48.875 64.0625 45.09375 64.984375
-Q 41.3125 65.921875 37.59375 65.921875
-Q 27.828125 65.921875 22.671875 59.328125
-Q 17.53125 52.734375 16.796875 39.40625
-Q 19.671875 43.65625 24.015625 45.921875
-Q 28.375 48.1875 33.59375 48.1875
-Q 44.578125 48.1875 50.953125 41.515625
-Q 57.328125 34.859375 57.328125 23.390625
-Q 57.328125 12.15625 50.6875 5.359375
-Q 44.046875 -1.421875 33.015625 -1.421875
-Q 20.359375 -1.421875 13.671875 8.265625
-Q 6.984375 17.96875 6.984375 36.375
-Q 6.984375 53.65625 15.1875 63.9375
-Q 23.390625 74.21875 37.203125 74.21875
-Q 40.921875 74.21875 44.703125 73.484375
-Q 48.484375 72.75 52.59375 71.296875
-z
-" id="DejaVuSans-54"/>
- </defs>
- <g style="fill:#262626;" transform="translate(325.286406 306.288406)scale(0.11 -0.11)">
- <use xlink:href="#DejaVuSans-54"/>
- <use x="63.623047" xlink:href="#DejaVuSans-48"/>
- <use x="127.246094" xlink:href="#DejaVuSans-48"/>
- <use x="190.869141" xlink:href="#DejaVuSans-48"/>
- </g>
- </g>
- </g>
- <g id="xtick_8">
- <g id="line2d_8">
- <g>
- <use style="fill:#262626;stroke:#262626;stroke-width:1.25;" x="383.923906" xlink:href="#me95d5351a6" y="288.430125"/>
- </g>
- </g>
- <g id="text_8">
- <!-- 7000 -->
- <defs>
- <path d="M 8.203125 72.90625
-L 55.078125 72.90625
-L 55.078125 68.703125
-L 28.609375 0
-L 18.3125 0
-L 43.21875 64.59375
-L 8.203125 64.59375
-z
-" id="DejaVuSans-55"/>
- </defs>
- <g style="fill:#262626;" transform="translate(369.926406 306.288406)scale(0.11 -0.11)">
- <use xlink:href="#DejaVuSans-55"/>
- <use x="63.623047" xlink:href="#DejaVuSans-48"/>
- <use x="127.246094" xlink:href="#DejaVuSans-48"/>
- <use x="190.869141" xlink:href="#DejaVuSans-48"/>
- </g>
- </g>
- </g>
- <g id="xtick_9">
- <g id="line2d_9">
- <g>
- <use style="fill:#262626;stroke:#262626;stroke-width:1.25;" x="428.563906" xlink:href="#me95d5351a6" y="288.430125"/>
- </g>
- </g>
- <g id="text_9">
- <!-- 8000 -->
- <defs>
- <path d="M 31.78125 34.625
-Q 24.75 34.625 20.71875 30.859375
-Q 16.703125 27.09375 16.703125 20.515625
-Q 16.703125 13.921875 20.71875 10.15625
-Q 24.75 6.390625 31.78125 6.390625
-Q 38.8125 6.390625 42.859375 10.171875
-Q 46.921875 13.96875 46.921875 20.515625
-Q 46.921875 27.09375 42.890625 30.859375
-Q 38.875 34.625 31.78125 34.625
-z
-M 21.921875 38.8125
-Q 15.578125 40.375 12.03125 44.71875
-Q 8.5 49.078125 8.5 55.328125
-Q 8.5 64.0625 14.71875 69.140625
-Q 20.953125 74.21875 31.78125 74.21875
-Q 42.671875 74.21875 48.875 69.140625
-Q 55.078125 64.0625 55.078125 55.328125
-Q 55.078125 49.078125 51.53125 44.71875
-Q 48 40.375 41.703125 38.8125
-Q 48.828125 37.15625 52.796875 32.3125
-Q 56.78125 27.484375 56.78125 20.515625
-Q 56.78125 9.90625 50.3125 4.234375
-Q 43.84375 -1.421875 31.78125 -1.421875
-Q 19.734375 -1.421875 13.25 4.234375
-Q 6.78125 9.90625 6.78125 20.515625
-Q 6.78125 27.484375 10.78125 32.3125
-Q 14.796875 37.15625 21.921875 38.8125
-z
-M 18.3125 54.390625
-Q 18.3125 48.734375 21.84375 45.5625
-Q 25.390625 42.390625 31.78125 42.390625
-Q 38.140625 42.390625 41.71875 45.5625
-Q 45.3125 48.734375 45.3125 54.390625
-Q 45.3125 60.0625 41.71875 63.234375
-Q 38.140625 66.40625 31.78125 66.40625
-Q 25.390625 66.40625 21.84375 63.234375
-Q 18.3125 60.0625 18.3125 54.390625
-z
-" id="DejaVuSans-56"/>
- </defs>
- <g style="fill:#262626;" transform="translate(414.566406 306.288406)scale(0.11 -0.11)">
- <use xlink:href="#DejaVuSans-56"/>
- <use x="63.623047" xlink:href="#DejaVuSans-48"/>
- <use x="127.246094" xlink:href="#DejaVuSans-48"/>
- <use x="190.869141" xlink:href="#DejaVuSans-48"/>
- </g>
- </g>
- </g>
- <g id="text_10">
- <!-- Speed (MiB/s) -->
- <defs>
- <path d="M 53.515625 70.515625
-L 53.515625 60.890625
-Q 47.90625 63.578125 42.921875 64.890625
-Q 37.9375 66.21875 33.296875 66.21875
-Q 25.25 66.21875 20.875 63.09375
-Q 16.5 59.96875 16.5 54.203125
-Q 16.5 49.359375 19.40625 46.890625
-Q 22.3125 44.4375 30.421875 42.921875
-L 36.375 41.703125
-Q 47.40625 39.59375 52.65625 34.296875
-Q 57.90625 29 57.90625 20.125
-Q 57.90625 9.515625 50.796875 4.046875
-Q 43.703125 -1.421875 29.984375 -1.421875
-Q 24.8125 -1.421875 18.96875 -0.25
-Q 13.140625 0.921875 6.890625 3.21875
-L 6.890625 13.375
-Q 12.890625 10.015625 18.65625 8.296875
-Q 24.421875 6.59375 29.984375 6.59375
-Q 38.421875 6.59375 43.015625 9.90625
-Q 47.609375 13.234375 47.609375 19.390625
-Q 47.609375 24.75 44.3125 27.78125
-Q 41.015625 30.8125 33.5 32.328125
-L 27.484375 33.5
-Q 16.453125 35.6875 11.515625 40.375
-Q 6.59375 45.0625 6.59375 53.421875
-Q 6.59375 63.09375 13.40625 68.65625
-Q 20.21875 74.21875 32.171875 74.21875
-Q 37.3125 74.21875 42.625 73.28125
-Q 47.953125 72.359375 53.515625 70.515625
-z
-" id="DejaVuSans-83"/>
- <path d="M 18.109375 8.203125
-L 18.109375 -20.796875
-L 9.078125 -20.796875
-L 9.078125 54.6875
-L 18.109375 54.6875
-L 18.109375 46.390625
-Q 20.953125 51.265625 25.265625 53.625
-Q 29.59375 56 35.59375 56
-Q 45.5625 56 51.78125 48.09375
-Q 58.015625 40.1875 58.015625 27.296875
-Q 58.015625 14.40625 51.78125 6.484375
-Q 45.5625 -1.421875 35.59375 -1.421875
-Q 29.59375 -1.421875 25.265625 0.953125
-Q 20.953125 3.328125 18.109375 8.203125
-z
-M 48.6875 27.296875
-Q 48.6875 37.203125 44.609375 42.84375
-Q 40.53125 48.484375 33.40625 48.484375
-Q 26.265625 48.484375 22.1875 42.84375
-Q 18.109375 37.203125 18.109375 27.296875
-Q 18.109375 17.390625 22.1875 11.75
-Q 26.265625 6.109375 33.40625 6.109375
-Q 40.53125 6.109375 44.609375 11.75
-Q 48.6875 17.390625 48.6875 27.296875
-z
-" id="DejaVuSans-112"/>
- <path d="M 56.203125 29.59375
-L 56.203125 25.203125
-L 14.890625 25.203125
-Q 15.484375 15.921875 20.484375 11.0625
-Q 25.484375 6.203125 34.421875 6.203125
-Q 39.59375 6.203125 44.453125 7.46875
-Q 49.3125 8.734375 54.109375 11.28125
-L 54.109375 2.78125
-Q 49.265625 0.734375 44.1875 -0.34375
-Q 39.109375 -1.421875 33.890625 -1.421875
-Q 20.796875 -1.421875 13.15625 6.1875
-Q 5.515625 13.8125 5.515625 26.8125
-Q 5.515625 40.234375 12.765625 48.109375
-Q 20.015625 56 32.328125 56
-Q 43.359375 56 49.78125 48.890625
-Q 56.203125 41.796875 56.203125 29.59375
-z
-M 47.21875 32.234375
-Q 47.125 39.59375 43.09375 43.984375
-Q 39.0625 48.390625 32.421875 48.390625
-Q 24.90625 48.390625 20.390625 44.140625
-Q 15.875 39.890625 15.1875 32.171875
-z
-" id="DejaVuSans-101"/>
- <path d="M 45.40625 46.390625
-L 45.40625 75.984375
-L 54.390625 75.984375
-L 54.390625 0
-L 45.40625 0
-L 45.40625 8.203125
-Q 42.578125 3.328125 38.25 0.953125
-Q 33.9375 -1.421875 27.875 -1.421875
-Q 17.96875 -1.421875 11.734375 6.484375
-Q 5.515625 14.40625 5.515625 27.296875
-Q 5.515625 40.1875 11.734375 48.09375
-Q 17.96875 56 27.875 56
-Q 33.9375 56 38.25 53.625
-Q 42.578125 51.265625 45.40625 46.390625
-z
-M 14.796875 27.296875
-Q 14.796875 17.390625 18.875 11.75
-Q 22.953125 6.109375 30.078125 6.109375
-Q 37.203125 6.109375 41.296875 11.75
-Q 45.40625 17.390625 45.40625 27.296875
-Q 45.40625 37.203125 41.296875 42.84375
-Q 37.203125 48.484375 30.078125 48.484375
-Q 22.953125 48.484375 18.875 42.84375
-Q 14.796875 37.203125 14.796875 27.296875
-z
-" id="DejaVuSans-100"/>
- <path id="DejaVuSans-32"/>
- <path d="M 31 75.875
-Q 24.46875 64.65625 21.28125 53.65625
-Q 18.109375 42.671875 18.109375 31.390625
-Q 18.109375 20.125 21.3125 9.0625
-Q 24.515625 -2 31 -13.1875
-L 23.1875 -13.1875
-Q 15.875 -1.703125 12.234375 9.375
-Q 8.59375 20.453125 8.59375 31.390625
-Q 8.59375 42.28125 12.203125 53.3125
-Q 15.828125 64.359375 23.1875 75.875
-z
-" id="DejaVuSans-40"/>
- <path d="M 9.8125 72.90625
-L 24.515625 72.90625
-L 43.109375 23.296875
-L 61.8125 72.90625
-L 76.515625 72.90625
-L 76.515625 0
-L 66.890625 0
-L 66.890625 64.015625
-L 48.09375 14.015625
-L 38.1875 14.015625
-L 19.390625 64.015625
-L 19.390625 0
-L 9.8125 0
-z
-" id="DejaVuSans-77"/>
- <path d="M 9.421875 54.6875
-L 18.40625 54.6875
-L 18.40625 0
-L 9.421875 0
-z
-M 9.421875 75.984375
-L 18.40625 75.984375
-L 18.40625 64.59375
-L 9.421875 64.59375
-z
-" id="DejaVuSans-105"/>
- <path d="M 19.671875 34.8125
-L 19.671875 8.109375
-L 35.5 8.109375
-Q 43.453125 8.109375 47.28125 11.40625
-Q 51.125 14.703125 51.125 21.484375
-Q 51.125 28.328125 47.28125 31.5625
-Q 43.453125 34.8125 35.5 34.8125
-z
-M 19.671875 64.796875
-L 19.671875 42.828125
-L 34.28125 42.828125
-Q 41.5 42.828125 45.03125 45.53125
-Q 48.578125 48.25 48.578125 53.8125
-Q 48.578125 59.328125 45.03125 62.0625
-Q 41.5 64.796875 34.28125 64.796875
-z
-M 9.8125 72.90625
-L 35.015625 72.90625
-Q 46.296875 72.90625 52.390625 68.21875
-Q 58.5 63.53125 58.5 54.890625
-Q 58.5 48.1875 55.375 44.234375
-Q 52.25 40.28125 46.1875 39.3125
-Q 53.46875 37.75 57.5 32.78125
-Q 61.53125 27.828125 61.53125 20.40625
-Q 61.53125 10.640625 54.890625 5.3125
-Q 48.25 0 35.984375 0
-L 9.8125 0
-z
-" id="DejaVuSans-66"/>
- <path d="M 25.390625 72.90625
-L 33.6875 72.90625
-L 8.296875 -9.28125
-L 0 -9.28125
-z
-" id="DejaVuSans-47"/>
- <path d="M 44.28125 53.078125
-L 44.28125 44.578125
-Q 40.484375 46.53125 36.375 47.5
-Q 32.28125 48.484375 27.875 48.484375
-Q 21.1875 48.484375 17.84375 46.4375
-Q 14.5 44.390625 14.5 40.28125
-Q 14.5 37.15625 16.890625 35.375
-Q 19.28125 33.59375 26.515625 31.984375
-L 29.59375 31.296875
-Q 39.15625 29.25 43.1875 25.515625
-Q 47.21875 21.78125 47.21875 15.09375
-Q 47.21875 7.46875 41.1875 3.015625
-Q 35.15625 -1.421875 24.609375 -1.421875
-Q 20.21875 -1.421875 15.453125 -0.5625
-Q 10.6875 0.296875 5.421875 2
-L 5.421875 11.28125
-Q 10.40625 8.6875 15.234375 7.390625
-Q 20.0625 6.109375 24.8125 6.109375
-Q 31.15625 6.109375 34.5625 8.28125
-Q 37.984375 10.453125 37.984375 14.40625
-Q 37.984375 18.0625 35.515625 20.015625
-Q 33.0625 21.96875 24.703125 23.78125
-L 21.578125 24.515625
-Q 13.234375 26.265625 9.515625 29.90625
-Q 5.8125 33.546875 5.8125 39.890625
-Q 5.8125 47.609375 11.28125 51.796875
-Q 16.75 56 26.8125 56
-Q 31.78125 56 36.171875 55.265625
-Q 40.578125 54.546875 44.28125 53.078125
-z
-" id="DejaVuSans-115"/>
- <path d="M 8.015625 75.875
-L 15.828125 75.875
-Q 23.140625 64.359375 26.78125 53.3125
-Q 30.421875 42.28125 30.421875 31.390625
-Q 30.421875 20.453125 26.78125 9.375
-Q 23.140625 -1.703125 15.828125 -13.1875
-L 8.015625 -13.1875
-Q 14.5 -2 17.703125 9.0625
-Q 20.90625 20.125 20.90625 31.390625
-Q 20.90625 42.671875 17.703125 53.65625
-Q 14.5 64.65625 8.015625 75.875
-z
-" id="DejaVuSans-41"/>
- </defs>
- <g style="fill:#262626;" transform="translate(208.497031 321.694187)scale(0.12 -0.12)">
- <use xlink:href="#DejaVuSans-83"/>
- <use x="63.476562" xlink:href="#DejaVuSans-112"/>
- <use x="126.953125" xlink:href="#DejaVuSans-101"/>
- <use x="188.476562" xlink:href="#DejaVuSans-101"/>
- <use x="250" xlink:href="#DejaVuSans-100"/>
- <use x="313.476562" xlink:href="#DejaVuSans-32"/>
- <use x="345.263672" xlink:href="#DejaVuSans-40"/>
- <use x="384.277344" xlink:href="#DejaVuSans-77"/>
- <use x="470.556641" xlink:href="#DejaVuSans-105"/>
- <use x="498.339844" xlink:href="#DejaVuSans-66"/>
- <use x="566.943359" xlink:href="#DejaVuSans-47"/>
- <use x="600.634766" xlink:href="#DejaVuSans-115"/>
- <use x="652.734375" xlink:href="#DejaVuSans-41"/>
- </g>
- </g>
- </g>
- <g id="matplotlib.axis_2">
- <g id="ytick_1">
- <g id="line2d_10">
- <defs>
- <path d="M 0 0
-L -6 0
-" id="m7d1bb602a9" style="stroke:#262626;stroke-width:1.25;"/>
- </defs>
- <g>
- <use style="fill:#262626;stroke:#262626;stroke-width:1.25;" x="71.443906" xlink:href="#m7d1bb602a9" y="38.950125"/>
- </g>
- </g>
- <g id="text_11">
- <!-- BLAKE3 -->
- <defs>
- <path d="M 9.8125 72.90625
-L 19.671875 72.90625
-L 19.671875 8.296875
-L 55.171875 8.296875
-L 55.171875 0
-L 9.8125 0
-z
-" id="DejaVuSans-76"/>
- <path d="M 34.1875 63.1875
-L 20.796875 26.90625
-L 47.609375 26.90625
-z
-M 28.609375 72.90625
-L 39.796875 72.90625
-L 67.578125 0
-L 57.328125 0
-L 50.6875 18.703125
-L 17.828125 18.703125
-L 11.1875 0
-L 0.78125 0
-z
-" id="DejaVuSans-65"/>
- <path d="M 9.8125 72.90625
-L 19.671875 72.90625
-L 19.671875 42.09375
-L 52.390625 72.90625
-L 65.09375 72.90625
-L 28.90625 38.921875
-L 67.671875 0
-L 54.6875 0
-L 19.671875 35.109375
-L 19.671875 0
-L 9.8125 0
-z
-" id="DejaVuSans-75"/>
- <path d="M 9.8125 72.90625
-L 55.90625 72.90625
-L 55.90625 64.59375
-L 19.671875 64.59375
-L 19.671875 43.015625
-L 54.390625 43.015625
-L 54.390625 34.71875
-L 19.671875 34.71875
-L 19.671875 8.296875
-L 56.78125 8.296875
-L 56.78125 0
-L 9.8125 0
-z
-" id="DejaVuSans-69"/>
- </defs>
- <g style="fill:#262626;" transform="translate(19.576719 43.129266)scale(0.11 -0.11)">
- <use xlink:href="#DejaVuSans-66"/>
- <use x="68.603516" xlink:href="#DejaVuSans-76"/>
- <use x="124.347656" xlink:href="#DejaVuSans-65"/>
- <use x="192.755859" xlink:href="#DejaVuSans-75"/>
- <use x="258.332031" xlink:href="#DejaVuSans-69"/>
- <use x="321.515625" xlink:href="#DejaVuSans-51"/>
- </g>
- </g>
- </g>
- <g id="ytick_2">
- <g id="line2d_11">
- <g>
- <use style="fill:#262626;stroke:#262626;stroke-width:1.25;" x="71.443906" xlink:href="#m7d1bb602a9" y="72.214125"/>
- </g>
- </g>
- <g id="text_12">
- <!-- BLAKE2b -->
- <defs>
- <path d="M 48.6875 27.296875
-Q 48.6875 37.203125 44.609375 42.84375
-Q 40.53125 48.484375 33.40625 48.484375
-Q 26.265625 48.484375 22.1875 42.84375
-Q 18.109375 37.203125 18.109375 27.296875
-Q 18.109375 17.390625 22.1875 11.75
-Q 26.265625 6.109375 33.40625 6.109375
-Q 40.53125 6.109375 44.609375 11.75
-Q 48.6875 17.390625 48.6875 27.296875
-z
-M 18.109375 46.390625
-Q 20.953125 51.265625 25.265625 53.625
-Q 29.59375 56 35.59375 56
-Q 45.5625 56 51.78125 48.09375
-Q 58.015625 40.1875 58.015625 27.296875
-Q 58.015625 14.40625 51.78125 6.484375
-Q 45.5625 -1.421875 35.59375 -1.421875
-Q 29.59375 -1.421875 25.265625 0.953125
-Q 20.953125 3.328125 18.109375 8.203125
-L 18.109375 0
-L 9.078125 0
-L 9.078125 75.984375
-L 18.109375 75.984375
-z
-" id="DejaVuSans-98"/>
- </defs>
- <g style="fill:#262626;" transform="translate(12.593438 76.393266)scale(0.11 -0.11)">
- <use xlink:href="#DejaVuSans-66"/>
- <use x="68.603516" xlink:href="#DejaVuSans-76"/>
- <use x="124.347656" xlink:href="#DejaVuSans-65"/>
- <use x="192.755859" xlink:href="#DejaVuSans-75"/>
- <use x="258.332031" xlink:href="#DejaVuSans-69"/>
- <use x="321.515625" xlink:href="#DejaVuSans-50"/>
- <use x="385.138672" xlink:href="#DejaVuSans-98"/>
- </g>
- </g>
- </g>
- <g id="ytick_3">
- <g id="line2d_12">
- <g>
- <use style="fill:#262626;stroke:#262626;stroke-width:1.25;" x="71.443906" xlink:href="#m7d1bb602a9" y="105.478125"/>
- </g>
- </g>
- <g id="text_13">
- <!-- SHA-1 -->
- <defs>
- <path d="M 9.8125 72.90625
-L 19.671875 72.90625
-L 19.671875 43.015625
-L 55.515625 43.015625
-L 55.515625 72.90625
-L 65.375 72.90625
-L 65.375 0
-L 55.515625 0
-L 55.515625 34.71875
-L 19.671875 34.71875
-L 19.671875 0
-L 9.8125 0
-z
-" id="DejaVuSans-72"/>
- <path d="M 4.890625 31.390625
-L 31.203125 31.390625
-L 31.203125 23.390625
-L 4.890625 23.390625
-z
-" id="DejaVuSans-45"/>
- </defs>
- <g style="fill:#262626;" transform="translate(28.199687 109.657266)scale(0.11 -0.11)">
- <use xlink:href="#DejaVuSans-83"/>
- <use x="63.476562" xlink:href="#DejaVuSans-72"/>
- <use x="138.671875" xlink:href="#DejaVuSans-65"/>
- <use x="207.048828" xlink:href="#DejaVuSans-45"/>
- <use x="243.132812" xlink:href="#DejaVuSans-49"/>
- </g>
- </g>
- </g>
- <g id="ytick_4">
- <g id="line2d_13">
- <g>
- <use style="fill:#262626;stroke:#262626;stroke-width:1.25;" x="71.443906" xlink:href="#m7d1bb602a9" y="138.742125"/>
- </g>
- </g>
- <g id="text_14">
- <!-- BLAKE2s -->
- <g style="fill:#262626;" transform="translate(13.846406 142.921266)scale(0.11 -0.11)">
- <use xlink:href="#DejaVuSans-66"/>
- <use x="68.603516" xlink:href="#DejaVuSans-76"/>
- <use x="124.347656" xlink:href="#DejaVuSans-65"/>
- <use x="192.755859" xlink:href="#DejaVuSans-75"/>
- <use x="258.332031" xlink:href="#DejaVuSans-69"/>
- <use x="321.515625" xlink:href="#DejaVuSans-50"/>
- <use x="385.138672" xlink:href="#DejaVuSans-115"/>
- </g>
- </g>
- </g>
- <g id="ytick_5">
- <g id="line2d_14">
- <g>
- <use style="fill:#262626;stroke:#262626;stroke-width:1.25;" x="71.443906" xlink:href="#m7d1bb602a9" y="172.006125"/>
- </g>
- </g>
- <g id="text_15">
- <!-- MD5 -->
- <defs>
- <path d="M 19.671875 64.796875
-L 19.671875 8.109375
-L 31.59375 8.109375
-Q 46.6875 8.109375 53.6875 14.9375
-Q 60.6875 21.78125 60.6875 36.53125
-Q 60.6875 51.171875 53.6875 57.984375
-Q 46.6875 64.796875 31.59375 64.796875
-z
-M 9.8125 72.90625
-L 30.078125 72.90625
-Q 51.265625 72.90625 61.171875 64.09375
-Q 71.09375 55.28125 71.09375 36.53125
-Q 71.09375 17.671875 61.125 8.828125
-Q 51.171875 0 30.078125 0
-L 9.8125 0
-z
-" id="DejaVuSans-68"/>
- </defs>
- <g style="fill:#262626;" transform="translate(36.984219 176.185266)scale(0.11 -0.11)">
- <use xlink:href="#DejaVuSans-77"/>
- <use x="86.279297" xlink:href="#DejaVuSans-68"/>
- <use x="163.28125" xlink:href="#DejaVuSans-53"/>
- </g>
- </g>
- </g>
- <g id="ytick_6">
- <g id="line2d_15">
- <g>
- <use style="fill:#262626;stroke:#262626;stroke-width:1.25;" x="71.443906" xlink:href="#m7d1bb602a9" y="205.270125"/>
- </g>
- </g>
- <g id="text_16">
- <!-- SHA-512 -->
- <g style="fill:#262626;" transform="translate(14.202188 209.449266)scale(0.11 -0.11)">
- <use xlink:href="#DejaVuSans-83"/>
- <use x="63.476562" xlink:href="#DejaVuSans-72"/>
- <use x="138.671875" xlink:href="#DejaVuSans-65"/>
- <use x="207.048828" xlink:href="#DejaVuSans-45"/>
- <use x="243.132812" xlink:href="#DejaVuSans-53"/>
- <use x="306.755859" xlink:href="#DejaVuSans-49"/>
- <use x="370.378906" xlink:href="#DejaVuSans-50"/>
- </g>
- </g>
- </g>
- <g id="ytick_7">
- <g id="line2d_16">
- <g>
- <use style="fill:#262626;stroke:#262626;stroke-width:1.25;" x="71.443906" xlink:href="#m7d1bb602a9" y="238.534125"/>
- </g>
- </g>
- <g id="text_17">
- <!-- SHA-256 -->
- <g style="fill:#262626;" transform="translate(14.202188 242.713266)scale(0.11 -0.11)">
- <use xlink:href="#DejaVuSans-83"/>
- <use x="63.476562" xlink:href="#DejaVuSans-72"/>
- <use x="138.671875" xlink:href="#DejaVuSans-65"/>
- <use x="207.048828" xlink:href="#DejaVuSans-45"/>
- <use x="243.132812" xlink:href="#DejaVuSans-50"/>
- <use x="306.755859" xlink:href="#DejaVuSans-53"/>
- <use x="370.378906" xlink:href="#DejaVuSans-54"/>
- </g>
- </g>
- </g>
- <g id="ytick_8">
- <g id="line2d_17">
- <g>
- <use style="fill:#262626;stroke:#262626;stroke-width:1.25;" x="71.443906" xlink:href="#m7d1bb602a9" y="271.798125"/>
- </g>
- </g>
- <g id="text_18">
- <!-- SHA3-256 -->
- <g style="fill:#262626;" transform="translate(7.2 275.977266)scale(0.11 -0.11)">
- <use xlink:href="#DejaVuSans-83"/>
- <use x="63.476562" xlink:href="#DejaVuSans-72"/>
- <use x="138.671875" xlink:href="#DejaVuSans-65"/>
- <use x="207.080078" xlink:href="#DejaVuSans-51"/>
- <use x="270.703125" xlink:href="#DejaVuSans-45"/>
- <use x="306.787109" xlink:href="#DejaVuSans-50"/>
- <use x="370.410156" xlink:href="#DejaVuSans-53"/>
- <use x="434.033203" xlink:href="#DejaVuSans-54"/>
- </g>
- </g>
- </g>
- </g>
- <g id="patch_3">
- <path clip-path="url(#p6091bd3d0f)" d="M 71.443906 25.644525
-L 377.942146 25.644525
-L 377.942146 52.255725
-L 71.443906 52.255725
-z
-" style="fill:#df2020;stroke:#000000;stroke-linejoin:miter;"/>
- </g>
- <g id="patch_4">
- <path clip-path="url(#p6091bd3d0f)" d="M 71.443906 58.908525
-L 130.011586 58.908525
-L 130.011586 85.519725
-L 71.443906 85.519725
-z
-" style="fill:#df2020;stroke:#000000;stroke-linejoin:miter;"/>
- </g>
- <g id="patch_5">
- <path clip-path="url(#p6091bd3d0f)" d="M 71.443906 92.172525
-L 117.289186 92.172525
-L 117.289186 118.783725
-L 71.443906 118.783725
-z
-" style="fill:#df2020;stroke:#000000;stroke-linejoin:miter;"/>
- </g>
- <g id="patch_6">
- <path clip-path="url(#p6091bd3d0f)" d="M 71.443906 125.436525
-L 110.548546 125.436525
-L 110.548546 152.047725
-L 71.443906 152.047725
-z
-" style="fill:#df2020;stroke:#000000;stroke-linejoin:miter;"/>
- </g>
- <g id="patch_7">
- <path clip-path="url(#p6091bd3d0f)" d="M 71.443906 158.700525
-L 104.477506 158.700525
-L 104.477506 185.311725
-L 71.443906 185.311725
-z
-" style="fill:#df2020;stroke:#000000;stroke-linejoin:miter;"/>
- </g>
- <g id="patch_8">
- <path clip-path="url(#p6091bd3d0f)" d="M 71.443906 191.964525
-L 103.584706 191.964525
-L 103.584706 218.575725
-L 71.443906 218.575725
-z
-" style="fill:#df2020;stroke:#000000;stroke-linejoin:miter;"/>
- </g>
- <g id="patch_9">
- <path clip-path="url(#p6091bd3d0f)" d="M 71.443906 225.228525
-L 93.049666 225.228525
-L 93.049666 251.839725
-L 71.443906 251.839725
-z
-" style="fill:#df2020;stroke:#000000;stroke-linejoin:miter;"/>
- </g>
- <g id="patch_10">
- <path clip-path="url(#p6091bd3d0f)" d="M 71.443906 258.492525
-L 89.032066 258.492525
-L 89.032066 285.103725
-L 71.443906 285.103725
-z
-" style="fill:#df2020;stroke:#000000;stroke-linejoin:miter;"/>
- </g>
- <g id="line2d_18">
- <path clip-path="url(#p6091bd3d0f)" d="M 0 0
-" style="fill:none;stroke:#424242;stroke-linecap:round;stroke-width:2.7;"/>
- </g>
- <g id="line2d_19">
- <path clip-path="url(#p6091bd3d0f)" d="M 0 0
-" style="fill:none;stroke:#424242;stroke-linecap:round;stroke-width:2.7;"/>
- </g>
- <g id="line2d_20">
- <path clip-path="url(#p6091bd3d0f)" d="M 0 0
-" style="fill:none;stroke:#424242;stroke-linecap:round;stroke-width:2.7;"/>
- </g>
- <g id="line2d_21">
- <path clip-path="url(#p6091bd3d0f)" d="M 0 0
-" style="fill:none;stroke:#424242;stroke-linecap:round;stroke-width:2.7;"/>
- </g>
- <g id="line2d_22">
- <path clip-path="url(#p6091bd3d0f)" d="M 0 0
-" style="fill:none;stroke:#424242;stroke-linecap:round;stroke-width:2.7;"/>
- </g>
- <g id="line2d_23">
- <path clip-path="url(#p6091bd3d0f)" d="M 0 0
-" style="fill:none;stroke:#424242;stroke-linecap:round;stroke-width:2.7;"/>
- </g>
- <g id="line2d_24">
- <path clip-path="url(#p6091bd3d0f)" d="M 0 0
-" style="fill:none;stroke:#424242;stroke-linecap:round;stroke-width:2.7;"/>
- </g>
- <g id="line2d_25">
- <path clip-path="url(#p6091bd3d0f)" d="M 0 0
-" style="fill:none;stroke:#424242;stroke-linecap:round;stroke-width:2.7;"/>
- </g>
- <g id="patch_11">
- <path d="M 71.443906 288.430125
-L 71.443906 22.318125
-" style="fill:none;stroke:#262626;stroke-linecap:square;stroke-linejoin:miter;stroke-width:1.25;"/>
- </g>
- <g id="patch_12">
- <path d="M 428.563906 288.430125
-L 428.563906 22.318125
-" style="fill:none;stroke:#262626;stroke-linecap:square;stroke-linejoin:miter;stroke-width:1.25;"/>
- </g>
- <g id="patch_13">
- <path d="M 71.443906 288.430125
-L 428.563906 288.430125
-" style="fill:none;stroke:#262626;stroke-linecap:square;stroke-linejoin:miter;stroke-width:1.25;"/>
- </g>
- <g id="patch_14">
- <path d="M 71.443906 22.318125
-L 428.563906 22.318125
-" style="fill:none;stroke:#262626;stroke-linecap:square;stroke-linejoin:miter;stroke-width:1.25;"/>
- </g>
- <g id="text_19">
- <!-- 6866 -->
- <g style="fill:#262626;" transform="translate(382.406146 43.939725)scale(0.12 -0.12)">
- <use xlink:href="#DejaVuSans-54"/>
- <use x="63.623047" xlink:href="#DejaVuSans-56"/>
- <use x="127.246094" xlink:href="#DejaVuSans-54"/>
- <use x="190.869141" xlink:href="#DejaVuSans-54"/>
- </g>
- </g>
- <g id="text_20">
- <!-- 1312 -->
- <g style="fill:#262626;" transform="translate(134.475586 77.203725)scale(0.12 -0.12)">
- <use xlink:href="#DejaVuSans-49"/>
- <use x="63.623047" xlink:href="#DejaVuSans-51"/>
- <use x="127.246094" xlink:href="#DejaVuSans-49"/>
- <use x="190.869141" xlink:href="#DejaVuSans-50"/>
- </g>
- </g>
- <g id="text_21">
- <!-- 1027 -->
- <g style="fill:#262626;" transform="translate(121.753186 110.467725)scale(0.12 -0.12)">
- <use xlink:href="#DejaVuSans-49"/>
- <use x="63.623047" xlink:href="#DejaVuSans-48"/>
- <use x="127.246094" xlink:href="#DejaVuSans-50"/>
- <use x="190.869141" xlink:href="#DejaVuSans-55"/>
- </g>
- </g>
- <g id="text_22">
- <!-- 876 -->
- <g style="fill:#262626;" transform="translate(115.012546 143.731725)scale(0.12 -0.12)">
- <use xlink:href="#DejaVuSans-56"/>
- <use x="63.623047" xlink:href="#DejaVuSans-55"/>
- <use x="127.246094" xlink:href="#DejaVuSans-54"/>
- </g>
- </g>
- <g id="text_23">
- <!-- 740 -->
- <g style="fill:#262626;" transform="translate(108.941506 176.995725)scale(0.12 -0.12)">
- <use xlink:href="#DejaVuSans-55"/>
- <use x="63.623047" xlink:href="#DejaVuSans-52"/>
- <use x="127.246094" xlink:href="#DejaVuSans-48"/>
- </g>
- </g>
- <g id="text_24">
- <!-- 720 -->
- <g style="fill:#262626;" transform="translate(108.048706 210.259725)scale(0.12 -0.12)">
- <use xlink:href="#DejaVuSans-55"/>
- <use x="63.623047" xlink:href="#DejaVuSans-50"/>
- <use x="127.246094" xlink:href="#DejaVuSans-48"/>
- </g>
- </g>
- <g id="text_25">
- <!-- 484 -->
- <g style="fill:#262626;" transform="translate(97.513666 243.523725)scale(0.12 -0.12)">
- <use xlink:href="#DejaVuSans-52"/>
- <use x="63.623047" xlink:href="#DejaVuSans-56"/>
- <use x="127.246094" xlink:href="#DejaVuSans-52"/>
- </g>
- </g>
- <g id="text_26">
- <!-- 394 -->
- <defs>
- <path d="M 10.984375 1.515625
-L 10.984375 10.5
-Q 14.703125 8.734375 18.5 7.8125
-Q 22.3125 6.890625 25.984375 6.890625
-Q 35.75 6.890625 40.890625 13.453125
-Q 46.046875 20.015625 46.78125 33.40625
-Q 43.953125 29.203125 39.59375 26.953125
-Q 35.25 24.703125 29.984375 24.703125
-Q 19.046875 24.703125 12.671875 31.3125
-Q 6.296875 37.9375 6.296875 49.421875
-Q 6.296875 60.640625 12.9375 67.421875
-Q 19.578125 74.21875 30.609375 74.21875
-Q 43.265625 74.21875 49.921875 64.515625
-Q 56.59375 54.828125 56.59375 36.375
-Q 56.59375 19.140625 48.40625 8.859375
-Q 40.234375 -1.421875 26.421875 -1.421875
-Q 22.703125 -1.421875 18.890625 -0.6875
-Q 15.09375 0.046875 10.984375 1.515625
-z
-M 30.609375 32.421875
-Q 37.25 32.421875 41.125 36.953125
-Q 45.015625 41.5 45.015625 49.421875
-Q 45.015625 57.28125 41.125 61.84375
-Q 37.25 66.40625 30.609375 66.40625
-Q 23.96875 66.40625 20.09375 61.84375
-Q 16.21875 57.28125 16.21875 49.421875
-Q 16.21875 41.5 20.09375 36.953125
-Q 23.96875 32.421875 30.609375 32.421875
-z
-" id="DejaVuSans-57"/>
- </defs>
- <g style="fill:#262626;" transform="translate(93.496066 276.787725)scale(0.12 -0.12)">
- <use xlink:href="#DejaVuSans-51"/>
- <use x="63.623047" xlink:href="#DejaVuSans-57"/>
- <use x="127.246094" xlink:href="#DejaVuSans-52"/>
- </g>
- </g>
- <g id="text_27">
- <!-- Performance on AWS c5.metal, 16 KiB input, 1 thread -->
- <defs>
- <path d="M 19.671875 64.796875
-L 19.671875 37.40625
-L 32.078125 37.40625
-Q 38.96875 37.40625 42.71875 40.96875
-Q 46.484375 44.53125 46.484375 51.125
-Q 46.484375 57.671875 42.71875 61.234375
-Q 38.96875 64.796875 32.078125 64.796875
-z
-M 9.8125 72.90625
-L 32.078125 72.90625
-Q 44.34375 72.90625 50.609375 67.359375
-Q 56.890625 61.8125 56.890625 51.125
-Q 56.890625 40.328125 50.609375 34.8125
-Q 44.34375 29.296875 32.078125 29.296875
-L 19.671875 29.296875
-L 19.671875 0
-L 9.8125 0
-z
-" id="DejaVuSans-80"/>
- <path d="M 41.109375 46.296875
-Q 39.59375 47.171875 37.8125 47.578125
-Q 36.03125 48 33.890625 48
-Q 26.265625 48 22.1875 43.046875
-Q 18.109375 38.09375 18.109375 28.8125
-L 18.109375 0
-L 9.078125 0
-L 9.078125 54.6875
-L 18.109375 54.6875
-L 18.109375 46.1875
-Q 20.953125 51.171875 25.484375 53.578125
-Q 30.03125 56 36.53125 56
-Q 37.453125 56 38.578125 55.875
-Q 39.703125 55.765625 41.0625 55.515625
-z
-" id="DejaVuSans-114"/>
- <path d="M 37.109375 75.984375
-L 37.109375 68.5
-L 28.515625 68.5
-Q 23.6875 68.5 21.796875 66.546875
-Q 19.921875 64.59375 19.921875 59.515625
-L 19.921875 54.6875
-L 34.71875 54.6875
-L 34.71875 47.703125
-L 19.921875 47.703125
-L 19.921875 0
-L 10.890625 0
-L 10.890625 47.703125
-L 2.296875 47.703125
-L 2.296875 54.6875
-L 10.890625 54.6875
-L 10.890625 58.5
-Q 10.890625 67.625 15.140625 71.796875
-Q 19.390625 75.984375 28.609375 75.984375
-z
-" id="DejaVuSans-102"/>
- <path d="M 30.609375 48.390625
-Q 23.390625 48.390625 19.1875 42.75
-Q 14.984375 37.109375 14.984375 27.296875
-Q 14.984375 17.484375 19.15625 11.84375
-Q 23.34375 6.203125 30.609375 6.203125
-Q 37.796875 6.203125 41.984375 11.859375
-Q 46.1875 17.53125 46.1875 27.296875
-Q 46.1875 37.015625 41.984375 42.703125
-Q 37.796875 48.390625 30.609375 48.390625
-z
-M 30.609375 56
-Q 42.328125 56 49.015625 48.375
-Q 55.71875 40.765625 55.71875 27.296875
-Q 55.71875 13.875 49.015625 6.21875
-Q 42.328125 -1.421875 30.609375 -1.421875
-Q 18.84375 -1.421875 12.171875 6.21875
-Q 5.515625 13.875 5.515625 27.296875
-Q 5.515625 40.765625 12.171875 48.375
-Q 18.84375 56 30.609375 56
-z
-" id="DejaVuSans-111"/>
- <path d="M 52 44.1875
-Q 55.375 50.25 60.0625 53.125
-Q 64.75 56 71.09375 56
-Q 79.640625 56 84.28125 50.015625
-Q 88.921875 44.046875 88.921875 33.015625
-L 88.921875 0
-L 79.890625 0
-L 79.890625 32.71875
-Q 79.890625 40.578125 77.09375 44.375
-Q 74.3125 48.1875 68.609375 48.1875
-Q 61.625 48.1875 57.5625 43.546875
-Q 53.515625 38.921875 53.515625 30.90625
-L 53.515625 0
-L 44.484375 0
-L 44.484375 32.71875
-Q 44.484375 40.625 41.703125 44.40625
-Q 38.921875 48.1875 33.109375 48.1875
-Q 26.21875 48.1875 22.15625 43.53125
-Q 18.109375 38.875 18.109375 30.90625
-L 18.109375 0
-L 9.078125 0
-L 9.078125 54.6875
-L 18.109375 54.6875
-L 18.109375 46.1875
-Q 21.1875 51.21875 25.484375 53.609375
-Q 29.78125 56 35.6875 56
-Q 41.65625 56 45.828125 52.96875
-Q 50 49.953125 52 44.1875
-z
-" id="DejaVuSans-109"/>
- <path d="M 34.28125 27.484375
-Q 23.390625 27.484375 19.1875 25
-Q 14.984375 22.515625 14.984375 16.5
-Q 14.984375 11.71875 18.140625 8.90625
-Q 21.296875 6.109375 26.703125 6.109375
-Q 34.1875 6.109375 38.703125 11.40625
-Q 43.21875 16.703125 43.21875 25.484375
-L 43.21875 27.484375
-z
-M 52.203125 31.203125
-L 52.203125 0
-L 43.21875 0
-L 43.21875 8.296875
-Q 40.140625 3.328125 35.546875 0.953125
-Q 30.953125 -1.421875 24.3125 -1.421875
-Q 15.921875 -1.421875 10.953125 3.296875
-Q 6 8.015625 6 15.921875
-Q 6 25.140625 12.171875 29.828125
-Q 18.359375 34.515625 30.609375 34.515625
-L 43.21875 34.515625
-L 43.21875 35.40625
-Q 43.21875 41.609375 39.140625 45
-Q 35.0625 48.390625 27.6875 48.390625
-Q 23 48.390625 18.546875 47.265625
-Q 14.109375 46.140625 10.015625 43.890625
-L 10.015625 52.203125
-Q 14.9375 54.109375 19.578125 55.046875
-Q 24.21875 56 28.609375 56
-Q 40.484375 56 46.34375 49.84375
-Q 52.203125 43.703125 52.203125 31.203125
-z
-" id="DejaVuSans-97"/>
- <path d="M 54.890625 33.015625
-L 54.890625 0
-L 45.90625 0
-L 45.90625 32.71875
-Q 45.90625 40.484375 42.875 44.328125
-Q 39.84375 48.1875 33.796875 48.1875
-Q 26.515625 48.1875 22.3125 43.546875
-Q 18.109375 38.921875 18.109375 30.90625
-L 18.109375 0
-L 9.078125 0
-L 9.078125 54.6875
-L 18.109375 54.6875
-L 18.109375 46.1875
-Q 21.34375 51.125 25.703125 53.5625
-Q 30.078125 56 35.796875 56
-Q 45.21875 56 50.046875 50.171875
-Q 54.890625 44.34375 54.890625 33.015625
-z
-" id="DejaVuSans-110"/>
- <path d="M 48.78125 52.59375
-L 48.78125 44.1875
-Q 44.96875 46.296875 41.140625 47.34375
-Q 37.3125 48.390625 33.40625 48.390625
-Q 24.65625 48.390625 19.8125 42.84375
-Q 14.984375 37.3125 14.984375 27.296875
-Q 14.984375 17.28125 19.8125 11.734375
-Q 24.65625 6.203125 33.40625 6.203125
-Q 37.3125 6.203125 41.140625 7.25
-Q 44.96875 8.296875 48.78125 10.40625
-L 48.78125 2.09375
-Q 45.015625 0.34375 40.984375 -0.53125
-Q 36.96875 -1.421875 32.421875 -1.421875
-Q 20.0625 -1.421875 12.78125 6.34375
-Q 5.515625 14.109375 5.515625 27.296875
-Q 5.515625 40.671875 12.859375 48.328125
-Q 20.21875 56 33.015625 56
-Q 37.15625 56 41.109375 55.140625
-Q 45.0625 54.296875 48.78125 52.59375
-z
-" id="DejaVuSans-99"/>
- <path d="M 3.328125 72.90625
-L 13.28125 72.90625
-L 28.609375 11.28125
-L 43.890625 72.90625
-L 54.984375 72.90625
-L 70.3125 11.28125
-L 85.59375 72.90625
-L 95.609375 72.90625
-L 77.296875 0
-L 64.890625 0
-L 49.515625 63.28125
-L 33.984375 0
-L 21.578125 0
-z
-" id="DejaVuSans-87"/>
- <path d="M 10.6875 12.40625
-L 21 12.40625
-L 21 0
-L 10.6875 0
-z
-" id="DejaVuSans-46"/>
- <path d="M 18.3125 70.21875
-L 18.3125 54.6875
-L 36.8125 54.6875
-L 36.8125 47.703125
-L 18.3125 47.703125
-L 18.3125 18.015625
-Q 18.3125 11.328125 20.140625 9.421875
-Q 21.96875 7.515625 27.59375 7.515625
-L 36.8125 7.515625
-L 36.8125 0
-L 27.59375 0
-Q 17.1875 0 13.234375 3.875
-Q 9.28125 7.765625 9.28125 18.015625
-L 9.28125 47.703125
-L 2.6875 47.703125
-L 2.6875 54.6875
-L 9.28125 54.6875
-L 9.28125 70.21875
-z
-" id="DejaVuSans-116"/>
- <path d="M 9.421875 75.984375
-L 18.40625 75.984375
-L 18.40625 0
-L 9.421875 0
-z
-" id="DejaVuSans-108"/>
- <path d="M 11.71875 12.40625
-L 22.015625 12.40625
-L 22.015625 4
-L 14.015625 -11.625
-L 7.71875 -11.625
-L 11.71875 4
-z
-" id="DejaVuSans-44"/>
- <path d="M 8.5 21.578125
-L 8.5 54.6875
-L 17.484375 54.6875
-L 17.484375 21.921875
-Q 17.484375 14.15625 20.5 10.265625
-Q 23.53125 6.390625 29.59375 6.390625
-Q 36.859375 6.390625 41.078125 11.03125
-Q 45.3125 15.671875 45.3125 23.6875
-L 45.3125 54.6875
-L 54.296875 54.6875
-L 54.296875 0
-L 45.3125 0
-L 45.3125 8.40625
-Q 42.046875 3.421875 37.71875 1
-Q 33.40625 -1.421875 27.6875 -1.421875
-Q 18.265625 -1.421875 13.375 4.4375
-Q 8.5 10.296875 8.5 21.578125
-z
-M 31.109375 56
-z
-" id="DejaVuSans-117"/>
- <path d="M 54.890625 33.015625
-L 54.890625 0
-L 45.90625 0
-L 45.90625 32.71875
-Q 45.90625 40.484375 42.875 44.328125
-Q 39.84375 48.1875 33.796875 48.1875
-Q 26.515625 48.1875 22.3125 43.546875
-Q 18.109375 38.921875 18.109375 30.90625
-L 18.109375 0
-L 9.078125 0
-L 9.078125 75.984375
-L 18.109375 75.984375
-L 18.109375 46.1875
-Q 21.34375 51.125 25.703125 53.5625
-Q 30.078125 56 35.796875 56
-Q 45.21875 56 50.046875 50.171875
-Q 54.890625 44.34375 54.890625 33.015625
-z
-" id="DejaVuSans-104"/>
- </defs>
- <g style="fill:#262626;" transform="translate(88.626406 16.318125)scale(0.12 -0.12)">
- <use xlink:href="#DejaVuSans-80"/>
- <use x="60.255859" xlink:href="#DejaVuSans-101"/>
- <use x="121.779297" xlink:href="#DejaVuSans-114"/>
- <use x="162.892578" xlink:href="#DejaVuSans-102"/>
- <use x="198.097656" xlink:href="#DejaVuSans-111"/>
- <use x="259.279297" xlink:href="#DejaVuSans-114"/>
- <use x="300.376953" xlink:href="#DejaVuSans-109"/>
- <use x="397.789062" xlink:href="#DejaVuSans-97"/>
- <use x="459.068359" xlink:href="#DejaVuSans-110"/>
- <use x="522.447266" xlink:href="#DejaVuSans-99"/>
- <use x="577.427734" xlink:href="#DejaVuSans-101"/>
- <use x="638.951172" xlink:href="#DejaVuSans-32"/>
- <use x="670.738281" xlink:href="#DejaVuSans-111"/>
- <use x="731.919922" xlink:href="#DejaVuSans-110"/>
- <use x="795.298828" xlink:href="#DejaVuSans-32"/>
- <use x="827.085938" xlink:href="#DejaVuSans-65"/>
- <use x="895.416016" xlink:href="#DejaVuSans-87"/>
- <use x="994.292969" xlink:href="#DejaVuSans-83"/>
- <use x="1057.769531" xlink:href="#DejaVuSans-32"/>
- <use x="1089.556641" xlink:href="#DejaVuSans-99"/>
- <use x="1144.537109" xlink:href="#DejaVuSans-53"/>
- <use x="1208.160156" xlink:href="#DejaVuSans-46"/>
- <use x="1239.947266" xlink:href="#DejaVuSans-109"/>
- <use x="1337.359375" xlink:href="#DejaVuSans-101"/>
- <use x="1398.882812" xlink:href="#DejaVuSans-116"/>
- <use x="1438.091797" xlink:href="#DejaVuSans-97"/>
- <use x="1499.371094" xlink:href="#DejaVuSans-108"/>
- <use x="1527.154297" xlink:href="#DejaVuSans-44"/>
- <use x="1558.941406" xlink:href="#DejaVuSans-32"/>
- <use x="1590.728516" xlink:href="#DejaVuSans-49"/>
- <use x="1654.351562" xlink:href="#DejaVuSans-54"/>
- <use x="1717.974609" xlink:href="#DejaVuSans-32"/>
- <use x="1749.761719" xlink:href="#DejaVuSans-75"/>
- <use x="1815.337891" xlink:href="#DejaVuSans-105"/>
- <use x="1843.121094" xlink:href="#DejaVuSans-66"/>
- <use x="1911.724609" xlink:href="#DejaVuSans-32"/>
- <use x="1943.511719" xlink:href="#DejaVuSans-105"/>
- <use x="1971.294922" xlink:href="#DejaVuSans-110"/>
- <use x="2034.673828" xlink:href="#DejaVuSans-112"/>
- <use x="2098.150391" xlink:href="#DejaVuSans-117"/>
- <use x="2161.529297" xlink:href="#DejaVuSans-116"/>
- <use x="2200.738281" xlink:href="#DejaVuSans-44"/>
- <use x="2232.525391" xlink:href="#DejaVuSans-32"/>
- <use x="2264.3125" xlink:href="#DejaVuSans-49"/>
- <use x="2327.935547" xlink:href="#DejaVuSans-32"/>
- <use x="2359.722656" xlink:href="#DejaVuSans-116"/>
- <use x="2398.931641" xlink:href="#DejaVuSans-104"/>
- <use x="2462.310547" xlink:href="#DejaVuSans-114"/>
- <use x="2503.392578" xlink:href="#DejaVuSans-101"/>
- <use x="2564.916016" xlink:href="#DejaVuSans-97"/>
- <use x="2626.195312" xlink:href="#DejaVuSans-100"/>
- </g>
- </g>
- </g>
- </g>
- <defs>
- <clipPath id="p6091bd3d0f">
- <rect height="266.112" width="357.12" x="71.443906" y="22.318125"/>
- </clipPath>
- </defs>
-</svg>
diff --git a/thirdparty/BLAKE3/reference_impl/Cargo.toml b/thirdparty/BLAKE3/reference_impl/Cargo.toml
deleted file mode 100644
index 8c81e5ad9..000000000
--- a/thirdparty/BLAKE3/reference_impl/Cargo.toml
+++ /dev/null
@@ -1,8 +0,0 @@
-[package]
-name = "reference_impl"
-version = "0.0.0"
-edition = "2018"
-
-[lib]
-name = "reference_impl"
-path = "reference_impl.rs"
diff --git a/thirdparty/BLAKE3/reference_impl/README.md b/thirdparty/BLAKE3/reference_impl/README.md
deleted file mode 100644
index 941fafd72..000000000
--- a/thirdparty/BLAKE3/reference_impl/README.md
+++ /dev/null
@@ -1,9 +0,0 @@
-This is the reference implementation of BLAKE3. It is used for testing and
-as a readable example of the algorithms involved. Section 5.1 of [the BLAKE3
-spec](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf)
-discusses this implementation. You can render docs for this implementation
-by running `cargo doc --open` in this directory.
-
-This implementation is a single file
-([`reference_impl.rs`](reference_impl.rs)) with no dependencies. It is
-not optimized for performance.
diff --git a/thirdparty/BLAKE3/reference_impl/reference_impl.rs b/thirdparty/BLAKE3/reference_impl/reference_impl.rs
deleted file mode 100644
index 248834319..000000000
--- a/thirdparty/BLAKE3/reference_impl/reference_impl.rs
+++ /dev/null
@@ -1,383 +0,0 @@
-//! This is the reference implementation of BLAKE3. It is used for testing and
-//! as a readable example of the algorithms involved. Section 5.1 of [the BLAKE3
-//! spec](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf)
-//! discusses this implementation. You can render docs for this implementation
-//! by running `cargo doc --open` in this directory.
-//!
-//! # Example
-//!
-//! ```
-//! let mut hasher = reference_impl::Hasher::new();
-//! hasher.update(b"abc");
-//! hasher.update(b"def");
-//! let mut hash = [0; 32];
-//! hasher.finalize(&mut hash);
-//! let mut extended_hash = [0; 500];
-//! hasher.finalize(&mut extended_hash);
-//! assert_eq!(hash, extended_hash[..32]);
-//! ```
-
-use core::cmp::min;
-use core::convert::TryInto;
-
-const OUT_LEN: usize = 32;
-const KEY_LEN: usize = 32;
-const BLOCK_LEN: usize = 64;
-const CHUNK_LEN: usize = 1024;
-
-const CHUNK_START: u32 = 1 << 0;
-const CHUNK_END: u32 = 1 << 1;
-const PARENT: u32 = 1 << 2;
-const ROOT: u32 = 1 << 3;
-const KEYED_HASH: u32 = 1 << 4;
-const DERIVE_KEY_CONTEXT: u32 = 1 << 5;
-const DERIVE_KEY_MATERIAL: u32 = 1 << 6;
-
-const IV: [u32; 8] = [
- 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19,
-];
-
-const MSG_PERMUTATION: [usize; 16] = [2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8];
-
-// The mixing function, G, which mixes either a column or a diagonal.
-fn g(state: &mut [u32; 16], a: usize, b: usize, c: usize, d: usize, mx: u32, my: u32) {
- state[a] = state[a].wrapping_add(state[b]).wrapping_add(mx);
- state[d] = (state[d] ^ state[a]).rotate_right(16);
- state[c] = state[c].wrapping_add(state[d]);
- state[b] = (state[b] ^ state[c]).rotate_right(12);
- state[a] = state[a].wrapping_add(state[b]).wrapping_add(my);
- state[d] = (state[d] ^ state[a]).rotate_right(8);
- state[c] = state[c].wrapping_add(state[d]);
- state[b] = (state[b] ^ state[c]).rotate_right(7);
-}
-
-fn round(state: &mut [u32; 16], m: &[u32; 16]) {
- // Mix the columns.
- g(state, 0, 4, 8, 12, m[0], m[1]);
- g(state, 1, 5, 9, 13, m[2], m[3]);
- g(state, 2, 6, 10, 14, m[4], m[5]);
- g(state, 3, 7, 11, 15, m[6], m[7]);
- // Mix the diagonals.
- g(state, 0, 5, 10, 15, m[8], m[9]);
- g(state, 1, 6, 11, 12, m[10], m[11]);
- g(state, 2, 7, 8, 13, m[12], m[13]);
- g(state, 3, 4, 9, 14, m[14], m[15]);
-}
-
-fn permute(m: &mut [u32; 16]) {
- let mut permuted = [0; 16];
- for i in 0..16 {
- permuted[i] = m[MSG_PERMUTATION[i]];
- }
- *m = permuted;
-}
-
-fn compress(
- chaining_value: &[u32; 8],
- block_words: &[u32; 16],
- counter: u64,
- block_len: u32,
- flags: u32,
-) -> [u32; 16] {
- let mut state = [
- chaining_value[0],
- chaining_value[1],
- chaining_value[2],
- chaining_value[3],
- chaining_value[4],
- chaining_value[5],
- chaining_value[6],
- chaining_value[7],
- IV[0],
- IV[1],
- IV[2],
- IV[3],
- counter as u32,
- (counter >> 32) as u32,
- block_len,
- flags,
- ];
- let mut block = *block_words;
-
- round(&mut state, &block); // round 1
- permute(&mut block);
- round(&mut state, &block); // round 2
- permute(&mut block);
- round(&mut state, &block); // round 3
- permute(&mut block);
- round(&mut state, &block); // round 4
- permute(&mut block);
- round(&mut state, &block); // round 5
- permute(&mut block);
- round(&mut state, &block); // round 6
- permute(&mut block);
- round(&mut state, &block); // round 7
-
- for i in 0..8 {
- state[i] ^= state[i + 8];
- state[i + 8] ^= chaining_value[i];
- }
- state
-}
-
-fn first_8_words(compression_output: [u32; 16]) -> [u32; 8] {
- compression_output[0..8].try_into().unwrap()
-}
-
-fn words_from_little_endian_bytes(bytes: &[u8], words: &mut [u32]) {
- for (bytes_block, word) in bytes.chunks_exact(4).zip(words.iter_mut()) {
- *word = u32::from_le_bytes(bytes_block.try_into().unwrap());
- }
-}
-
-// Each chunk or parent node can produce either an 8-word chaining value or, by
-// setting the ROOT flag, any number of final output bytes. The Output struct
-// captures the state just prior to choosing between those two possibilities.
-struct Output {
- input_chaining_value: [u32; 8],
- block_words: [u32; 16],
- counter: u64,
- block_len: u32,
- flags: u32,
-}
-
-impl Output {
- fn chaining_value(&self) -> [u32; 8] {
- first_8_words(compress(
- &self.input_chaining_value,
- &self.block_words,
- self.counter,
- self.block_len,
- self.flags,
- ))
- }
-
- fn root_output_bytes(&self, out_slice: &mut [u8]) {
- let mut output_block_counter = 0;
- for out_block in out_slice.chunks_mut(2 * OUT_LEN) {
- let words = compress(
- &self.input_chaining_value,
- &self.block_words,
- output_block_counter,
- self.block_len,
- self.flags | ROOT,
- );
- // The output length might not be a multiple of 4.
- for (word, out_word) in words.iter().zip(out_block.chunks_mut(4)) {
- out_word.copy_from_slice(&word.to_le_bytes()[..out_word.len()]);
- }
- output_block_counter += 1;
- }
- }
-}
-
-struct ChunkState {
- chaining_value: [u32; 8],
- chunk_counter: u64,
- block: [u8; BLOCK_LEN],
- block_len: u8,
- blocks_compressed: u8,
- flags: u32,
-}
-
-impl ChunkState {
- fn new(key: [u32; 8], chunk_counter: u64, flags: u32) -> Self {
- Self {
- chaining_value: key,
- chunk_counter,
- block: [0; BLOCK_LEN],
- block_len: 0,
- blocks_compressed: 0,
- flags,
- }
- }
-
- fn len(&self) -> usize {
- BLOCK_LEN * self.blocks_compressed as usize + self.block_len as usize
- }
-
- fn start_flag(&self) -> u32 {
- if self.blocks_compressed == 0 {
- CHUNK_START
- } else {
- 0
- }
- }
-
- fn update(&mut self, mut input: &[u8]) {
- while !input.is_empty() {
- // If the block buffer is full, compress it and clear it. More
- // input is coming, so this compression is not CHUNK_END.
- if self.block_len as usize == BLOCK_LEN {
- let mut block_words = [0; 16];
- words_from_little_endian_bytes(&self.block, &mut block_words);
- self.chaining_value = first_8_words(compress(
- &self.chaining_value,
- &block_words,
- self.chunk_counter,
- BLOCK_LEN as u32,
- self.flags | self.start_flag(),
- ));
- self.blocks_compressed += 1;
- self.block = [0; BLOCK_LEN];
- self.block_len = 0;
- }
-
- // Copy input bytes into the block buffer.
- let want = BLOCK_LEN - self.block_len as usize;
- let take = min(want, input.len());
- self.block[self.block_len as usize..][..take].copy_from_slice(&input[..take]);
- self.block_len += take as u8;
- input = &input[take..];
- }
- }
-
- fn output(&self) -> Output {
- let mut block_words = [0; 16];
- words_from_little_endian_bytes(&self.block, &mut block_words);
- Output {
- input_chaining_value: self.chaining_value,
- block_words,
- block_len: self.block_len as u32,
- counter: self.chunk_counter,
- flags: self.flags | self.start_flag() | CHUNK_END,
- }
- }
-}
-
-fn parent_output(
- left_child_cv: [u32; 8],
- right_child_cv: [u32; 8],
- key: [u32; 8],
- flags: u32,
-) -> Output {
- let mut block_words = [0; 16];
- block_words[..8].copy_from_slice(&left_child_cv);
- block_words[8..].copy_from_slice(&right_child_cv);
- Output {
- input_chaining_value: key,
- block_words,
- counter: 0, // Always 0 for parent nodes.
- block_len: BLOCK_LEN as u32, // Always BLOCK_LEN (64) for parent nodes.
- flags: PARENT | flags,
- }
-}
-
-fn parent_cv(
- left_child_cv: [u32; 8],
- right_child_cv: [u32; 8],
- key: [u32; 8],
- flags: u32,
-) -> [u32; 8] {
- parent_output(left_child_cv, right_child_cv, key, flags).chaining_value()
-}
-
-/// An incremental hasher that can accept any number of writes.
-pub struct Hasher {
- chunk_state: ChunkState,
- key: [u32; 8],
- cv_stack: [[u32; 8]; 54], // Space for 54 subtree chaining values:
- cv_stack_len: u8, // 2^54 * CHUNK_LEN = 2^64
- flags: u32,
-}
-
-impl Hasher {
- fn new_internal(key: [u32; 8], flags: u32) -> Self {
- Self {
- chunk_state: ChunkState::new(key, 0, flags),
- key,
- cv_stack: [[0; 8]; 54],
- cv_stack_len: 0,
- flags,
- }
- }
-
- /// Construct a new `Hasher` for the regular hash function.
- pub fn new() -> Self {
- Self::new_internal(IV, 0)
- }
-
- /// Construct a new `Hasher` for the keyed hash function.
- pub fn new_keyed(key: &[u8; KEY_LEN]) -> Self {
- let mut key_words = [0; 8];
- words_from_little_endian_bytes(key, &mut key_words);
- Self::new_internal(key_words, KEYED_HASH)
- }
-
- /// Construct a new `Hasher` for the key derivation function. The context
- /// string should be hardcoded, globally unique, and application-specific.
- pub fn new_derive_key(context: &str) -> Self {
- let mut context_hasher = Self::new_internal(IV, DERIVE_KEY_CONTEXT);
- context_hasher.update(context.as_bytes());
- let mut context_key = [0; KEY_LEN];
- context_hasher.finalize(&mut context_key);
- let mut context_key_words = [0; 8];
- words_from_little_endian_bytes(&context_key, &mut context_key_words);
- Self::new_internal(context_key_words, DERIVE_KEY_MATERIAL)
- }
-
- fn push_stack(&mut self, cv: [u32; 8]) {
- self.cv_stack[self.cv_stack_len as usize] = cv;
- self.cv_stack_len += 1;
- }
-
- fn pop_stack(&mut self) -> [u32; 8] {
- self.cv_stack_len -= 1;
- self.cv_stack[self.cv_stack_len as usize]
- }
-
- // Section 5.1.2 of the BLAKE3 spec explains this algorithm in more detail.
- fn add_chunk_chaining_value(&mut self, mut new_cv: [u32; 8], mut total_chunks: u64) {
- // This chunk might complete some subtrees. For each completed subtree,
- // its left child will be the current top entry in the CV stack, and
- // its right child will be the current value of `new_cv`. Pop each left
- // child off the stack, merge it with `new_cv`, and overwrite `new_cv`
- // with the result. After all these merges, push the final value of
- // `new_cv` onto the stack. The number of completed subtrees is given
- // by the number of trailing 0-bits in the new total number of chunks.
- while total_chunks & 1 == 0 {
- new_cv = parent_cv(self.pop_stack(), new_cv, self.key, self.flags);
- total_chunks >>= 1;
- }
- self.push_stack(new_cv);
- }
-
- /// Add input to the hash state. This can be called any number of times.
- pub fn update(&mut self, mut input: &[u8]) {
- while !input.is_empty() {
- // If the current chunk is complete, finalize it and reset the
- // chunk state. More input is coming, so this chunk is not ROOT.
- if self.chunk_state.len() == CHUNK_LEN {
- let chunk_cv = self.chunk_state.output().chaining_value();
- let total_chunks = self.chunk_state.chunk_counter + 1;
- self.add_chunk_chaining_value(chunk_cv, total_chunks);
- self.chunk_state = ChunkState::new(self.key, total_chunks, self.flags);
- }
-
- // Compress input bytes into the current chunk state.
- let want = CHUNK_LEN - self.chunk_state.len();
- let take = min(want, input.len());
- self.chunk_state.update(&input[..take]);
- input = &input[take..];
- }
- }
-
- /// Finalize the hash and write any number of output bytes.
- pub fn finalize(&self, out_slice: &mut [u8]) {
- // Starting with the Output from the current chunk, compute all the
- // parent chaining values along the right edge of the tree, until we
- // have the root Output.
- let mut output = self.chunk_state.output();
- let mut parent_nodes_remaining = self.cv_stack_len as usize;
- while parent_nodes_remaining > 0 {
- parent_nodes_remaining -= 1;
- output = parent_output(
- self.cv_stack[parent_nodes_remaining],
- output.chaining_value(),
- self.key,
- self.flags,
- );
- }
- output.root_output_bytes(out_slice);
- }
-}
diff --git a/thirdparty/BLAKE3/src/ffi_avx2.rs b/thirdparty/BLAKE3/src/ffi_avx2.rs
deleted file mode 100644
index d805e868e..000000000
--- a/thirdparty/BLAKE3/src/ffi_avx2.rs
+++ /dev/null
@@ -1,63 +0,0 @@
-use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN};
-
-// Note that there is no AVX2 implementation of compress_in_place or
-// compress_xof.
-
-// Unsafe because this may only be called on platforms supporting AVX2.
-pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>(
- inputs: &[&A],
- key: &CVWords,
- counter: u64,
- increment_counter: IncrementCounter,
- flags: u8,
- flags_start: u8,
- flags_end: u8,
- out: &mut [u8],
-) {
- // The Rust hash_many implementations do bounds checking on the `out`
- // array, but the C implementations don't. Even though this is an unsafe
- // function, assert the bounds here.
- assert!(out.len() >= inputs.len() * OUT_LEN);
- ffi::blake3_hash_many_avx2(
- inputs.as_ptr() as *const *const u8,
- inputs.len(),
- A::CAPACITY / BLOCK_LEN,
- key.as_ptr(),
- counter,
- increment_counter.yes(),
- flags,
- flags_start,
- flags_end,
- out.as_mut_ptr(),
- )
-}
-
-pub mod ffi {
- extern "C" {
- pub fn blake3_hash_many_avx2(
- inputs: *const *const u8,
- num_inputs: usize,
- blocks: usize,
- key: *const u32,
- counter: u64,
- increment_counter: bool,
- flags: u8,
- flags_start: u8,
- flags_end: u8,
- out: *mut u8,
- );
- }
-}
-
-#[cfg(test)]
-mod test {
- use super::*;
-
- #[test]
- fn test_hash_many() {
- if !crate::platform::avx2_detected() {
- return;
- }
- crate::test::test_hash_many_fn(hash_many, hash_many);
- }
-}
diff --git a/thirdparty/BLAKE3/src/ffi_avx512.rs b/thirdparty/BLAKE3/src/ffi_avx512.rs
deleted file mode 100644
index c1b9f649b..000000000
--- a/thirdparty/BLAKE3/src/ffi_avx512.rs
+++ /dev/null
@@ -1,114 +0,0 @@
-use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN};
-
-// Unsafe because this may only be called on platforms supporting AVX-512.
-pub unsafe fn compress_in_place(
- cv: &mut CVWords,
- block: &[u8; BLOCK_LEN],
- block_len: u8,
- counter: u64,
- flags: u8,
-) {
- ffi::blake3_compress_in_place_avx512(cv.as_mut_ptr(), block.as_ptr(), block_len, counter, flags)
-}
-
-// Unsafe because this may only be called on platforms supporting AVX-512.
-pub unsafe fn compress_xof(
- cv: &CVWords,
- block: &[u8; BLOCK_LEN],
- block_len: u8,
- counter: u64,
- flags: u8,
-) -> [u8; 64] {
- let mut out = [0u8; 64];
- ffi::blake3_compress_xof_avx512(
- cv.as_ptr(),
- block.as_ptr(),
- block_len,
- counter,
- flags,
- out.as_mut_ptr(),
- );
- out
-}
-
-// Unsafe because this may only be called on platforms supporting AVX-512.
-pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>(
- inputs: &[&A],
- key: &CVWords,
- counter: u64,
- increment_counter: IncrementCounter,
- flags: u8,
- flags_start: u8,
- flags_end: u8,
- out: &mut [u8],
-) {
- // The Rust hash_many implementations do bounds checking on the `out`
- // array, but the C implementations don't. Even though this is an unsafe
- // function, assert the bounds here.
- assert!(out.len() >= inputs.len() * OUT_LEN);
- ffi::blake3_hash_many_avx512(
- inputs.as_ptr() as *const *const u8,
- inputs.len(),
- A::CAPACITY / BLOCK_LEN,
- key.as_ptr(),
- counter,
- increment_counter.yes(),
- flags,
- flags_start,
- flags_end,
- out.as_mut_ptr(),
- )
-}
-
-pub mod ffi {
- extern "C" {
- pub fn blake3_compress_in_place_avx512(
- cv: *mut u32,
- block: *const u8,
- block_len: u8,
- counter: u64,
- flags: u8,
- );
- pub fn blake3_compress_xof_avx512(
- cv: *const u32,
- block: *const u8,
- block_len: u8,
- counter: u64,
- flags: u8,
- out: *mut u8,
- );
- pub fn blake3_hash_many_avx512(
- inputs: *const *const u8,
- num_inputs: usize,
- blocks: usize,
- key: *const u32,
- counter: u64,
- increment_counter: bool,
- flags: u8,
- flags_start: u8,
- flags_end: u8,
- out: *mut u8,
- );
- }
-}
-
-#[cfg(test)]
-mod test {
- use super::*;
-
- #[test]
- fn test_compress() {
- if !crate::platform::avx512_detected() {
- return;
- }
- crate::test::test_compress_fn(compress_in_place, compress_xof);
- }
-
- #[test]
- fn test_hash_many() {
- if !crate::platform::avx512_detected() {
- return;
- }
- crate::test::test_hash_many_fn(hash_many, hash_many);
- }
-}
diff --git a/thirdparty/BLAKE3/src/ffi_neon.rs b/thirdparty/BLAKE3/src/ffi_neon.rs
deleted file mode 100644
index 889974277..000000000
--- a/thirdparty/BLAKE3/src/ffi_neon.rs
+++ /dev/null
@@ -1,82 +0,0 @@
-use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN};
-
-// Unsafe because this may only be called on platforms supporting NEON.
-pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>(
- inputs: &[&A],
- key: &CVWords,
- counter: u64,
- increment_counter: IncrementCounter,
- flags: u8,
- flags_start: u8,
- flags_end: u8,
- out: &mut [u8],
-) {
- // The Rust hash_many implementations do bounds checking on the `out`
- // array, but the C implementations don't. Even though this is an unsafe
- // function, assert the bounds here.
- assert!(out.len() >= inputs.len() * OUT_LEN);
- ffi::blake3_hash_many_neon(
- inputs.as_ptr() as *const *const u8,
- inputs.len(),
- A::CAPACITY / BLOCK_LEN,
- key.as_ptr(),
- counter,
- increment_counter.yes(),
- flags,
- flags_start,
- flags_end,
- out.as_mut_ptr(),
- )
-}
-
-// blake3_neon.c normally depends on blake3_portable.c, because the NEON
-// implementation only provides 4x compression, and it relies on the portable
-// implementation for 1x compression. However, we expose the portable Rust
-// implementation here instead, to avoid linking in unnecessary code.
-#[no_mangle]
-pub extern "C" fn blake3_compress_in_place_portable(
- cv: *mut u32,
- block: *const u8,
- block_len: u8,
- counter: u64,
- flags: u8,
-) {
- unsafe {
- crate::portable::compress_in_place(
- &mut *(cv as *mut [u32; 8]),
- &*(block as *const [u8; 64]),
- block_len,
- counter,
- flags,
- )
- }
-}
-
-pub mod ffi {
- extern "C" {
- pub fn blake3_hash_many_neon(
- inputs: *const *const u8,
- num_inputs: usize,
- blocks: usize,
- key: *const u32,
- counter: u64,
- increment_counter: bool,
- flags: u8,
- flags_start: u8,
- flags_end: u8,
- out: *mut u8,
- );
- }
-}
-
-#[cfg(test)]
-mod test {
- use super::*;
-
- #[test]
- fn test_hash_many() {
- // This entire file is gated on feature="neon", so NEON support is
- // assumed here.
- crate::test::test_hash_many_fn(hash_many, hash_many);
- }
-}
diff --git a/thirdparty/BLAKE3/src/ffi_sse2.rs b/thirdparty/BLAKE3/src/ffi_sse2.rs
deleted file mode 100644
index c49a229ad..000000000
--- a/thirdparty/BLAKE3/src/ffi_sse2.rs
+++ /dev/null
@@ -1,114 +0,0 @@
-use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN};
-
-// Unsafe because this may only be called on platforms supporting SSE2.
-pub unsafe fn compress_in_place(
- cv: &mut CVWords,
- block: &[u8; BLOCK_LEN],
- block_len: u8,
- counter: u64,
- flags: u8,
-) {
- ffi::blake3_compress_in_place_sse2(cv.as_mut_ptr(), block.as_ptr(), block_len, counter, flags)
-}
-
-// Unsafe because this may only be called on platforms supporting SSE2.
-pub unsafe fn compress_xof(
- cv: &CVWords,
- block: &[u8; BLOCK_LEN],
- block_len: u8,
- counter: u64,
- flags: u8,
-) -> [u8; 64] {
- let mut out = [0u8; 64];
- ffi::blake3_compress_xof_sse2(
- cv.as_ptr(),
- block.as_ptr(),
- block_len,
- counter,
- flags,
- out.as_mut_ptr(),
- );
- out
-}
-
-// Unsafe because this may only be called on platforms supporting SSE2.
-pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>(
- inputs: &[&A],
- key: &CVWords,
- counter: u64,
- increment_counter: IncrementCounter,
- flags: u8,
- flags_start: u8,
- flags_end: u8,
- out: &mut [u8],
-) {
- // The Rust hash_many implementations do bounds checking on the `out`
- // array, but the C implementations don't. Even though this is an unsafe
- // function, assert the bounds here.
- assert!(out.len() >= inputs.len() * OUT_LEN);
- ffi::blake3_hash_many_sse2(
- inputs.as_ptr() as *const *const u8,
- inputs.len(),
- A::CAPACITY / BLOCK_LEN,
- key.as_ptr(),
- counter,
- increment_counter.yes(),
- flags,
- flags_start,
- flags_end,
- out.as_mut_ptr(),
- )
-}
-
-pub mod ffi {
- extern "C" {
- pub fn blake3_compress_in_place_sse2(
- cv: *mut u32,
- block: *const u8,
- block_len: u8,
- counter: u64,
- flags: u8,
- );
- pub fn blake3_compress_xof_sse2(
- cv: *const u32,
- block: *const u8,
- block_len: u8,
- counter: u64,
- flags: u8,
- out: *mut u8,
- );
- pub fn blake3_hash_many_sse2(
- inputs: *const *const u8,
- num_inputs: usize,
- blocks: usize,
- key: *const u32,
- counter: u64,
- increment_counter: bool,
- flags: u8,
- flags_start: u8,
- flags_end: u8,
- out: *mut u8,
- );
- }
-}
-
-#[cfg(test)]
-mod test {
- use super::*;
-
- #[test]
- fn test_compress() {
- if !crate::platform::sse2_detected() {
- return;
- }
- crate::test::test_compress_fn(compress_in_place, compress_xof);
- }
-
- #[test]
- fn test_hash_many() {
- if !crate::platform::sse2_detected() {
- return;
- }
- crate::test::test_hash_many_fn(hash_many, hash_many);
- }
-}
diff --git a/thirdparty/BLAKE3/src/ffi_sse41.rs b/thirdparty/BLAKE3/src/ffi_sse41.rs
deleted file mode 100644
index 0b64c90a0..000000000
--- a/thirdparty/BLAKE3/src/ffi_sse41.rs
+++ /dev/null
@@ -1,114 +0,0 @@
-use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN};
-
-// Unsafe because this may only be called on platforms supporting SSE4.1.
-pub unsafe fn compress_in_place(
- cv: &mut CVWords,
- block: &[u8; BLOCK_LEN],
- block_len: u8,
- counter: u64,
- flags: u8,
-) {
- ffi::blake3_compress_in_place_sse41(cv.as_mut_ptr(), block.as_ptr(), block_len, counter, flags)
-}
-
-// Unsafe because this may only be called on platforms supporting SSE4.1.
-pub unsafe fn compress_xof(
- cv: &CVWords,
- block: &[u8; BLOCK_LEN],
- block_len: u8,
- counter: u64,
- flags: u8,
-) -> [u8; 64] {
- let mut out = [0u8; 64];
- ffi::blake3_compress_xof_sse41(
- cv.as_ptr(),
- block.as_ptr(),
- block_len,
- counter,
- flags,
- out.as_mut_ptr(),
- );
- out
-}
-
-// Unsafe because this may only be called on platforms supporting SSE4.1.
-pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>(
- inputs: &[&A],
- key: &CVWords,
- counter: u64,
- increment_counter: IncrementCounter,
- flags: u8,
- flags_start: u8,
- flags_end: u8,
- out: &mut [u8],
-) {
- // The Rust hash_many implementations do bounds checking on the `out`
- // array, but the C implementations don't. Even though this is an unsafe
- // function, assert the bounds here.
- assert!(out.len() >= inputs.len() * OUT_LEN);
- ffi::blake3_hash_many_sse41(
- inputs.as_ptr() as *const *const u8,
- inputs.len(),
- A::CAPACITY / BLOCK_LEN,
- key.as_ptr(),
- counter,
- increment_counter.yes(),
- flags,
- flags_start,
- flags_end,
- out.as_mut_ptr(),
- )
-}
-
-pub mod ffi {
- extern "C" {
- pub fn blake3_compress_in_place_sse41(
- cv: *mut u32,
- block: *const u8,
- block_len: u8,
- counter: u64,
- flags: u8,
- );
- pub fn blake3_compress_xof_sse41(
- cv: *const u32,
- block: *const u8,
- block_len: u8,
- counter: u64,
- flags: u8,
- out: *mut u8,
- );
- pub fn blake3_hash_many_sse41(
- inputs: *const *const u8,
- num_inputs: usize,
- blocks: usize,
- key: *const u32,
- counter: u64,
- increment_counter: bool,
- flags: u8,
- flags_start: u8,
- flags_end: u8,
- out: *mut u8,
- );
- }
-}
-
-#[cfg(test)]
-mod test {
- use super::*;
-
- #[test]
- fn test_compress() {
- if !crate::platform::sse41_detected() {
- return;
- }
- crate::test::test_compress_fn(compress_in_place, compress_xof);
- }
-
- #[test]
- fn test_hash_many() {
- if !crate::platform::sse41_detected() {
- return;
- }
- crate::test::test_hash_many_fn(hash_many, hash_many);
- }
-}
diff --git a/thirdparty/BLAKE3/src/guts.rs b/thirdparty/BLAKE3/src/guts.rs
deleted file mode 100644
index 88dcc86cd..000000000
--- a/thirdparty/BLAKE3/src/guts.rs
+++ /dev/null
@@ -1,95 +0,0 @@
-// This module is for incremental use cases like the `bao` crate, which need to
-// get their hands on internal chunk and parent chaining values. The vast
-// majority of users should ignore this and use the publicly documented
-// interface instead.
-
-#[derive(Clone, Debug)]
-pub struct ChunkState(crate::ChunkState);
-
-impl ChunkState {
- // Currently this type only supports the regular hash mode. If an
- // incremental user needs keyed_hash or derive_key, we can add that.
- pub fn new(chunk_counter: u64) -> Self {
- Self(crate::ChunkState::new(
- crate::IV,
- chunk_counter,
- 0,
- crate::platform::Platform::detect(),
- ))
- }
-
- #[inline]
- pub fn len(&self) -> usize {
- self.0.len()
- }
-
- #[inline]
- pub fn update(&mut self, input: &[u8]) -> &mut Self {
- self.0.update(input);
- self
- }
-
- pub fn finalize(&self, is_root: bool) -> crate::Hash {
- let output = self.0.output();
- if is_root {
- output.root_hash()
- } else {
- output.chaining_value().into()
- }
- }
-}
-
-// As above, this currently assumes the regular hash mode. If an incremental
-// user needs keyed_hash or derive_key, we can add that.
-pub fn parent_cv(
- left_child: &crate::Hash,
- right_child: &crate::Hash,
- is_root: bool,
-) -> crate::Hash {
- let output = crate::parent_node_output(
- left_child.as_bytes(),
- right_child.as_bytes(),
- crate::IV,
- 0,
- crate::platform::Platform::detect(),
- );
- if is_root {
- output.root_hash()
- } else {
- output.chaining_value().into()
- }
-}
-
-#[cfg(test)]
-mod test {
- use super::*;
-
- #[test]
- fn test_chunk() {
- assert_eq!(
- crate::hash(b"foo"),
- ChunkState::new(0).update(b"foo").finalize(true)
- );
- }
-
- #[test]
- fn test_parents() {
- let mut hasher = crate::Hasher::new();
- let mut buf = [0; crate::CHUNK_LEN];
-
- buf[0] = 'a' as u8;
- hasher.update(&buf);
- let chunk0_cv = ChunkState::new(0).update(&buf).finalize(false);
-
- buf[0] = 'b' as u8;
- hasher.update(&buf);
- let chunk1_cv = ChunkState::new(1).update(&buf).finalize(false);
-
- hasher.update(b"c");
- let chunk2_cv = ChunkState::new(2).update(b"c").finalize(false);
-
- let parent = parent_cv(&chunk0_cv, &chunk1_cv, false);
- let root = parent_cv(&parent, &chunk2_cv, true);
- assert_eq!(hasher.finalize(), root);
- }
-}
diff --git a/thirdparty/BLAKE3/src/join.rs b/thirdparty/BLAKE3/src/join.rs
deleted file mode 100644
index 60932db1c..000000000
--- a/thirdparty/BLAKE3/src/join.rs
+++ /dev/null
@@ -1,120 +0,0 @@
-//! The multi-threading abstractions used by [`Hasher::update_with_join`].
-//!
-//! Different implementations of the `Join` trait determine whether
-//! [`Hasher::update_with_join`] performs multi-threading on sufficiently large
-//! inputs. The `SerialJoin` implementation is single-threaded, and the
-//! `RayonJoin` implementation (gated by the `rayon` feature) is
-//! multi-threaded. Interfaces other than [`Hasher::update_with_join`], like
-//! [`hash`] and [`Hasher::update`], always use `SerialJoin` internally.
-//!
-//! The `Join` trait is an almost exact copy of the [`rayon::join`] API, and
-//! `RayonJoin` is the only non-trivial implementation provided. The only
-//! difference between the function signature in the `Join` trait and the
-//! underlying one in Rayon, is that the trait method includes two length
-//! parameters. This gives an implementation the option of e.g. setting a
-//! subtree size threshold below which it keeps splits on the same thread.
-//! However, neither of the two provided implementations currently makes use of
-//! those parameters. Note that in Rayon, the very first `join` call is more
-//! expensive than subsequent calls, because it moves work from the calling
-//! thread into the thread pool. That makes a coarse-grained input length
-//! threshold in the caller more effective than a fine-grained subtree size
-//! threshold after the implementation has already started recursing.
-//!
-//! # Example
-//!
-//! ```
-//! // Hash a large input using multi-threading. Note that multi-threading
-//! // comes with some overhead, and it can actually hurt performance for small
-//! // inputs. The meaning of "small" varies, however, depending on the
-//! // platform and the number of threads. (On x86_64, the cutoff tends to be
-//! // around 128 KiB.) You should benchmark your own use case to see whether
-//! // multi-threading helps.
-//! # #[cfg(feature = "rayon")]
-//! # {
-//! # fn some_large_input() -> &'static [u8] { b"foo" }
-//! let input: &[u8] = some_large_input();
-//! let mut hasher = blake3::Hasher::new();
-//! hasher.update_with_join::<blake3::join::RayonJoin>(input);
-//! let hash = hasher.finalize();
-//! # }
-//! ```
-//!
-//! [`Hasher::update_with_join`]: ../struct.Hasher.html#method.update_with_join
-//! [`Hasher::update`]: ../struct.Hasher.html#method.update
-//! [`hash`]: ../fn.hash.html
-//! [`rayon::join`]: https://docs.rs/rayon/1.3.0/rayon/fn.join.html
-
-/// The trait that abstracts over single-threaded and multi-threaded recursion.
-///
-/// See the [`join` module docs](index.html) for more details.
-pub trait Join {
- fn join<A, B, RA, RB>(oper_a: A, oper_b: B, len_a: usize, len_b: usize) -> (RA, RB)
- where
- A: FnOnce() -> RA + Send,
- B: FnOnce() -> RB + Send,
- RA: Send,
- RB: Send;
-}
-
-/// The trivial, serial implementation of `Join`. The left and right sides are
-/// executed one after the other, on the calling thread. The standalone hashing
-/// functions and the `Hasher::update` method use this implementation
-/// internally.
-///
-/// See the [`join` module docs](index.html) for more details.
-pub enum SerialJoin {}
-
-impl Join for SerialJoin {
- #[inline]
- fn join<A, B, RA, RB>(oper_a: A, oper_b: B, _len_a: usize, _len_b: usize) -> (RA, RB)
- where
- A: FnOnce() -> RA + Send,
- B: FnOnce() -> RB + Send,
- RA: Send,
- RB: Send,
- {
- (oper_a(), oper_b())
- }
-}
-
-/// The Rayon-based implementation of `Join`. The left and right sides are
-/// executed on the Rayon thread pool, potentially in parallel. This
-/// implementation is gated by the `rayon` feature, which is off by default.
-///
-/// See the [`join` module docs](index.html) for more details.
-#[cfg(feature = "rayon")]
-pub enum RayonJoin {}
-
-#[cfg(feature = "rayon")]
-impl Join for RayonJoin {
- #[inline]
- fn join<A, B, RA, RB>(oper_a: A, oper_b: B, _len_a: usize, _len_b: usize) -> (RA, RB)
- where
- A: FnOnce() -> RA + Send,
- B: FnOnce() -> RB + Send,
- RA: Send,
- RB: Send,
- {
- rayon::join(oper_a, oper_b)
- }
-}
-
-#[cfg(test)]
-mod test {
- use super::*;
-
- #[test]
- fn test_serial_join() {
- let oper_a = || 1 + 1;
- let oper_b = || 2 + 2;
- assert_eq!((2, 4), SerialJoin::join(oper_a, oper_b, 3, 4));
- }
-
- #[test]
- #[cfg(feature = "rayon")]
- fn test_rayon_join() {
- let oper_a = || 1 + 1;
- let oper_b = || 2 + 2;
- assert_eq!((2, 4), RayonJoin::join(oper_a, oper_b, 3, 4));
- }
-}
diff --git a/thirdparty/BLAKE3/src/lib.rs b/thirdparty/BLAKE3/src/lib.rs
deleted file mode 100644
index bf66b6dae..000000000
--- a/thirdparty/BLAKE3/src/lib.rs
+++ /dev/null
@@ -1,1359 +0,0 @@
-//! The official Rust implementation of the [BLAKE3] cryptographic hash
-//! function.
-//!
-//! # Examples
-//!
-//! ```
-//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
-//! // Hash an input all at once.
-//! let hash1 = blake3::hash(b"foobarbaz");
-//!
-//! // Hash an input incrementally.
-//! let mut hasher = blake3::Hasher::new();
-//! hasher.update(b"foo");
-//! hasher.update(b"bar");
-//! hasher.update(b"baz");
-//! let hash2 = hasher.finalize();
-//! assert_eq!(hash1, hash2);
-//!
-//! // Extended output. OutputReader also implements Read and Seek.
-//! # #[cfg(feature = "std")] {
-//! let mut output = [0; 1000];
-//! let mut output_reader = hasher.finalize_xof();
-//! output_reader.fill(&mut output);
-//! assert_eq!(&output[..32], hash1.as_bytes());
-//! # }
-//!
-//! // Print a hash as hex.
-//! println!("{}", hash1.to_hex());
-//! # Ok(())
-//! # }
-//! ```
-//!
-//! # Cargo Features
-//!
-//! The `rayon` feature provides [Rayon]-based multi-threading, in particular
-//! the [`join::RayonJoin`] type for use with [`Hasher::update_with_join`]. It
-//! is disabled by default, but enabled for [docs.rs].
-//!
-//! The `neon` feature enables ARM NEON support. Currently there is no runtime
-//! CPU feature detection for NEON, so you must only enable this feature for
-//! targets that are known to have NEON support. In particular, some ARMv7
-//! targets support NEON, and some don't.
-//!
-//! The `std` feature (enabled by default) is required for implementations of
-//! the [`Write`] and [`Seek`] traits, and also for runtime CPU feature
-//! detection. If this feature is disabled, the only way to use the SIMD
-//! implementations in this crate is to enable the corresponding instruction
-//! sets statically for the entire build, with e.g. `RUSTFLAGS="-C
-//! target-cpu=native"`. The resulting binary will not be portable to other
-//! machines.
-//!
-//! [BLAKE3]: https://blake3.io
-//! [Rayon]: https://github.com/rayon-rs/rayon
-//! [`join::RayonJoin`]: join/enum.RayonJoin.html
-//! [`Hasher::update_with_join`]: struct.Hasher.html#method.update_with_join
-//! [docs.rs]: https://docs.rs/
-//! [`Write`]: https://doc.rust-lang.org/std/io/trait.Write.html
-//! [`Seek`]: https://doc.rust-lang.org/std/io/trait.Seek.html
-
-#![cfg_attr(not(feature = "std"), no_std)]
-
-#[cfg(test)]
-mod test;
-
-// The guts module is for incremental use cases like the `bao` crate that need
-// to explicitly compute chunk and parent chaining values. It is semi-stable
-// and likely to keep working, but largely undocumented and not intended for
-// widespread use.
-#[doc(hidden)]
-pub mod guts;
-
-// The platform module is pub for benchmarks only. It is not stable.
-#[doc(hidden)]
-pub mod platform;
-
-// Platform-specific implementations of the compression function. These
-// BLAKE3-specific cfg flags are set in build.rs.
-#[cfg(blake3_avx2_rust)]
-#[path = "rust_avx2.rs"]
-mod avx2;
-#[cfg(blake3_avx2_ffi)]
-#[path = "ffi_avx2.rs"]
-mod avx2;
-#[cfg(blake3_avx512_ffi)]
-#[path = "ffi_avx512.rs"]
-mod avx512;
-#[cfg(feature = "neon")]
-#[path = "ffi_neon.rs"]
-mod neon;
-mod portable;
-#[cfg(blake3_sse2_rust)]
-#[path = "rust_sse2.rs"]
-mod sse2;
-#[cfg(blake3_sse2_ffi)]
-#[path = "ffi_sse2.rs"]
-mod sse2;
-#[cfg(blake3_sse41_rust)]
-#[path = "rust_sse41.rs"]
-mod sse41;
-#[cfg(blake3_sse41_ffi)]
-#[path = "ffi_sse41.rs"]
-mod sse41;
-
-pub mod traits;
-
-pub mod join;
-
-use arrayref::{array_mut_ref, array_ref};
-use arrayvec::{ArrayString, ArrayVec};
-use core::cmp;
-use core::fmt;
-use join::{Join, SerialJoin};
-use platform::{Platform, MAX_SIMD_DEGREE, MAX_SIMD_DEGREE_OR_2};
-
-/// The number of bytes in a [`Hash`](struct.Hash.html), 32.
-pub const OUT_LEN: usize = 32;
-
-/// The number of bytes in a key, 32.
-pub const KEY_LEN: usize = 32;
-
-// These constants are pub for incremental use cases like `bao`, as well as
-// tests and benchmarks. Most callers should not need them.
-#[doc(hidden)]
-pub const BLOCK_LEN: usize = 64;
-#[doc(hidden)]
-pub const CHUNK_LEN: usize = 1024;
-#[doc(hidden)]
-pub const MAX_DEPTH: usize = 54; // 2^54 * CHUNK_LEN = 2^64
-
-// While iterating the compression function within a chunk, the CV is
-// represented as words, to avoid doing two extra endianness conversions for
-// each compression in the portable implementation. But the hash_many interface
-// needs to hash both input bytes and parent nodes, so its better for its
-// output CVs to be represented as bytes.
-type CVWords = [u32; 8];
-type CVBytes = [u8; 32]; // little-endian
-
-const IV: &CVWords = &[
- 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19,
-];
-
-const MSG_SCHEDULE: [[usize; 16]; 7] = [
- [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
- [2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8],
- [3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1],
- [10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6],
- [12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4],
- [9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7],
- [11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13],
-];
-
-// These are the internal flags that we use to domain separate root/non-root,
-// chunk/parent, and chunk beginning/middle/end. These get set at the high end
-// of the block flags word in the compression function, so their values start
-// high and go down.
-const CHUNK_START: u8 = 1 << 0;
-const CHUNK_END: u8 = 1 << 1;
-const PARENT: u8 = 1 << 2;
-const ROOT: u8 = 1 << 3;
-const KEYED_HASH: u8 = 1 << 4;
-const DERIVE_KEY_CONTEXT: u8 = 1 << 5;
-const DERIVE_KEY_MATERIAL: u8 = 1 << 6;
-
-#[inline]
-fn counter_low(counter: u64) -> u32 {
- counter as u32
-}
-
-#[inline]
-fn counter_high(counter: u64) -> u32 {
- (counter >> 32) as u32
-}
-
-/// An output of the default size, 32 bytes, which provides constant-time
-/// equality checking.
-///
-/// `Hash` implements [`From`] and [`Into`] for `[u8; 32]`, and it provides an
-/// explicit [`as_bytes`] method returning `&[u8; 32]`. However, byte arrays
-/// and slices don't provide constant-time equality checking, which is often a
-/// security requirement in software that handles private data. `Hash` doesn't
-/// implement [`Deref`] or [`AsRef`], to avoid situations where a type
-/// conversion happens implicitly and the constant-time property is
-/// accidentally lost.
-///
-/// `Hash` provides the [`to_hex`] method for converting to hexadecimal. It
-/// doesn't directly support converting from hexadecimal, but here's an example
-/// of doing that with the [`hex`] crate:
-///
-/// ```
-/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
-/// use std::convert::TryInto;
-///
-/// let hash_hex = "d74981efa70a0c880b8d8c1985d075dbcbf679b99a5f9914e5aaf96b831a9e24";
-/// let hash_bytes = hex::decode(hash_hex)?;
-/// let hash_array: [u8; blake3::OUT_LEN] = hash_bytes[..].try_into()?;
-/// let hash: blake3::Hash = hash_array.into();
-/// # Ok(())
-/// # }
-/// ```
-///
-/// [`From`]: https://doc.rust-lang.org/std/convert/trait.From.html
-/// [`Into`]: https://doc.rust-lang.org/std/convert/trait.Into.html
-/// [`as_bytes`]: #method.as_bytes
-/// [`Deref`]: https://doc.rust-lang.org/stable/std/ops/trait.Deref.html
-/// [`AsRef`]: https://doc.rust-lang.org/std/convert/trait.AsRef.html
-/// [`to_hex`]: #method.to_hex
-/// [`hex`]: https://crates.io/crates/hex
-#[derive(Clone, Copy, Hash)]
-pub struct Hash([u8; OUT_LEN]);
-
-impl Hash {
- /// The bytes of the `Hash`. Note that byte arrays don't provide
- /// constant-time equality checking, so if you need to compare hashes,
- /// prefer the `Hash` type.
- #[inline]
- pub fn as_bytes(&self) -> &[u8; OUT_LEN] {
- &self.0
- }
-
- /// The hexadecimal encoding of the `Hash`. The returned [`ArrayString`] is
- /// a fixed size and doesn't allocate memory on the heap. Note that
- /// [`ArrayString`] doesn't provide constant-time equality checking, so if
- /// you need to compare hashes, prefer the `Hash` type.
- ///
- /// [`ArrayString`]: https://docs.rs/arrayvec/0.5.1/arrayvec/struct.ArrayString.html
- pub fn to_hex(&self) -> ArrayString<[u8; 2 * OUT_LEN]> {
- let mut s = ArrayString::new();
- let table = b"0123456789abcdef";
- for &b in self.0.iter() {
- s.push(table[(b >> 4) as usize] as char);
- s.push(table[(b & 0xf) as usize] as char);
- }
- s
- }
-}
-
-impl From<[u8; OUT_LEN]> for Hash {
- #[inline]
- fn from(bytes: [u8; OUT_LEN]) -> Self {
- Self(bytes)
- }
-}
-
-impl From<Hash> for [u8; OUT_LEN] {
- #[inline]
- fn from(hash: Hash) -> Self {
- hash.0
- }
-}
-
-/// This implementation is constant-time.
-impl PartialEq for Hash {
- #[inline]
- fn eq(&self, other: &Hash) -> bool {
- constant_time_eq::constant_time_eq_32(&self.0, &other.0)
- }
-}
-
-/// This implementation is constant-time.
-impl PartialEq<[u8; OUT_LEN]> for Hash {
- #[inline]
- fn eq(&self, other: &[u8; OUT_LEN]) -> bool {
- constant_time_eq::constant_time_eq_32(&self.0, other)
- }
-}
-
-impl Eq for Hash {}
-
-impl fmt::Debug for Hash {
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
- // Formatting field as `&str` to reduce code size since the `Debug`
- // dynamic dispatch table for `&str` is likely needed elsewhere already,
- // but that for `ArrayString<[u8; 64]>` is not.
- let hex = self.to_hex();
- let hex: &str = hex.as_str();
-
- f.debug_tuple("Hash").field(&hex).finish()
- }
-}
-
-// Each chunk or parent node can produce either a 32-byte chaining value or, by
-// setting the ROOT flag, any number of final output bytes. The Output struct
-// captures the state just prior to choosing between those two possibilities.
-#[derive(Clone)]
-struct Output {
- input_chaining_value: CVWords,
- block: [u8; 64],
- block_len: u8,
- counter: u64,
- flags: u8,
- platform: Platform,
-}
-
-impl Output {
- fn chaining_value(&self) -> CVBytes {
- let mut cv = self.input_chaining_value;
- self.platform.compress_in_place(
- &mut cv,
- &self.block,
- self.block_len,
- self.counter,
- self.flags,
- );
- platform::le_bytes_from_words_32(&cv)
- }
-
- fn root_hash(&self) -> Hash {
- debug_assert_eq!(self.counter, 0);
- let mut cv = self.input_chaining_value;
- self.platform
- .compress_in_place(&mut cv, &self.block, self.block_len, 0, self.flags | ROOT);
- Hash(platform::le_bytes_from_words_32(&cv))
- }
-
- fn root_output_block(&self) -> [u8; 2 * OUT_LEN] {
- self.platform.compress_xof(
- &self.input_chaining_value,
- &self.block,
- self.block_len,
- self.counter,
- self.flags | ROOT,
- )
- }
-}
-
-#[derive(Clone)]
-struct ChunkState {
- cv: CVWords,
- chunk_counter: u64,
- buf: [u8; BLOCK_LEN],
- buf_len: u8,
- blocks_compressed: u8,
- flags: u8,
- platform: Platform,
-}
-
-impl ChunkState {
- fn new(key: &CVWords, chunk_counter: u64, flags: u8, platform: Platform) -> Self {
- Self {
- cv: *key,
- chunk_counter,
- buf: [0; BLOCK_LEN],
- buf_len: 0,
- blocks_compressed: 0,
- flags,
- platform,
- }
- }
-
- fn len(&self) -> usize {
- BLOCK_LEN * self.blocks_compressed as usize + self.buf_len as usize
- }
-
- fn fill_buf(&mut self, input: &mut &[u8]) {
- let want = BLOCK_LEN - self.buf_len as usize;
- let take = cmp::min(want, input.len());
- self.buf[self.buf_len as usize..][..take].copy_from_slice(&input[..take]);
- self.buf_len += take as u8;
- *input = &input[take..];
- }
-
- fn start_flag(&self) -> u8 {
- if self.blocks_compressed == 0 {
- CHUNK_START
- } else {
- 0
- }
- }
-
- // Try to avoid buffering as much as possible, by compressing directly from
- // the input slice when full blocks are available.
- fn update(&mut self, mut input: &[u8]) -> &mut Self {
- if self.buf_len > 0 {
- self.fill_buf(&mut input);
- if !input.is_empty() {
- debug_assert_eq!(self.buf_len as usize, BLOCK_LEN);
- let block_flags = self.flags | self.start_flag(); // borrowck
- self.platform.compress_in_place(
- &mut self.cv,
- &self.buf,
- BLOCK_LEN as u8,
- self.chunk_counter,
- block_flags,
- );
- self.buf_len = 0;
- self.buf = [0; BLOCK_LEN];
- self.blocks_compressed += 1;
- }
- }
-
- while input.len() > BLOCK_LEN {
- debug_assert_eq!(self.buf_len, 0);
- let block_flags = self.flags | self.start_flag(); // borrowck
- self.platform.compress_in_place(
- &mut self.cv,
- array_ref!(input, 0, BLOCK_LEN),
- BLOCK_LEN as u8,
- self.chunk_counter,
- block_flags,
- );
- self.blocks_compressed += 1;
- input = &input[BLOCK_LEN..];
- }
-
- self.fill_buf(&mut input);
- debug_assert!(input.is_empty());
- debug_assert!(self.len() <= CHUNK_LEN);
- self
- }
-
- fn output(&self) -> Output {
- let block_flags = self.flags | self.start_flag() | CHUNK_END;
- Output {
- input_chaining_value: self.cv,
- block: self.buf,
- block_len: self.buf_len,
- counter: self.chunk_counter,
- flags: block_flags,
- platform: self.platform,
- }
- }
-}
-
-// Don't derive(Debug), because the state may be secret.
-impl fmt::Debug for ChunkState {
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
- f.debug_struct("ChunkState")
- .field("len", &self.len())
- .field("chunk_counter", &self.chunk_counter)
- .field("flags", &self.flags)
- .field("platform", &self.platform)
- .finish()
- }
-}
-
-// IMPLEMENTATION NOTE
-// ===================
-// The recursive function compress_subtree_wide(), implemented below, is the
-// basis of high-performance BLAKE3. We use it both for all-at-once hashing,
-// and for the incremental input with Hasher (though we have to be careful with
-// subtree boundaries in the incremental case). compress_subtree_wide() applies
-// several optimizations at the same time:
-// - Multi-threading with Rayon.
-// - Parallel chunk hashing with SIMD.
-// - Parallel parent hashing with SIMD. Note that while SIMD chunk hashing
-// maxes out at MAX_SIMD_DEGREE*CHUNK_LEN, parallel parent hashing continues
-// to benefit from larger inputs, because more levels of the tree benefit can
-// use full-width SIMD vectors for parent hashing. Without parallel parent
-// hashing, we lose about 10% of overall throughput on AVX2 and AVX-512.
-
-// pub for benchmarks
-#[doc(hidden)]
-#[derive(Clone, Copy)]
-pub enum IncrementCounter {
- Yes,
- No,
-}
-
-impl IncrementCounter {
- #[inline]
- fn yes(&self) -> bool {
- match self {
- IncrementCounter::Yes => true,
- IncrementCounter::No => false,
- }
- }
-}
-
-// The largest power of two less than or equal to `n`, used for left_len()
-// immediately below, and also directly in Hasher::update().
-fn largest_power_of_two_leq(n: usize) -> usize {
- ((n / 2) + 1).next_power_of_two()
-}
-
-// Given some input larger than one chunk, return the number of bytes that
-// should go in the left subtree. This is the largest power-of-2 number of
-// chunks that leaves at least 1 byte for the right subtree.
-fn left_len(content_len: usize) -> usize {
- debug_assert!(content_len > CHUNK_LEN);
- // Subtract 1 to reserve at least one byte for the right side.
- let full_chunks = (content_len - 1) / CHUNK_LEN;
- largest_power_of_two_leq(full_chunks) * CHUNK_LEN
-}
-
-// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE chunks at the same time
-// on a single thread. Write out the chunk chaining values and return the
-// number of chunks hashed. These chunks are never the root and never empty;
-// those cases use a different codepath.
-fn compress_chunks_parallel(
- input: &[u8],
- key: &CVWords,
- chunk_counter: u64,
- flags: u8,
- platform: Platform,
- out: &mut [u8],
-) -> usize {
- debug_assert!(!input.is_empty(), "empty chunks below the root");
- debug_assert!(input.len() <= MAX_SIMD_DEGREE * CHUNK_LEN);
-
- let mut chunks_exact = input.chunks_exact(CHUNK_LEN);
- let mut chunks_array = ArrayVec::<[&[u8; CHUNK_LEN]; MAX_SIMD_DEGREE]>::new();
- for chunk in &mut chunks_exact {
- chunks_array.push(array_ref!(chunk, 0, CHUNK_LEN));
- }
- platform.hash_many(
- &chunks_array,
- key,
- chunk_counter,
- IncrementCounter::Yes,
- flags,
- CHUNK_START,
- CHUNK_END,
- out,
- );
-
- // Hash the remaining partial chunk, if there is one. Note that the empty
- // chunk (meaning the empty message) is a different codepath.
- let chunks_so_far = chunks_array.len();
- if !chunks_exact.remainder().is_empty() {
- let counter = chunk_counter + chunks_so_far as u64;
- let mut chunk_state = ChunkState::new(key, counter, flags, platform);
- chunk_state.update(chunks_exact.remainder());
- *array_mut_ref!(out, chunks_so_far * OUT_LEN, OUT_LEN) =
- chunk_state.output().chaining_value();
- chunks_so_far + 1
- } else {
- chunks_so_far
- }
-}
-
-// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE parents at the same time
-// on a single thread. Write out the parent chaining values and return the
-// number of parents hashed. (If there's an odd input chaining value left over,
-// return it as an additional output.) These parents are never the root and
-// never empty; those cases use a different codepath.
-fn compress_parents_parallel(
- child_chaining_values: &[u8],
- key: &CVWords,
- flags: u8,
- platform: Platform,
- out: &mut [u8],
-) -> usize {
- debug_assert_eq!(child_chaining_values.len() % OUT_LEN, 0, "wacky hash bytes");
- let num_children = child_chaining_values.len() / OUT_LEN;
- debug_assert!(num_children >= 2, "not enough children");
- debug_assert!(num_children <= 2 * MAX_SIMD_DEGREE_OR_2, "too many");
-
- let mut parents_exact = child_chaining_values.chunks_exact(BLOCK_LEN);
- // Use MAX_SIMD_DEGREE_OR_2 rather than MAX_SIMD_DEGREE here, because of
- // the requirements of compress_subtree_wide().
- let mut parents_array = ArrayVec::<[&[u8; BLOCK_LEN]; MAX_SIMD_DEGREE_OR_2]>::new();
- for parent in &mut parents_exact {
- parents_array.push(array_ref!(parent, 0, BLOCK_LEN));
- }
- platform.hash_many(
- &parents_array,
- key,
- 0, // Parents always use counter 0.
- IncrementCounter::No,
- flags | PARENT,
- 0, // Parents have no start flags.
- 0, // Parents have no end flags.
- out,
- );
-
- // If there's an odd child left over, it becomes an output.
- let parents_so_far = parents_array.len();
- if !parents_exact.remainder().is_empty() {
- out[parents_so_far * OUT_LEN..][..OUT_LEN].copy_from_slice(parents_exact.remainder());
- parents_so_far + 1
- } else {
- parents_so_far
- }
-}
-
-// The wide helper function returns (writes out) an array of chaining values
-// and returns the length of that array. The number of chaining values returned
-// is the dyanmically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer,
-// if the input is shorter than that many chunks. The reason for maintaining a
-// wide array of chaining values going back up the tree, is to allow the
-// implementation to hash as many parents in parallel as possible.
-//
-// As a special case when the SIMD degree is 1, this function will still return
-// at least 2 outputs. This guarantees that this function doesn't perform the
-// root compression. (If it did, it would use the wrong flags, and also we
-// wouldn't be able to implement exendable ouput.) Note that this function is
-// not used when the whole input is only 1 chunk long; that's a different
-// codepath.
-//
-// Why not just have the caller split the input on the first update(), instead
-// of implementing this special rule? Because we don't want to limit SIMD or
-// multi-threading parallelism for that update().
-fn compress_subtree_wide<J: Join>(
- input: &[u8],
- key: &CVWords,
- chunk_counter: u64,
- flags: u8,
- platform: Platform,
- out: &mut [u8],
-) -> usize {
- // Note that the single chunk case does *not* bump the SIMD degree up to 2
- // when it is 1. This allows Rayon the option of multi-threading even the
- // 2-chunk case, which can help performance on smaller platforms.
- if input.len() <= platform.simd_degree() * CHUNK_LEN {
- return compress_chunks_parallel(input, key, chunk_counter, flags, platform, out);
- }
-
- // With more than simd_degree chunks, we need to recurse. Start by dividing
- // the input into left and right subtrees. (Note that this is only optimal
- // as long as the SIMD degree is a power of 2. If we ever get a SIMD degree
- // of 3 or something, we'll need a more complicated strategy.)
- debug_assert_eq!(platform.simd_degree().count_ones(), 1, "power of 2");
- let (left, right) = input.split_at(left_len(input.len()));
- let right_chunk_counter = chunk_counter + (left.len() / CHUNK_LEN) as u64;
-
- // Make space for the child outputs. Here we use MAX_SIMD_DEGREE_OR_2 to
- // account for the special case of returning 2 outputs when the SIMD degree
- // is 1.
- let mut cv_array = [0; 2 * MAX_SIMD_DEGREE_OR_2 * OUT_LEN];
- let degree = if left.len() == CHUNK_LEN {
- // The "simd_degree=1 and we're at the leaf nodes" case.
- debug_assert_eq!(platform.simd_degree(), 1);
- 1
- } else {
- cmp::max(platform.simd_degree(), 2)
- };
- let (left_out, right_out) = cv_array.split_at_mut(degree * OUT_LEN);
-
- // Recurse! This uses multiple threads if the "rayon" feature is enabled.
- let (left_n, right_n) = J::join(
- || compress_subtree_wide::<J>(left, key, chunk_counter, flags, platform, left_out),
- || compress_subtree_wide::<J>(right, key, right_chunk_counter, flags, platform, right_out),
- left.len(),
- right.len(),
- );
-
- // The special case again. If simd_degree=1, then we'll have left_n=1 and
- // right_n=1. Rather than compressing them into a single output, return
- // them directly, to make sure we always have at least two outputs.
- debug_assert_eq!(left_n, degree);
- debug_assert!(right_n >= 1 && right_n <= left_n);
- if left_n == 1 {
- out[..2 * OUT_LEN].copy_from_slice(&cv_array[..2 * OUT_LEN]);
- return 2;
- }
-
- // Otherwise, do one layer of parent node compression.
- let num_children = left_n + right_n;
- compress_parents_parallel(
- &cv_array[..num_children * OUT_LEN],
- key,
- flags,
- platform,
- out,
- )
-}
-
-// Hash a subtree with compress_subtree_wide(), and then condense the resulting
-// list of chaining values down to a single parent node. Don't compress that
-// last parent node, however. Instead, return its message bytes (the
-// concatenated chaining values of its children). This is necessary when the
-// first call to update() supplies a complete subtree, because the topmost
-// parent node of that subtree could end up being the root. It's also necessary
-// for extended output in the general case.
-//
-// As with compress_subtree_wide(), this function is not used on inputs of 1
-// chunk or less. That's a different codepath.
-fn compress_subtree_to_parent_node<J: Join>(
- input: &[u8],
- key: &CVWords,
- chunk_counter: u64,
- flags: u8,
- platform: Platform,
-) -> [u8; BLOCK_LEN] {
- debug_assert!(input.len() > CHUNK_LEN);
- let mut cv_array = [0; MAX_SIMD_DEGREE_OR_2 * OUT_LEN];
- let mut num_cvs =
- compress_subtree_wide::<J>(input, &key, chunk_counter, flags, platform, &mut cv_array);
- debug_assert!(num_cvs >= 2);
-
- // If MAX_SIMD_DEGREE is greater than 2 and there's enough input,
- // compress_subtree_wide() returns more than 2 chaining values. Condense
- // them into 2 by forming parent nodes repeatedly.
- let mut out_array = [0; MAX_SIMD_DEGREE_OR_2 * OUT_LEN / 2];
- while num_cvs > 2 {
- let cv_slice = &cv_array[..num_cvs * OUT_LEN];
- num_cvs = compress_parents_parallel(cv_slice, key, flags, platform, &mut out_array);
- cv_array[..num_cvs * OUT_LEN].copy_from_slice(&out_array[..num_cvs * OUT_LEN]);
- }
- *array_ref!(cv_array, 0, 2 * OUT_LEN)
-}
-
-// Hash a complete input all at once. Unlike compress_subtree_wide() and
-// compress_subtree_to_parent_node(), this function handles the 1 chunk case.
-// Note that this we use SerialJoin here, so this is always single-threaded.
-fn hash_all_at_once(input: &[u8], key: &CVWords, flags: u8) -> Output {
- let platform = Platform::detect();
-
- // If the whole subtree is one chunk, hash it directly with a ChunkState.
- if input.len() <= CHUNK_LEN {
- return ChunkState::new(key, 0, flags, platform)
- .update(input)
- .output();
- }
-
- // Otherwise construct an Output object from the parent node returned by
- // compress_subtree_to_parent_node().
- Output {
- input_chaining_value: *key,
- block: compress_subtree_to_parent_node::<SerialJoin>(input, key, 0, flags, platform),
- block_len: BLOCK_LEN as u8,
- counter: 0,
- flags: flags | PARENT,
- platform,
- }
-}
-
-/// The default hash function.
-///
-/// For an incremental version that accepts multiple writes, see [`Hasher::update`].
-///
-/// This function is always single-threaded. For multi-threading support, see
-/// [`Hasher::update_with_join`].
-///
-/// [`Hasher::update`]: struct.Hasher.html#method.update
-/// [`Hasher::update_with_join`]: struct.Hasher.html#method.update_with_join
-pub fn hash(input: &[u8]) -> Hash {
- hash_all_at_once(input, IV, 0).root_hash()
-}
-
-/// The keyed hash function.
-///
-/// This is suitable for use as a message authentication code, for
-/// example to replace an HMAC instance.
-/// In that use case, the constant-time equality checking provided by
-/// [`Hash`](struct.Hash.html) is almost always a security requirement, and
-/// callers need to be careful not to compare MACs as raw bytes.
-///
-/// This function is always single-threaded. For multi-threading support, see
-/// [`Hasher::update_with_join`].
-///
-/// [`Hasher::update_with_join`]: struct.Hasher.html#method.update_with_join
-pub fn keyed_hash(key: &[u8; KEY_LEN], input: &[u8]) -> Hash {
- let key_words = platform::words_from_le_bytes_32(key);
- hash_all_at_once(input, &key_words, KEYED_HASH).root_hash()
-}
-
-/// The key derivation function.
-///
-/// Given cryptographic key material of any length and a context string of any
-/// length, this function outputs a derived subkey of any length. **The context
-/// string should be hardcoded, globally unique, and application-specific.** A
-/// good default format for such strings is `"[application] [commit timestamp]
-/// [purpose]"`, e.g., `"example.com 2019-12-25 16:18:03 session tokens v1"`.
-///
-/// Key derivation is important when you want to use the same key in multiple
-/// algorithms or use cases. Using the same key with different cryptographic
-/// algorithms is generally forbidden, and deriving a separate subkey for each
-/// use case protects you from bad interactions. Derived keys also mitigate the
-/// damage from one part of your application accidentally leaking its key.
-///
-/// As a rare exception to that general rule, however, it is possible to use
-/// `derive_key` itself with key material that you are already using with
-/// another algorithm. You might need to do this if you're adding features to
-/// an existing application, which does not yet use key derivation internally.
-/// However, you still must not share key material with algorithms that forbid
-/// key reuse entirely, like a one-time pad.
-///
-/// Note that BLAKE3 is not a password hash, and **`derive_key` should never be
-/// used with passwords.** Instead, use a dedicated password hash like
-/// [Argon2]. Password hashes are entirely different from generic hash
-/// functions, with opposite design requirements.
-///
-/// This function is always single-threaded. For multi-threading support, see
-/// [`Hasher::update_with_join`].
-///
-/// [`Hasher::new_derive_key`]: struct.Hasher.html#method.new_derive_key
-/// [`Hasher::finalize_xof`]: struct.Hasher.html#method.finalize_xof
-/// [Argon2]: https://en.wikipedia.org/wiki/Argon2
-/// [`Hasher::update_with_join`]: struct.Hasher.html#method.update_with_join
-pub fn derive_key(context: &str, key_material: &[u8], output: &mut [u8]) {
- let context_key = hash_all_at_once(context.as_bytes(), IV, DERIVE_KEY_CONTEXT).root_hash();
- let context_key_words = platform::words_from_le_bytes_32(context_key.as_bytes());
- let inner_output = hash_all_at_once(key_material, &context_key_words, DERIVE_KEY_MATERIAL);
- OutputReader::new(inner_output).fill(output);
-}
-
-fn parent_node_output(
- left_child: &CVBytes,
- right_child: &CVBytes,
- key: &CVWords,
- flags: u8,
- platform: Platform,
-) -> Output {
- let mut block = [0; BLOCK_LEN];
- block[..32].copy_from_slice(left_child);
- block[32..].copy_from_slice(right_child);
- Output {
- input_chaining_value: *key,
- block,
- block_len: BLOCK_LEN as u8,
- counter: 0,
- flags: flags | PARENT,
- platform,
- }
-}
-
-/// An incremental hash state that can accept any number of writes.
-///
-/// In addition to its inherent methods, this type implements several commonly
-/// used traits from the [`digest`](https://crates.io/crates/digest) and
-/// [`crypto_mac`](https://crates.io/crates/crypto-mac) crates.
-///
-/// **Performance note:** The [`update`] and [`update_with_join`] methods
-/// perform poorly when the caller's input buffer is small. See their method
-/// docs below. A 16 KiB buffer is large enough to leverage all currently
-/// supported SIMD instruction sets.
-///
-/// # Examples
-///
-/// ```
-/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
-/// // Hash an input incrementally.
-/// let mut hasher = blake3::Hasher::new();
-/// hasher.update(b"foo");
-/// hasher.update(b"bar");
-/// hasher.update(b"baz");
-/// assert_eq!(hasher.finalize(), blake3::hash(b"foobarbaz"));
-///
-/// // Extended output. OutputReader also implements Read and Seek.
-/// # #[cfg(feature = "std")] {
-/// let mut output = [0; 1000];
-/// let mut output_reader = hasher.finalize_xof();
-/// output_reader.fill(&mut output);
-/// assert_eq!(&output[..32], blake3::hash(b"foobarbaz").as_bytes());
-/// # }
-/// # Ok(())
-/// # }
-/// ```
-///
-/// [`update`]: #method.update
-/// [`update_with_join`]: #method.update_with_join
-#[derive(Clone)]
-pub struct Hasher {
- key: CVWords,
- chunk_state: ChunkState,
- // The stack size is MAX_DEPTH + 1 because we do lazy merging. For example,
- // with 7 chunks, we have 3 entries in the stack. Adding an 8th chunk
- // requires a 4th entry, rather than merging everything down to 1, because
- // we don't know whether more input is coming. This is different from how
- // the reference implementation does things.
- cv_stack: ArrayVec<[CVBytes; MAX_DEPTH + 1]>,
-}
-
-impl Hasher {
- fn new_internal(key: &CVWords, flags: u8) -> Self {
- Self {
- key: *key,
- chunk_state: ChunkState::new(key, 0, flags, Platform::detect()),
- cv_stack: ArrayVec::new(),
- }
- }
-
- /// Construct a new `Hasher` for the regular hash function.
- pub fn new() -> Self {
- Self::new_internal(IV, 0)
- }
-
- /// Construct a new `Hasher` for the keyed hash function. See
- /// [`keyed_hash`].
- ///
- /// [`keyed_hash`]: fn.keyed_hash.html
- pub fn new_keyed(key: &[u8; KEY_LEN]) -> Self {
- let key_words = platform::words_from_le_bytes_32(key);
- Self::new_internal(&key_words, KEYED_HASH)
- }
-
- /// Construct a new `Hasher` for the key derivation function. See
- /// [`derive_key`]. The context string should be hardcoded, globally
- /// unique, and application-specific.
- ///
- /// [`derive_key`]: fn.derive_key.html
- pub fn new_derive_key(context: &str) -> Self {
- let context_key = hash_all_at_once(context.as_bytes(), IV, DERIVE_KEY_CONTEXT).root_hash();
- let context_key_words = platform::words_from_le_bytes_32(context_key.as_bytes());
- Self::new_internal(&context_key_words, DERIVE_KEY_MATERIAL)
- }
-
- /// Reset the `Hasher` to its initial state.
- ///
- /// This is functionally the same as overwriting the `Hasher` with a new
- /// one, using the same key or context string if any. However, depending on
- /// how much inlining the optimizer does, moving a `Hasher` might copy its
- /// entire CV stack, most of which is useless uninitialized bytes. This
- /// methods avoids that copy.
- pub fn reset(&mut self) -> &mut Self {
- self.chunk_state = ChunkState::new(
- &self.key,
- 0,
- self.chunk_state.flags,
- self.chunk_state.platform,
- );
- self.cv_stack.clear();
- self
- }
-
- // As described in push_cv() below, we do "lazy merging", delaying merges
- // until right before the next CV is about to be added. This is different
- // from the reference implementation. Another difference is that we aren't
- // always merging 1 chunk at a time. Instead, each CV might represent any
- // power-of-two number of chunks, as long as the smaller-above-larger stack
- // order is maintained. Instead of the "count the trailing 0-bits"
- // algorithm described in the spec, we use a "count the total number of
- // 1-bits" variant that doesn't require us to retain the subtree size of
- // the CV on top of the stack. The principle is the same: each CV that
- // should remain in the stack is represented by a 1-bit in the total number
- // of chunks (or bytes) so far.
- fn merge_cv_stack(&mut self, total_len: u64) {
- let post_merge_stack_len = total_len.count_ones() as usize;
- while self.cv_stack.len() > post_merge_stack_len {
- let right_child = self.cv_stack.pop().unwrap();
- let left_child = self.cv_stack.pop().unwrap();
- let parent_output = parent_node_output(
- &left_child,
- &right_child,
- &self.key,
- self.chunk_state.flags,
- self.chunk_state.platform,
- );
- self.cv_stack.push(parent_output.chaining_value());
- }
- }
-
- // In reference_impl.rs, we merge the new CV with existing CVs from the
- // stack before pushing it. We can do that because we know more input is
- // coming, so we know none of the merges are root.
- //
- // This setting is different. We want to feed as much input as possible to
- // compress_subtree_wide(), without setting aside anything for the
- // chunk_state. If the user gives us 64 KiB, we want to parallelize over
- // all 64 KiB at once as a single subtree, if at all possible.
- //
- // This leads to two problems:
- // 1) This 64 KiB input might be the only call that ever gets made to
- // update. In this case, the root node of the 64 KiB subtree would be
- // the root node of the whole tree, and it would need to be ROOT
- // finalized. We can't compress it until we know.
- // 2) This 64 KiB input might complete a larger tree, whose root node is
- // similarly going to be the the root of the whole tree. For example,
- // maybe we have 196 KiB (that is, 128 + 64) hashed so far. We can't
- // compress the node at the root of the 256 KiB subtree until we know
- // how to finalize it.
- //
- // The second problem is solved with "lazy merging". That is, when we're
- // about to add a CV to the stack, we don't merge it with anything first,
- // as the reference impl does. Instead we do merges using the *previous* CV
- // that was added, which is sitting on top of the stack, and we put the new
- // CV (unmerged) on top of the stack afterwards. This guarantees that we
- // never merge the root node until finalize().
- //
- // Solving the first problem requires an additional tool,
- // compress_subtree_to_parent_node(). That function always returns the top
- // *two* chaining values of the subtree it's compressing. We then do lazy
- // merging with each of them separately, so that the second CV will always
- // remain unmerged. (That also helps us support extendable output when
- // we're hashing an input all-at-once.)
- fn push_cv(&mut self, new_cv: &CVBytes, chunk_counter: u64) {
- self.merge_cv_stack(chunk_counter);
- self.cv_stack.push(*new_cv);
- }
-
- /// Add input bytes to the hash state. You can call this any number of
- /// times.
- ///
- /// This method is always single-threaded. For multi-threading support, see
- /// `update_with_join` below.
- ///
- /// Note that the degree of SIMD parallelism that `update` can use is
- /// limited by the size of this input buffer. The 8 KiB buffer currently
- /// used by [`std::io::copy`] is enough to leverage AVX2, for example, but
- /// not enough to leverage AVX-512. A 16 KiB buffer is large enough to
- /// leverage all currently supported SIMD instruction sets.
- ///
- /// [`std::io::copy`]: https://doc.rust-lang.org/std/io/fn.copy.html
- pub fn update(&mut self, input: &[u8]) -> &mut Self {
- self.update_with_join::<SerialJoin>(input)
- }
-
- /// Add input bytes to the hash state, as with `update`, but potentially
- /// using multi-threading. See the example below, and the
- /// [`join`](join/index.html) module for a more detailed explanation.
- ///
- /// To get any performance benefit from multi-threading, the input buffer
- /// size needs to be very large. As a rule of thumb on x86_64, there is no
- /// benefit to multi-threading inputs less than 128 KiB. Other platforms
- /// have different thresholds, and in general you need to benchmark your
- /// specific use case. Where possible, memory mapping an entire input file
- /// is recommended, to take maximum advantage of multi-threading without
- /// needing to tune a specific buffer size. Where memory mapping is not
- /// possible, good multi-threading performance requires doing IO on a
- /// background thread, to avoid sleeping all your worker threads while the
- /// input buffer is (serially) refilled. This is quite complicated compared
- /// to memory mapping.
- ///
- /// # Example
- ///
- /// ```
- /// // Hash a large input using multi-threading. Note that multi-threading
- /// // comes with some overhead, and it can actually hurt performance for small
- /// // inputs. The meaning of "small" varies, however, depending on the
- /// // platform and the number of threads. (On x86_64, the cutoff tends to be
- /// // around 128 KiB.) You should benchmark your own use case to see whether
- /// // multi-threading helps.
- /// # #[cfg(feature = "rayon")]
- /// # {
- /// # fn some_large_input() -> &'static [u8] { b"foo" }
- /// let input: &[u8] = some_large_input();
- /// let mut hasher = blake3::Hasher::new();
- /// hasher.update_with_join::<blake3::join::RayonJoin>(input);
- /// let hash = hasher.finalize();
- /// # }
- /// ```
- pub fn update_with_join<J: Join>(&mut self, mut input: &[u8]) -> &mut Self {
- // If we have some partial chunk bytes in the internal chunk_state, we
- // need to finish that chunk first.
- if self.chunk_state.len() > 0 {
- let want = CHUNK_LEN - self.chunk_state.len();
- let take = cmp::min(want, input.len());
- self.chunk_state.update(&input[..take]);
- input = &input[take..];
- if !input.is_empty() {
- // We've filled the current chunk, and there's more input
- // coming, so we know it's not the root and we can finalize it.
- // Then we'll proceed to hashing whole chunks below.
- debug_assert_eq!(self.chunk_state.len(), CHUNK_LEN);
- let chunk_cv = self.chunk_state.output().chaining_value();
- self.push_cv(&chunk_cv, self.chunk_state.chunk_counter);
- self.chunk_state = ChunkState::new(
- &self.key,
- self.chunk_state.chunk_counter + 1,
- self.chunk_state.flags,
- self.chunk_state.platform,
- );
- } else {
- return self;
- }
- }
-
- // Now the chunk_state is clear, and we have more input. If there's
- // more than a single chunk (so, definitely not the root chunk), hash
- // the largest whole subtree we can, with the full benefits of SIMD and
- // multi-threading parallelism. Two restrictions:
- // - The subtree has to be a power-of-2 number of chunks. Only subtrees
- // along the right edge can be incomplete, and we don't know where
- // the right edge is going to be until we get to finalize().
- // - The subtree must evenly divide the total number of chunks up until
- // this point (if total is not 0). If the current incomplete subtree
- // is only waiting for 1 more chunk, we can't hash a subtree of 4
- // chunks. We have to complete the current subtree first.
- // Because we might need to break up the input to form powers of 2, or
- // to evenly divide what we already have, this part runs in a loop.
- while input.len() > CHUNK_LEN {
- debug_assert_eq!(self.chunk_state.len(), 0, "no partial chunk data");
- debug_assert_eq!(CHUNK_LEN.count_ones(), 1, "power of 2 chunk len");
- let mut subtree_len = largest_power_of_two_leq(input.len());
- let count_so_far = self.chunk_state.chunk_counter * CHUNK_LEN as u64;
- // Shrink the subtree_len until it evenly divides the count so far.
- // We know that subtree_len itself is a power of 2, so we can use a
- // bitmasking trick instead of an actual remainder operation. (Note
- // that if the caller consistently passes power-of-2 inputs of the
- // same size, as is hopefully typical, this loop condition will
- // always fail, and subtree_len will always be the full length of
- // the input.)
- //
- // An aside: We don't have to shrink subtree_len quite this much.
- // For example, if count_so_far is 1, we could pass 2 chunks to
- // compress_subtree_to_parent_node. Since we'll get 2 CVs back,
- // we'll still get the right answer in the end, and we might get to
- // use 2-way SIMD parallelism. The problem with this optimization,
- // is that it gets us stuck always hashing 2 chunks. The total
- // number of chunks will remain odd, and we'll never graduate to
- // higher degrees of parallelism. See
- // https://github.com/BLAKE3-team/BLAKE3/issues/69.
- while (subtree_len - 1) as u64 & count_so_far != 0 {
- subtree_len /= 2;
- }
- // The shrunken subtree_len might now be 1 chunk long. If so, hash
- // that one chunk by itself. Otherwise, compress the subtree into a
- // pair of CVs.
- let subtree_chunks = (subtree_len / CHUNK_LEN) as u64;
- if subtree_len <= CHUNK_LEN {
- debug_assert_eq!(subtree_len, CHUNK_LEN);
- self.push_cv(
- &ChunkState::new(
- &self.key,
- self.chunk_state.chunk_counter,
- self.chunk_state.flags,
- self.chunk_state.platform,
- )
- .update(&input[..subtree_len])
- .output()
- .chaining_value(),
- self.chunk_state.chunk_counter,
- );
- } else {
- // This is the high-performance happy path, though getting here
- // depends on the caller giving us a long enough input.
- let cv_pair = compress_subtree_to_parent_node::<J>(
- &input[..subtree_len],
- &self.key,
- self.chunk_state.chunk_counter,
- self.chunk_state.flags,
- self.chunk_state.platform,
- );
- let left_cv = array_ref!(cv_pair, 0, 32);
- let right_cv = array_ref!(cv_pair, 32, 32);
- // Push the two CVs we received into the CV stack in order. Because
- // the stack merges lazily, this guarantees we aren't merging the
- // root.
- self.push_cv(left_cv, self.chunk_state.chunk_counter);
- self.push_cv(
- right_cv,
- self.chunk_state.chunk_counter + (subtree_chunks / 2),
- );
- }
- self.chunk_state.chunk_counter += subtree_chunks;
- input = &input[subtree_len..];
- }
-
- // What remains is 1 chunk or less. Add it to the chunk state.
- debug_assert!(input.len() <= CHUNK_LEN);
- if !input.is_empty() {
- self.chunk_state.update(input);
- // Having added some input to the chunk_state, we know what's in
- // the CV stack won't become the root node, and we can do an extra
- // merge. This simplifies finalize().
- self.merge_cv_stack(self.chunk_state.chunk_counter);
- }
-
- self
- }
-
- fn final_output(&self) -> Output {
- // If the current chunk is the only chunk, that makes it the root node
- // also. Convert it directly into an Output. Otherwise, we need to
- // merge subtrees below.
- if self.cv_stack.is_empty() {
- debug_assert_eq!(self.chunk_state.chunk_counter, 0);
- return self.chunk_state.output();
- }
-
- // If there are any bytes in the ChunkState, finalize that chunk and
- // merge its CV with everything in the CV stack. In that case, the work
- // we did at the end of update() above guarantees that the stack
- // doesn't contain any unmerged subtrees that need to be merged first.
- // (This is important, because if there were two chunk hashes sitting
- // on top of the stack, they would need to merge with each other, and
- // merging a new chunk hash into them would be incorrect.)
- //
- // If there are no bytes in the ChunkState, we'll merge what's already
- // in the stack. In this case it's fine if there are unmerged chunks on
- // top, because we'll merge them with each other. Note that the case of
- // the empty chunk is taken care of above.
- let mut output: Output;
- let mut num_cvs_remaining = self.cv_stack.len();
- if self.chunk_state.len() > 0 {
- debug_assert_eq!(
- self.cv_stack.len(),
- self.chunk_state.chunk_counter.count_ones() as usize,
- "cv stack does not need a merge"
- );
- output = self.chunk_state.output();
- } else {
- debug_assert!(self.cv_stack.len() >= 2);
- output = parent_node_output(
- &self.cv_stack[num_cvs_remaining - 2],
- &self.cv_stack[num_cvs_remaining - 1],
- &self.key,
- self.chunk_state.flags,
- self.chunk_state.platform,
- );
- num_cvs_remaining -= 2;
- }
- while num_cvs_remaining > 0 {
- output = parent_node_output(
- &self.cv_stack[num_cvs_remaining - 1],
- &output.chaining_value(),
- &self.key,
- self.chunk_state.flags,
- self.chunk_state.platform,
- );
- num_cvs_remaining -= 1;
- }
- output
- }
-
- /// Finalize the hash state and return the [`Hash`](struct.Hash.html) of
- /// the input.
- ///
- /// This method is idempotent. Calling it twice will give the same result.
- /// You can also add more input and finalize again.
- pub fn finalize(&self) -> Hash {
- self.final_output().root_hash()
- }
-
- /// Finalize the hash state and return an [`OutputReader`], which can
- /// supply any number of output bytes.
- ///
- /// This method is idempotent. Calling it twice will give the same result.
- /// You can also add more input and finalize again.
- ///
- /// [`OutputReader`]: struct.OutputReader.html
- pub fn finalize_xof(&self) -> OutputReader {
- OutputReader::new(self.final_output())
- }
-}
-
-// Don't derive(Debug), because the state may be secret.
-impl fmt::Debug for Hasher {
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
- f.debug_struct("Hasher")
- .field("flags", &self.chunk_state.flags)
- .field("platform", &self.chunk_state.platform)
- .finish()
- }
-}
-
-impl Default for Hasher {
- #[inline]
- fn default() -> Self {
- Self::new()
- }
-}
-
-#[cfg(feature = "std")]
-impl std::io::Write for Hasher {
- /// This is equivalent to [`update`](#method.update).
- #[inline]
- fn write(&mut self, input: &[u8]) -> std::io::Result<usize> {
- self.update(input);
- Ok(input.len())
- }
-
- #[inline]
- fn flush(&mut self) -> std::io::Result<()> {
- Ok(())
- }
-}
-
-/// An incremental reader for extended output, returned by
-/// [`Hasher::finalize_xof`](struct.Hasher.html#method.finalize_xof).
-#[derive(Clone)]
-pub struct OutputReader {
- inner: Output,
- position_within_block: u8,
-}
-
-impl OutputReader {
- fn new(inner: Output) -> Self {
- Self {
- inner,
- position_within_block: 0,
- }
- }
-
- /// Fill a buffer with output bytes and advance the position of the
- /// `OutputReader`. This is equivalent to [`Read::read`], except that it
- /// doesn't return a `Result`. Both methods always fill the entire buffer.
- ///
- /// Note that `OutputReader` doesn't buffer output bytes internally, so
- /// calling `fill` repeatedly with a short-length or odd-length slice will
- /// end up performing the same compression multiple times. If you're
- /// reading output in a loop, prefer a slice length that's a multiple of
- /// 64.
- ///
- /// The maximum output size of BLAKE3 is 2<sup>64</sup>-1 bytes. If you try
- /// to extract more than that, for example by seeking near the end and
- /// reading further, the behavior is unspecified.
- ///
- /// [`Read::read`]: #method.read
- pub fn fill(&mut self, mut buf: &mut [u8]) {
- while !buf.is_empty() {
- let block: [u8; BLOCK_LEN] = self.inner.root_output_block();
- let output_bytes = &block[self.position_within_block as usize..];
- let take = cmp::min(buf.len(), output_bytes.len());
- buf[..take].copy_from_slice(&output_bytes[..take]);
- buf = &mut buf[take..];
- self.position_within_block += take as u8;
- if self.position_within_block == BLOCK_LEN as u8 {
- self.inner.counter += 1;
- self.position_within_block = 0;
- }
- }
- }
-
- /// Return the current read position in the output stream. The position of
- /// a new `OutputReader` starts at 0, and each call to [`fill`] or
- /// [`Read::read`] moves the position forward by the number of bytes read.
- ///
- /// [`fill`]: #method.fill
- /// [`Read::read`]: #method.read
- pub fn position(&self) -> u64 {
- self.inner.counter * BLOCK_LEN as u64 + self.position_within_block as u64
- }
-
- /// Seek to a new read position in the output stream. This is equivalent to
- /// calling [`Seek::seek`] with [`SeekFrom::Start`], except that it doesn't
- /// return a `Result`.
- ///
- /// [`Seek::seek`]: #method.seek
- /// [`SeekFrom::Start`]: https://doc.rust-lang.org/std/io/enum.SeekFrom.html
- pub fn set_position(&mut self, position: u64) {
- self.position_within_block = (position % BLOCK_LEN as u64) as u8;
- self.inner.counter = position / BLOCK_LEN as u64;
- }
-}
-
-// Don't derive(Debug), because the state may be secret.
-impl fmt::Debug for OutputReader {
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
- f.debug_struct("OutputReader")
- .field("position", &self.position())
- .finish()
- }
-}
-
-#[cfg(feature = "std")]
-impl std::io::Read for OutputReader {
- #[inline]
- fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
- self.fill(buf);
- Ok(buf.len())
- }
-}
-
-#[cfg(feature = "std")]
-impl std::io::Seek for OutputReader {
- fn seek(&mut self, pos: std::io::SeekFrom) -> std::io::Result<u64> {
- let max_position = u64::max_value() as i128;
- let target_position: i128 = match pos {
- std::io::SeekFrom::Start(x) => x as i128,
- std::io::SeekFrom::Current(x) => self.position() as i128 + x as i128,
- std::io::SeekFrom::End(_) => {
- return Err(std::io::Error::new(
- std::io::ErrorKind::InvalidInput,
- "seek from end not supported",
- ));
- }
- };
- if target_position < 0 {
- return Err(std::io::Error::new(
- std::io::ErrorKind::InvalidInput,
- "seek before start",
- ));
- }
- self.set_position(cmp::min(target_position, max_position) as u64);
- Ok(self.position())
- }
-}
diff --git a/thirdparty/BLAKE3/src/platform.rs b/thirdparty/BLAKE3/src/platform.rs
deleted file mode 100644
index 4bd67de7a..000000000
--- a/thirdparty/BLAKE3/src/platform.rs
+++ /dev/null
@@ -1,487 +0,0 @@
-use crate::{portable, CVWords, IncrementCounter, BLOCK_LEN};
-use arrayref::{array_mut_ref, array_ref};
-
-cfg_if::cfg_if! {
- if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
- cfg_if::cfg_if! {
- if #[cfg(blake3_avx512_ffi)] {
- pub const MAX_SIMD_DEGREE: usize = 16;
- } else {
- pub const MAX_SIMD_DEGREE: usize = 8;
- }
- }
- } else if #[cfg(feature = "neon")] {
- pub const MAX_SIMD_DEGREE: usize = 4;
- } else {
- pub const MAX_SIMD_DEGREE: usize = 1;
- }
-}
-
-// There are some places where we want a static size that's equal to the
-// MAX_SIMD_DEGREE, but also at least 2. Constant contexts aren't currently
-// allowed to use cmp::max, so we have to hardcode this additional constant
-// value. Get rid of this once cmp::max is a const fn.
-cfg_if::cfg_if! {
- if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
- cfg_if::cfg_if! {
- if #[cfg(blake3_avx512_ffi)] {
- pub const MAX_SIMD_DEGREE_OR_2: usize = 16;
- } else {
- pub const MAX_SIMD_DEGREE_OR_2: usize = 8;
- }
- }
- } else if #[cfg(feature = "neon")] {
- pub const MAX_SIMD_DEGREE_OR_2: usize = 4;
- } else {
- pub const MAX_SIMD_DEGREE_OR_2: usize = 2;
- }
-}
-
-#[derive(Clone, Copy, Debug)]
-pub enum Platform {
- Portable,
- #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
- SSE2,
- #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
- SSE41,
- #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
- AVX2,
- #[cfg(blake3_avx512_ffi)]
- #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
- AVX512,
- #[cfg(feature = "neon")]
- NEON,
-}
-
-impl Platform {
- #[allow(unreachable_code)]
- pub fn detect() -> Self {
- #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
- {
- #[cfg(blake3_avx512_ffi)]
- {
- if avx512_detected() {
- return Platform::AVX512;
- }
- }
- if avx2_detected() {
- return Platform::AVX2;
- }
- if sse41_detected() {
- return Platform::SSE41;
- }
- if sse2_detected() {
- return Platform::SSE2;
- }
- }
- // We don't use dynamic feature detection for NEON. If the "neon"
- // feature is on, NEON is assumed to be supported.
- #[cfg(feature = "neon")]
- {
- return Platform::NEON;
- }
- Platform::Portable
- }
-
- pub fn simd_degree(&self) -> usize {
- let degree = match self {
- Platform::Portable => 1,
- #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
- Platform::SSE2 => 4,
- #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
- Platform::SSE41 => 4,
- #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
- Platform::AVX2 => 8,
- #[cfg(blake3_avx512_ffi)]
- #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
- Platform::AVX512 => 16,
- #[cfg(feature = "neon")]
- Platform::NEON => 4,
- };
- debug_assert!(degree <= MAX_SIMD_DEGREE);
- degree
- }
-
- pub fn compress_in_place(
- &self,
- cv: &mut CVWords,
- block: &[u8; BLOCK_LEN],
- block_len: u8,
- counter: u64,
- flags: u8,
- ) {
- match self {
- Platform::Portable => portable::compress_in_place(cv, block, block_len, counter, flags),
- // Safe because detect() checked for platform support.
- #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
- Platform::SSE2 => unsafe {
- crate::sse2::compress_in_place(cv, block, block_len, counter, flags)
- },
- // Safe because detect() checked for platform support.
- #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
- Platform::SSE41 | Platform::AVX2 => unsafe {
- crate::sse41::compress_in_place(cv, block, block_len, counter, flags)
- },
- // Safe because detect() checked for platform support.
- #[cfg(blake3_avx512_ffi)]
- #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
- Platform::AVX512 => unsafe {
- crate::avx512::compress_in_place(cv, block, block_len, counter, flags)
- },
- // No NEON compress_in_place() implementation yet.
- #[cfg(feature = "neon")]
- Platform::NEON => portable::compress_in_place(cv, block, block_len, counter, flags),
- }
- }
-
- pub fn compress_xof(
- &self,
- cv: &CVWords,
- block: &[u8; BLOCK_LEN],
- block_len: u8,
- counter: u64,
- flags: u8,
- ) -> [u8; 64] {
- match self {
- Platform::Portable => portable::compress_xof(cv, block, block_len, counter, flags),
- // Safe because detect() checked for platform support.
- #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
- Platform::SSE2 => unsafe {
- crate::sse2::compress_xof(cv, block, block_len, counter, flags)
- },
- // Safe because detect() checked for platform support.
- #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
- Platform::SSE41 | Platform::AVX2 => unsafe {
- crate::sse41::compress_xof(cv, block, block_len, counter, flags)
- },
- // Safe because detect() checked for platform support.
- #[cfg(blake3_avx512_ffi)]
- #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
- Platform::AVX512 => unsafe {
- crate::avx512::compress_xof(cv, block, block_len, counter, flags)
- },
- // No NEON compress_xof() implementation yet.
- #[cfg(feature = "neon")]
- Platform::NEON => portable::compress_xof(cv, block, block_len, counter, flags),
- }
- }
-
- // IMPLEMENTATION NOTE
- // ===================
- // hash_many() applies two optimizations. The critically important
- // optimization is the high-performance parallel SIMD hashing mode,
- // described in detail in the spec. This more than doubles throughput per
- // thread. Another optimization is keeping the state vectors transposed
- // from block to block within a chunk. When state vectors are transposed
- // after every block, there's a small but measurable performance loss.
- // Compressing chunks with a dedicated loop avoids this.
-
- pub fn hash_many<A: arrayvec::Array<Item = u8>>(
- &self,
- inputs: &[&A],
- key: &CVWords,
- counter: u64,
- increment_counter: IncrementCounter,
- flags: u8,
- flags_start: u8,
- flags_end: u8,
- out: &mut [u8],
- ) {
- match self {
- Platform::Portable => portable::hash_many(
- inputs,
- key,
- counter,
- increment_counter,
- flags,
- flags_start,
- flags_end,
- out,
- ),
- // Safe because detect() checked for platform support.
- #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
- Platform::SSE2 => unsafe {
- crate::sse2::hash_many(
- inputs,
- key,
- counter,
- increment_counter,
- flags,
- flags_start,
- flags_end,
- out,
- )
- },
- // Safe because detect() checked for platform support.
- #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
- Platform::SSE41 => unsafe {
- crate::sse41::hash_many(
- inputs,
- key,
- counter,
- increment_counter,
- flags,
- flags_start,
- flags_end,
- out,
- )
- },
- // Safe because detect() checked for platform support.
- #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
- Platform::AVX2 => unsafe {
- crate::avx2::hash_many(
- inputs,
- key,
- counter,
- increment_counter,
- flags,
- flags_start,
- flags_end,
- out,
- )
- },
- // Safe because detect() checked for platform support.
- #[cfg(blake3_avx512_ffi)]
- #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
- Platform::AVX512 => unsafe {
- crate::avx512::hash_many(
- inputs,
- key,
- counter,
- increment_counter,
- flags,
- flags_start,
- flags_end,
- out,
- )
- },
- // Assumed to be safe if the "neon" feature is on.
- #[cfg(feature = "neon")]
- Platform::NEON => unsafe {
- crate::neon::hash_many(
- inputs,
- key,
- counter,
- increment_counter,
- flags,
- flags_start,
- flags_end,
- out,
- )
- },
- }
- }
-
- // Explicit platform constructors, for benchmarks.
-
- pub fn portable() -> Self {
- Self::Portable
- }
-
- #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
- pub fn sse2() -> Option<Self> {
- if sse2_detected() {
- Some(Self::SSE2)
- } else {
- None
- }
- }
-
- #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
- pub fn sse41() -> Option<Self> {
- if sse41_detected() {
- Some(Self::SSE41)
- } else {
- None
- }
- }
-
- #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
- pub fn avx2() -> Option<Self> {
- if avx2_detected() {
- Some(Self::AVX2)
- } else {
- None
- }
- }
-
- #[cfg(blake3_avx512_ffi)]
- #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
- pub fn avx512() -> Option<Self> {
- if avx512_detected() {
- Some(Self::AVX512)
- } else {
- None
- }
- }
-
- #[cfg(feature = "neon")]
- pub fn neon() -> Option<Self> {
- // Assumed to be safe if the "neon" feature is on.
- Some(Self::NEON)
- }
-}
-
-// Note that AVX-512 is divided into multiple featuresets, and we use two of
-// them, F and VL.
-#[cfg(blake3_avx512_ffi)]
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-#[inline(always)]
-pub fn avx512_detected() -> bool {
- // A testing-only short-circuit.
- if cfg!(feature = "no_avx512") {
- return false;
- }
- // Static check, e.g. for building with target-cpu=native.
- #[cfg(all(target_feature = "avx512f", target_feature = "avx512vl"))]
- {
- return true;
- }
- // Dynamic check, if std is enabled.
- #[cfg(feature = "std")]
- {
- if is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512vl") {
- return true;
- }
- }
- false
-}
-
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-#[inline(always)]
-pub fn avx2_detected() -> bool {
- // A testing-only short-circuit.
- if cfg!(feature = "no_avx2") {
- return false;
- }
- // Static check, e.g. for building with target-cpu=native.
- #[cfg(target_feature = "avx2")]
- {
- return true;
- }
- // Dynamic check, if std is enabled.
- #[cfg(feature = "std")]
- {
- if is_x86_feature_detected!("avx2") {
- return true;
- }
- }
- false
-}
-
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-#[inline(always)]
-pub fn sse41_detected() -> bool {
- // A testing-only short-circuit.
- if cfg!(feature = "no_sse41") {
- return false;
- }
- // Static check, e.g. for building with target-cpu=native.
- #[cfg(target_feature = "sse4.1")]
- {
- return true;
- }
- // Dynamic check, if std is enabled.
- #[cfg(feature = "std")]
- {
- if is_x86_feature_detected!("sse4.1") {
- return true;
- }
- }
- false
-}
-
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-#[inline(always)]
-#[allow(unreachable_code)]
-pub fn sse2_detected() -> bool {
- // A testing-only short-circuit.
- if cfg!(feature = "no_sse2") {
- return false;
- }
- // Static check, e.g. for building with target-cpu=native.
- #[cfg(target_feature = "sse2")]
- {
- return true;
- }
- // Dynamic check, if std is enabled.
- #[cfg(feature = "std")]
- {
- if is_x86_feature_detected!("sse2") {
- return true;
- }
- }
- false
-}
-
-#[inline(always)]
-pub fn words_from_le_bytes_32(bytes: &[u8; 32]) -> [u32; 8] {
- let mut out = [0; 8];
- out[0] = u32::from_le_bytes(*array_ref!(bytes, 0 * 4, 4));
- out[1] = u32::from_le_bytes(*array_ref!(bytes, 1 * 4, 4));
- out[2] = u32::from_le_bytes(*array_ref!(bytes, 2 * 4, 4));
- out[3] = u32::from_le_bytes(*array_ref!(bytes, 3 * 4, 4));
- out[4] = u32::from_le_bytes(*array_ref!(bytes, 4 * 4, 4));
- out[5] = u32::from_le_bytes(*array_ref!(bytes, 5 * 4, 4));
- out[6] = u32::from_le_bytes(*array_ref!(bytes, 6 * 4, 4));
- out[7] = u32::from_le_bytes(*array_ref!(bytes, 7 * 4, 4));
- out
-}
-
-#[inline(always)]
-pub fn words_from_le_bytes_64(bytes: &[u8; 64]) -> [u32; 16] {
- let mut out = [0; 16];
- out[0] = u32::from_le_bytes(*array_ref!(bytes, 0 * 4, 4));
- out[1] = u32::from_le_bytes(*array_ref!(bytes, 1 * 4, 4));
- out[2] = u32::from_le_bytes(*array_ref!(bytes, 2 * 4, 4));
- out[3] = u32::from_le_bytes(*array_ref!(bytes, 3 * 4, 4));
- out[4] = u32::from_le_bytes(*array_ref!(bytes, 4 * 4, 4));
- out[5] = u32::from_le_bytes(*array_ref!(bytes, 5 * 4, 4));
- out[6] = u32::from_le_bytes(*array_ref!(bytes, 6 * 4, 4));
- out[7] = u32::from_le_bytes(*array_ref!(bytes, 7 * 4, 4));
- out[8] = u32::from_le_bytes(*array_ref!(bytes, 8 * 4, 4));
- out[9] = u32::from_le_bytes(*array_ref!(bytes, 9 * 4, 4));
- out[10] = u32::from_le_bytes(*array_ref!(bytes, 10 * 4, 4));
- out[11] = u32::from_le_bytes(*array_ref!(bytes, 11 * 4, 4));
- out[12] = u32::from_le_bytes(*array_ref!(bytes, 12 * 4, 4));
- out[13] = u32::from_le_bytes(*array_ref!(bytes, 13 * 4, 4));
- out[14] = u32::from_le_bytes(*array_ref!(bytes, 14 * 4, 4));
- out[15] = u32::from_le_bytes(*array_ref!(bytes, 15 * 4, 4));
- out
-}
-
-#[inline(always)]
-pub fn le_bytes_from_words_32(words: &[u32; 8]) -> [u8; 32] {
- let mut out = [0; 32];
- *array_mut_ref!(out, 0 * 4, 4) = words[0].to_le_bytes();
- *array_mut_ref!(out, 1 * 4, 4) = words[1].to_le_bytes();
- *array_mut_ref!(out, 2 * 4, 4) = words[2].to_le_bytes();
- *array_mut_ref!(out, 3 * 4, 4) = words[3].to_le_bytes();
- *array_mut_ref!(out, 4 * 4, 4) = words[4].to_le_bytes();
- *array_mut_ref!(out, 5 * 4, 4) = words[5].to_le_bytes();
- *array_mut_ref!(out, 6 * 4, 4) = words[6].to_le_bytes();
- *array_mut_ref!(out, 7 * 4, 4) = words[7].to_le_bytes();
- out
-}
-
-#[inline(always)]
-pub fn le_bytes_from_words_64(words: &[u32; 16]) -> [u8; 64] {
- let mut out = [0; 64];
- *array_mut_ref!(out, 0 * 4, 4) = words[0].to_le_bytes();
- *array_mut_ref!(out, 1 * 4, 4) = words[1].to_le_bytes();
- *array_mut_ref!(out, 2 * 4, 4) = words[2].to_le_bytes();
- *array_mut_ref!(out, 3 * 4, 4) = words[3].to_le_bytes();
- *array_mut_ref!(out, 4 * 4, 4) = words[4].to_le_bytes();
- *array_mut_ref!(out, 5 * 4, 4) = words[5].to_le_bytes();
- *array_mut_ref!(out, 6 * 4, 4) = words[6].to_le_bytes();
- *array_mut_ref!(out, 7 * 4, 4) = words[7].to_le_bytes();
- *array_mut_ref!(out, 8 * 4, 4) = words[8].to_le_bytes();
- *array_mut_ref!(out, 9 * 4, 4) = words[9].to_le_bytes();
- *array_mut_ref!(out, 10 * 4, 4) = words[10].to_le_bytes();
- *array_mut_ref!(out, 11 * 4, 4) = words[11].to_le_bytes();
- *array_mut_ref!(out, 12 * 4, 4) = words[12].to_le_bytes();
- *array_mut_ref!(out, 13 * 4, 4) = words[13].to_le_bytes();
- *array_mut_ref!(out, 14 * 4, 4) = words[14].to_le_bytes();
- *array_mut_ref!(out, 15 * 4, 4) = words[15].to_le_bytes();
- out
-}
diff --git a/thirdparty/BLAKE3/src/portable.rs b/thirdparty/BLAKE3/src/portable.rs
deleted file mode 100644
index 0a569cec7..000000000
--- a/thirdparty/BLAKE3/src/portable.rs
+++ /dev/null
@@ -1,198 +0,0 @@
-use crate::{
- counter_high, counter_low, CVBytes, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE,
- OUT_LEN,
-};
-use arrayref::{array_mut_ref, array_ref};
-
-#[inline(always)]
-fn g(state: &mut [u32; 16], a: usize, b: usize, c: usize, d: usize, x: u32, y: u32) {
- state[a] = state[a].wrapping_add(state[b]).wrapping_add(x);
- state[d] = (state[d] ^ state[a]).rotate_right(16);
- state[c] = state[c].wrapping_add(state[d]);
- state[b] = (state[b] ^ state[c]).rotate_right(12);
- state[a] = state[a].wrapping_add(state[b]).wrapping_add(y);
- state[d] = (state[d] ^ state[a]).rotate_right(8);
- state[c] = state[c].wrapping_add(state[d]);
- state[b] = (state[b] ^ state[c]).rotate_right(7);
-}
-
-#[inline(always)]
-fn round(state: &mut [u32; 16], msg: &[u32; 16], round: usize) {
- // Select the message schedule based on the round.
- let schedule = MSG_SCHEDULE[round];
-
- // Mix the columns.
- g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]);
- g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]);
- g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]);
- g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]);
-
- // Mix the diagonals.
- g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]);
- g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]);
- g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]);
- g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]);
-}
-
-#[inline(always)]
-fn compress_pre(
- cv: &CVWords,
- block: &[u8; BLOCK_LEN],
- block_len: u8,
- counter: u64,
- flags: u8,
-) -> [u32; 16] {
- let block_words = crate::platform::words_from_le_bytes_64(block);
-
- let mut state = [
- cv[0],
- cv[1],
- cv[2],
- cv[3],
- cv[4],
- cv[5],
- cv[6],
- cv[7],
- IV[0],
- IV[1],
- IV[2],
- IV[3],
- counter_low(counter),
- counter_high(counter),
- block_len as u32,
- flags as u32,
- ];
-
- round(&mut state, &block_words, 0);
- round(&mut state, &block_words, 1);
- round(&mut state, &block_words, 2);
- round(&mut state, &block_words, 3);
- round(&mut state, &block_words, 4);
- round(&mut state, &block_words, 5);
- round(&mut state, &block_words, 6);
-
- state
-}
-
-pub fn compress_in_place(
- cv: &mut CVWords,
- block: &[u8; BLOCK_LEN],
- block_len: u8,
- counter: u64,
- flags: u8,
-) {
- let state = compress_pre(cv, block, block_len, counter, flags);
-
- cv[0] = state[0] ^ state[8];
- cv[1] = state[1] ^ state[9];
- cv[2] = state[2] ^ state[10];
- cv[3] = state[3] ^ state[11];
- cv[4] = state[4] ^ state[12];
- cv[5] = state[5] ^ state[13];
- cv[6] = state[6] ^ state[14];
- cv[7] = state[7] ^ state[15];
-}
-
-pub fn compress_xof(
- cv: &CVWords,
- block: &[u8; BLOCK_LEN],
- block_len: u8,
- counter: u64,
- flags: u8,
-) -> [u8; 64] {
- let mut state = compress_pre(cv, block, block_len, counter, flags);
- state[0] ^= state[8];
- state[1] ^= state[9];
- state[2] ^= state[10];
- state[3] ^= state[11];
- state[4] ^= state[12];
- state[5] ^= state[13];
- state[6] ^= state[14];
- state[7] ^= state[15];
- state[8] ^= cv[0];
- state[9] ^= cv[1];
- state[10] ^= cv[2];
- state[11] ^= cv[3];
- state[12] ^= cv[4];
- state[13] ^= cv[5];
- state[14] ^= cv[6];
- state[15] ^= cv[7];
- crate::platform::le_bytes_from_words_64(&state)
-}
-
-pub fn hash1<A: arrayvec::Array<Item = u8>>(
- input: &A,
- key: &CVWords,
- counter: u64,
- flags: u8,
- flags_start: u8,
- flags_end: u8,
- out: &mut CVBytes,
-) {
- debug_assert_eq!(A::CAPACITY % BLOCK_LEN, 0, "uneven blocks");
- let mut cv = *key;
- let mut block_flags = flags | flags_start;
- let mut slice = input.as_slice();
- while slice.len() >= BLOCK_LEN {
- if slice.len() == BLOCK_LEN {
- block_flags |= flags_end;
- }
- compress_in_place(
- &mut cv,
- array_ref!(slice, 0, BLOCK_LEN),
- BLOCK_LEN as u8,
- counter,
- block_flags,
- );
- block_flags = flags;
- slice = &slice[BLOCK_LEN..];
- }
- *out = crate::platform::le_bytes_from_words_32(&cv);
-}
-
-pub fn hash_many<A: arrayvec::Array<Item = u8>>(
- inputs: &[&A],
- key: &CVWords,
- mut counter: u64,
- increment_counter: IncrementCounter,
- flags: u8,
- flags_start: u8,
- flags_end: u8,
- out: &mut [u8],
-) {
- debug_assert!(out.len() >= inputs.len() * OUT_LEN, "out too short");
- for (&input, output) in inputs.iter().zip(out.chunks_exact_mut(OUT_LEN)) {
- hash1(
- input,
- key,
- counter,
- flags,
- flags_start,
- flags_end,
- array_mut_ref!(output, 0, OUT_LEN),
- );
- if increment_counter.yes() {
- counter += 1;
- }
- }
-}
-
-#[cfg(test)]
-pub mod test {
- use super::*;
-
- // This is basically testing the portable implementation against itself,
- // but it also checks that compress_in_place and compress_xof are
- // consistent. And there are tests against the reference implementation and
- // against hardcoded test vectors elsewhere.
- #[test]
- fn test_compress() {
- crate::test::test_compress_fn(compress_in_place, compress_xof);
- }
-
- // Ditto.
- #[test]
- fn test_hash_many() {
- crate::test::test_hash_many_fn(hash_many, hash_many);
- }
-}
diff --git a/thirdparty/BLAKE3/src/rust_avx2.rs b/thirdparty/BLAKE3/src/rust_avx2.rs
deleted file mode 100644
index 6ab773ad4..000000000
--- a/thirdparty/BLAKE3/src/rust_avx2.rs
+++ /dev/null
@@ -1,474 +0,0 @@
-#[cfg(target_arch = "x86")]
-use core::arch::x86::*;
-#[cfg(target_arch = "x86_64")]
-use core::arch::x86_64::*;
-
-use crate::{
- counter_high, counter_low, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE, OUT_LEN,
-};
-use arrayref::{array_mut_ref, mut_array_refs};
-
-pub const DEGREE: usize = 8;
-
-#[inline(always)]
-unsafe fn loadu(src: *const u8) -> __m256i {
- // This is an unaligned load, so the pointer cast is allowed.
- _mm256_loadu_si256(src as *const __m256i)
-}
-
-#[inline(always)]
-unsafe fn storeu(src: __m256i, dest: *mut u8) {
- // This is an unaligned store, so the pointer cast is allowed.
- _mm256_storeu_si256(dest as *mut __m256i, src)
-}
-
-#[inline(always)]
-unsafe fn add(a: __m256i, b: __m256i) -> __m256i {
- _mm256_add_epi32(a, b)
-}
-
-#[inline(always)]
-unsafe fn xor(a: __m256i, b: __m256i) -> __m256i {
- _mm256_xor_si256(a, b)
-}
-
-#[inline(always)]
-unsafe fn set1(x: u32) -> __m256i {
- _mm256_set1_epi32(x as i32)
-}
-
-#[inline(always)]
-unsafe fn set8(a: u32, b: u32, c: u32, d: u32, e: u32, f: u32, g: u32, h: u32) -> __m256i {
- _mm256_setr_epi32(
- a as i32, b as i32, c as i32, d as i32, e as i32, f as i32, g as i32, h as i32,
- )
-}
-
-// These rotations are the "simple/shifts version". For the
-// "complicated/shuffles version", see
-// https://github.com/sneves/blake2-avx2/blob/b3723921f668df09ece52dcd225a36d4a4eea1d9/blake2s-common.h#L63-L66.
-// For a discussion of the tradeoffs, see
-// https://github.com/sneves/blake2-avx2/pull/5. Due to an LLVM bug
-// (https://bugs.llvm.org/show_bug.cgi?id=44379), this version performs better
-// on recent x86 chips.
-
-#[inline(always)]
-unsafe fn rot16(x: __m256i) -> __m256i {
- _mm256_or_si256(_mm256_srli_epi32(x, 16), _mm256_slli_epi32(x, 32 - 16))
-}
-
-#[inline(always)]
-unsafe fn rot12(x: __m256i) -> __m256i {
- _mm256_or_si256(_mm256_srli_epi32(x, 12), _mm256_slli_epi32(x, 32 - 12))
-}
-
-#[inline(always)]
-unsafe fn rot8(x: __m256i) -> __m256i {
- _mm256_or_si256(_mm256_srli_epi32(x, 8), _mm256_slli_epi32(x, 32 - 8))
-}
-
-#[inline(always)]
-unsafe fn rot7(x: __m256i) -> __m256i {
- _mm256_or_si256(_mm256_srli_epi32(x, 7), _mm256_slli_epi32(x, 32 - 7))
-}
-
-#[inline(always)]
-unsafe fn round(v: &mut [__m256i; 16], m: &[__m256i; 16], r: usize) {
- v[0] = add(v[0], m[MSG_SCHEDULE[r][0] as usize]);
- v[1] = add(v[1], m[MSG_SCHEDULE[r][2] as usize]);
- v[2] = add(v[2], m[MSG_SCHEDULE[r][4] as usize]);
- v[3] = add(v[3], m[MSG_SCHEDULE[r][6] as usize]);
- v[0] = add(v[0], v[4]);
- v[1] = add(v[1], v[5]);
- v[2] = add(v[2], v[6]);
- v[3] = add(v[3], v[7]);
- v[12] = xor(v[12], v[0]);
- v[13] = xor(v[13], v[1]);
- v[14] = xor(v[14], v[2]);
- v[15] = xor(v[15], v[3]);
- v[12] = rot16(v[12]);
- v[13] = rot16(v[13]);
- v[14] = rot16(v[14]);
- v[15] = rot16(v[15]);
- v[8] = add(v[8], v[12]);
- v[9] = add(v[9], v[13]);
- v[10] = add(v[10], v[14]);
- v[11] = add(v[11], v[15]);
- v[4] = xor(v[4], v[8]);
- v[5] = xor(v[5], v[9]);
- v[6] = xor(v[6], v[10]);
- v[7] = xor(v[7], v[11]);
- v[4] = rot12(v[4]);
- v[5] = rot12(v[5]);
- v[6] = rot12(v[6]);
- v[7] = rot12(v[7]);
- v[0] = add(v[0], m[MSG_SCHEDULE[r][1] as usize]);
- v[1] = add(v[1], m[MSG_SCHEDULE[r][3] as usize]);
- v[2] = add(v[2], m[MSG_SCHEDULE[r][5] as usize]);
- v[3] = add(v[3], m[MSG_SCHEDULE[r][7] as usize]);
- v[0] = add(v[0], v[4]);
- v[1] = add(v[1], v[5]);
- v[2] = add(v[2], v[6]);
- v[3] = add(v[3], v[7]);
- v[12] = xor(v[12], v[0]);
- v[13] = xor(v[13], v[1]);
- v[14] = xor(v[14], v[2]);
- v[15] = xor(v[15], v[3]);
- v[12] = rot8(v[12]);
- v[13] = rot8(v[13]);
- v[14] = rot8(v[14]);
- v[15] = rot8(v[15]);
- v[8] = add(v[8], v[12]);
- v[9] = add(v[9], v[13]);
- v[10] = add(v[10], v[14]);
- v[11] = add(v[11], v[15]);
- v[4] = xor(v[4], v[8]);
- v[5] = xor(v[5], v[9]);
- v[6] = xor(v[6], v[10]);
- v[7] = xor(v[7], v[11]);
- v[4] = rot7(v[4]);
- v[5] = rot7(v[5]);
- v[6] = rot7(v[6]);
- v[7] = rot7(v[7]);
-
- v[0] = add(v[0], m[MSG_SCHEDULE[r][8] as usize]);
- v[1] = add(v[1], m[MSG_SCHEDULE[r][10] as usize]);
- v[2] = add(v[2], m[MSG_SCHEDULE[r][12] as usize]);
- v[3] = add(v[3], m[MSG_SCHEDULE[r][14] as usize]);
- v[0] = add(v[0], v[5]);
- v[1] = add(v[1], v[6]);
- v[2] = add(v[2], v[7]);
- v[3] = add(v[3], v[4]);
- v[15] = xor(v[15], v[0]);
- v[12] = xor(v[12], v[1]);
- v[13] = xor(v[13], v[2]);
- v[14] = xor(v[14], v[3]);
- v[15] = rot16(v[15]);
- v[12] = rot16(v[12]);
- v[13] = rot16(v[13]);
- v[14] = rot16(v[14]);
- v[10] = add(v[10], v[15]);
- v[11] = add(v[11], v[12]);
- v[8] = add(v[8], v[13]);
- v[9] = add(v[9], v[14]);
- v[5] = xor(v[5], v[10]);
- v[6] = xor(v[6], v[11]);
- v[7] = xor(v[7], v[8]);
- v[4] = xor(v[4], v[9]);
- v[5] = rot12(v[5]);
- v[6] = rot12(v[6]);
- v[7] = rot12(v[7]);
- v[4] = rot12(v[4]);
- v[0] = add(v[0], m[MSG_SCHEDULE[r][9] as usize]);
- v[1] = add(v[1], m[MSG_SCHEDULE[r][11] as usize]);
- v[2] = add(v[2], m[MSG_SCHEDULE[r][13] as usize]);
- v[3] = add(v[3], m[MSG_SCHEDULE[r][15] as usize]);
- v[0] = add(v[0], v[5]);
- v[1] = add(v[1], v[6]);
- v[2] = add(v[2], v[7]);
- v[3] = add(v[3], v[4]);
- v[15] = xor(v[15], v[0]);
- v[12] = xor(v[12], v[1]);
- v[13] = xor(v[13], v[2]);
- v[14] = xor(v[14], v[3]);
- v[15] = rot8(v[15]);
- v[12] = rot8(v[12]);
- v[13] = rot8(v[13]);
- v[14] = rot8(v[14]);
- v[10] = add(v[10], v[15]);
- v[11] = add(v[11], v[12]);
- v[8] = add(v[8], v[13]);
- v[9] = add(v[9], v[14]);
- v[5] = xor(v[5], v[10]);
- v[6] = xor(v[6], v[11]);
- v[7] = xor(v[7], v[8]);
- v[4] = xor(v[4], v[9]);
- v[5] = rot7(v[5]);
- v[6] = rot7(v[6]);
- v[7] = rot7(v[7]);
- v[4] = rot7(v[4]);
-}
-
-#[inline(always)]
-unsafe fn interleave128(a: __m256i, b: __m256i) -> (__m256i, __m256i) {
- (
- _mm256_permute2x128_si256(a, b, 0x20),
- _mm256_permute2x128_si256(a, b, 0x31),
- )
-}
-
-// There are several ways to do a transposition. We could do it naively, with 8 separate
-// _mm256_set_epi32 instructions, referencing each of the 32 words explicitly. Or we could copy
-// the vecs into contiguous storage and then use gather instructions. This third approach is to use
-// a series of unpack instructions to interleave the vectors. In my benchmarks, interleaving is the
-// fastest approach. To test this, run `cargo +nightly bench --bench libtest load_8` in the
-// https://github.com/oconnor663/bao_experiments repo.
-#[inline(always)]
-unsafe fn transpose_vecs(vecs: &mut [__m256i; DEGREE]) {
- // Interleave 32-bit lanes. The low unpack is lanes 00/11/44/55, and the high is 22/33/66/77.
- let ab_0145 = _mm256_unpacklo_epi32(vecs[0], vecs[1]);
- let ab_2367 = _mm256_unpackhi_epi32(vecs[0], vecs[1]);
- let cd_0145 = _mm256_unpacklo_epi32(vecs[2], vecs[3]);
- let cd_2367 = _mm256_unpackhi_epi32(vecs[2], vecs[3]);
- let ef_0145 = _mm256_unpacklo_epi32(vecs[4], vecs[5]);
- let ef_2367 = _mm256_unpackhi_epi32(vecs[4], vecs[5]);
- let gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]);
- let gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]);
-
- // Interleave 64-bit lates. The low unpack is lanes 00/22 and the high is 11/33.
- let abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145);
- let abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145);
- let abcd_26 = _mm256_unpacklo_epi64(ab_2367, cd_2367);
- let abcd_37 = _mm256_unpackhi_epi64(ab_2367, cd_2367);
- let efgh_04 = _mm256_unpacklo_epi64(ef_0145, gh_0145);
- let efgh_15 = _mm256_unpackhi_epi64(ef_0145, gh_0145);
- let efgh_26 = _mm256_unpacklo_epi64(ef_2367, gh_2367);
- let efgh_37 = _mm256_unpackhi_epi64(ef_2367, gh_2367);
-
- // Interleave 128-bit lanes.
- let (abcdefgh_0, abcdefgh_4) = interleave128(abcd_04, efgh_04);
- let (abcdefgh_1, abcdefgh_5) = interleave128(abcd_15, efgh_15);
- let (abcdefgh_2, abcdefgh_6) = interleave128(abcd_26, efgh_26);
- let (abcdefgh_3, abcdefgh_7) = interleave128(abcd_37, efgh_37);
-
- vecs[0] = abcdefgh_0;
- vecs[1] = abcdefgh_1;
- vecs[2] = abcdefgh_2;
- vecs[3] = abcdefgh_3;
- vecs[4] = abcdefgh_4;
- vecs[5] = abcdefgh_5;
- vecs[6] = abcdefgh_6;
- vecs[7] = abcdefgh_7;
-}
-
-#[inline(always)]
-unsafe fn transpose_msg_vecs(inputs: &[*const u8; DEGREE], block_offset: usize) -> [__m256i; 16] {
- let mut vecs = [
- loadu(inputs[0].add(block_offset + 0 * 4 * DEGREE)),
- loadu(inputs[1].add(block_offset + 0 * 4 * DEGREE)),
- loadu(inputs[2].add(block_offset + 0 * 4 * DEGREE)),
- loadu(inputs[3].add(block_offset + 0 * 4 * DEGREE)),
- loadu(inputs[4].add(block_offset + 0 * 4 * DEGREE)),
- loadu(inputs[5].add(block_offset + 0 * 4 * DEGREE)),
- loadu(inputs[6].add(block_offset + 0 * 4 * DEGREE)),
- loadu(inputs[7].add(block_offset + 0 * 4 * DEGREE)),
- loadu(inputs[0].add(block_offset + 1 * 4 * DEGREE)),
- loadu(inputs[1].add(block_offset + 1 * 4 * DEGREE)),
- loadu(inputs[2].add(block_offset + 1 * 4 * DEGREE)),
- loadu(inputs[3].add(block_offset + 1 * 4 * DEGREE)),
- loadu(inputs[4].add(block_offset + 1 * 4 * DEGREE)),
- loadu(inputs[5].add(block_offset + 1 * 4 * DEGREE)),
- loadu(inputs[6].add(block_offset + 1 * 4 * DEGREE)),
- loadu(inputs[7].add(block_offset + 1 * 4 * DEGREE)),
- ];
- for i in 0..DEGREE {
- _mm_prefetch(inputs[i].add(block_offset + 256) as *const i8, _MM_HINT_T0);
- }
- let squares = mut_array_refs!(&mut vecs, DEGREE, DEGREE);
- transpose_vecs(squares.0);
- transpose_vecs(squares.1);
- vecs
-}
-
-#[inline(always)]
-unsafe fn load_counters(counter: u64, increment_counter: IncrementCounter) -> (__m256i, __m256i) {
- let mask = if increment_counter.yes() { !0 } else { 0 };
- (
- set8(
- counter_low(counter + (mask & 0)),
- counter_low(counter + (mask & 1)),
- counter_low(counter + (mask & 2)),
- counter_low(counter + (mask & 3)),
- counter_low(counter + (mask & 4)),
- counter_low(counter + (mask & 5)),
- counter_low(counter + (mask & 6)),
- counter_low(counter + (mask & 7)),
- ),
- set8(
- counter_high(counter + (mask & 0)),
- counter_high(counter + (mask & 1)),
- counter_high(counter + (mask & 2)),
- counter_high(counter + (mask & 3)),
- counter_high(counter + (mask & 4)),
- counter_high(counter + (mask & 5)),
- counter_high(counter + (mask & 6)),
- counter_high(counter + (mask & 7)),
- ),
- )
-}
-
-#[target_feature(enable = "avx2")]
-pub unsafe fn hash8(
- inputs: &[*const u8; DEGREE],
- blocks: usize,
- key: &CVWords,
- counter: u64,
- increment_counter: IncrementCounter,
- flags: u8,
- flags_start: u8,
- flags_end: u8,
- out: &mut [u8; DEGREE * OUT_LEN],
-) {
- let mut h_vecs = [
- set1(key[0]),
- set1(key[1]),
- set1(key[2]),
- set1(key[3]),
- set1(key[4]),
- set1(key[5]),
- set1(key[6]),
- set1(key[7]),
- ];
- let (counter_low_vec, counter_high_vec) = load_counters(counter, increment_counter);
- let mut block_flags = flags | flags_start;
-
- for block in 0..blocks {
- if block + 1 == blocks {
- block_flags |= flags_end;
- }
- let block_len_vec = set1(BLOCK_LEN as u32); // full blocks only
- let block_flags_vec = set1(block_flags as u32);
- let msg_vecs = transpose_msg_vecs(inputs, block * BLOCK_LEN);
-
- // The transposed compression function. Note that inlining this
- // manually here improves compile times by a lot, compared to factoring
- // it out into its own function and making it #[inline(always)]. Just
- // guessing, it might have something to do with loop unrolling.
- let mut v = [
- h_vecs[0],
- h_vecs[1],
- h_vecs[2],
- h_vecs[3],
- h_vecs[4],
- h_vecs[5],
- h_vecs[6],
- h_vecs[7],
- set1(IV[0]),
- set1(IV[1]),
- set1(IV[2]),
- set1(IV[3]),
- counter_low_vec,
- counter_high_vec,
- block_len_vec,
- block_flags_vec,
- ];
- round(&mut v, &msg_vecs, 0);
- round(&mut v, &msg_vecs, 1);
- round(&mut v, &msg_vecs, 2);
- round(&mut v, &msg_vecs, 3);
- round(&mut v, &msg_vecs, 4);
- round(&mut v, &msg_vecs, 5);
- round(&mut v, &msg_vecs, 6);
- h_vecs[0] = xor(v[0], v[8]);
- h_vecs[1] = xor(v[1], v[9]);
- h_vecs[2] = xor(v[2], v[10]);
- h_vecs[3] = xor(v[3], v[11]);
- h_vecs[4] = xor(v[4], v[12]);
- h_vecs[5] = xor(v[5], v[13]);
- h_vecs[6] = xor(v[6], v[14]);
- h_vecs[7] = xor(v[7], v[15]);
-
- block_flags = flags;
- }
-
- transpose_vecs(&mut h_vecs);
- storeu(h_vecs[0], out.as_mut_ptr().add(0 * 4 * DEGREE));
- storeu(h_vecs[1], out.as_mut_ptr().add(1 * 4 * DEGREE));
- storeu(h_vecs[2], out.as_mut_ptr().add(2 * 4 * DEGREE));
- storeu(h_vecs[3], out.as_mut_ptr().add(3 * 4 * DEGREE));
- storeu(h_vecs[4], out.as_mut_ptr().add(4 * 4 * DEGREE));
- storeu(h_vecs[5], out.as_mut_ptr().add(5 * 4 * DEGREE));
- storeu(h_vecs[6], out.as_mut_ptr().add(6 * 4 * DEGREE));
- storeu(h_vecs[7], out.as_mut_ptr().add(7 * 4 * DEGREE));
-}
-
-#[target_feature(enable = "avx2")]
-pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>(
- mut inputs: &[&A],
- key: &CVWords,
- mut counter: u64,
- increment_counter: IncrementCounter,
- flags: u8,
- flags_start: u8,
- flags_end: u8,
- mut out: &mut [u8],
-) {
- debug_assert!(out.len() >= inputs.len() * OUT_LEN, "out too short");
- while inputs.len() >= DEGREE && out.len() >= DEGREE * OUT_LEN {
- // Safe because the layout of arrays is guaranteed, and because the
- // `blocks` count is determined statically from the argument type.
- let input_ptrs: &[*const u8; DEGREE] = &*(inputs.as_ptr() as *const [*const u8; DEGREE]);
- let blocks = A::CAPACITY / BLOCK_LEN;
- hash8(
- input_ptrs,
- blocks,
- key,
- counter,
- increment_counter,
- flags,
- flags_start,
- flags_end,
- array_mut_ref!(out, 0, DEGREE * OUT_LEN),
- );
- if increment_counter.yes() {
- counter += DEGREE as u64;
- }
- inputs = &inputs[DEGREE..];
- out = &mut out[DEGREE * OUT_LEN..];
- }
- crate::sse41::hash_many(
- inputs,
- key,
- counter,
- increment_counter,
- flags,
- flags_start,
- flags_end,
- out,
- );
-}
-
-#[cfg(test)]
-mod test {
- use super::*;
-
- #[test]
- fn test_transpose() {
- if !crate::platform::avx2_detected() {
- return;
- }
-
- #[target_feature(enable = "avx2")]
- unsafe fn transpose_wrapper(vecs: &mut [__m256i; DEGREE]) {
- transpose_vecs(vecs);
- }
-
- let mut matrix = [[0 as u32; DEGREE]; DEGREE];
- for i in 0..DEGREE {
- for j in 0..DEGREE {
- matrix[i][j] = (i * DEGREE + j) as u32;
- }
- }
-
- unsafe {
- let mut vecs: [__m256i; DEGREE] = core::mem::transmute(matrix);
- transpose_wrapper(&mut vecs);
- matrix = core::mem::transmute(vecs);
- }
-
- for i in 0..DEGREE {
- for j in 0..DEGREE {
- // Reversed indexes from above.
- assert_eq!(matrix[j][i], (i * DEGREE + j) as u32);
- }
- }
- }
-
- #[test]
- fn test_hash_many() {
- if !crate::platform::avx2_detected() {
- return;
- }
- crate::test::test_hash_many_fn(hash_many, hash_many);
- }
-}
diff --git a/thirdparty/BLAKE3/src/rust_sse2.rs b/thirdparty/BLAKE3/src/rust_sse2.rs
deleted file mode 100644
index 15b52ee5d..000000000
--- a/thirdparty/BLAKE3/src/rust_sse2.rs
+++ /dev/null
@@ -1,775 +0,0 @@
-#[cfg(target_arch = "x86")]
-use core::arch::x86::*;
-#[cfg(target_arch = "x86_64")]
-use core::arch::x86_64::*;
-
-use crate::{
- counter_high, counter_low, CVBytes, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE,
- OUT_LEN,
-};
-use arrayref::{array_mut_ref, array_ref, mut_array_refs};
-
-pub const DEGREE: usize = 4;
-
-#[inline(always)]
-unsafe fn loadu(src: *const u8) -> __m128i {
- // This is an unaligned load, so the pointer cast is allowed.
- _mm_loadu_si128(src as *const __m128i)
-}
-
-#[inline(always)]
-unsafe fn storeu(src: __m128i, dest: *mut u8) {
- // This is an unaligned store, so the pointer cast is allowed.
- _mm_storeu_si128(dest as *mut __m128i, src)
-}
-
-#[inline(always)]
-unsafe fn add(a: __m128i, b: __m128i) -> __m128i {
- _mm_add_epi32(a, b)
-}
-
-#[inline(always)]
-unsafe fn xor(a: __m128i, b: __m128i) -> __m128i {
- _mm_xor_si128(a, b)
-}
-
-#[inline(always)]
-unsafe fn set1(x: u32) -> __m128i {
- _mm_set1_epi32(x as i32)
-}
-
-#[inline(always)]
-unsafe fn set4(a: u32, b: u32, c: u32, d: u32) -> __m128i {
- _mm_setr_epi32(a as i32, b as i32, c as i32, d as i32)
-}
-
-// These rotations are the "simple/shifts version". For the
-// "complicated/shuffles version", see
-// https://github.com/sneves/blake2-avx2/blob/b3723921f668df09ece52dcd225a36d4a4eea1d9/blake2s-common.h#L63-L66.
-// For a discussion of the tradeoffs, see
-// https://github.com/sneves/blake2-avx2/pull/5. Due to an LLVM bug
-// (https://bugs.llvm.org/show_bug.cgi?id=44379), this version performs better
-// on recent x86 chips.
-
-#[inline(always)]
-unsafe fn rot16(a: __m128i) -> __m128i {
- _mm_or_si128(_mm_srli_epi32(a, 16), _mm_slli_epi32(a, 32 - 16))
-}
-
-#[inline(always)]
-unsafe fn rot12(a: __m128i) -> __m128i {
- _mm_or_si128(_mm_srli_epi32(a, 12), _mm_slli_epi32(a, 32 - 12))
-}
-
-#[inline(always)]
-unsafe fn rot8(a: __m128i) -> __m128i {
- _mm_or_si128(_mm_srli_epi32(a, 8), _mm_slli_epi32(a, 32 - 8))
-}
-
-#[inline(always)]
-unsafe fn rot7(a: __m128i) -> __m128i {
- _mm_or_si128(_mm_srli_epi32(a, 7), _mm_slli_epi32(a, 32 - 7))
-}
-
-#[inline(always)]
-unsafe fn g1(
- row0: &mut __m128i,
- row1: &mut __m128i,
- row2: &mut __m128i,
- row3: &mut __m128i,
- m: __m128i,
-) {
- *row0 = add(add(*row0, m), *row1);
- *row3 = xor(*row3, *row0);
- *row3 = rot16(*row3);
- *row2 = add(*row2, *row3);
- *row1 = xor(*row1, *row2);
- *row1 = rot12(*row1);
-}
-
-#[inline(always)]
-unsafe fn g2(
- row0: &mut __m128i,
- row1: &mut __m128i,
- row2: &mut __m128i,
- row3: &mut __m128i,
- m: __m128i,
-) {
- *row0 = add(add(*row0, m), *row1);
- *row3 = xor(*row3, *row0);
- *row3 = rot8(*row3);
- *row2 = add(*row2, *row3);
- *row1 = xor(*row1, *row2);
- *row1 = rot7(*row1);
-}
-
-// Adapted from https://github.com/rust-lang-nursery/stdsimd/pull/479.
-macro_rules! _MM_SHUFFLE {
- ($z:expr, $y:expr, $x:expr, $w:expr) => {
- ($z << 6) | ($y << 4) | ($x << 2) | $w
- };
-}
-
-macro_rules! shuffle2 {
- ($a:expr, $b:expr, $c:expr) => {
- _mm_castps_si128(_mm_shuffle_ps(
- _mm_castsi128_ps($a),
- _mm_castsi128_ps($b),
- $c,
- ))
- };
-}
-
-// Note the optimization here of leaving row1 as the unrotated row, rather than
-// row0. All the message loads below are adjusted to compensate for this. See
-// discussion at https://github.com/sneves/blake2-avx2/pull/4
-#[inline(always)]
-unsafe fn diagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) {
- *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(2, 1, 0, 3));
- *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2));
- *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(0, 3, 2, 1));
-}
-
-#[inline(always)]
-unsafe fn undiagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) {
- *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(0, 3, 2, 1));
- *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2));
- *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(2, 1, 0, 3));
-}
-
-#[inline(always)]
-unsafe fn blend_epi16(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
- let bits = _mm_set_epi16(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
- let mut mask = _mm_set1_epi16(imm8 as i16);
- mask = _mm_and_si128(mask, bits);
- mask = _mm_cmpeq_epi16(mask, bits);
- _mm_or_si128(_mm_and_si128(mask, b), _mm_andnot_si128(mask, a))
-}
-
-#[inline(always)]
-unsafe fn compress_pre(
- cv: &CVWords,
- block: &[u8; BLOCK_LEN],
- block_len: u8,
- counter: u64,
- flags: u8,
-) -> [__m128i; 4] {
- let row0 = &mut loadu(cv.as_ptr().add(0) as *const u8);
- let row1 = &mut loadu(cv.as_ptr().add(4) as *const u8);
- let row2 = &mut set4(IV[0], IV[1], IV[2], IV[3]);
- let row3 = &mut set4(
- counter_low(counter),
- counter_high(counter),
- block_len as u32,
- flags as u32,
- );
-
- let mut m0 = loadu(block.as_ptr().add(0 * 4 * DEGREE));
- let mut m1 = loadu(block.as_ptr().add(1 * 4 * DEGREE));
- let mut m2 = loadu(block.as_ptr().add(2 * 4 * DEGREE));
- let mut m3 = loadu(block.as_ptr().add(3 * 4 * DEGREE));
-
- let mut t0;
- let mut t1;
- let mut t2;
- let mut t3;
- let mut tt;
-
- // Round 1. The first round permutes the message words from the original
- // input order, into the groups that get mixed in parallel.
- t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(2, 0, 2, 0)); // 6 4 2 0
- g1(row0, row1, row2, row3, t0);
- t1 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 3, 1)); // 7 5 3 1
- g2(row0, row1, row2, row3, t1);
- diagonalize(row0, row2, row3);
- t2 = shuffle2!(m2, m3, _MM_SHUFFLE!(2, 0, 2, 0)); // 14 12 10 8
- t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE!(2, 1, 0, 3)); // 12 10 8 14
- g1(row0, row1, row2, row3, t2);
- t3 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 1, 3, 1)); // 15 13 11 9
- t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE!(2, 1, 0, 3)); // 13 11 9 15
- g2(row0, row1, row2, row3, t3);
- undiagonalize(row0, row2, row3);
- m0 = t0;
- m1 = t1;
- m2 = t2;
- m3 = t3;
-
- // Round 2. This round and all following rounds apply a fixed permutation
- // to the message words from the round before.
- t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
- t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
- g1(row0, row1, row2, row3, t0);
- t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
- tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
- t1 = blend_epi16(tt, t1, 0xCC);
- g2(row0, row1, row2, row3, t1);
- diagonalize(row0, row2, row3);
- t2 = _mm_unpacklo_epi64(m3, m1);
- tt = blend_epi16(t2, m2, 0xC0);
- t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
- g1(row0, row1, row2, row3, t2);
- t3 = _mm_unpackhi_epi32(m1, m3);
- tt = _mm_unpacklo_epi32(m2, t3);
- t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
- g2(row0, row1, row2, row3, t3);
- undiagonalize(row0, row2, row3);
- m0 = t0;
- m1 = t1;
- m2 = t2;
- m3 = t3;
-
- // Round 3
- t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
- t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
- g1(row0, row1, row2, row3, t0);
- t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
- tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
- t1 = blend_epi16(tt, t1, 0xCC);
- g2(row0, row1, row2, row3, t1);
- diagonalize(row0, row2, row3);
- t2 = _mm_unpacklo_epi64(m3, m1);
- tt = blend_epi16(t2, m2, 0xC0);
- t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
- g1(row0, row1, row2, row3, t2);
- t3 = _mm_unpackhi_epi32(m1, m3);
- tt = _mm_unpacklo_epi32(m2, t3);
- t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
- g2(row0, row1, row2, row3, t3);
- undiagonalize(row0, row2, row3);
- m0 = t0;
- m1 = t1;
- m2 = t2;
- m3 = t3;
-
- // Round 4
- t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
- t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
- g1(row0, row1, row2, row3, t0);
- t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
- tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
- t1 = blend_epi16(tt, t1, 0xCC);
- g2(row0, row1, row2, row3, t1);
- diagonalize(row0, row2, row3);
- t2 = _mm_unpacklo_epi64(m3, m1);
- tt = blend_epi16(t2, m2, 0xC0);
- t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
- g1(row0, row1, row2, row3, t2);
- t3 = _mm_unpackhi_epi32(m1, m3);
- tt = _mm_unpacklo_epi32(m2, t3);
- t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
- g2(row0, row1, row2, row3, t3);
- undiagonalize(row0, row2, row3);
- m0 = t0;
- m1 = t1;
- m2 = t2;
- m3 = t3;
-
- // Round 5
- t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
- t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
- g1(row0, row1, row2, row3, t0);
- t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
- tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
- t1 = blend_epi16(tt, t1, 0xCC);
- g2(row0, row1, row2, row3, t1);
- diagonalize(row0, row2, row3);
- t2 = _mm_unpacklo_epi64(m3, m1);
- tt = blend_epi16(t2, m2, 0xC0);
- t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
- g1(row0, row1, row2, row3, t2);
- t3 = _mm_unpackhi_epi32(m1, m3);
- tt = _mm_unpacklo_epi32(m2, t3);
- t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
- g2(row0, row1, row2, row3, t3);
- undiagonalize(row0, row2, row3);
- m0 = t0;
- m1 = t1;
- m2 = t2;
- m3 = t3;
-
- // Round 6
- t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
- t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
- g1(row0, row1, row2, row3, t0);
- t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
- tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
- t1 = blend_epi16(tt, t1, 0xCC);
- g2(row0, row1, row2, row3, t1);
- diagonalize(row0, row2, row3);
- t2 = _mm_unpacklo_epi64(m3, m1);
- tt = blend_epi16(t2, m2, 0xC0);
- t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
- g1(row0, row1, row2, row3, t2);
- t3 = _mm_unpackhi_epi32(m1, m3);
- tt = _mm_unpacklo_epi32(m2, t3);
- t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
- g2(row0, row1, row2, row3, t3);
- undiagonalize(row0, row2, row3);
- m0 = t0;
- m1 = t1;
- m2 = t2;
- m3 = t3;
-
- // Round 7
- t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
- t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
- g1(row0, row1, row2, row3, t0);
- t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
- tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
- t1 = blend_epi16(tt, t1, 0xCC);
- g2(row0, row1, row2, row3, t1);
- diagonalize(row0, row2, row3);
- t2 = _mm_unpacklo_epi64(m3, m1);
- tt = blend_epi16(t2, m2, 0xC0);
- t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
- g1(row0, row1, row2, row3, t2);
- t3 = _mm_unpackhi_epi32(m1, m3);
- tt = _mm_unpacklo_epi32(m2, t3);
- t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
- g2(row0, row1, row2, row3, t3);
- undiagonalize(row0, row2, row3);
-
- [*row0, *row1, *row2, *row3]
-}
-
-#[target_feature(enable = "sse2")]
-pub unsafe fn compress_in_place(
- cv: &mut CVWords,
- block: &[u8; BLOCK_LEN],
- block_len: u8,
- counter: u64,
- flags: u8,
-) {
- let [row0, row1, row2, row3] = compress_pre(cv, block, block_len, counter, flags);
- storeu(xor(row0, row2), cv.as_mut_ptr().add(0) as *mut u8);
- storeu(xor(row1, row3), cv.as_mut_ptr().add(4) as *mut u8);
-}
-
-#[target_feature(enable = "sse2")]
-pub unsafe fn compress_xof(
- cv: &CVWords,
- block: &[u8; BLOCK_LEN],
- block_len: u8,
- counter: u64,
- flags: u8,
-) -> [u8; 64] {
- let [mut row0, mut row1, mut row2, mut row3] =
- compress_pre(cv, block, block_len, counter, flags);
- row0 = xor(row0, row2);
- row1 = xor(row1, row3);
- row2 = xor(row2, loadu(cv.as_ptr().add(0) as *const u8));
- row3 = xor(row3, loadu(cv.as_ptr().add(4) as *const u8));
- core::mem::transmute([row0, row1, row2, row3])
-}
-
-#[inline(always)]
-unsafe fn round(v: &mut [__m128i; 16], m: &[__m128i; 16], r: usize) {
- v[0] = add(v[0], m[MSG_SCHEDULE[r][0] as usize]);
- v[1] = add(v[1], m[MSG_SCHEDULE[r][2] as usize]);
- v[2] = add(v[2], m[MSG_SCHEDULE[r][4] as usize]);
- v[3] = add(v[3], m[MSG_SCHEDULE[r][6] as usize]);
- v[0] = add(v[0], v[4]);
- v[1] = add(v[1], v[5]);
- v[2] = add(v[2], v[6]);
- v[3] = add(v[3], v[7]);
- v[12] = xor(v[12], v[0]);
- v[13] = xor(v[13], v[1]);
- v[14] = xor(v[14], v[2]);
- v[15] = xor(v[15], v[3]);
- v[12] = rot16(v[12]);
- v[13] = rot16(v[13]);
- v[14] = rot16(v[14]);
- v[15] = rot16(v[15]);
- v[8] = add(v[8], v[12]);
- v[9] = add(v[9], v[13]);
- v[10] = add(v[10], v[14]);
- v[11] = add(v[11], v[15]);
- v[4] = xor(v[4], v[8]);
- v[5] = xor(v[5], v[9]);
- v[6] = xor(v[6], v[10]);
- v[7] = xor(v[7], v[11]);
- v[4] = rot12(v[4]);
- v[5] = rot12(v[5]);
- v[6] = rot12(v[6]);
- v[7] = rot12(v[7]);
- v[0] = add(v[0], m[MSG_SCHEDULE[r][1] as usize]);
- v[1] = add(v[1], m[MSG_SCHEDULE[r][3] as usize]);
- v[2] = add(v[2], m[MSG_SCHEDULE[r][5] as usize]);
- v[3] = add(v[3], m[MSG_SCHEDULE[r][7] as usize]);
- v[0] = add(v[0], v[4]);
- v[1] = add(v[1], v[5]);
- v[2] = add(v[2], v[6]);
- v[3] = add(v[3], v[7]);
- v[12] = xor(v[12], v[0]);
- v[13] = xor(v[13], v[1]);
- v[14] = xor(v[14], v[2]);
- v[15] = xor(v[15], v[3]);
- v[12] = rot8(v[12]);
- v[13] = rot8(v[13]);
- v[14] = rot8(v[14]);
- v[15] = rot8(v[15]);
- v[8] = add(v[8], v[12]);
- v[9] = add(v[9], v[13]);
- v[10] = add(v[10], v[14]);
- v[11] = add(v[11], v[15]);
- v[4] = xor(v[4], v[8]);
- v[5] = xor(v[5], v[9]);
- v[6] = xor(v[6], v[10]);
- v[7] = xor(v[7], v[11]);
- v[4] = rot7(v[4]);
- v[5] = rot7(v[5]);
- v[6] = rot7(v[6]);
- v[7] = rot7(v[7]);
-
- v[0] = add(v[0], m[MSG_SCHEDULE[r][8] as usize]);
- v[1] = add(v[1], m[MSG_SCHEDULE[r][10] as usize]);
- v[2] = add(v[2], m[MSG_SCHEDULE[r][12] as usize]);
- v[3] = add(v[3], m[MSG_SCHEDULE[r][14] as usize]);
- v[0] = add(v[0], v[5]);
- v[1] = add(v[1], v[6]);
- v[2] = add(v[2], v[7]);
- v[3] = add(v[3], v[4]);
- v[15] = xor(v[15], v[0]);
- v[12] = xor(v[12], v[1]);
- v[13] = xor(v[13], v[2]);
- v[14] = xor(v[14], v[3]);
- v[15] = rot16(v[15]);
- v[12] = rot16(v[12]);
- v[13] = rot16(v[13]);
- v[14] = rot16(v[14]);
- v[10] = add(v[10], v[15]);
- v[11] = add(v[11], v[12]);
- v[8] = add(v[8], v[13]);
- v[9] = add(v[9], v[14]);
- v[5] = xor(v[5], v[10]);
- v[6] = xor(v[6], v[11]);
- v[7] = xor(v[7], v[8]);
- v[4] = xor(v[4], v[9]);
- v[5] = rot12(v[5]);
- v[6] = rot12(v[6]);
- v[7] = rot12(v[7]);
- v[4] = rot12(v[4]);
- v[0] = add(v[0], m[MSG_SCHEDULE[r][9] as usize]);
- v[1] = add(v[1], m[MSG_SCHEDULE[r][11] as usize]);
- v[2] = add(v[2], m[MSG_SCHEDULE[r][13] as usize]);
- v[3] = add(v[3], m[MSG_SCHEDULE[r][15] as usize]);
- v[0] = add(v[0], v[5]);
- v[1] = add(v[1], v[6]);
- v[2] = add(v[2], v[7]);
- v[3] = add(v[3], v[4]);
- v[15] = xor(v[15], v[0]);
- v[12] = xor(v[12], v[1]);
- v[13] = xor(v[13], v[2]);
- v[14] = xor(v[14], v[3]);
- v[15] = rot8(v[15]);
- v[12] = rot8(v[12]);
- v[13] = rot8(v[13]);
- v[14] = rot8(v[14]);
- v[10] = add(v[10], v[15]);
- v[11] = add(v[11], v[12]);
- v[8] = add(v[8], v[13]);
- v[9] = add(v[9], v[14]);
- v[5] = xor(v[5], v[10]);
- v[6] = xor(v[6], v[11]);
- v[7] = xor(v[7], v[8]);
- v[4] = xor(v[4], v[9]);
- v[5] = rot7(v[5]);
- v[6] = rot7(v[6]);
- v[7] = rot7(v[7]);
- v[4] = rot7(v[4]);
-}
-
-#[inline(always)]
-unsafe fn transpose_vecs(vecs: &mut [__m128i; DEGREE]) {
- // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is
- // 22/33. Note that this doesn't split the vector into two lanes, as the
- // AVX2 counterparts do.
- let ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
- let ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
- let cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
- let cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
-
- // Interleave 64-bit lanes.
- let abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01);
- let abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01);
- let abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23);
- let abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23);
-
- vecs[0] = abcd_0;
- vecs[1] = abcd_1;
- vecs[2] = abcd_2;
- vecs[3] = abcd_3;
-}
-
-#[inline(always)]
-unsafe fn transpose_msg_vecs(inputs: &[*const u8; DEGREE], block_offset: usize) -> [__m128i; 16] {
- let mut vecs = [
- loadu(inputs[0].add(block_offset + 0 * 4 * DEGREE)),
- loadu(inputs[1].add(block_offset + 0 * 4 * DEGREE)),
- loadu(inputs[2].add(block_offset + 0 * 4 * DEGREE)),
- loadu(inputs[3].add(block_offset + 0 * 4 * DEGREE)),
- loadu(inputs[0].add(block_offset + 1 * 4 * DEGREE)),
- loadu(inputs[1].add(block_offset + 1 * 4 * DEGREE)),
- loadu(inputs[2].add(block_offset + 1 * 4 * DEGREE)),
- loadu(inputs[3].add(block_offset + 1 * 4 * DEGREE)),
- loadu(inputs[0].add(block_offset + 2 * 4 * DEGREE)),
- loadu(inputs[1].add(block_offset + 2 * 4 * DEGREE)),
- loadu(inputs[2].add(block_offset + 2 * 4 * DEGREE)),
- loadu(inputs[3].add(block_offset + 2 * 4 * DEGREE)),
- loadu(inputs[0].add(block_offset + 3 * 4 * DEGREE)),
- loadu(inputs[1].add(block_offset + 3 * 4 * DEGREE)),
- loadu(inputs[2].add(block_offset + 3 * 4 * DEGREE)),
- loadu(inputs[3].add(block_offset + 3 * 4 * DEGREE)),
- ];
- for i in 0..DEGREE {
- _mm_prefetch(inputs[i].add(block_offset + 256) as *const i8, _MM_HINT_T0);
- }
- let squares = mut_array_refs!(&mut vecs, DEGREE, DEGREE, DEGREE, DEGREE);
- transpose_vecs(squares.0);
- transpose_vecs(squares.1);
- transpose_vecs(squares.2);
- transpose_vecs(squares.3);
- vecs
-}
-
-#[inline(always)]
-unsafe fn load_counters(counter: u64, increment_counter: IncrementCounter) -> (__m128i, __m128i) {
- let mask = if increment_counter.yes() { !0 } else { 0 };
- (
- set4(
- counter_low(counter + (mask & 0)),
- counter_low(counter + (mask & 1)),
- counter_low(counter + (mask & 2)),
- counter_low(counter + (mask & 3)),
- ),
- set4(
- counter_high(counter + (mask & 0)),
- counter_high(counter + (mask & 1)),
- counter_high(counter + (mask & 2)),
- counter_high(counter + (mask & 3)),
- ),
- )
-}
-
-#[target_feature(enable = "sse2")]
-pub unsafe fn hash4(
- inputs: &[*const u8; DEGREE],
- blocks: usize,
- key: &CVWords,
- counter: u64,
- increment_counter: IncrementCounter,
- flags: u8,
- flags_start: u8,
- flags_end: u8,
- out: &mut [u8; DEGREE * OUT_LEN],
-) {
- let mut h_vecs = [
- set1(key[0]),
- set1(key[1]),
- set1(key[2]),
- set1(key[3]),
- set1(key[4]),
- set1(key[5]),
- set1(key[6]),
- set1(key[7]),
- ];
- let (counter_low_vec, counter_high_vec) = load_counters(counter, increment_counter);
- let mut block_flags = flags | flags_start;
-
- for block in 0..blocks {
- if block + 1 == blocks {
- block_flags |= flags_end;
- }
- let block_len_vec = set1(BLOCK_LEN as u32); // full blocks only
- let block_flags_vec = set1(block_flags as u32);
- let msg_vecs = transpose_msg_vecs(inputs, block * BLOCK_LEN);
-
- // The transposed compression function. Note that inlining this
- // manually here improves compile times by a lot, compared to factoring
- // it out into its own function and making it #[inline(always)]. Just
- // guessing, it might have something to do with loop unrolling.
- let mut v = [
- h_vecs[0],
- h_vecs[1],
- h_vecs[2],
- h_vecs[3],
- h_vecs[4],
- h_vecs[5],
- h_vecs[6],
- h_vecs[7],
- set1(IV[0]),
- set1(IV[1]),
- set1(IV[2]),
- set1(IV[3]),
- counter_low_vec,
- counter_high_vec,
- block_len_vec,
- block_flags_vec,
- ];
- round(&mut v, &msg_vecs, 0);
- round(&mut v, &msg_vecs, 1);
- round(&mut v, &msg_vecs, 2);
- round(&mut v, &msg_vecs, 3);
- round(&mut v, &msg_vecs, 4);
- round(&mut v, &msg_vecs, 5);
- round(&mut v, &msg_vecs, 6);
- h_vecs[0] = xor(v[0], v[8]);
- h_vecs[1] = xor(v[1], v[9]);
- h_vecs[2] = xor(v[2], v[10]);
- h_vecs[3] = xor(v[3], v[11]);
- h_vecs[4] = xor(v[4], v[12]);
- h_vecs[5] = xor(v[5], v[13]);
- h_vecs[6] = xor(v[6], v[14]);
- h_vecs[7] = xor(v[7], v[15]);
-
- block_flags = flags;
- }
-
- let squares = mut_array_refs!(&mut h_vecs, DEGREE, DEGREE);
- transpose_vecs(squares.0);
- transpose_vecs(squares.1);
- // The first four vecs now contain the first half of each output, and the
- // second four vecs contain the second half of each output.
- storeu(h_vecs[0], out.as_mut_ptr().add(0 * 4 * DEGREE));
- storeu(h_vecs[4], out.as_mut_ptr().add(1 * 4 * DEGREE));
- storeu(h_vecs[1], out.as_mut_ptr().add(2 * 4 * DEGREE));
- storeu(h_vecs[5], out.as_mut_ptr().add(3 * 4 * DEGREE));
- storeu(h_vecs[2], out.as_mut_ptr().add(4 * 4 * DEGREE));
- storeu(h_vecs[6], out.as_mut_ptr().add(5 * 4 * DEGREE));
- storeu(h_vecs[3], out.as_mut_ptr().add(6 * 4 * DEGREE));
- storeu(h_vecs[7], out.as_mut_ptr().add(7 * 4 * DEGREE));
-}
-
-#[target_feature(enable = "sse2")]
-unsafe fn hash1<A: arrayvec::Array<Item = u8>>(
- input: &A,
- key: &CVWords,
- counter: u64,
- flags: u8,
- flags_start: u8,
- flags_end: u8,
- out: &mut CVBytes,
-) {
- debug_assert_eq!(A::CAPACITY % BLOCK_LEN, 0, "uneven blocks");
- let mut cv = *key;
- let mut block_flags = flags | flags_start;
- let mut slice = input.as_slice();
- while slice.len() >= BLOCK_LEN {
- if slice.len() == BLOCK_LEN {
- block_flags |= flags_end;
- }
- compress_in_place(
- &mut cv,
- array_ref!(slice, 0, BLOCK_LEN),
- BLOCK_LEN as u8,
- counter,
- block_flags,
- );
- block_flags = flags;
- slice = &slice[BLOCK_LEN..];
- }
- *out = core::mem::transmute(cv); // x86 is little-endian
-}
-
-#[target_feature(enable = "sse2")]
-pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>(
- mut inputs: &[&A],
- key: &CVWords,
- mut counter: u64,
- increment_counter: IncrementCounter,
- flags: u8,
- flags_start: u8,
- flags_end: u8,
- mut out: &mut [u8],
-) {
- debug_assert!(out.len() >= inputs.len() * OUT_LEN, "out too short");
- while inputs.len() >= DEGREE && out.len() >= DEGREE * OUT_LEN {
- // Safe because the layout of arrays is guaranteed, and because the
- // `blocks` count is determined statically from the argument type.
- let input_ptrs: &[*const u8; DEGREE] = &*(inputs.as_ptr() as *const [*const u8; DEGREE]);
- let blocks = A::CAPACITY / BLOCK_LEN;
- hash4(
- input_ptrs,
- blocks,
- key,
- counter,
- increment_counter,
- flags,
- flags_start,
- flags_end,
- array_mut_ref!(out, 0, DEGREE * OUT_LEN),
- );
- if increment_counter.yes() {
- counter += DEGREE as u64;
- }
- inputs = &inputs[DEGREE..];
- out = &mut out[DEGREE * OUT_LEN..];
- }
- for (&input, output) in inputs.iter().zip(out.chunks_exact_mut(OUT_LEN)) {
- hash1(
- input,
- key,
- counter,
- flags,
- flags_start,
- flags_end,
- array_mut_ref!(output, 0, OUT_LEN),
- );
- if increment_counter.yes() {
- counter += 1;
- }
- }
-}
-
-#[cfg(test)]
-mod test {
- use super::*;
-
- #[test]
- fn test_transpose() {
- if !crate::platform::sse2_detected() {
- return;
- }
-
- #[target_feature(enable = "sse2")]
- unsafe fn transpose_wrapper(vecs: &mut [__m128i; DEGREE]) {
- transpose_vecs(vecs);
- }
-
- let mut matrix = [[0 as u32; DEGREE]; DEGREE];
- for i in 0..DEGREE {
- for j in 0..DEGREE {
- matrix[i][j] = (i * DEGREE + j) as u32;
- }
- }
-
- unsafe {
- let mut vecs: [__m128i; DEGREE] = core::mem::transmute(matrix);
- transpose_wrapper(&mut vecs);
- matrix = core::mem::transmute(vecs);
- }
-
- for i in 0..DEGREE {
- for j in 0..DEGREE {
- // Reversed indexes from above.
- assert_eq!(matrix[j][i], (i * DEGREE + j) as u32);
- }
- }
- }
-
- #[test]
- fn test_compress() {
- if !crate::platform::sse2_detected() {
- return;
- }
- crate::test::test_compress_fn(compress_in_place, compress_xof);
- }
-
- #[test]
- fn test_hash_many() {
- if !crate::platform::sse2_detected() {
- return;
- }
- crate::test::test_hash_many_fn(hash_many, hash_many);
- }
-}
diff --git a/thirdparty/BLAKE3/src/rust_sse41.rs b/thirdparty/BLAKE3/src/rust_sse41.rs
deleted file mode 100644
index d5cf0f4a9..000000000
--- a/thirdparty/BLAKE3/src/rust_sse41.rs
+++ /dev/null
@@ -1,766 +0,0 @@
-#[cfg(target_arch = "x86")]
-use core::arch::x86::*;
-#[cfg(target_arch = "x86_64")]
-use core::arch::x86_64::*;
-
-use crate::{
- counter_high, counter_low, CVBytes, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE,
- OUT_LEN,
-};
-use arrayref::{array_mut_ref, array_ref, mut_array_refs};
-
-pub const DEGREE: usize = 4;
-
-#[inline(always)]
-unsafe fn loadu(src: *const u8) -> __m128i {
- // This is an unaligned load, so the pointer cast is allowed.
- _mm_loadu_si128(src as *const __m128i)
-}
-
-#[inline(always)]
-unsafe fn storeu(src: __m128i, dest: *mut u8) {
- // This is an unaligned store, so the pointer cast is allowed.
- _mm_storeu_si128(dest as *mut __m128i, src)
-}
-
-#[inline(always)]
-unsafe fn add(a: __m128i, b: __m128i) -> __m128i {
- _mm_add_epi32(a, b)
-}
-
-#[inline(always)]
-unsafe fn xor(a: __m128i, b: __m128i) -> __m128i {
- _mm_xor_si128(a, b)
-}
-
-#[inline(always)]
-unsafe fn set1(x: u32) -> __m128i {
- _mm_set1_epi32(x as i32)
-}
-
-#[inline(always)]
-unsafe fn set4(a: u32, b: u32, c: u32, d: u32) -> __m128i {
- _mm_setr_epi32(a as i32, b as i32, c as i32, d as i32)
-}
-
-// These rotations are the "simple/shifts version". For the
-// "complicated/shuffles version", see
-// https://github.com/sneves/blake2-avx2/blob/b3723921f668df09ece52dcd225a36d4a4eea1d9/blake2s-common.h#L63-L66.
-// For a discussion of the tradeoffs, see
-// https://github.com/sneves/blake2-avx2/pull/5. Due to an LLVM bug
-// (https://bugs.llvm.org/show_bug.cgi?id=44379), this version performs better
-// on recent x86 chips.
-
-#[inline(always)]
-unsafe fn rot16(a: __m128i) -> __m128i {
- _mm_or_si128(_mm_srli_epi32(a, 16), _mm_slli_epi32(a, 32 - 16))
-}
-
-#[inline(always)]
-unsafe fn rot12(a: __m128i) -> __m128i {
- _mm_or_si128(_mm_srli_epi32(a, 12), _mm_slli_epi32(a, 32 - 12))
-}
-
-#[inline(always)]
-unsafe fn rot8(a: __m128i) -> __m128i {
- _mm_or_si128(_mm_srli_epi32(a, 8), _mm_slli_epi32(a, 32 - 8))
-}
-
-#[inline(always)]
-unsafe fn rot7(a: __m128i) -> __m128i {
- _mm_or_si128(_mm_srli_epi32(a, 7), _mm_slli_epi32(a, 32 - 7))
-}
-
-#[inline(always)]
-unsafe fn g1(
- row0: &mut __m128i,
- row1: &mut __m128i,
- row2: &mut __m128i,
- row3: &mut __m128i,
- m: __m128i,
-) {
- *row0 = add(add(*row0, m), *row1);
- *row3 = xor(*row3, *row0);
- *row3 = rot16(*row3);
- *row2 = add(*row2, *row3);
- *row1 = xor(*row1, *row2);
- *row1 = rot12(*row1);
-}
-
-#[inline(always)]
-unsafe fn g2(
- row0: &mut __m128i,
- row1: &mut __m128i,
- row2: &mut __m128i,
- row3: &mut __m128i,
- m: __m128i,
-) {
- *row0 = add(add(*row0, m), *row1);
- *row3 = xor(*row3, *row0);
- *row3 = rot8(*row3);
- *row2 = add(*row2, *row3);
- *row1 = xor(*row1, *row2);
- *row1 = rot7(*row1);
-}
-
-// Adapted from https://github.com/rust-lang-nursery/stdsimd/pull/479.
-macro_rules! _MM_SHUFFLE {
- ($z:expr, $y:expr, $x:expr, $w:expr) => {
- ($z << 6) | ($y << 4) | ($x << 2) | $w
- };
-}
-
-macro_rules! shuffle2 {
- ($a:expr, $b:expr, $c:expr) => {
- _mm_castps_si128(_mm_shuffle_ps(
- _mm_castsi128_ps($a),
- _mm_castsi128_ps($b),
- $c,
- ))
- };
-}
-
-// Note the optimization here of leaving row1 as the unrotated row, rather than
-// row0. All the message loads below are adjusted to compensate for this. See
-// discussion at https://github.com/sneves/blake2-avx2/pull/4
-#[inline(always)]
-unsafe fn diagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) {
- *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(2, 1, 0, 3));
- *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2));
- *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(0, 3, 2, 1));
-}
-
-#[inline(always)]
-unsafe fn undiagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) {
- *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(0, 3, 2, 1));
- *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2));
- *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(2, 1, 0, 3));
-}
-
-#[inline(always)]
-unsafe fn compress_pre(
- cv: &CVWords,
- block: &[u8; BLOCK_LEN],
- block_len: u8,
- counter: u64,
- flags: u8,
-) -> [__m128i; 4] {
- let row0 = &mut loadu(cv.as_ptr().add(0) as *const u8);
- let row1 = &mut loadu(cv.as_ptr().add(4) as *const u8);
- let row2 = &mut set4(IV[0], IV[1], IV[2], IV[3]);
- let row3 = &mut set4(
- counter_low(counter),
- counter_high(counter),
- block_len as u32,
- flags as u32,
- );
-
- let mut m0 = loadu(block.as_ptr().add(0 * 4 * DEGREE));
- let mut m1 = loadu(block.as_ptr().add(1 * 4 * DEGREE));
- let mut m2 = loadu(block.as_ptr().add(2 * 4 * DEGREE));
- let mut m3 = loadu(block.as_ptr().add(3 * 4 * DEGREE));
-
- let mut t0;
- let mut t1;
- let mut t2;
- let mut t3;
- let mut tt;
-
- // Round 1. The first round permutes the message words from the original
- // input order, into the groups that get mixed in parallel.
- t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(2, 0, 2, 0)); // 6 4 2 0
- g1(row0, row1, row2, row3, t0);
- t1 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 3, 1)); // 7 5 3 1
- g2(row0, row1, row2, row3, t1);
- diagonalize(row0, row2, row3);
- t2 = shuffle2!(m2, m3, _MM_SHUFFLE!(2, 0, 2, 0)); // 14 12 10 8
- t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE!(2, 1, 0, 3)); // 12 10 8 14
- g1(row0, row1, row2, row3, t2);
- t3 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 1, 3, 1)); // 15 13 11 9
- t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE!(2, 1, 0, 3)); // 13 11 9 15
- g2(row0, row1, row2, row3, t3);
- undiagonalize(row0, row2, row3);
- m0 = t0;
- m1 = t1;
- m2 = t2;
- m3 = t3;
-
- // Round 2. This round and all following rounds apply a fixed permutation
- // to the message words from the round before.
- t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
- t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
- g1(row0, row1, row2, row3, t0);
- t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
- tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
- t1 = _mm_blend_epi16(tt, t1, 0xCC);
- g2(row0, row1, row2, row3, t1);
- diagonalize(row0, row2, row3);
- t2 = _mm_unpacklo_epi64(m3, m1);
- tt = _mm_blend_epi16(t2, m2, 0xC0);
- t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
- g1(row0, row1, row2, row3, t2);
- t3 = _mm_unpackhi_epi32(m1, m3);
- tt = _mm_unpacklo_epi32(m2, t3);
- t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
- g2(row0, row1, row2, row3, t3);
- undiagonalize(row0, row2, row3);
- m0 = t0;
- m1 = t1;
- m2 = t2;
- m3 = t3;
-
- // Round 3
- t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
- t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
- g1(row0, row1, row2, row3, t0);
- t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
- tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
- t1 = _mm_blend_epi16(tt, t1, 0xCC);
- g2(row0, row1, row2, row3, t1);
- diagonalize(row0, row2, row3);
- t2 = _mm_unpacklo_epi64(m3, m1);
- tt = _mm_blend_epi16(t2, m2, 0xC0);
- t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
- g1(row0, row1, row2, row3, t2);
- t3 = _mm_unpackhi_epi32(m1, m3);
- tt = _mm_unpacklo_epi32(m2, t3);
- t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
- g2(row0, row1, row2, row3, t3);
- undiagonalize(row0, row2, row3);
- m0 = t0;
- m1 = t1;
- m2 = t2;
- m3 = t3;
-
- // Round 4
- t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
- t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
- g1(row0, row1, row2, row3, t0);
- t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
- tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
- t1 = _mm_blend_epi16(tt, t1, 0xCC);
- g2(row0, row1, row2, row3, t1);
- diagonalize(row0, row2, row3);
- t2 = _mm_unpacklo_epi64(m3, m1);
- tt = _mm_blend_epi16(t2, m2, 0xC0);
- t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
- g1(row0, row1, row2, row3, t2);
- t3 = _mm_unpackhi_epi32(m1, m3);
- tt = _mm_unpacklo_epi32(m2, t3);
- t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
- g2(row0, row1, row2, row3, t3);
- undiagonalize(row0, row2, row3);
- m0 = t0;
- m1 = t1;
- m2 = t2;
- m3 = t3;
-
- // Round 5
- t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
- t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
- g1(row0, row1, row2, row3, t0);
- t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
- tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
- t1 = _mm_blend_epi16(tt, t1, 0xCC);
- g2(row0, row1, row2, row3, t1);
- diagonalize(row0, row2, row3);
- t2 = _mm_unpacklo_epi64(m3, m1);
- tt = _mm_blend_epi16(t2, m2, 0xC0);
- t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
- g1(row0, row1, row2, row3, t2);
- t3 = _mm_unpackhi_epi32(m1, m3);
- tt = _mm_unpacklo_epi32(m2, t3);
- t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
- g2(row0, row1, row2, row3, t3);
- undiagonalize(row0, row2, row3);
- m0 = t0;
- m1 = t1;
- m2 = t2;
- m3 = t3;
-
- // Round 6
- t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
- t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
- g1(row0, row1, row2, row3, t0);
- t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
- tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
- t1 = _mm_blend_epi16(tt, t1, 0xCC);
- g2(row0, row1, row2, row3, t1);
- diagonalize(row0, row2, row3);
- t2 = _mm_unpacklo_epi64(m3, m1);
- tt = _mm_blend_epi16(t2, m2, 0xC0);
- t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
- g1(row0, row1, row2, row3, t2);
- t3 = _mm_unpackhi_epi32(m1, m3);
- tt = _mm_unpacklo_epi32(m2, t3);
- t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
- g2(row0, row1, row2, row3, t3);
- undiagonalize(row0, row2, row3);
- m0 = t0;
- m1 = t1;
- m2 = t2;
- m3 = t3;
-
- // Round 7
- t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
- t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
- g1(row0, row1, row2, row3, t0);
- t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
- tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
- t1 = _mm_blend_epi16(tt, t1, 0xCC);
- g2(row0, row1, row2, row3, t1);
- diagonalize(row0, row2, row3);
- t2 = _mm_unpacklo_epi64(m3, m1);
- tt = _mm_blend_epi16(t2, m2, 0xC0);
- t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
- g1(row0, row1, row2, row3, t2);
- t3 = _mm_unpackhi_epi32(m1, m3);
- tt = _mm_unpacklo_epi32(m2, t3);
- t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
- g2(row0, row1, row2, row3, t3);
- undiagonalize(row0, row2, row3);
-
- [*row0, *row1, *row2, *row3]
-}
-
-#[target_feature(enable = "sse4.1")]
-pub unsafe fn compress_in_place(
- cv: &mut CVWords,
- block: &[u8; BLOCK_LEN],
- block_len: u8,
- counter: u64,
- flags: u8,
-) {
- let [row0, row1, row2, row3] = compress_pre(cv, block, block_len, counter, flags);
- storeu(xor(row0, row2), cv.as_mut_ptr().add(0) as *mut u8);
- storeu(xor(row1, row3), cv.as_mut_ptr().add(4) as *mut u8);
-}
-
-#[target_feature(enable = "sse4.1")]
-pub unsafe fn compress_xof(
- cv: &CVWords,
- block: &[u8; BLOCK_LEN],
- block_len: u8,
- counter: u64,
- flags: u8,
-) -> [u8; 64] {
- let [mut row0, mut row1, mut row2, mut row3] =
- compress_pre(cv, block, block_len, counter, flags);
- row0 = xor(row0, row2);
- row1 = xor(row1, row3);
- row2 = xor(row2, loadu(cv.as_ptr().add(0) as *const u8));
- row3 = xor(row3, loadu(cv.as_ptr().add(4) as *const u8));
- core::mem::transmute([row0, row1, row2, row3])
-}
-
-#[inline(always)]
-unsafe fn round(v: &mut [__m128i; 16], m: &[__m128i; 16], r: usize) {
- v[0] = add(v[0], m[MSG_SCHEDULE[r][0] as usize]);
- v[1] = add(v[1], m[MSG_SCHEDULE[r][2] as usize]);
- v[2] = add(v[2], m[MSG_SCHEDULE[r][4] as usize]);
- v[3] = add(v[3], m[MSG_SCHEDULE[r][6] as usize]);
- v[0] = add(v[0], v[4]);
- v[1] = add(v[1], v[5]);
- v[2] = add(v[2], v[6]);
- v[3] = add(v[3], v[7]);
- v[12] = xor(v[12], v[0]);
- v[13] = xor(v[13], v[1]);
- v[14] = xor(v[14], v[2]);
- v[15] = xor(v[15], v[3]);
- v[12] = rot16(v[12]);
- v[13] = rot16(v[13]);
- v[14] = rot16(v[14]);
- v[15] = rot16(v[15]);
- v[8] = add(v[8], v[12]);
- v[9] = add(v[9], v[13]);
- v[10] = add(v[10], v[14]);
- v[11] = add(v[11], v[15]);
- v[4] = xor(v[4], v[8]);
- v[5] = xor(v[5], v[9]);
- v[6] = xor(v[6], v[10]);
- v[7] = xor(v[7], v[11]);
- v[4] = rot12(v[4]);
- v[5] = rot12(v[5]);
- v[6] = rot12(v[6]);
- v[7] = rot12(v[7]);
- v[0] = add(v[0], m[MSG_SCHEDULE[r][1] as usize]);
- v[1] = add(v[1], m[MSG_SCHEDULE[r][3] as usize]);
- v[2] = add(v[2], m[MSG_SCHEDULE[r][5] as usize]);
- v[3] = add(v[3], m[MSG_SCHEDULE[r][7] as usize]);
- v[0] = add(v[0], v[4]);
- v[1] = add(v[1], v[5]);
- v[2] = add(v[2], v[6]);
- v[3] = add(v[3], v[7]);
- v[12] = xor(v[12], v[0]);
- v[13] = xor(v[13], v[1]);
- v[14] = xor(v[14], v[2]);
- v[15] = xor(v[15], v[3]);
- v[12] = rot8(v[12]);
- v[13] = rot8(v[13]);
- v[14] = rot8(v[14]);
- v[15] = rot8(v[15]);
- v[8] = add(v[8], v[12]);
- v[9] = add(v[9], v[13]);
- v[10] = add(v[10], v[14]);
- v[11] = add(v[11], v[15]);
- v[4] = xor(v[4], v[8]);
- v[5] = xor(v[5], v[9]);
- v[6] = xor(v[6], v[10]);
- v[7] = xor(v[7], v[11]);
- v[4] = rot7(v[4]);
- v[5] = rot7(v[5]);
- v[6] = rot7(v[6]);
- v[7] = rot7(v[7]);
-
- v[0] = add(v[0], m[MSG_SCHEDULE[r][8] as usize]);
- v[1] = add(v[1], m[MSG_SCHEDULE[r][10] as usize]);
- v[2] = add(v[2], m[MSG_SCHEDULE[r][12] as usize]);
- v[3] = add(v[3], m[MSG_SCHEDULE[r][14] as usize]);
- v[0] = add(v[0], v[5]);
- v[1] = add(v[1], v[6]);
- v[2] = add(v[2], v[7]);
- v[3] = add(v[3], v[4]);
- v[15] = xor(v[15], v[0]);
- v[12] = xor(v[12], v[1]);
- v[13] = xor(v[13], v[2]);
- v[14] = xor(v[14], v[3]);
- v[15] = rot16(v[15]);
- v[12] = rot16(v[12]);
- v[13] = rot16(v[13]);
- v[14] = rot16(v[14]);
- v[10] = add(v[10], v[15]);
- v[11] = add(v[11], v[12]);
- v[8] = add(v[8], v[13]);
- v[9] = add(v[9], v[14]);
- v[5] = xor(v[5], v[10]);
- v[6] = xor(v[6], v[11]);
- v[7] = xor(v[7], v[8]);
- v[4] = xor(v[4], v[9]);
- v[5] = rot12(v[5]);
- v[6] = rot12(v[6]);
- v[7] = rot12(v[7]);
- v[4] = rot12(v[4]);
- v[0] = add(v[0], m[MSG_SCHEDULE[r][9] as usize]);
- v[1] = add(v[1], m[MSG_SCHEDULE[r][11] as usize]);
- v[2] = add(v[2], m[MSG_SCHEDULE[r][13] as usize]);
- v[3] = add(v[3], m[MSG_SCHEDULE[r][15] as usize]);
- v[0] = add(v[0], v[5]);
- v[1] = add(v[1], v[6]);
- v[2] = add(v[2], v[7]);
- v[3] = add(v[3], v[4]);
- v[15] = xor(v[15], v[0]);
- v[12] = xor(v[12], v[1]);
- v[13] = xor(v[13], v[2]);
- v[14] = xor(v[14], v[3]);
- v[15] = rot8(v[15]);
- v[12] = rot8(v[12]);
- v[13] = rot8(v[13]);
- v[14] = rot8(v[14]);
- v[10] = add(v[10], v[15]);
- v[11] = add(v[11], v[12]);
- v[8] = add(v[8], v[13]);
- v[9] = add(v[9], v[14]);
- v[5] = xor(v[5], v[10]);
- v[6] = xor(v[6], v[11]);
- v[7] = xor(v[7], v[8]);
- v[4] = xor(v[4], v[9]);
- v[5] = rot7(v[5]);
- v[6] = rot7(v[6]);
- v[7] = rot7(v[7]);
- v[4] = rot7(v[4]);
-}
-
-#[inline(always)]
-unsafe fn transpose_vecs(vecs: &mut [__m128i; DEGREE]) {
- // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is
- // 22/33. Note that this doesn't split the vector into two lanes, as the
- // AVX2 counterparts do.
- let ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
- let ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
- let cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
- let cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
-
- // Interleave 64-bit lanes.
- let abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01);
- let abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01);
- let abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23);
- let abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23);
-
- vecs[0] = abcd_0;
- vecs[1] = abcd_1;
- vecs[2] = abcd_2;
- vecs[3] = abcd_3;
-}
-
-#[inline(always)]
-unsafe fn transpose_msg_vecs(inputs: &[*const u8; DEGREE], block_offset: usize) -> [__m128i; 16] {
- let mut vecs = [
- loadu(inputs[0].add(block_offset + 0 * 4 * DEGREE)),
- loadu(inputs[1].add(block_offset + 0 * 4 * DEGREE)),
- loadu(inputs[2].add(block_offset + 0 * 4 * DEGREE)),
- loadu(inputs[3].add(block_offset + 0 * 4 * DEGREE)),
- loadu(inputs[0].add(block_offset + 1 * 4 * DEGREE)),
- loadu(inputs[1].add(block_offset + 1 * 4 * DEGREE)),
- loadu(inputs[2].add(block_offset + 1 * 4 * DEGREE)),
- loadu(inputs[3].add(block_offset + 1 * 4 * DEGREE)),
- loadu(inputs[0].add(block_offset + 2 * 4 * DEGREE)),
- loadu(inputs[1].add(block_offset + 2 * 4 * DEGREE)),
- loadu(inputs[2].add(block_offset + 2 * 4 * DEGREE)),
- loadu(inputs[3].add(block_offset + 2 * 4 * DEGREE)),
- loadu(inputs[0].add(block_offset + 3 * 4 * DEGREE)),
- loadu(inputs[1].add(block_offset + 3 * 4 * DEGREE)),
- loadu(inputs[2].add(block_offset + 3 * 4 * DEGREE)),
- loadu(inputs[3].add(block_offset + 3 * 4 * DEGREE)),
- ];
- for i in 0..DEGREE {
- _mm_prefetch(inputs[i].add(block_offset + 256) as *const i8, _MM_HINT_T0);
- }
- let squares = mut_array_refs!(&mut vecs, DEGREE, DEGREE, DEGREE, DEGREE);
- transpose_vecs(squares.0);
- transpose_vecs(squares.1);
- transpose_vecs(squares.2);
- transpose_vecs(squares.3);
- vecs
-}
-
-#[inline(always)]
-unsafe fn load_counters(counter: u64, increment_counter: IncrementCounter) -> (__m128i, __m128i) {
- let mask = if increment_counter.yes() { !0 } else { 0 };
- (
- set4(
- counter_low(counter + (mask & 0)),
- counter_low(counter + (mask & 1)),
- counter_low(counter + (mask & 2)),
- counter_low(counter + (mask & 3)),
- ),
- set4(
- counter_high(counter + (mask & 0)),
- counter_high(counter + (mask & 1)),
- counter_high(counter + (mask & 2)),
- counter_high(counter + (mask & 3)),
- ),
- )
-}
-
-#[target_feature(enable = "sse4.1")]
-pub unsafe fn hash4(
- inputs: &[*const u8; DEGREE],
- blocks: usize,
- key: &CVWords,
- counter: u64,
- increment_counter: IncrementCounter,
- flags: u8,
- flags_start: u8,
- flags_end: u8,
- out: &mut [u8; DEGREE * OUT_LEN],
-) {
- let mut h_vecs = [
- set1(key[0]),
- set1(key[1]),
- set1(key[2]),
- set1(key[3]),
- set1(key[4]),
- set1(key[5]),
- set1(key[6]),
- set1(key[7]),
- ];
- let (counter_low_vec, counter_high_vec) = load_counters(counter, increment_counter);
- let mut block_flags = flags | flags_start;
-
- for block in 0..blocks {
- if block + 1 == blocks {
- block_flags |= flags_end;
- }
- let block_len_vec = set1(BLOCK_LEN as u32); // full blocks only
- let block_flags_vec = set1(block_flags as u32);
- let msg_vecs = transpose_msg_vecs(inputs, block * BLOCK_LEN);
-
- // The transposed compression function. Note that inlining this
- // manually here improves compile times by a lot, compared to factoring
- // it out into its own function and making it #[inline(always)]. Just
- // guessing, it might have something to do with loop unrolling.
- let mut v = [
- h_vecs[0],
- h_vecs[1],
- h_vecs[2],
- h_vecs[3],
- h_vecs[4],
- h_vecs[5],
- h_vecs[6],
- h_vecs[7],
- set1(IV[0]),
- set1(IV[1]),
- set1(IV[2]),
- set1(IV[3]),
- counter_low_vec,
- counter_high_vec,
- block_len_vec,
- block_flags_vec,
- ];
- round(&mut v, &msg_vecs, 0);
- round(&mut v, &msg_vecs, 1);
- round(&mut v, &msg_vecs, 2);
- round(&mut v, &msg_vecs, 3);
- round(&mut v, &msg_vecs, 4);
- round(&mut v, &msg_vecs, 5);
- round(&mut v, &msg_vecs, 6);
- h_vecs[0] = xor(v[0], v[8]);
- h_vecs[1] = xor(v[1], v[9]);
- h_vecs[2] = xor(v[2], v[10]);
- h_vecs[3] = xor(v[3], v[11]);
- h_vecs[4] = xor(v[4], v[12]);
- h_vecs[5] = xor(v[5], v[13]);
- h_vecs[6] = xor(v[6], v[14]);
- h_vecs[7] = xor(v[7], v[15]);
-
- block_flags = flags;
- }
-
- let squares = mut_array_refs!(&mut h_vecs, DEGREE, DEGREE);
- transpose_vecs(squares.0);
- transpose_vecs(squares.1);
- // The first four vecs now contain the first half of each output, and the
- // second four vecs contain the second half of each output.
- storeu(h_vecs[0], out.as_mut_ptr().add(0 * 4 * DEGREE));
- storeu(h_vecs[4], out.as_mut_ptr().add(1 * 4 * DEGREE));
- storeu(h_vecs[1], out.as_mut_ptr().add(2 * 4 * DEGREE));
- storeu(h_vecs[5], out.as_mut_ptr().add(3 * 4 * DEGREE));
- storeu(h_vecs[2], out.as_mut_ptr().add(4 * 4 * DEGREE));
- storeu(h_vecs[6], out.as_mut_ptr().add(5 * 4 * DEGREE));
- storeu(h_vecs[3], out.as_mut_ptr().add(6 * 4 * DEGREE));
- storeu(h_vecs[7], out.as_mut_ptr().add(7 * 4 * DEGREE));
-}
-
-#[target_feature(enable = "sse4.1")]
-unsafe fn hash1<A: arrayvec::Array<Item = u8>>(
- input: &A,
- key: &CVWords,
- counter: u64,
- flags: u8,
- flags_start: u8,
- flags_end: u8,
- out: &mut CVBytes,
-) {
- debug_assert_eq!(A::CAPACITY % BLOCK_LEN, 0, "uneven blocks");
- let mut cv = *key;
- let mut block_flags = flags | flags_start;
- let mut slice = input.as_slice();
- while slice.len() >= BLOCK_LEN {
- if slice.len() == BLOCK_LEN {
- block_flags |= flags_end;
- }
- compress_in_place(
- &mut cv,
- array_ref!(slice, 0, BLOCK_LEN),
- BLOCK_LEN as u8,
- counter,
- block_flags,
- );
- block_flags = flags;
- slice = &slice[BLOCK_LEN..];
- }
- *out = core::mem::transmute(cv); // x86 is little-endian
-}
-
-#[target_feature(enable = "sse4.1")]
-pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>(
- mut inputs: &[&A],
- key: &CVWords,
- mut counter: u64,
- increment_counter: IncrementCounter,
- flags: u8,
- flags_start: u8,
- flags_end: u8,
- mut out: &mut [u8],
-) {
- debug_assert!(out.len() >= inputs.len() * OUT_LEN, "out too short");
- while inputs.len() >= DEGREE && out.len() >= DEGREE * OUT_LEN {
- // Safe because the layout of arrays is guaranteed, and because the
- // `blocks` count is determined statically from the argument type.
- let input_ptrs: &[*const u8; DEGREE] = &*(inputs.as_ptr() as *const [*const u8; DEGREE]);
- let blocks = A::CAPACITY / BLOCK_LEN;
- hash4(
- input_ptrs,
- blocks,
- key,
- counter,
- increment_counter,
- flags,
- flags_start,
- flags_end,
- array_mut_ref!(out, 0, DEGREE * OUT_LEN),
- );
- if increment_counter.yes() {
- counter += DEGREE as u64;
- }
- inputs = &inputs[DEGREE..];
- out = &mut out[DEGREE * OUT_LEN..];
- }
- for (&input, output) in inputs.iter().zip(out.chunks_exact_mut(OUT_LEN)) {
- hash1(
- input,
- key,
- counter,
- flags,
- flags_start,
- flags_end,
- array_mut_ref!(output, 0, OUT_LEN),
- );
- if increment_counter.yes() {
- counter += 1;
- }
- }
-}
-
-#[cfg(test)]
-mod test {
- use super::*;
-
- #[test]
- fn test_transpose() {
- if !crate::platform::sse41_detected() {
- return;
- }
-
- #[target_feature(enable = "sse4.1")]
- unsafe fn transpose_wrapper(vecs: &mut [__m128i; DEGREE]) {
- transpose_vecs(vecs);
- }
-
- let mut matrix = [[0 as u32; DEGREE]; DEGREE];
- for i in 0..DEGREE {
- for j in 0..DEGREE {
- matrix[i][j] = (i * DEGREE + j) as u32;
- }
- }
-
- unsafe {
- let mut vecs: [__m128i; DEGREE] = core::mem::transmute(matrix);
- transpose_wrapper(&mut vecs);
- matrix = core::mem::transmute(vecs);
- }
-
- for i in 0..DEGREE {
- for j in 0..DEGREE {
- // Reversed indexes from above.
- assert_eq!(matrix[j][i], (i * DEGREE + j) as u32);
- }
- }
- }
-
- #[test]
- fn test_compress() {
- if !crate::platform::sse41_detected() {
- return;
- }
- crate::test::test_compress_fn(compress_in_place, compress_xof);
- }
-
- #[test]
- fn test_hash_many() {
- if !crate::platform::sse41_detected() {
- return;
- }
- crate::test::test_hash_many_fn(hash_many, hash_many);
- }
-}
diff --git a/thirdparty/BLAKE3/src/test.rs b/thirdparty/BLAKE3/src/test.rs
deleted file mode 100644
index eefb1a354..000000000
--- a/thirdparty/BLAKE3/src/test.rs
+++ /dev/null
@@ -1,569 +0,0 @@
-use crate::{CVBytes, CVWords, IncrementCounter, BLOCK_LEN, CHUNK_LEN, OUT_LEN};
-use arrayref::array_ref;
-use arrayvec::ArrayVec;
-use core::sync::atomic::{AtomicUsize, Ordering};
-use core::usize;
-use rand::prelude::*;
-
-// Interesting input lengths to run tests on.
-pub const TEST_CASES: &[usize] = &[
- 0,
- 1,
- 2,
- 3,
- 4,
- 5,
- 6,
- 7,
- 8,
- BLOCK_LEN - 1,
- BLOCK_LEN,
- BLOCK_LEN + 1,
- 2 * BLOCK_LEN - 1,
- 2 * BLOCK_LEN,
- 2 * BLOCK_LEN + 1,
- CHUNK_LEN - 1,
- CHUNK_LEN,
- CHUNK_LEN + 1,
- 2 * CHUNK_LEN,
- 2 * CHUNK_LEN + 1,
- 3 * CHUNK_LEN,
- 3 * CHUNK_LEN + 1,
- 4 * CHUNK_LEN,
- 4 * CHUNK_LEN + 1,
- 5 * CHUNK_LEN,
- 5 * CHUNK_LEN + 1,
- 6 * CHUNK_LEN,
- 6 * CHUNK_LEN + 1,
- 7 * CHUNK_LEN,
- 7 * CHUNK_LEN + 1,
- 8 * CHUNK_LEN,
- 8 * CHUNK_LEN + 1,
- 16 * CHUNK_LEN, // AVX512's bandwidth
- 31 * CHUNK_LEN, // 16 + 8 + 4 + 2 + 1
- 100 * CHUNK_LEN, // subtrees larger than MAX_SIMD_DEGREE chunks
-];
-
-pub const TEST_CASES_MAX: usize = 100 * CHUNK_LEN;
-
-// There's a test to make sure these two are equal below.
-pub const TEST_KEY: CVBytes = *b"whats the Elvish word for friend";
-pub const TEST_KEY_WORDS: CVWords = [
- 1952540791, 1752440947, 1816469605, 1752394102, 1919907616, 1868963940, 1919295602, 1684956521,
-];
-
-// Paint the input with a repeating byte pattern. We use a cycle length of 251,
-// because that's the largets prime number less than 256. This makes it
-// unlikely to swapping any two adjacent input blocks or chunks will give the
-// same answer.
-pub fn paint_test_input(buf: &mut [u8]) {
- for (i, b) in buf.iter_mut().enumerate() {
- *b = (i % 251) as u8;
- }
-}
-
-type CompressInPlaceFn =
- unsafe fn(cv: &mut CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8);
-
-type CompressXofFn = unsafe fn(
- cv: &CVWords,
- block: &[u8; BLOCK_LEN],
- block_len: u8,
- counter: u64,
- flags: u8,
-) -> [u8; 64];
-
-// A shared helper function for platform-specific tests.
-pub fn test_compress_fn(compress_in_place_fn: CompressInPlaceFn, compress_xof_fn: CompressXofFn) {
- let initial_state = TEST_KEY_WORDS;
- let block_len: u8 = 61;
- let mut block = [0; BLOCK_LEN];
- paint_test_input(&mut block[..block_len as usize]);
- // Use a counter with set bits in both 32-bit words.
- let counter = (5u64 << 32) + 6;
- let flags = crate::CHUNK_END | crate::ROOT | crate::KEYED_HASH;
-
- let portable_out =
- crate::portable::compress_xof(&initial_state, &block, block_len, counter as u64, flags);
-
- let mut test_state = initial_state;
- unsafe { compress_in_place_fn(&mut test_state, &block, block_len, counter as u64, flags) };
- let test_state_bytes = crate::platform::le_bytes_from_words_32(&test_state);
- let test_xof =
- unsafe { compress_xof_fn(&initial_state, &block, block_len, counter as u64, flags) };
-
- assert_eq!(&portable_out[..32], &test_state_bytes[..]);
- assert_eq!(&portable_out[..], &test_xof[..]);
-}
-
-type HashManyFn<A> = unsafe fn(
- inputs: &[&A],
- key: &CVWords,
- counter: u64,
- increment_counter: IncrementCounter,
- flags: u8,
- flags_start: u8,
- flags_end: u8,
- out: &mut [u8],
-);
-
-// A shared helper function for platform-specific tests.
-pub fn test_hash_many_fn(
- hash_many_chunks_fn: HashManyFn<[u8; CHUNK_LEN]>,
- hash_many_parents_fn: HashManyFn<[u8; 2 * OUT_LEN]>,
-) {
- // 31 (16 + 8 + 4 + 2 + 1) inputs
- const NUM_INPUTS: usize = 31;
- let mut input_buf = [0; CHUNK_LEN * NUM_INPUTS];
- crate::test::paint_test_input(&mut input_buf);
- // A counter just prior to u32::MAX.
- let counter = (1u64 << 32) - 1;
-
- // First hash chunks.
- let mut chunks = ArrayVec::<[&[u8; CHUNK_LEN]; NUM_INPUTS]>::new();
- for i in 0..NUM_INPUTS {
- chunks.push(array_ref!(input_buf, i * CHUNK_LEN, CHUNK_LEN));
- }
- let mut portable_chunks_out = [0; NUM_INPUTS * OUT_LEN];
- crate::portable::hash_many(
- &chunks,
- &TEST_KEY_WORDS,
- counter,
- IncrementCounter::Yes,
- crate::KEYED_HASH,
- crate::CHUNK_START,
- crate::CHUNK_END,
- &mut portable_chunks_out,
- );
-
- let mut test_chunks_out = [0; NUM_INPUTS * OUT_LEN];
- unsafe {
- hash_many_chunks_fn(
- &chunks[..],
- &TEST_KEY_WORDS,
- counter,
- IncrementCounter::Yes,
- crate::KEYED_HASH,
- crate::CHUNK_START,
- crate::CHUNK_END,
- &mut test_chunks_out,
- );
- }
- for n in 0..NUM_INPUTS {
- #[cfg(feature = "std")]
- dbg!(n);
- assert_eq!(
- &portable_chunks_out[n * OUT_LEN..][..OUT_LEN],
- &test_chunks_out[n * OUT_LEN..][..OUT_LEN]
- );
- }
-
- // Then hash parents.
- let mut parents = ArrayVec::<[&[u8; 2 * OUT_LEN]; NUM_INPUTS]>::new();
- for i in 0..NUM_INPUTS {
- parents.push(array_ref!(input_buf, i * 2 * OUT_LEN, 2 * OUT_LEN));
- }
- let mut portable_parents_out = [0; NUM_INPUTS * OUT_LEN];
- crate::portable::hash_many(
- &parents,
- &TEST_KEY_WORDS,
- counter,
- IncrementCounter::No,
- crate::KEYED_HASH | crate::PARENT,
- 0,
- 0,
- &mut portable_parents_out,
- );
-
- let mut test_parents_out = [0; NUM_INPUTS * OUT_LEN];
- unsafe {
- hash_many_parents_fn(
- &parents[..],
- &TEST_KEY_WORDS,
- counter,
- IncrementCounter::No,
- crate::KEYED_HASH | crate::PARENT,
- 0,
- 0,
- &mut test_parents_out,
- );
- }
- for n in 0..NUM_INPUTS {
- #[cfg(feature = "std")]
- dbg!(n);
- assert_eq!(
- &portable_parents_out[n * OUT_LEN..][..OUT_LEN],
- &test_parents_out[n * OUT_LEN..][..OUT_LEN]
- );
- }
-}
-
-#[test]
-fn test_key_bytes_equal_key_words() {
- assert_eq!(
- TEST_KEY_WORDS,
- crate::platform::words_from_le_bytes_32(&TEST_KEY),
- );
-}
-
-#[test]
-fn test_reference_impl_size() {
- // Because the Rust compiler optimizes struct layout, it's possible that
- // some future version of the compiler will produce a different size. If
- // that happens, we can either disable this test, or test for multiple
- // expected values. For now, the purpose of this test is to make sure we
- // notice if that happens.
- assert_eq!(1880, core::mem::size_of::<reference_impl::Hasher>());
-}
-
-#[test]
-fn test_counter_words() {
- let counter: u64 = (1 << 32) + 2;
- assert_eq!(crate::counter_low(counter), 2);
- assert_eq!(crate::counter_high(counter), 1);
-}
-
-#[test]
-fn test_largest_power_of_two_leq() {
- let input_output = &[
- // The zero case is nonsensical, but it does work.
- (0, 1),
- (1, 1),
- (2, 2),
- (3, 2),
- (4, 4),
- (5, 4),
- (6, 4),
- (7, 4),
- (8, 8),
- // the largest possible usize
- (usize::MAX, (usize::MAX >> 1) + 1),
- ];
- for &(input, output) in input_output {
- assert_eq!(
- output,
- crate::largest_power_of_two_leq(input),
- "wrong output for n={}",
- input
- );
- }
-}
-
-#[test]
-fn test_left_len() {
- let input_output = &[
- (CHUNK_LEN + 1, CHUNK_LEN),
- (2 * CHUNK_LEN - 1, CHUNK_LEN),
- (2 * CHUNK_LEN, CHUNK_LEN),
- (2 * CHUNK_LEN + 1, 2 * CHUNK_LEN),
- (4 * CHUNK_LEN - 1, 2 * CHUNK_LEN),
- (4 * CHUNK_LEN, 2 * CHUNK_LEN),
- (4 * CHUNK_LEN + 1, 4 * CHUNK_LEN),
- ];
- for &(input, output) in input_output {
- assert_eq!(crate::left_len(input), output);
- }
-}
-
-#[test]
-fn test_compare_reference_impl() {
- const OUT: usize = 303; // more than 64, not a multiple of 4
- let mut input_buf = [0; TEST_CASES_MAX];
- paint_test_input(&mut input_buf);
- for &case in TEST_CASES {
- let input = &input_buf[..case];
- #[cfg(feature = "std")]
- dbg!(case);
-
- // regular
- {
- let mut reference_hasher = reference_impl::Hasher::new();
- reference_hasher.update(input);
- let mut expected_out = [0; OUT];
- reference_hasher.finalize(&mut expected_out);
-
- // all at once
- let test_out = crate::hash(input);
- assert_eq!(test_out, *array_ref!(expected_out, 0, 32));
- // incremental
- let mut hasher = crate::Hasher::new();
- hasher.update(input);
- assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32));
- assert_eq!(hasher.finalize(), test_out);
- // xof
- let mut extended = [0; OUT];
- hasher.finalize_xof().fill(&mut extended);
- assert_eq!(extended[..], expected_out[..]);
- }
-
- // keyed
- {
- let mut reference_hasher = reference_impl::Hasher::new_keyed(&TEST_KEY);
- reference_hasher.update(input);
- let mut expected_out = [0; OUT];
- reference_hasher.finalize(&mut expected_out);
-
- // all at once
- let test_out = crate::keyed_hash(&TEST_KEY, input);
- assert_eq!(test_out, *array_ref!(expected_out, 0, 32));
- // incremental
- let mut hasher = crate::Hasher::new_keyed(&TEST_KEY);
- hasher.update(input);
- assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32));
- assert_eq!(hasher.finalize(), test_out);
- // xof
- let mut extended = [0; OUT];
- hasher.finalize_xof().fill(&mut extended);
- assert_eq!(extended[..], expected_out[..]);
- }
-
- // derive_key
- {
- let context = "BLAKE3 2019-12-27 16:13:59 example context (not the test vector one)";
- let mut reference_hasher = reference_impl::Hasher::new_derive_key(context);
- reference_hasher.update(input);
- let mut expected_out = [0; OUT];
- reference_hasher.finalize(&mut expected_out);
-
- // all at once
- let mut test_out = [0; OUT];
- crate::derive_key(context, input, &mut test_out);
- assert_eq!(test_out[..], expected_out[..]);
- // incremental
- let mut hasher = crate::Hasher::new_derive_key(context);
- hasher.update(input);
- assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32));
- assert_eq!(hasher.finalize(), *array_ref!(test_out, 0, 32));
- // xof
- let mut extended = [0; OUT];
- hasher.finalize_xof().fill(&mut extended);
- assert_eq!(extended[..], expected_out[..]);
- }
- }
-}
-
-fn reference_hash(input: &[u8]) -> crate::Hash {
- let mut hasher = reference_impl::Hasher::new();
- hasher.update(input);
- let mut bytes = [0; 32];
- hasher.finalize(&mut bytes);
- bytes.into()
-}
-
-#[test]
-fn test_compare_update_multiple() {
- // Don't use all the long test cases here, since that's unnecessarily slow
- // in debug mode.
- let mut short_test_cases = TEST_CASES;
- while *short_test_cases.last().unwrap() > 4 * CHUNK_LEN {
- short_test_cases = &short_test_cases[..short_test_cases.len() - 1];
- }
- assert_eq!(*short_test_cases.last().unwrap(), 4 * CHUNK_LEN);
-
- let mut input_buf = [0; 2 * TEST_CASES_MAX];
- paint_test_input(&mut input_buf);
-
- for &first_update in short_test_cases {
- #[cfg(feature = "std")]
- dbg!(first_update);
- let first_input = &input_buf[..first_update];
- let mut test_hasher = crate::Hasher::new();
- test_hasher.update(first_input);
-
- for &second_update in short_test_cases {
- #[cfg(feature = "std")]
- dbg!(second_update);
- let second_input = &input_buf[first_update..][..second_update];
- let total_input = &input_buf[..first_update + second_update];
-
- // Clone the hasher with first_update bytes already written, so
- // that the next iteration can reuse it.
- let mut test_hasher = test_hasher.clone();
- test_hasher.update(second_input);
- let expected = reference_hash(total_input);
- assert_eq!(expected, test_hasher.finalize());
- }
- }
-}
-
-#[test]
-fn test_fuzz_hasher() {
- const INPUT_MAX: usize = 4 * CHUNK_LEN;
- let mut input_buf = [0; 3 * INPUT_MAX];
- paint_test_input(&mut input_buf);
-
- // Don't do too many iterations in debug mode, to keep the tests under a
- // second or so. CI should run tests in release mode also. Provide an
- // environment variable for specifying a larger number of fuzz iterations.
- let num_tests = if cfg!(debug_assertions) { 100 } else { 10_000 };
-
- // Use a fixed RNG seed for reproducibility.
- let mut rng = rand_chacha::ChaCha8Rng::from_seed([1; 32]);
- for _num_test in 0..num_tests {
- #[cfg(feature = "std")]
- dbg!(_num_test);
- let mut hasher = crate::Hasher::new();
- let mut total_input = 0;
- // For each test, write 3 inputs of random length.
- for _ in 0..3 {
- let input_len = rng.gen_range(0, INPUT_MAX + 1);
- #[cfg(feature = "std")]
- dbg!(input_len);
- let input = &input_buf[total_input..][..input_len];
- hasher.update(input);
- total_input += input_len;
- }
- let expected = reference_hash(&input_buf[..total_input]);
- assert_eq!(expected, hasher.finalize());
- }
-}
-
-#[test]
-fn test_xof_seek() {
- let mut out = [0; 533];
- let mut hasher = crate::Hasher::new();
- hasher.update(b"foo");
- hasher.finalize_xof().fill(&mut out);
- assert_eq!(hasher.finalize().as_bytes(), &out[0..32]);
-
- let mut reader = hasher.finalize_xof();
- reader.set_position(303);
- let mut out2 = [0; 102];
- reader.fill(&mut out2);
- assert_eq!(&out[303..][..102], &out2[..]);
-
- #[cfg(feature = "std")]
- {
- use std::io::prelude::*;
- let mut reader = hasher.finalize_xof();
- reader.seek(std::io::SeekFrom::Start(303)).unwrap();
- let mut out3 = Vec::new();
- reader.by_ref().take(102).read_to_end(&mut out3).unwrap();
- assert_eq!(&out[303..][..102], &out3[..]);
-
- assert_eq!(
- reader.seek(std::io::SeekFrom::Current(0)).unwrap(),
- 303 + 102
- );
- reader.seek(std::io::SeekFrom::Current(-5)).unwrap();
- assert_eq!(
- reader.seek(std::io::SeekFrom::Current(0)).unwrap(),
- 303 + 102 - 5
- );
- let mut out4 = [0; 17];
- assert_eq!(reader.read(&mut out4).unwrap(), 17);
- assert_eq!(&out[303 + 102 - 5..][..17], &out4[..]);
- assert_eq!(
- reader.seek(std::io::SeekFrom::Current(0)).unwrap(),
- 303 + 102 - 5 + 17
- );
- assert!(reader.seek(std::io::SeekFrom::End(0)).is_err());
- assert!(reader.seek(std::io::SeekFrom::Current(-1000)).is_err());
- }
-}
-
-#[test]
-fn test_msg_schdule_permutation() {
- let permutation = [2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8];
-
- let mut generated = [[0; 16]; 7];
- generated[0] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
-
- for round in 1..7 {
- for i in 0..16 {
- generated[round][i] = generated[round - 1][permutation[i]];
- }
- }
-
- assert_eq!(generated, crate::MSG_SCHEDULE);
-}
-
-#[test]
-fn test_reset() {
- let mut hasher = crate::Hasher::new();
- hasher.update(&[42; 3 * CHUNK_LEN + 7]);
- hasher.reset();
- hasher.update(&[42; CHUNK_LEN + 3]);
- assert_eq!(hasher.finalize(), crate::hash(&[42; CHUNK_LEN + 3]));
-
- let key = &[99; crate::KEY_LEN];
- let mut keyed_hasher = crate::Hasher::new_keyed(key);
- keyed_hasher.update(&[42; 3 * CHUNK_LEN + 7]);
- keyed_hasher.reset();
- keyed_hasher.update(&[42; CHUNK_LEN + 3]);
- assert_eq!(
- keyed_hasher.finalize(),
- crate::keyed_hash(key, &[42; CHUNK_LEN + 3]),
- );
-
- let context = "BLAKE3 2020-02-12 10:20:58 reset test";
- let mut kdf = crate::Hasher::new_derive_key(context);
- kdf.update(&[42; 3 * CHUNK_LEN + 7]);
- kdf.reset();
- kdf.update(&[42; CHUNK_LEN + 3]);
- let mut expected = [0; crate::OUT_LEN];
- crate::derive_key(context, &[42; CHUNK_LEN + 3], &mut expected);
- assert_eq!(kdf.finalize(), expected);
-}
-
-#[test]
-#[cfg(feature = "rayon")]
-fn test_update_with_rayon_join() {
- let mut input = [0; TEST_CASES_MAX];
- paint_test_input(&mut input);
- let rayon_hash = crate::Hasher::new()
- .update_with_join::<crate::join::RayonJoin>(&input)
- .finalize();
- assert_eq!(crate::hash(&input), rayon_hash);
-}
-
-// Test that the length values given to Join::join are what they're supposed to
-// be.
-#[test]
-fn test_join_lengths() {
- // Use static atomics to let us safely get a couple of values in and out of
- // CustomJoin. This avoids depending on std, though it assumes that this
- // thread will only run once in the lifetime of the runner process.
- static SINGLE_THREAD_LEN: AtomicUsize = AtomicUsize::new(0);
- static CUSTOM_JOIN_CALLS: AtomicUsize = AtomicUsize::new(0);
-
- // Use an input that's exactly (simd_degree * CHUNK_LEN) + 1. That should
- // guarantee that compress_subtree_wide does exactly one split, with the
- // last byte on the right side. Note that it we used
- // Hasher::update_with_join, we would end up buffering that last byte,
- // rather than splitting and joining it.
- let single_thread_len = crate::platform::Platform::detect().simd_degree() * CHUNK_LEN;
- SINGLE_THREAD_LEN.store(single_thread_len, Ordering::SeqCst);
- let mut input_buf = [0; 2 * crate::platform::MAX_SIMD_DEGREE * CHUNK_LEN];
- paint_test_input(&mut input_buf);
- let input = &input_buf[..single_thread_len + 1];
-
- enum CustomJoin {}
-
- impl crate::join::Join for CustomJoin {
- fn join<A, B, RA, RB>(oper_a: A, oper_b: B, len_a: usize, len_b: usize) -> (RA, RB)
- where
- A: FnOnce() -> RA + Send,
- B: FnOnce() -> RB + Send,
- RA: Send,
- RB: Send,
- {
- let prev_calls = CUSTOM_JOIN_CALLS.fetch_add(1, Ordering::SeqCst);
- assert_eq!(prev_calls, 0);
- assert_eq!(len_a, SINGLE_THREAD_LEN.load(Ordering::SeqCst));
- assert_eq!(len_b, 1);
- (oper_a(), oper_b())
- }
- }
-
- let mut out_buf = [0; crate::platform::MAX_SIMD_DEGREE_OR_2 * CHUNK_LEN];
- crate::compress_subtree_wide::<CustomJoin>(
- input,
- crate::IV,
- 0,
- 0,
- crate::platform::Platform::detect(),
- &mut out_buf,
- );
- assert_eq!(CUSTOM_JOIN_CALLS.load(Ordering::SeqCst), 1);
-}
diff --git a/thirdparty/BLAKE3/src/traits.rs b/thirdparty/BLAKE3/src/traits.rs
deleted file mode 100644
index 9704e0106..000000000
--- a/thirdparty/BLAKE3/src/traits.rs
+++ /dev/null
@@ -1,184 +0,0 @@
-//! Implementations of commonly used traits like
-//! [`digest::Digest`](https://crates.io/crates/digest) and
-//! [`crypto_mac::Mac`](https://crates.io/crates/crypto-mac).
-
-pub use crypto_mac;
-pub use digest;
-
-use crate::{Hasher, OutputReader};
-use digest::generic_array::{
- typenum::{U32, U64},
- GenericArray,
-};
-
-impl digest::BlockInput for Hasher {
- type BlockSize = U64;
-}
-
-impl digest::Update for Hasher {
- #[inline]
- fn update(&mut self, data: impl AsRef<[u8]>) {
- self.update(data.as_ref());
- }
-}
-
-impl digest::Reset for Hasher {
- #[inline]
- fn reset(&mut self) {
- self.reset(); // the inherent method
- }
-}
-
-impl digest::FixedOutput for Hasher {
- type OutputSize = U32;
-
- #[inline]
- fn finalize_into(self, out: &mut GenericArray<u8, Self::OutputSize>) {
- out.copy_from_slice(self.finalize().as_bytes());
- }
-
- #[inline]
- fn finalize_into_reset(&mut self, out: &mut GenericArray<u8, Self::OutputSize>) {
- out.copy_from_slice(self.finalize().as_bytes());
- self.reset();
- }
-}
-
-impl digest::ExtendableOutput for Hasher {
- type Reader = OutputReader;
-
- #[inline]
- fn finalize_xof(self) -> Self::Reader {
- Hasher::finalize_xof(&self)
- }
-
- #[inline]
- fn finalize_xof_reset(&mut self) -> Self::Reader {
- let reader = Hasher::finalize_xof(self);
- self.reset();
- reader
- }
-}
-
-impl digest::XofReader for OutputReader {
- #[inline]
- fn read(&mut self, buffer: &mut [u8]) {
- self.fill(buffer);
- }
-}
-
-impl crypto_mac::NewMac for Hasher {
- type KeySize = U32;
-
- #[inline]
- fn new(key: &crypto_mac::Key<Self>) -> Self {
- let key_bytes: [u8; 32] = (*key).into();
- Hasher::new_keyed(&key_bytes)
- }
-}
-
-impl crypto_mac::Mac for Hasher {
- type OutputSize = U32;
-
- #[inline]
- fn update(&mut self, data: &[u8]) {
- self.update(data);
- }
-
- #[inline]
- fn reset(&mut self) {
- self.reset();
- }
-
- #[inline]
- fn finalize(self) -> crypto_mac::Output<Self> {
- crypto_mac::Output::new(digest::Digest::finalize(self))
- }
-}
-
-#[cfg(test)]
-mod test {
- use super::*;
-
- #[test]
- fn test_digest_traits() {
- // Inherent methods.
- let mut hasher1 = crate::Hasher::new();
- hasher1.update(b"foo");
- hasher1.update(b"bar");
- hasher1.update(b"baz");
- let out1 = hasher1.finalize();
- let mut xof1 = [0; 301];
- hasher1.finalize_xof().fill(&mut xof1);
- assert_eq!(out1.as_bytes(), &xof1[..32]);
-
- // Trait implementations.
- let mut hasher2: crate::Hasher = digest::Digest::new();
- digest::Digest::update(&mut hasher2, b"xxx");
- digest::Digest::reset(&mut hasher2);
- digest::Digest::update(&mut hasher2, b"foo");
- digest::Digest::update(&mut hasher2, b"bar");
- digest::Digest::update(&mut hasher2, b"baz");
- let out2 = digest::Digest::finalize(hasher2.clone());
- let mut xof2 = [0; 301];
- digest::XofReader::read(
- &mut digest::ExtendableOutput::finalize_xof(hasher2.clone()),
- &mut xof2,
- );
- assert_eq!(out1.as_bytes(), &out2[..]);
- assert_eq!(xof1[..], xof2[..]);
-
- // Again with the resetting variants.
- let mut hasher3: crate::Hasher = digest::Digest::new();
- digest::Digest::update(&mut hasher3, b"foobarbaz");
- let mut out3 = [0; 32];
- digest::FixedOutput::finalize_into_reset(
- &mut hasher3,
- GenericArray::from_mut_slice(&mut out3),
- );
- digest::Digest::update(&mut hasher3, b"foobarbaz");
- let mut out4 = [0; 32];
- digest::FixedOutput::finalize_into_reset(
- &mut hasher3,
- GenericArray::from_mut_slice(&mut out4),
- );
- digest::Digest::update(&mut hasher3, b"foobarbaz");
- let mut xof3 = [0; 301];
- digest::XofReader::read(
- &mut digest::ExtendableOutput::finalize_xof_reset(&mut hasher3),
- &mut xof3,
- );
- digest::Digest::update(&mut hasher3, b"foobarbaz");
- let mut xof4 = [0; 301];
- digest::XofReader::read(
- &mut digest::ExtendableOutput::finalize_xof_reset(&mut hasher3),
- &mut xof4,
- );
- assert_eq!(out1.as_bytes(), &out3[..]);
- assert_eq!(out1.as_bytes(), &out4[..]);
- assert_eq!(xof1[..], xof3[..]);
- assert_eq!(xof1[..], xof4[..]);
- }
-
- #[test]
- fn test_mac_trait() {
- // Inherent methods.
- let key = b"some super secret key bytes fooo";
- let mut hasher1 = crate::Hasher::new_keyed(key);
- hasher1.update(b"foo");
- hasher1.update(b"bar");
- hasher1.update(b"baz");
- let out1 = hasher1.finalize();
-
- // Trait implementation.
- let generic_key = (*key).into();
- let mut hasher2: crate::Hasher = crypto_mac::NewMac::new(&generic_key);
- crypto_mac::Mac::update(&mut hasher2, b"xxx");
- crypto_mac::Mac::reset(&mut hasher2);
- crypto_mac::Mac::update(&mut hasher2, b"foo");
- crypto_mac::Mac::update(&mut hasher2, b"bar");
- crypto_mac::Mac::update(&mut hasher2, b"baz");
- let out2 = crypto_mac::Mac::finalize(hasher2);
- assert_eq!(out1.as_bytes(), out2.into_bytes().as_slice());
- }
-}
diff --git a/thirdparty/BLAKE3/test_vectors/Cargo.toml b/thirdparty/BLAKE3/test_vectors/Cargo.toml
deleted file mode 100644
index cd74a9df0..000000000
--- a/thirdparty/BLAKE3/test_vectors/Cargo.toml
+++ /dev/null
@@ -1,18 +0,0 @@
-[package]
-name = "test_vectors"
-version = "0.0.0"
-edition = "2018"
-
-[features]
-neon = ["blake3/neon"]
-prefer_intrinsics = ["blake3/prefer_intrinsics"]
-pure = ["blake3/pure"]
-
-[dependencies]
-# If you ever change these path dependencies, you'll probably need to update
-# cross_test.sh, or CI will break. I'm sorry >.<
-blake3 = { path = "../" }
-hex = "0.4.0"
-reference_impl = { path = "../reference_impl" }
-serde = { version = "1.0", features = ["derive"] }
-serde_json = "1.0"
diff --git a/thirdparty/BLAKE3/test_vectors/cross_test.sh b/thirdparty/BLAKE3/test_vectors/cross_test.sh
deleted file mode 100644
index c4d280c9d..000000000
--- a/thirdparty/BLAKE3/test_vectors/cross_test.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#! /usr/bin/env bash
-
-# This hacky script works around the fact that `cross test` does not support
-# path dependencies. (It uses a docker shared folder to let the guest access
-# project files, so parent directories aren't available.) Solve this problem by
-# copying the entire project to a temp dir and rearranging paths to put
-# "blake3" and "reference_impl" underneath "test_vectors", so that everything
-# is accessible. Hopefully this will just run on CI forever and no one will
-# ever read this and discover my deep shame.
-
-set -e -u -o pipefail
-
-project_root="$(realpath "$(dirname "$BASH_SOURCE")/..")"
-tmpdir="$(mktemp -d)"
-echo "Running cross tests in $tmpdir"
-cd "$tmpdir"
-git clone "$project_root" blake3
-mv blake3/test_vectors .
-mv blake3/reference_impl test_vectors
-mv blake3 test_vectors
-cd test_vectors
-sed -i 's|blake3 = { path = "../" }|blake3 = { path = "./blake3" }|' Cargo.toml
-sed -i 's|reference_impl = { path = "../reference_impl" }|reference_impl = { path = "reference_impl" }|' Cargo.toml
-
-cross test "$@"
diff --git a/thirdparty/BLAKE3/test_vectors/src/lib.rs b/thirdparty/BLAKE3/test_vectors/src/lib.rs
deleted file mode 100644
index 04460f668..000000000
--- a/thirdparty/BLAKE3/test_vectors/src/lib.rs
+++ /dev/null
@@ -1,349 +0,0 @@
-use blake3::{BLOCK_LEN, CHUNK_LEN};
-use serde::{Deserialize, Serialize};
-
-// A non-multiple of 4 is important, since one possible bug is to fail to emit
-// partial words.
-pub const OUTPUT_LEN: usize = 2 * blake3::BLOCK_LEN + 3;
-
-pub const TEST_CASES: &[usize] = &[
- 0,
- 1,
- 2,
- 3,
- 4,
- 5,
- 6,
- 7,
- 8,
- BLOCK_LEN - 1,
- BLOCK_LEN,
- BLOCK_LEN + 1,
- 2 * BLOCK_LEN - 1,
- 2 * BLOCK_LEN,
- 2 * BLOCK_LEN + 1,
- CHUNK_LEN - 1,
- CHUNK_LEN,
- CHUNK_LEN + 1,
- 2 * CHUNK_LEN,
- 2 * CHUNK_LEN + 1,
- 3 * CHUNK_LEN,
- 3 * CHUNK_LEN + 1,
- 4 * CHUNK_LEN,
- 4 * CHUNK_LEN + 1,
- 5 * CHUNK_LEN,
- 5 * CHUNK_LEN + 1,
- 6 * CHUNK_LEN,
- 6 * CHUNK_LEN + 1,
- 7 * CHUNK_LEN,
- 7 * CHUNK_LEN + 1,
- 8 * CHUNK_LEN,
- 8 * CHUNK_LEN + 1,
- 16 * CHUNK_LEN, // AVX512's bandwidth
- 31 * CHUNK_LEN, // 16 + 8 + 4 + 2 + 1
- 100 * CHUNK_LEN, // subtrees larger than MAX_SIMD_DEGREE chunks
-];
-
-pub const TEST_KEY: &[u8; blake3::KEY_LEN] = b"whats the Elvish word for friend";
-pub const TEST_CONTEXT: &str = "BLAKE3 2019-12-27 16:29:52 test vectors context";
-
-const COMMENT: &str = r#"
-Each test is an input length and three outputs, one for each of the hash,
-keyed_hash, and derive_key modes. The input in each case is filled with a
-repeating sequence of 251 bytes: 0, 1, 2, ..., 249, 250, 0, 1, ..., and so on.
-The key used with keyed_hash is the 32-byte ASCII string "whats the Elvish word
-for friend", also given in the `key` field below. The context string used with
-derive_key is the ASCII string "BLAKE3 2019-12-27 16:29:52 test vectors
-context", also given in the `context_string` field below. Outputs are encoded
-as hexadecimal. Each case is an extended output, and implementations should
-also check that the first 32 bytes match their default-length output.
-"#;
-
-// Paint the input with a repeating byte pattern. We use a cycle length of 251,
-// because that's the largets prime number less than 256. This makes it
-// unlikely to swapping any two adjacent input blocks or chunks will give the
-// same answer.
-pub fn paint_test_input(buf: &mut [u8]) {
- for (i, b) in buf.iter_mut().enumerate() {
- *b = (i % 251) as u8;
- }
-}
-
-#[derive(Debug, Serialize, Deserialize)]
-pub struct Cases {
- pub _comment: String,
- pub key: String,
- pub context_string: String,
- pub cases: Vec<Case>,
-}
-
-#[derive(Debug, Serialize, Deserialize)]
-pub struct Case {
- pub input_len: usize,
- pub hash: String,
- pub keyed_hash: String,
- pub derive_key: String,
-}
-
-pub fn generate_json() -> String {
- let mut cases = Vec::new();
- for &input_len in TEST_CASES {
- let mut input = vec![0; input_len];
- paint_test_input(&mut input);
-
- let mut hash_out = [0; OUTPUT_LEN];
- blake3::Hasher::new()
- .update(&input)
- .finalize_xof()
- .fill(&mut hash_out);
-
- let mut keyed_hash_out = [0; OUTPUT_LEN];
- blake3::Hasher::new_keyed(TEST_KEY)
- .update(&input)
- .finalize_xof()
- .fill(&mut keyed_hash_out);
-
- let mut derive_key_out = [0; OUTPUT_LEN];
- blake3::Hasher::new_derive_key(TEST_CONTEXT)
- .update(&input)
- .finalize_xof()
- .fill(&mut derive_key_out);
-
- cases.push(Case {
- input_len,
- hash: hex::encode(&hash_out[..]),
- keyed_hash: hex::encode(&keyed_hash_out[..]),
- derive_key: hex::encode(&derive_key_out[..]),
- });
- }
-
- let mut json = serde_json::to_string_pretty(&Cases {
- _comment: COMMENT.trim().replace("\n", " "),
- key: std::str::from_utf8(TEST_KEY).unwrap().to_string(),
- context_string: TEST_CONTEXT.to_string(),
- cases,
- })
- .unwrap();
-
- // Add a trailing newline.
- json.push('\n');
- json
-}
-
-pub fn read_test_vectors_file() -> String {
- let test_vectors_file_path = "./test_vectors.json";
- std::fs::read_to_string(test_vectors_file_path).expect("failed to read test_vectors.json")
-}
-
-pub fn parse_test_cases() -> Cases {
- let json = read_test_vectors_file();
- serde_json::from_str(&json).expect("failed to parse test_vectors.json")
-}
-
-#[cfg(test)]
-mod tests {
- use super::*;
- use std::convert::TryInto;
-
- fn test_reference_impl_all_at_once(
- key: &[u8; blake3::KEY_LEN],
- input: &[u8],
- expected_hash: &[u8],
- expected_keyed_hash: &[u8],
- expected_derive_key: &[u8],
- ) {
- let mut out = vec![0; expected_hash.len()];
- let mut hasher = reference_impl::Hasher::new();
- hasher.update(input);
- hasher.finalize(&mut out);
- assert_eq!(expected_hash, &out[..]);
-
- let mut out = vec![0; expected_keyed_hash.len()];
- let mut hasher = reference_impl::Hasher::new_keyed(key);
- hasher.update(input);
- hasher.finalize(&mut out);
- assert_eq!(expected_keyed_hash, &out[..]);
-
- let mut out = vec![0; expected_derive_key.len()];
- let mut hasher = reference_impl::Hasher::new_derive_key(TEST_CONTEXT);
- hasher.update(input);
- hasher.finalize(&mut out);
- assert_eq!(expected_derive_key, &out[..]);
- }
-
- fn test_reference_impl_one_at_a_time(
- key: &[u8; blake3::KEY_LEN],
- input: &[u8],
- expected_hash: &[u8],
- expected_keyed_hash: &[u8],
- expected_derive_key: &[u8],
- ) {
- let mut out = vec![0; expected_hash.len()];
- let mut hasher = reference_impl::Hasher::new();
- for &b in input {
- hasher.update(&[b]);
- }
- hasher.finalize(&mut out);
- assert_eq!(expected_hash, &out[..]);
-
- let mut out = vec![0; expected_keyed_hash.len()];
- let mut hasher = reference_impl::Hasher::new_keyed(key);
- for &b in input {
- hasher.update(&[b]);
- }
- hasher.finalize(&mut out);
- assert_eq!(expected_keyed_hash, &out[..]);
-
- let mut out = vec![0; expected_derive_key.len()];
- let mut hasher = reference_impl::Hasher::new_derive_key(TEST_CONTEXT);
- for &b in input {
- hasher.update(&[b]);
- }
- hasher.finalize(&mut out);
- assert_eq!(expected_derive_key, &out[..]);
- }
-
- fn test_incremental_all_at_once(
- key: &[u8; blake3::KEY_LEN],
- input: &[u8],
- expected_hash: &[u8],
- expected_keyed_hash: &[u8],
- expected_derive_key: &[u8],
- ) {
- let mut out = vec![0; expected_hash.len()];
- let mut hasher = blake3::Hasher::new();
- hasher.update(input);
- hasher.finalize_xof().fill(&mut out);
- assert_eq!(expected_hash, &out[..]);
- assert_eq!(&expected_hash[..32], hasher.finalize().as_bytes());
-
- let mut out = vec![0; expected_keyed_hash.len()];
- let mut hasher = blake3::Hasher::new_keyed(key);
- hasher.update(input);
- hasher.finalize_xof().fill(&mut out);
- assert_eq!(expected_keyed_hash, &out[..]);
- assert_eq!(&expected_keyed_hash[..32], hasher.finalize().as_bytes());
-
- let mut out = vec![0; expected_derive_key.len()];
- let mut hasher = blake3::Hasher::new_derive_key(TEST_CONTEXT);
- hasher.update(input);
- hasher.finalize_xof().fill(&mut out);
- assert_eq!(expected_derive_key, &out[..]);
- assert_eq!(&expected_derive_key[..32], hasher.finalize().as_bytes());
- }
-
- fn test_incremental_one_at_a_time(
- key: &[u8; blake3::KEY_LEN],
- input: &[u8],
- expected_hash: &[u8],
- expected_keyed_hash: &[u8],
- expected_derive_key: &[u8],
- ) {
- let mut out = vec![0; expected_hash.len()];
- let mut hasher = blake3::Hasher::new();
- for &b in input {
- hasher.update(&[b]);
- }
- hasher.finalize_xof().fill(&mut out);
- assert_eq!(expected_hash, &out[..]);
- assert_eq!(&expected_hash[..32], hasher.finalize().as_bytes());
-
- let mut out = vec![0; expected_keyed_hash.len()];
- let mut hasher = blake3::Hasher::new_keyed(key);
- for &b in input {
- hasher.update(&[b]);
- }
- hasher.finalize_xof().fill(&mut out);
- assert_eq!(expected_keyed_hash, &out[..]);
- assert_eq!(&expected_keyed_hash[..32], hasher.finalize().as_bytes());
-
- let mut out = vec![0; expected_derive_key.len()];
- let mut hasher = blake3::Hasher::new_derive_key(TEST_CONTEXT);
- for &b in input {
- hasher.update(&[b]);
- }
- hasher.finalize_xof().fill(&mut out);
- assert_eq!(expected_derive_key, &out[..]);
- assert_eq!(&expected_derive_key[..32], hasher.finalize().as_bytes());
- }
-
- fn test_recursive(
- key: &[u8; blake3::KEY_LEN],
- input: &[u8],
- expected_hash: &[u8],
- expected_keyed_hash: &[u8],
- expected_derive_key: &[u8],
- ) {
- assert_eq!(&expected_hash[..32], blake3::hash(input).as_bytes());
- assert_eq!(
- &expected_keyed_hash[..32],
- &blake3::keyed_hash(key, input).as_bytes()[..],
- );
- let mut derive_key_out = vec![0; expected_derive_key.len()];
- blake3::derive_key(TEST_CONTEXT, input, &mut derive_key_out);
- assert_eq!(expected_derive_key, &derive_key_out[..],);
- }
-
- #[test]
- fn run_test_vectors() {
- let cases = parse_test_cases();
- let key: &[u8; blake3::KEY_LEN] = cases.key.as_bytes().try_into().unwrap();
- for case in &cases.cases {
- dbg!(case.input_len);
- let mut input = vec![0; case.input_len];
- paint_test_input(&mut input);
- let expected_hash = hex::decode(&case.hash).unwrap();
- let expected_keyed_hash = hex::decode(&case.keyed_hash).unwrap();
- let expected_derive_key = hex::decode(&case.derive_key).unwrap();
-
- test_reference_impl_all_at_once(
- key,
- &input,
- &expected_hash,
- &expected_keyed_hash,
- &expected_derive_key,
- );
-
- test_reference_impl_one_at_a_time(
- key,
- &input,
- &expected_hash,
- &expected_keyed_hash,
- &expected_derive_key,
- );
-
- test_incremental_all_at_once(
- key,
- &input,
- &expected_hash,
- &expected_keyed_hash,
- &expected_derive_key,
- );
-
- test_incremental_one_at_a_time(
- key,
- &input,
- &expected_hash,
- &expected_keyed_hash,
- &expected_derive_key,
- );
-
- test_recursive(
- key,
- &input,
- &expected_hash,
- &expected_keyed_hash,
- &expected_derive_key,
- );
- }
- }
-
- #[test]
- fn test_checked_in_vectors_up_to_date() {
- // Replace Windows newlines, in case Git is configured to alter
- // newlines when files are checked out.
- let json = read_test_vectors_file().replace("\r\n", "\n");
- if generate_json() != json {
- panic!("Checked-in test_vectors.json is not up to date. Regenerate with `cargo run --bin generate > ./test_vectors.json`.");
- }
- }
-}
diff --git a/thirdparty/BLAKE3/test_vectors/test_vectors.json b/thirdparty/BLAKE3/test_vectors/test_vectors.json
deleted file mode 100644
index f6da91792..000000000
--- a/thirdparty/BLAKE3/test_vectors/test_vectors.json
+++ /dev/null
@@ -1,217 +0,0 @@
-{
- "_comment": "Each test is an input length and three outputs, one for each of the hash, keyed_hash, and derive_key modes. The input in each case is filled with a repeating sequence of 251 bytes: 0, 1, 2, ..., 249, 250, 0, 1, ..., and so on. The key used with keyed_hash is the 32-byte ASCII string \"whats the Elvish word for friend\", also given in the `key` field below. The context string used with derive_key is the ASCII string \"BLAKE3 2019-12-27 16:29:52 test vectors context\", also given in the `context_string` field below. Outputs are encoded as hexadecimal. Each case is an extended output, and implementations should also check that the first 32 bytes match their default-length output.",
- "key": "whats the Elvish word for friend",
- "context_string": "BLAKE3 2019-12-27 16:29:52 test vectors context",
- "cases": [
- {
- "input_len": 0,
- "hash": "af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262e00f03e7b69af26b7faaf09fcd333050338ddfe085b8cc869ca98b206c08243a26f5487789e8f660afe6c99ef9e0c52b92e7393024a80459cf91f476f9ffdbda7001c22e159b402631f277ca96f2defdf1078282314e763699a31c5363165421cce14d",
- "keyed_hash": "92b2b75604ed3c761f9d6f62392c8a9227ad0ea3f09573e783f1498a4ed60d26b18171a2f22a4b94822c701f107153dba24918c4bae4d2945c20ece13387627d3b73cbf97b797d5e59948c7ef788f54372df45e45e4293c7dc18c1d41144a9758be58960856be1eabbe22c2653190de560ca3b2ac4aa692a9210694254c371e851bc8f",
- "derive_key": "2cc39783c223154fea8dfb7c1b1660f2ac2dcbd1c1de8277b0b0dd39b7e50d7d905630c8be290dfcf3e6842f13bddd573c098c3f17361f1f206b8cad9d088aa4a3f746752c6b0ce6a83b0da81d59649257cdf8eb3e9f7d4998e41021fac119deefb896224ac99f860011f73609e6e0e4540f93b273e56547dfd3aa1a035ba6689d89a0"
- },
- {
- "input_len": 1,
- "hash": "2d3adedff11b61f14c886e35afa036736dcd87a74d27b5c1510225d0f592e213c3a6cb8bf623e20cdb535f8d1a5ffb86342d9c0b64aca3bce1d31f60adfa137b358ad4d79f97b47c3d5e79f179df87a3b9776ef8325f8329886ba42f07fb138bb502f4081cbcec3195c5871e6c23e2cc97d3c69a613eba131e5f1351f3f1da786545e5",
- "keyed_hash": "6d7878dfff2f485635d39013278ae14f1454b8c0a3a2d34bc1ab38228a80c95b6568c0490609413006fbd428eb3fd14e7756d90f73a4725fad147f7bf70fd61c4e0cf7074885e92b0e3f125978b4154986d4fb202a3f331a3fb6cf349a3a70e49990f98fe4289761c8602c4e6ab1138d31d3b62218078b2f3ba9a88e1d08d0dd4cea11",
- "derive_key": "b3e2e340a117a499c6cf2398a19ee0d29cca2bb7404c73063382693bf66cb06c5827b91bf889b6b97c5477f535361caefca0b5d8c4746441c57617111933158950670f9aa8a05d791daae10ac683cbef8faf897c84e6114a59d2173c3f417023a35d6983f2c7dfa57e7fc559ad751dbfb9ffab39c2ef8c4aafebc9ae973a64f0c76551"
- },
- {
- "input_len": 2,
- "hash": "7b7015bb92cf0b318037702a6cdd81dee41224f734684c2c122cd6359cb1ee63d8386b22e2ddc05836b7c1bb693d92af006deb5ffbc4c70fb44d0195d0c6f252faac61659ef86523aa16517f87cb5f1340e723756ab65efb2f91964e14391de2a432263a6faf1d146937b35a33621c12d00be8223a7f1919cec0acd12097ff3ab00ab1",
- "keyed_hash": "5392ddae0e0a69d5f40160462cbd9bd889375082ff224ac9c758802b7a6fd20a9ffbf7efd13e989a6c246f96d3a96b9d279f2c4e63fb0bdff633957acf50ee1a5f658be144bab0f6f16500dee4aa5967fc2c586d85a04caddec90fffb7633f46a60786024353b9e5cebe277fcd9514217fee2267dcda8f7b31697b7c54fab6a939bf8f",
- "derive_key": "1f166565a7df0098ee65922d7fea425fb18b9943f19d6161e2d17939356168e6daa59cae19892b2d54f6fc9f475d26031fd1c22ae0a3e8ef7bdb23f452a15e0027629d2e867b1bb1e6ab21c71297377750826c404dfccc2406bd57a83775f89e0b075e59a7732326715ef912078e213944f490ad68037557518b79c0086de6d6f6cdd2"
- },
- {
- "input_len": 3,
- "hash": "e1be4d7a8ab5560aa4199eea339849ba8e293d55ca0a81006726d184519e647f5b49b82f805a538c68915c1ae8035c900fd1d4b13902920fd05e1450822f36de9454b7e9996de4900c8e723512883f93f4345f8a58bfe64ee38d3ad71ab027765d25cdd0e448328a8e7a683b9a6af8b0af94fa09010d9186890b096a08471e4230a134",
- "keyed_hash": "39e67b76b5a007d4921969779fe666da67b5213b096084ab674742f0d5ec62b9b9142d0fab08e1b161efdbb28d18afc64d8f72160c958e53a950cdecf91c1a1bbab1a9c0f01def762a77e2e8545d4dec241e98a89b6db2e9a5b070fc110caae2622690bd7b76c02ab60750a3ea75426a6bb8803c370ffe465f07fb57def95df772c39f",
- "derive_key": "440aba35cb006b61fc17c0529255de438efc06a8c9ebf3f2ddac3b5a86705797f27e2e914574f4d87ec04c379e12789eccbfbc15892626042707802dbe4e97c3ff59dca80c1e54246b6d055154f7348a39b7d098b2b4824ebe90e104e763b2a447512132cede16243484a55a4e40a85790038bb0dcf762e8c053cabae41bbe22a5bff7"
- },
- {
- "input_len": 4,
- "hash": "f30f5ab28fe047904037f77b6da4fea1e27241c5d132638d8bedce9d40494f328f603ba4564453e06cdcee6cbe728a4519bbe6f0d41e8a14b5b225174a566dbfa61b56afb1e452dc08c804f8c3143c9e2cc4a31bb738bf8c1917b55830c6e65797211701dc0b98daa1faeaa6ee9e56ab606ce03a1a881e8f14e87a4acf4646272cfd12",
- "keyed_hash": "7671dde590c95d5ac9616651ff5aa0a27bee5913a348e053b8aa9108917fe070116c0acff3f0d1fa97ab38d813fd46506089118147d83393019b068a55d646251ecf81105f798d76a10ae413f3d925787d6216a7eb444e510fd56916f1d753a5544ecf0072134a146b2615b42f50c179f56b8fae0788008e3e27c67482349e249cb86a",
- "derive_key": "f46085c8190d69022369ce1a18880e9b369c135eb93f3c63550d3e7630e91060fbd7d8f4258bec9da4e05044f88b91944f7cab317a2f0c18279629a3867fad0662c9ad4d42c6f27e5b124da17c8c4f3a94a025ba5d1b623686c6099d202a7317a82e3d95dae46a87de0555d727a5df55de44dab799a20dffe239594d6e99ed17950910"
- },
- {
- "input_len": 5,
- "hash": "b40b44dfd97e7a84a996a91af8b85188c66c126940ba7aad2e7ae6b385402aa2ebcfdac6c5d32c31209e1f81a454751280db64942ce395104e1e4eaca62607de1c2ca748251754ea5bbe8c20150e7f47efd57012c63b3c6a6632dc1c7cd15f3e1c999904037d60fac2eb9397f2adbe458d7f264e64f1e73aa927b30988e2aed2f03620",
- "keyed_hash": "73ac69eecf286894d8102018a6fc729f4b1f4247d3703f69bdc6a5fe3e0c84616ab199d1f2f3e53bffb17f0a2209fe8b4f7d4c7bae59c2bc7d01f1ff94c67588cc6b38fa6024886f2c078bfe09b5d9e6584cd6c521c3bb52f4de7687b37117a2dbbec0d59e92fa9a8cc3240d4432f91757aabcae03e87431dac003e7d73574bfdd8218",
- "derive_key": "1f24eda69dbcb752847ec3ebb5dd42836d86e58500c7c98d906ecd82ed9ae47f6f48a3f67e4e43329c9a89b1ca526b9b35cbf7d25c1e353baffb590fd79be58ddb6c711f1a6b60e98620b851c688670412fcb0435657ba6b638d21f0f2a04f2f6b0bd8834837b10e438d5f4c7c2c71299cf7586ea9144ed09253d51f8f54dd6bff719d"
- },
- {
- "input_len": 6,
- "hash": "06c4e8ffb6872fad96f9aaca5eee1553eb62aed0ad7198cef42e87f6a616c844611a30c4e4f37fe2fe23c0883cde5cf7059d88b657c7ed2087e3d210925ede716435d6d5d82597a1e52b9553919e804f5656278bd739880692c94bff2824d8e0b48cac1d24682699e4883389dc4f2faa2eb3b4db6e39debd5061ff3609916f3e07529a",
- "keyed_hash": "82d3199d0013035682cc7f2a399d4c212544376a839aa863a0f4c91220ca7a6dc2ffb3aa05f2631f0fa9ac19b6e97eb7e6669e5ec254799350c8b8d189e8807800842a5383c4d907c932f34490aaf00064de8cdb157357bde37c1504d2960034930887603abc5ccb9f5247f79224baff6120a3c622a46d7b1bcaee02c5025460941256",
- "derive_key": "be96b30b37919fe4379dfbe752ae77b4f7e2ab92f7ff27435f76f2f065f6a5f435ae01a1d14bd5a6b3b69d8cbd35f0b01ef2173ff6f9b640ca0bd4748efa398bf9a9c0acd6a66d9332fdc9b47ffe28ba7ab6090c26747b85f4fab22f936b71eb3f64613d8bd9dfabe9bb68da19de78321b481e5297df9e40ec8a3d662f3e1479c65de0"
- },
- {
- "input_len": 7,
- "hash": "3f8770f387faad08faa9d8414e9f449ac68e6ff0417f673f602a646a891419fe66036ef6e6d1a8f54baa9fed1fc11c77cfb9cff65bae915045027046ebe0c01bf5a941f3bb0f73791d3fc0b84370f9f30af0cd5b0fc334dd61f70feb60dad785f070fef1f343ed933b49a5ca0d16a503f599a365a4296739248b28d1a20b0e2cc8975c",
- "keyed_hash": "af0a7ec382aedc0cfd626e49e7628bc7a353a4cb108855541a5651bf64fbb28a7c5035ba0f48a9c73dabb2be0533d02e8fd5d0d5639a18b2803ba6bf527e1d145d5fd6406c437b79bcaad6c7bdf1cf4bd56a893c3eb9510335a7a798548c6753f74617bede88bef924ba4b334f8852476d90b26c5dc4c3668a2519266a562c6c8034a6",
- "derive_key": "dc3b6485f9d94935329442916b0d059685ba815a1fa2a14107217453a7fc9f0e66266db2ea7c96843f9d8208e600a73f7f45b2f55b9e6d6a7ccf05daae63a3fdd10b25ac0bd2e224ce8291f88c05976d575df998477db86fb2cfbbf91725d62cb57acfeb3c2d973b89b503c2b60dde85a7802b69dc1ac2007d5623cbea8cbfb6b181f5"
- },
- {
- "input_len": 8,
- "hash": "2351207d04fc16ade43ccab08600939c7c1fa70a5c0aaca76063d04c3228eaeb725d6d46ceed8f785ab9f2f9b06acfe398c6699c6129da084cb531177445a682894f9685eaf836999221d17c9a64a3a057000524cd2823986db378b074290a1a9b93a22e135ed2c14c7e20c6d045cd00b903400374126676ea78874d79f2dd7883cf5c",
- "keyed_hash": "be2f5495c61cba1bb348a34948c004045e3bd4dae8f0fe82bf44d0da245a060048eb5e68ce6dea1eb0229e144f578b3aa7e9f4f85febd135df8525e6fe40c6f0340d13dd09b255ccd5112a94238f2be3c0b5b7ecde06580426a93e0708555a265305abf86d874e34b4995b788e37a823491f25127a502fe0704baa6bfdf04e76c13276",
- "derive_key": "2b166978cef14d9d438046c720519d8b1cad707e199746f1562d0c87fbd32940f0e2545a96693a66654225ebbaac76d093bfa9cd8f525a53acb92a861a98c42e7d1c4ae82e68ab691d510012edd2a728f98cd4794ef757e94d6546961b4f280a51aac339cc95b64a92b83cc3f26d8af8dfb4c091c240acdb4d47728d23e7148720ef04"
- },
- {
- "input_len": 63,
- "hash": "e9bc37a594daad83be9470df7f7b3798297c3d834ce80ba85d6e207627b7db7b1197012b1e7d9af4d7cb7bdd1f3bb49a90a9b5dec3ea2bbc6eaebce77f4e470cbf4687093b5352f04e4a4570fba233164e6acc36900e35d185886a827f7ea9bdc1e5c3ce88b095a200e62c10c043b3e9bc6cb9b6ac4dfa51794b02ace9f98779040755",
- "keyed_hash": "bb1eb5d4afa793c1ebdd9fb08def6c36d10096986ae0cfe148cd101170ce37aea05a63d74a840aecd514f654f080e51ac50fd617d22610d91780fe6b07a26b0847abb38291058c97474ef6ddd190d30fc318185c09ca1589d2024f0a6f16d45f11678377483fa5c005b2a107cb9943e5da634e7046855eaa888663de55d6471371d55d",
- "derive_key": "b6451e30b953c206e34644c6803724e9d2725e0893039cfc49584f991f451af3b89e8ff572d3da4f4022199b9563b9d70ebb616efff0763e9abec71b550f1371e233319c4c4e74da936ba8e5bbb29a598e007a0bbfa929c99738ca2cc098d59134d11ff300c39f82e2fce9f7f0fa266459503f64ab9913befc65fddc474f6dc1c67669"
- },
- {
- "input_len": 64,
- "hash": "4eed7141ea4a5cd4b788606bd23f46e212af9cacebacdc7d1f4c6dc7f2511b98fc9cc56cb831ffe33ea8e7e1d1df09b26efd2767670066aa82d023b1dfe8ab1b2b7fbb5b97592d46ffe3e05a6a9b592e2949c74160e4674301bc3f97e04903f8c6cf95b863174c33228924cdef7ae47559b10b294acd660666c4538833582b43f82d74",
- "keyed_hash": "ba8ced36f327700d213f120b1a207a3b8c04330528586f414d09f2f7d9ccb7e68244c26010afc3f762615bbac552a1ca909e67c83e2fd5478cf46b9e811efccc93f77a21b17a152ebaca1695733fdb086e23cd0eb48c41c034d52523fc21236e5d8c9255306e48d52ba40b4dac24256460d56573d1312319afcf3ed39d72d0bfc69acb",
- "derive_key": "a5c4a7053fa86b64746d4bb688d06ad1f02a18fce9afd3e818fefaa7126bf73e9b9493a9befebe0bf0c9509fb3105cfa0e262cde141aa8e3f2c2f77890bb64a4cca96922a21ead111f6338ad5244f2c15c44cb595443ac2ac294231e31be4a4307d0a91e874d36fc9852aeb1265c09b6e0cda7c37ef686fbbcab97e8ff66718be048bb"
- },
- {
- "input_len": 65,
- "hash": "de1e5fa0be70df6d2be8fffd0e99ceaa8eb6e8c93a63f2d8d1c30ecb6b263dee0e16e0a4749d6811dd1d6d1265c29729b1b75a9ac346cf93f0e1d7296dfcfd4313b3a227faaaaf7757cc95b4e87a49be3b8a270a12020233509b1c3632b3485eef309d0abc4a4a696c9decc6e90454b53b000f456a3f10079072baaf7a981653221f2c",
- "keyed_hash": "c0a4edefa2d2accb9277c371ac12fcdbb52988a86edc54f0716e1591b4326e72d5e795f46a596b02d3d4bfb43abad1e5d19211152722ec1f20fef2cd413e3c22f2fc5da3d73041275be6ede3517b3b9f0fc67ade5956a672b8b75d96cb43294b9041497de92637ed3f2439225e683910cb3ae923374449ca788fb0f9bea92731bc26ad",
- "derive_key": "51fd05c3c1cfbc8ed67d139ad76f5cf8236cd2acd26627a30c104dfd9d3ff8a82b02e8bd36d8498a75ad8c8e9b15eb386970283d6dd42c8ae7911cc592887fdbe26a0a5f0bf821cd92986c60b2502c9be3f98a9c133a7e8045ea867e0828c7252e739321f7c2d65daee4468eb4429efae469a42763f1f94977435d10dccae3e3dce88d"
- },
- {
- "input_len": 127,
- "hash": "d81293fda863f008c09e92fc382a81f5a0b4a1251cba1634016a0f86a6bd640de3137d477156d1fde56b0cf36f8ef18b44b2d79897bece12227539ac9ae0a5119da47644d934d26e74dc316145dcb8bb69ac3f2e05c242dd6ee06484fcb0e956dc44355b452c5e2bbb5e2b66e99f5dd443d0cbcaaafd4beebaed24ae2f8bb672bcef78",
- "keyed_hash": "c64200ae7dfaf35577ac5a9521c47863fb71514a3bcad18819218b818de85818ee7a317aaccc1458f78d6f65f3427ec97d9c0adb0d6dacd4471374b621b7b5f35cd54663c64dbe0b9e2d95632f84c611313ea5bd90b71ce97b3cf645776f3adc11e27d135cbadb9875c2bf8d3ae6b02f8a0206aba0c35bfe42574011931c9a255ce6dc",
- "derive_key": "c91c090ceee3a3ac81902da31838012625bbcd73fcb92e7d7e56f78deba4f0c3feeb3974306966ccb3e3c69c337ef8a45660ad02526306fd685c88542ad00f759af6dd1adc2e50c2b8aac9f0c5221ff481565cf6455b772515a69463223202e5c371743e35210bbbbabd89651684107fd9fe493c937be16e39cfa7084a36207c99bea3"
- },
- {
- "input_len": 128,
- "hash": "f17e570564b26578c33bb7f44643f539624b05df1a76c81f30acd548c44b45efa69faba091427f9c5c4caa873aa07828651f19c55bad85c47d1368b11c6fd99e47ecba5820a0325984d74fe3e4058494ca12e3f1d3293d0010a9722f7dee64f71246f75e9361f44cc8e214a100650db1313ff76a9f93ec6e84edb7add1cb4a95019b0c",
- "keyed_hash": "b04fe15577457267ff3b6f3c947d93be581e7e3a4b018679125eaf86f6a628ecd86bbe0001f10bda47e6077b735016fca8119da11348d93ca302bbd125bde0db2b50edbe728a620bb9d3e6f706286aedea973425c0b9eedf8a38873544cf91badf49ad92a635a93f71ddfcee1eae536c25d1b270956be16588ef1cfef2f1d15f650bd5",
- "derive_key": "81720f34452f58a0120a58b6b4608384b5c51d11f39ce97161a0c0e442ca022550e7cd651e312f0b4c6afb3c348ae5dd17d2b29fab3b894d9a0034c7b04fd9190cbd90043ff65d1657bbc05bfdecf2897dd894c7a1b54656d59a50b51190a9da44db426266ad6ce7c173a8c0bbe091b75e734b4dadb59b2861cd2518b4e7591e4b83c9"
- },
- {
- "input_len": 129,
- "hash": "683aaae9f3c5ba37eaaf072aed0f9e30bac0865137bae68b1fde4ca2aebdcb12f96ffa7b36dd78ba321be7e842d364a62a42e3746681c8bace18a4a8a79649285c7127bf8febf125be9de39586d251f0d41da20980b70d35e3dac0eee59e468a894fa7e6a07129aaad09855f6ad4801512a116ba2b7841e6cfc99ad77594a8f2d181a7",
- "keyed_hash": "d4a64dae6cdccbac1e5287f54f17c5f985105457c1a2ec1878ebd4b57e20d38f1c9db018541eec241b748f87725665b7b1ace3e0065b29c3bcb232c90e37897fa5aaee7e1e8a2ecfcd9b51463e42238cfdd7fee1aecb3267fa7f2128079176132a412cd8aaf0791276f6b98ff67359bd8652ef3a203976d5ff1cd41885573487bcd683",
- "derive_key": "938d2d4435be30eafdbb2b7031f7857c98b04881227391dc40db3c7b21f41fc18d72d0f9c1de5760e1941aebf3100b51d64644cb459eb5d20258e233892805eb98b07570ef2a1787cd48e117c8d6a63a68fd8fc8e59e79dbe63129e88352865721c8d5f0cf183f85e0609860472b0d6087cefdd186d984b21542c1c780684ed6832d8d"
- },
- {
- "input_len": 1023,
- "hash": "10108970eeda3eb932baac1428c7a2163b0e924c9a9e25b35bba72b28f70bd11a182d27a591b05592b15607500e1e8dd56bc6c7fc063715b7a1d737df5bad3339c56778957d870eb9717b57ea3d9fb68d1b55127bba6a906a4a24bbd5acb2d123a37b28f9e9a81bbaae360d58f85e5fc9d75f7c370a0cc09b6522d9c8d822f2f28f485",
- "keyed_hash": "c951ecdf03288d0fcc96ee3413563d8a6d3589547f2c2fb36d9786470f1b9d6e890316d2e6d8b8c25b0a5b2180f94fb1a158ef508c3cde45e2966bd796a696d3e13efd86259d756387d9becf5c8bf1ce2192b87025152907b6d8cc33d17826d8b7b9bc97e38c3c85108ef09f013e01c229c20a83d9e8efac5b37470da28575fd755a10",
- "derive_key": "74a16c1c3d44368a86e1ca6df64be6a2f64cce8f09220787450722d85725dea59c413264404661e9e4d955409dfe4ad3aa487871bcd454ed12abfe2c2b1eb7757588cf6cb18d2eccad49e018c0d0fec323bec82bf1644c6325717d13ea712e6840d3e6e730d35553f59eff5377a9c350bcc1556694b924b858f329c44ee64b884ef00d"
- },
- {
- "input_len": 1024,
- "hash": "42214739f095a406f3fc83deb889744ac00df831c10daa55189b5d121c855af71cf8107265ecdaf8505b95d8fcec83a98a6a96ea5109d2c179c47a387ffbb404756f6eeae7883b446b70ebb144527c2075ab8ab204c0086bb22b7c93d465efc57f8d917f0b385c6df265e77003b85102967486ed57db5c5ca170ba441427ed9afa684e",
- "keyed_hash": "75c46f6f3d9eb4f55ecaaee480db732e6c2105546f1e675003687c31719c7ba4a78bc838c72852d4f49c864acb7adafe2478e824afe51c8919d06168414c265f298a8094b1ad813a9b8614acabac321f24ce61c5a5346eb519520d38ecc43e89b5000236df0597243e4d2493fd626730e2ba17ac4d8824d09d1a4a8f57b8227778e2de",
- "derive_key": "7356cd7720d5b66b6d0697eb3177d9f8d73a4a5c5e968896eb6a6896843027066c23b601d3ddfb391e90d5c8eccdef4ae2a264bce9e612ba15e2bc9d654af1481b2e75dbabe615974f1070bba84d56853265a34330b4766f8e75edd1f4a1650476c10802f22b64bd3919d246ba20a17558bc51c199efdec67e80a227251808d8ce5bad"
- },
- {
- "input_len": 1025,
- "hash": "d00278ae47eb27b34faecf67b4fe263f82d5412916c1ffd97c8cb7fb814b8444f4c4a22b4b399155358a994e52bf255de60035742ec71bd08ac275a1b51cc6bfe332b0ef84b409108cda080e6269ed4b3e2c3f7d722aa4cdc98d16deb554e5627be8f955c98e1d5f9565a9194cad0c4285f93700062d9595adb992ae68ff12800ab67a",
- "keyed_hash": "357dc55de0c7e382c900fd6e320acc04146be01db6a8ce7210b7189bd664ea69362396b77fdc0d2634a552970843722066c3c15902ae5097e00ff53f1e116f1cd5352720113a837ab2452cafbde4d54085d9cf5d21ca613071551b25d52e69d6c81123872b6f19cd3bc1333edf0c52b94de23ba772cf82636cff4542540a7738d5b930",
- "derive_key": "effaa245f065fbf82ac186839a249707c3bddf6d3fdda22d1b95a3c970379bcb5d31013a167509e9066273ab6e2123bc835b408b067d88f96addb550d96b6852dad38e320b9d940f86db74d398c770f462118b35d2724efa13da97194491d96dd37c3c09cbef665953f2ee85ec83d88b88d11547a6f911c8217cca46defa2751e7f3ad"
- },
- {
- "input_len": 2048,
- "hash": "e776b6028c7cd22a4d0ba182a8bf62205d2ef576467e838ed6f2529b85fba24a9a60bf80001410ec9eea6698cd537939fad4749edd484cb541aced55cd9bf54764d063f23f6f1e32e12958ba5cfeb1bf618ad094266d4fc3c968c2088f677454c288c67ba0dba337b9d91c7e1ba586dc9a5bc2d5e90c14f53a8863ac75655461cea8f9",
- "keyed_hash": "879cf1fa2ea0e79126cb1063617a05b6ad9d0b696d0d757cf053439f60a99dd10173b961cd574288194b23ece278c330fbb8585485e74967f31352a8183aa782b2b22f26cdcadb61eed1a5bc144b8198fbb0c13abbf8e3192c145d0a5c21633b0ef86054f42809df823389ee40811a5910dcbd1018af31c3b43aa55201ed4edaac74fe",
- "derive_key": "7b2945cb4fef70885cc5d78a87bf6f6207dd901ff239201351ffac04e1088a23e2c11a1ebffcea4d80447867b61badb1383d842d4e79645d48dd82ccba290769caa7af8eaa1bd78a2a5e6e94fbdab78d9c7b74e894879f6a515257ccf6f95056f4e25390f24f6b35ffbb74b766202569b1d797f2d4bd9d17524c720107f985f4ddc583"
- },
- {
- "input_len": 2049,
- "hash": "5f4d72f40d7a5f82b15ca2b2e44b1de3c2ef86c426c95c1af0b687952256303096de31d71d74103403822a2e0bc1eb193e7aecc9643a76b7bbc0c9f9c52e8783aae98764ca468962b5c2ec92f0c74eb5448d519713e09413719431c802f948dd5d90425a4ecdadece9eb178d80f26efccae630734dff63340285adec2aed3b51073ad3",
- "keyed_hash": "9f29700902f7c86e514ddc4df1e3049f258b2472b6dd5267f61bf13983b78dd5f9a88abfefdfa1e00b418971f2b39c64ca621e8eb37fceac57fd0c8fc8e117d43b81447be22d5d8186f8f5919ba6bcc6846bd7d50726c06d245672c2ad4f61702c646499ee1173daa061ffe15bf45a631e2946d616a4c345822f1151284712f76b2b0e",
- "derive_key": "2ea477c5515cc3dd606512ee72bb3e0e758cfae7232826f35fb98ca1bcbdf27316d8e9e79081a80b046b60f6a263616f33ca464bd78d79fa18200d06c7fc9bffd808cc4755277a7d5e09da0f29ed150f6537ea9bed946227ff184cc66a72a5f8c1e4bd8b04e81cf40fe6dc4427ad5678311a61f4ffc39d195589bdbc670f63ae70f4b6"
- },
- {
- "input_len": 3072,
- "hash": "b98cb0ff3623be03326b373de6b9095218513e64f1ee2edd2525c7ad1e5cffd29a3f6b0b978d6608335c09dc94ccf682f9951cdfc501bfe47b9c9189a6fc7b404d120258506341a6d802857322fbd20d3e5dae05b95c88793fa83db1cb08e7d8008d1599b6209d78336e24839724c191b2a52a80448306e0daa84a3fdb566661a37e11",
- "keyed_hash": "044a0e7b172a312dc02a4c9a818c036ffa2776368d7f528268d2e6b5df19177022f302d0529e4174cc507c463671217975e81dab02b8fdeb0d7ccc7568dd22574c783a76be215441b32e91b9a904be8ea81f7a0afd14bad8ee7c8efc305ace5d3dd61b996febe8da4f56ca0919359a7533216e2999fc87ff7d8f176fbecb3d6f34278b",
- "derive_key": "050df97f8c2ead654d9bb3ab8c9178edcd902a32f8495949feadcc1e0480c46b3604131bbd6e3ba573b6dd682fa0a63e5b165d39fc43a625d00207607a2bfeb65ff1d29292152e26b298868e3b87be95d6458f6f2ce6118437b632415abe6ad522874bcd79e4030a5e7bad2efa90a7a7c67e93f0a18fb28369d0a9329ab5c24134ccb0"
- },
- {
- "input_len": 3073,
- "hash": "7124b49501012f81cc7f11ca069ec9226cecb8a2c850cfe644e327d22d3e1cd39a27ae3b79d68d89da9bf25bc27139ae65a324918a5f9b7828181e52cf373c84f35b639b7fccbb985b6f2fa56aea0c18f531203497b8bbd3a07ceb5926f1cab74d14bd66486d9a91eba99059a98bd1cd25876b2af5a76c3e9eed554ed72ea952b603bf",
- "keyed_hash": "68dede9bef00ba89e43f31a6825f4cf433389fedae75c04ee9f0cf16a427c95a96d6da3fe985054d3478865be9a092250839a697bbda74e279e8a9e69f0025e4cfddd6cfb434b1cd9543aaf97c635d1b451a4386041e4bb100f5e45407cbbc24fa53ea2de3536ccb329e4eb9466ec37093a42cf62b82903c696a93a50b702c80f3c3c5",
- "derive_key": "72613c9ec9ff7e40f8f5c173784c532ad852e827dba2bf85b2ab4b76f7079081576288e552647a9d86481c2cae75c2dd4e7c5195fb9ada1ef50e9c5098c249d743929191441301c69e1f48505a4305ec1778450ee48b8e69dc23a25960fe33070ea549119599760a8a2d28aeca06b8c5e9ba58bc19e11fe57b6ee98aa44b2a8e6b14a5"
- },
- {
- "input_len": 4096,
- "hash": "015094013f57a5277b59d8475c0501042c0b642e531b0a1c8f58d2163229e9690289e9409ddb1b99768eafe1623da896faf7e1114bebeadc1be30829b6f8af707d85c298f4f0ff4d9438aef948335612ae921e76d411c3a9111df62d27eaf871959ae0062b5492a0feb98ef3ed4af277f5395172dbe5c311918ea0074ce0036454f620",
- "keyed_hash": "befc660aea2f1718884cd8deb9902811d332f4fc4a38cf7c7300d597a081bfc0bbb64a36edb564e01e4b4aaf3b060092a6b838bea44afebd2deb8298fa562b7b597c757b9df4c911c3ca462e2ac89e9a787357aaf74c3b56d5c07bc93ce899568a3eb17d9250c20f6c5f6c1e792ec9a2dcb715398d5a6ec6d5c54f586a00403a1af1de",
- "derive_key": "1e0d7f3db8c414c97c6307cbda6cd27ac3b030949da8e23be1a1a924ad2f25b9d78038f7b198596c6cc4a9ccf93223c08722d684f240ff6569075ed81591fd93f9fff1110b3a75bc67e426012e5588959cc5a4c192173a03c00731cf84544f65a2fb9378989f72e9694a6a394a8a30997c2e67f95a504e631cd2c5f55246024761b245"
- },
- {
- "input_len": 4097,
- "hash": "9b4052b38f1c5fc8b1f9ff7ac7b27cd242487b3d890d15c96a1c25b8aa0fb99505f91b0b5600a11251652eacfa9497b31cd3c409ce2e45cfe6c0a016967316c426bd26f619eab5d70af9a418b845c608840390f361630bd497b1ab44019316357c61dbe091ce72fc16dc340ac3d6e009e050b3adac4b5b2c92e722cffdc46501531956",
- "keyed_hash": "00df940cd36bb9fa7cbbc3556744e0dbc8191401afe70520ba292ee3ca80abbc606db4976cfdd266ae0abf667d9481831ff12e0caa268e7d3e57260c0824115a54ce595ccc897786d9dcbf495599cfd90157186a46ec800a6763f1c59e36197e9939e900809f7077c102f888caaf864b253bc41eea812656d46742e4ea42769f89b83f",
- "derive_key": "aca51029626b55fda7117b42a7c211f8c6e9ba4fe5b7a8ca922f34299500ead8a897f66a400fed9198fd61dd2d58d382458e64e100128075fc54b860934e8de2e84170734b06e1d212a117100820dbc48292d148afa50567b8b84b1ec336ae10d40c8c975a624996e12de31abbe135d9d159375739c333798a80c64ae895e51e22f3ad"
- },
- {
- "input_len": 5120,
- "hash": "9cadc15fed8b5d854562b26a9536d9707cadeda9b143978f319ab34230535833acc61c8fdc114a2010ce8038c853e121e1544985133fccdd0a2d507e8e615e611e9a0ba4f47915f49e53d721816a9198e8b30f12d20ec3689989175f1bf7a300eee0d9321fad8da232ece6efb8e9fd81b42ad161f6b9550a069e66b11b40487a5f5059",
- "keyed_hash": "2c493e48e9b9bf31e0553a22b23503c0a3388f035cece68eb438d22fa1943e209b4dc9209cd80ce7c1f7c9a744658e7e288465717ae6e56d5463d4f80cdb2ef56495f6a4f5487f69749af0c34c2cdfa857f3056bf8d807336a14d7b89bf62bef2fb54f9af6a546f818dc1e98b9e07f8a5834da50fa28fb5874af91bf06020d1bf0120e",
- "derive_key": "7a7acac8a02adcf3038d74cdd1d34527de8a0fcc0ee3399d1262397ce5817f6055d0cefd84d9d57fe792d65a278fd20384ac6c30fdb340092f1a74a92ace99c482b28f0fc0ef3b923e56ade20c6dba47e49227166251337d80a037e987ad3a7f728b5ab6dfafd6e2ab1bd583a95d9c895ba9c2422c24ea0f62961f0dca45cad47bfa0d"
- },
- {
- "input_len": 5121,
- "hash": "628bd2cb2004694adaab7bbd778a25df25c47b9d4155a55f8fbd79f2fe154cff96adaab0613a6146cdaabe498c3a94e529d3fc1da2bd08edf54ed64d40dcd6777647eac51d8277d70219a9694334a68bc8f0f23e20b0ff70ada6f844542dfa32cd4204ca1846ef76d811cdb296f65e260227f477aa7aa008bac878f72257484f2b6c95",
- "keyed_hash": "6ccf1c34753e7a044db80798ecd0782a8f76f33563accaddbfbb2e0ea4b2d0240d07e63f13667a8d1490e5e04f13eb617aea16a8c8a5aaed1ef6fbde1b0515e3c81050b361af6ead126032998290b563e3caddeaebfab592e155f2e161fb7cba939092133f23f9e65245e58ec23457b78a2e8a125588aad6e07d7f11a85b88d375b72d",
- "derive_key": "b07f01e518e702f7ccb44a267e9e112d403a7b3f4883a47ffbed4b48339b3c341a0add0ac032ab5aaea1e4e5b004707ec5681ae0fcbe3796974c0b1cf31a194740c14519273eedaabec832e8a784b6e7cfc2c5952677e6c3f2c3914454082d7eb1ce1766ac7d75a4d3001fc89544dd46b5147382240d689bbbaefc359fb6ae30263165"
- },
- {
- "input_len": 6144,
- "hash": "3e2e5b74e048f3add6d21faab3f83aa44d3b2278afb83b80b3c35164ebeca2054d742022da6fdda444ebc384b04a54c3ac5839b49da7d39f6d8a9db03deab32aade156c1c0311e9b3435cde0ddba0dce7b26a376cad121294b689193508dd63151603c6ddb866ad16c2ee41585d1633a2cea093bea714f4c5d6b903522045b20395c83",
- "keyed_hash": "3d6b6d21281d0ade5b2b016ae4034c5dec10ca7e475f90f76eac7138e9bc8f1dc35754060091dc5caf3efabe0603c60f45e415bb3407db67e6beb3d11cf8e4f7907561f05dace0c15807f4b5f389c841eb114d81a82c02a00b57206b1d11fa6e803486b048a5ce87105a686dee041207e095323dfe172df73deb8c9532066d88f9da7e",
- "derive_key": "2a95beae63ddce523762355cf4b9c1d8f131465780a391286a5d01abb5683a1597099e3c6488aab6c48f3c15dbe1942d21dbcdc12115d19a8b8465fb54e9053323a9178e4275647f1a9927f6439e52b7031a0b465c861a3fc531527f7758b2b888cf2f20582e9e2c593709c0a44f9c6e0f8b963994882ea4168827823eef1f64169fef"
- },
- {
- "input_len": 6145,
- "hash": "f1323a8631446cc50536a9f705ee5cb619424d46887f3c376c695b70e0f0507f18a2cfdd73c6e39dd75ce7c1c6e3ef238fd54465f053b25d21044ccb2093beb015015532b108313b5829c3621ce324b8e14229091b7c93f32db2e4e63126a377d2a63a3597997d4f1cba59309cb4af240ba70cebff9a23d5e3ff0cdae2cfd54e070022",
- "keyed_hash": "9ac301e9e39e45e3250a7e3b3df701aa0fb6889fbd80eeecf28dbc6300fbc539f3c184ca2f59780e27a576c1d1fb9772e99fd17881d02ac7dfd39675aca918453283ed8c3169085ef4a466b91c1649cc341dfdee60e32231fc34c9c4e0b9a2ba87ca8f372589c744c15fd6f985eec15e98136f25beeb4b13c4e43dc84abcc79cd4646c",
- "derive_key": "379bcc61d0051dd489f686c13de00d5b14c505245103dc040d9e4dd1facab8e5114493d029bdbd295aaa744a59e31f35c7f52dba9c3642f773dd0b4262a9980a2aef811697e1305d37ba9d8b6d850ef07fe41108993180cf779aeece363704c76483458603bbeeb693cffbbe5588d1f3535dcad888893e53d977424bb707201569a8d2"
- },
- {
- "input_len": 7168,
- "hash": "61da957ec2499a95d6b8023e2b0e604ec7f6b50e80a9678b89d2628e99ada77a5707c321c83361793b9af62a40f43b523df1c8633cecb4cd14d00bdc79c78fca5165b863893f6d38b02ff7236c5a9a8ad2dba87d24c547cab046c29fc5bc1ed142e1de4763613bb162a5a538e6ef05ed05199d751f9eb58d332791b8d73fb74e4fce95",
- "keyed_hash": "b42835e40e9d4a7f42ad8cc04f85a963a76e18198377ed84adddeaecacc6f3fca2f01d5277d69bb681c70fa8d36094f73ec06e452c80d2ff2257ed82e7ba348400989a65ee8daa7094ae0933e3d2210ac6395c4af24f91c2b590ef87d7788d7066ea3eaebca4c08a4f14b9a27644f99084c3543711b64a070b94f2c9d1d8a90d035d52",
- "derive_key": "11c37a112765370c94a51415d0d651190c288566e295d505defdad895dae223730d5a5175a38841693020669c7638f40b9bc1f9f39cf98bda7a5b54ae24218a800a2116b34665aa95d846d97ea988bfcb53dd9c055d588fa21ba78996776ea6c40bc428b53c62b5f3ccf200f647a5aae8067f0ea1976391fcc72af1945100e2a6dcb88"
- },
- {
- "input_len": 7169,
- "hash": "a003fc7a51754a9b3c7fae0367ab3d782dccf28855a03d435f8cfe74605e781798a8b20534be1ca9eb2ae2df3fae2ea60e48c6fb0b850b1385b5de0fe460dbe9d9f9b0d8db4435da75c601156df9d047f4ede008732eb17adc05d96180f8a73548522840779e6062d643b79478a6e8dbce68927f36ebf676ffa7d72d5f68f050b119c8",
- "keyed_hash": "ed9b1a922c046fdb3d423ae34e143b05ca1bf28b710432857bf738bcedbfa5113c9e28d72fcbfc020814ce3f5d4fc867f01c8f5b6caf305b3ea8a8ba2da3ab69fabcb438f19ff11f5378ad4484d75c478de425fb8e6ee809b54eec9bdb184315dc856617c09f5340451bf42fd3270a7b0b6566169f242e533777604c118a6358250f54",
- "derive_key": "554b0a5efea9ef183f2f9b931b7497995d9eb26f5c5c6dad2b97d62fc5ac31d99b20652c016d88ba2a611bbd761668d5eda3e568e940faae24b0d9991c3bd25a65f770b89fdcadabcb3d1a9c1cb63e69721cacf1ae69fefdcef1e3ef41bc5312ccc17222199e47a26552c6adc460cf47a72319cb5039369d0060eaea59d6c65130f1dd"
- },
- {
- "input_len": 8192,
- "hash": "aae792484c8efe4f19e2ca7d371d8c467ffb10748d8a5a1ae579948f718a2a635fe51a27db045a567c1ad51be5aa34c01c6651c4d9b5b5ac5d0fd58cf18dd61a47778566b797a8c67df7b1d60b97b19288d2d877bb2df417ace009dcb0241ca1257d62712b6a4043b4ff33f690d849da91ea3bf711ed583cb7b7a7da2839ba71309bbf",
- "keyed_hash": "dc9637c8845a770b4cbf76b8daec0eebf7dc2eac11498517f08d44c8fc00d58a4834464159dcbc12a0ba0c6d6eb41bac0ed6585cabfe0aca36a375e6c5480c22afdc40785c170f5a6b8a1107dbee282318d00d915ac9ed1143ad40765ec120042ee121cd2baa36250c618adaf9e27260fda2f94dea8fb6f08c04f8f10c78292aa46102",
- "derive_key": "ad01d7ae4ad059b0d33baa3c01319dcf8088094d0359e5fd45d6aeaa8b2d0c3d4c9e58958553513b67f84f8eac653aeeb02ae1d5672dcecf91cd9985a0e67f4501910ecba25555395427ccc7241d70dc21c190e2aadee875e5aae6bf1912837e53411dabf7a56cbf8e4fb780432b0d7fe6cec45024a0788cf5874616407757e9e6bef7"
- },
- {
- "input_len": 8193,
- "hash": "bab6c09cb8ce8cf459261398d2e7aef35700bf488116ceb94a36d0f5f1b7bc3bb2282aa69be089359ea1154b9a9286c4a56af4de975a9aa4a5c497654914d279bea60bb6d2cf7225a2fa0ff5ef56bbe4b149f3ed15860f78b4e2ad04e158e375c1e0c0b551cd7dfc82f1b155c11b6b3ed51ec9edb30d133653bb5709d1dbd55f4e1ff6",
- "keyed_hash": "954a2a75420c8d6547e3ba5b98d963e6fa6491addc8c023189cc519821b4a1f5f03228648fd983aef045c2fa8290934b0866b615f585149587dda2299039965328835a2b18f1d63b7e300fc76ff260b571839fe44876a4eae66cbac8c67694411ed7e09df51068a22c6e67d6d3dd2cca8ff12e3275384006c80f4db68023f24eebba57",
- "derive_key": "af1e0346e389b17c23200270a64aa4e1ead98c61695d917de7d5b00491c9b0f12f20a01d6d622edf3de026a4db4e4526225debb93c1237934d71c7340bb5916158cbdafe9ac3225476b6ab57a12357db3abbad7a26c6e66290e44034fb08a20a8d0ec264f309994d2810c49cfba6989d7abb095897459f5425adb48aba07c5fb3c83c0"
- },
- {
- "input_len": 16384,
- "hash": "f875d6646de28985646f34ee13be9a576fd515f76b5b0a26bb324735041ddde49d764c270176e53e97bdffa58d549073f2c660be0e81293767ed4e4929f9ad34bbb39a529334c57c4a381ffd2a6d4bfdbf1482651b172aa883cc13408fa67758a3e47503f93f87720a3177325f7823251b85275f64636a8f1d599c2e49722f42e93893",
- "keyed_hash": "9e9fc4eb7cf081ea7c47d1807790ed211bfec56aa25bb7037784c13c4b707b0df9e601b101e4cf63a404dfe50f2e1865bb12edc8fca166579ce0c70dba5a5c0fc960ad6f3772183416a00bd29d4c6e651ea7620bb100c9449858bf14e1ddc9ecd35725581ca5b9160de04060045993d972571c3e8f71e9d0496bfa744656861b169d65",
- "derive_key": "160e18b5878cd0df1c3af85eb25a0db5344d43a6fbd7a8ef4ed98d0714c3f7e160dc0b1f09caa35f2f417b9ef309dfe5ebd67f4c9507995a531374d099cf8ae317542e885ec6f589378864d3ea98716b3bbb65ef4ab5e0ab5bb298a501f19a41ec19af84a5e6b428ecd813b1a47ed91c9657c3fba11c406bc316768b58f6802c9e9b57"
- },
- {
- "input_len": 31744,
- "hash": "62b6960e1a44bcc1eb1a611a8d6235b6b4b78f32e7abc4fb4c6cdcce94895c47860cc51f2b0c28a7b77304bd55fe73af663c02d3f52ea053ba43431ca5bab7bfea2f5e9d7121770d88f70ae9649ea713087d1914f7f312147e247f87eb2d4ffef0ac978bf7b6579d57d533355aa20b8b77b13fd09748728a5cc327a8ec470f4013226f",
- "keyed_hash": "efa53b389ab67c593dba624d898d0f7353ab99e4ac9d42302ee64cbf9939a4193a7258db2d9cd32a7a3ecfce46144114b15c2fcb68a618a976bd74515d47be08b628be420b5e830fade7c080e351a076fbc38641ad80c736c8a18fe3c66ce12f95c61c2462a9770d60d0f77115bbcd3782b593016a4e728d4c06cee4505cb0c08a42ec",
- "derive_key": "39772aef80e0ebe60596361e45b061e8f417429d529171b6764468c22928e28e9759adeb797a3fbf771b1bcea30150a020e317982bf0d6e7d14dd9f064bc11025c25f31e81bd78a921db0174f03dd481d30e93fd8e90f8b2fee209f849f2d2a52f31719a490fb0ba7aea1e09814ee912eba111a9fde9d5c274185f7bae8ba85d300a2b"
- },
- {
- "input_len": 102400,
- "hash": "bc3e3d41a1146b069abffad3c0d44860cf664390afce4d9661f7902e7943e085e01c59dab908c04c3342b816941a26d69c2605ebee5ec5291cc55e15b76146e6745f0601156c3596cb75065a9c57f35585a52e1ac70f69131c23d611ce11ee4ab1ec2c009012d236648e77be9295dd0426f29b764d65de58eb7d01dd42248204f45f8e",
- "keyed_hash": "1c35d1a5811083fd7119f5d5d1ba027b4d01c0c6c49fb6ff2cf75393ea5db4a7f9dbdd3e1d81dcbca3ba241bb18760f207710b751846faaeb9dff8262710999a59b2aa1aca298a032d94eacfadf1aa192418eb54808db23b56e34213266aa08499a16b354f018fc4967d05f8b9d2ad87a7278337be9693fc638a3bfdbe314574ee6fc4",
- "derive_key": "4652cff7a3f385a6103b5c260fc1593e13c778dbe608efb092fe7ee69df6e9c6d83a3e041bc3a48df2879f4a0a3ed40e7c961c73eff740f3117a0504c2dff4786d44fb17f1549eb0ba585e40ec29bf7732f0b7e286ff8acddc4cb1e23b87ff5d824a986458dcc6a04ac83969b80637562953df51ed1a7e90a7926924d2763778be8560"
- }
- ]
-}
diff --git a/thirdparty/BLAKE3/tools/compiler_version/Cargo.toml b/thirdparty/BLAKE3/tools/compiler_version/Cargo.toml
deleted file mode 100644
index 1046cf29d..000000000
--- a/thirdparty/BLAKE3/tools/compiler_version/Cargo.toml
+++ /dev/null
@@ -1,7 +0,0 @@
-[package]
-name = "compiler_version"
-version = "0.0.0"
-edition = "2018"
-
-[build-dependencies]
-cc = "1.0.50"
diff --git a/thirdparty/BLAKE3/tools/compiler_version/build.rs b/thirdparty/BLAKE3/tools/compiler_version/build.rs
deleted file mode 100644
index 3e14ebe67..000000000
--- a/thirdparty/BLAKE3/tools/compiler_version/build.rs
+++ /dev/null
@@ -1,6 +0,0 @@
-fn main() {
- let build = cc::Build::new();
- let compiler = build.get_compiler();
- let compiler_path = compiler.path().to_string_lossy();
- println!("cargo:rustc-env=COMPILER_PATH={}", compiler_path);
-}
diff --git a/thirdparty/BLAKE3/tools/compiler_version/src/main.rs b/thirdparty/BLAKE3/tools/compiler_version/src/main.rs
deleted file mode 100644
index 767cb31bd..000000000
--- a/thirdparty/BLAKE3/tools/compiler_version/src/main.rs
+++ /dev/null
@@ -1,27 +0,0 @@
-use std::process::Command;
-
-fn main() {
- // Print the rustc version.
- Command::new(env!("CARGO"))
- .args(&["rustc", "--quiet", "--", "--version"])
- .status()
- .unwrap();
- println!();
-
- // Print the Cargo version.
- Command::new(env!("CARGO"))
- .args(&["--version"])
- .status()
- .unwrap();
- println!();
-
- // Print the C compiler version. This relies on C compiler detection done
- // in build.rs, which sets the COMPILER_PATH variable.
- let compiler_path = env!("COMPILER_PATH");
- let mut compiler_command = Command::new(compiler_path);
- // Use the --version flag on everything other than MSVC.
- if !cfg!(target_env = "msvc") {
- compiler_command.arg("--version");
- }
- let _ = compiler_command.status().unwrap();
-}
diff --git a/thirdparty/BLAKE3/tools/instruction_set_support/Cargo.toml b/thirdparty/BLAKE3/tools/instruction_set_support/Cargo.toml
deleted file mode 100644
index 9e30174a9..000000000
--- a/thirdparty/BLAKE3/tools/instruction_set_support/Cargo.toml
+++ /dev/null
@@ -1,6 +0,0 @@
-[package]
-name = "instruction_set_support"
-version = "0.0.0"
-edition = "2018"
-
-[dependencies]
diff --git a/thirdparty/BLAKE3/tools/instruction_set_support/src/main.rs b/thirdparty/BLAKE3/tools/instruction_set_support/src/main.rs
deleted file mode 100644
index 6b509b053..000000000
--- a/thirdparty/BLAKE3/tools/instruction_set_support/src/main.rs
+++ /dev/null
@@ -1,10 +0,0 @@
-fn main() {
- #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
- {
- dbg!(is_x86_feature_detected!("sse2"));
- dbg!(is_x86_feature_detected!("sse4.1"));
- dbg!(is_x86_feature_detected!("avx2"));
- dbg!(is_x86_feature_detected!("avx512f"));
- dbg!(is_x86_feature_detected!("avx512vl"));
- }
-}
diff --git a/xmake.lua b/xmake.lua
index 4b13cde5c..4469a3acc 100644
--- a/xmake.lua
+++ b/xmake.lua
@@ -4,6 +4,7 @@ set_configvar("ZEN_SCHEMA_VERSION", 4) -- store Cid data in CAS under raw hash (
add_requires(
"vcpkg::asio",
+ "vcpkg::blake3",
"vcpkg::catch2 2.13.8",
"vcpkg::cpr",
"vcpkg::curl",
diff --git a/zencore/blake3.cpp b/zencore/blake3.cpp
index 02d6eb241..89826ae5d 100644
--- a/zencore/blake3.cpp
+++ b/zencore/blake3.cpp
@@ -7,13 +7,10 @@
#include <zencore/testing.h>
#include <zencore/zencore.h>
-#include "../thirdparty/BLAKE3/c/blake3.h"
-#if ZEN_PLATFORM_WINDOWS
-# pragma comment(lib, "blake3.lib")
-#endif
-
#include <string.h>
+#include "blake3.h"
+
//////////////////////////////////////////////////////////////////////////
namespace zen {
diff --git a/zencore/xmake.lua b/zencore/xmake.lua
index f01c12c86..aae20274b 100644
--- a/zencore/xmake.lua
+++ b/zencore/xmake.lua
@@ -14,40 +14,33 @@ target('zencore')
add_includedirs("include", {public=true})
add_includedirs("$(projectdir)/thirdparty/utfcpp/source")
add_includedirs("$(projectdir)/thirdparty/trace", {public=true})
+ add_links("blake3")
if is_os("windows") then
- add_linkdirs("$(projectdir)/thirdparty/BLAKE3/lib/Win64")
add_linkdirs("$(projectdir)/thirdparty/Oodle/lib/Win64")
elseif is_os("linux") then
- add_linkdirs("$(projectdir)/thirdparty/BLAKE3/lib/Linux_x64")
add_linkdirs("$(projectdir)/thirdparty/Oodle/lib/Linux_x64")
- add_links("blake3")
add_links("oo2corelinux64")
add_syslinks("pthread")
elseif is_os("macosx") then
- if is_arch("arm64") then
- add_linkdirs("$(projectdir)/thirdparty/BLAKE3/lib/Mac_arm64")
- else
- add_linkdirs("$(projectdir)/thirdparty/BLAKE3/lib/Mac_x64")
- end
add_linkdirs("$(projectdir)/thirdparty/Oodle/lib/Mac_x64")
- add_links("blake3")
add_links("oo2coremac64")
end
add_options("zentrace")
add_packages(
- "vcpkg::spdlog",
- "vcpkg::fmt",
- "vcpkg::doctest",
+ "vcpkg::blake3",
"vcpkg::catch2",
+ "vcpkg::cpr",
+ "vcpkg::curl", -- required by cpr
+ "vcpkg::doctest",
+ "vcpkg::fmt",
+ "vcpkg::gsl-lite",
"vcpkg::json11",
"vcpkg::lz4",
"vcpkg::mimalloc",
- "vcpkg::cpr",
- "vcpkg::curl", -- required by cpr
- "vcpkg::zlib", -- required by curl
"vcpkg::openssl", -- required by curl
- "vcpkg::xxhash",
- "vcpkg::gsl-lite")
+ "vcpkg::spdlog",
+ "vcpkg::zlib", -- required by curl
+ "vcpkg::xxhash")
if is_plat("linux") then
-- The 'vcpkg::openssl' package is two libraries; ssl and crypto, with