diff options
| author | Stefan Boberg <[email protected]> | 2025-11-07 14:49:13 +0100 |
|---|---|---|
| committer | GitHub Enterprise <[email protected]> | 2025-11-07 14:49:13 +0100 |
| commit | 24e43a913f29ac3b314354e8ce5175f135bcc64f (patch) | |
| tree | ca442937ceeb63461012b33a4576e9835099f106 /thirdparty/blake3/src | |
| parent | get oplog attachments (#622) (diff) | |
| download | zen-24e43a913f29ac3b314354e8ce5175f135bcc64f.tar.xz zen-24e43a913f29ac3b314354e8ce5175f135bcc64f.zip | |
switch to xmake for package management (#611)
This change removes our dependency on vcpkg for package management, in favour of bringing some code in-tree in the `thirdparty` folder as well as using the xmake build-in package management feature. For the latter, all the package definitions are maintained in the zen repo itself, in the `repo` folder.
It should now also be easier to build the project as it will no longer depend on having the right version of vcpkg installed, which has been a common problem for new people coming in to the codebase. Now you should only need xmake to build.
* Bumps xmake requirement on github runners to 2.9.9 to resolve an issue where xmake on Windows invokes cmake with `v144` toolchain which does not exist
* BLAKE3 is now in-tree at `thirdparty/blake3`
* cpr is now in-tree at `thirdparty/cpr`
* cxxopts is now in-tree at `thirdparty/cxxopts`
* fmt is now in-tree at `thirdparty/fmt`
* robin-map is now in-tree at `thirdparty/robin-map`
* ryml is now in-tree at `thirdparty/ryml`
* sol2 is now in-tree at `thirdparty/sol2`
* spdlog is now in-tree at `thirdparty/spdlog`
* utfcpp is now in-tree at `thirdparty/utfcpp`
* xmake package repo definitions is in `repo`
* implemented support for sanitizers. ASAN is supported on windows, TSAN, UBSAN, MSAN etc are supported on Linux/MacOS though I have not yet tested it extensively on MacOS
* the zencore encryption implementation also now supports using mbedTLS which is used on MacOS, though for now we still use openssl on Linux
* crashpad
* bumps libcurl to 8.11.0 (from 8.8.0) which should address a rare build upload bug
Diffstat (limited to 'thirdparty/blake3/src')
| -rw-r--r-- | thirdparty/blake3/src/ffi_avx2.rs | 65 | ||||
| -rw-r--r-- | thirdparty/blake3/src/ffi_avx512.rs | 169 | ||||
| -rw-r--r-- | thirdparty/blake3/src/ffi_neon.rs | 82 | ||||
| -rw-r--r-- | thirdparty/blake3/src/ffi_sse2.rs | 126 | ||||
| -rw-r--r-- | thirdparty/blake3/src/ffi_sse41.rs | 126 | ||||
| -rw-r--r-- | thirdparty/blake3/src/guts.rs | 60 | ||||
| -rw-r--r-- | thirdparty/blake3/src/hazmat.rs | 704 | ||||
| -rw-r--r-- | thirdparty/blake3/src/io.rs | 64 | ||||
| -rw-r--r-- | thirdparty/blake3/src/join.rs | 92 | ||||
| -rw-r--r-- | thirdparty/blake3/src/lib.rs | 1835 | ||||
| -rw-r--r-- | thirdparty/blake3/src/platform.rs | 587 | ||||
| -rw-r--r-- | thirdparty/blake3/src/portable.rs | 198 | ||||
| -rw-r--r-- | thirdparty/blake3/src/rust_avx2.rs | 474 | ||||
| -rw-r--r-- | thirdparty/blake3/src/rust_sse2.rs | 775 | ||||
| -rw-r--r-- | thirdparty/blake3/src/rust_sse41.rs | 766 | ||||
| -rw-r--r-- | thirdparty/blake3/src/test.rs | 1049 | ||||
| -rw-r--r-- | thirdparty/blake3/src/traits.rs | 227 | ||||
| -rw-r--r-- | thirdparty/blake3/src/wasm32_simd.rs | 794 |
18 files changed, 8193 insertions, 0 deletions
diff --git a/thirdparty/blake3/src/ffi_avx2.rs b/thirdparty/blake3/src/ffi_avx2.rs new file mode 100644 index 000000000..43bf1504a --- /dev/null +++ b/thirdparty/blake3/src/ffi_avx2.rs @@ -0,0 +1,65 @@ +use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN}; + +// Note that there is no AVX2 implementation of compress_in_place or +// compress_xof. + +// Unsafe because this may only be called on platforms supporting AVX2. +pub unsafe fn hash_many<const N: usize>( + inputs: &[&[u8; N]], + key: &CVWords, + counter: u64, + increment_counter: IncrementCounter, + flags: u8, + flags_start: u8, + flags_end: u8, + out: &mut [u8], +) { + unsafe { + // The Rust hash_many implementations do bounds checking on the `out` + // array, but the C implementations don't. Even though this is an unsafe + // function, assert the bounds here. + assert!(out.len() >= inputs.len() * OUT_LEN); + ffi::blake3_hash_many_avx2( + inputs.as_ptr() as *const *const u8, + inputs.len(), + N / BLOCK_LEN, + key.as_ptr(), + counter, + increment_counter.yes(), + flags, + flags_start, + flags_end, + out.as_mut_ptr(), + ) + } +} + +pub mod ffi { + extern "C" { + pub fn blake3_hash_many_avx2( + inputs: *const *const u8, + num_inputs: usize, + blocks: usize, + key: *const u32, + counter: u64, + increment_counter: bool, + flags: u8, + flags_start: u8, + flags_end: u8, + out: *mut u8, + ); + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_hash_many() { + if !crate::platform::avx2_detected() { + return; + } + crate::test::test_hash_many_fn(hash_many, hash_many); + } +} diff --git a/thirdparty/blake3/src/ffi_avx512.rs b/thirdparty/blake3/src/ffi_avx512.rs new file mode 100644 index 000000000..e648edaf2 --- /dev/null +++ b/thirdparty/blake3/src/ffi_avx512.rs @@ -0,0 +1,169 @@ +use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN}; + +// Unsafe because this may only be called on platforms supporting AVX-512. +pub unsafe fn compress_in_place( + cv: &mut CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) { + unsafe { + ffi::blake3_compress_in_place_avx512( + cv.as_mut_ptr(), + block.as_ptr(), + block_len, + counter, + flags, + ) + } +} + +// Unsafe because this may only be called on platforms supporting AVX-512. +pub unsafe fn compress_xof( + cv: &CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) -> [u8; 64] { + unsafe { + let mut out = [0u8; 64]; + ffi::blake3_compress_xof_avx512( + cv.as_ptr(), + block.as_ptr(), + block_len, + counter, + flags, + out.as_mut_ptr(), + ); + out + } +} + +// Unsafe because this may only be called on platforms supporting AVX-512. +pub unsafe fn hash_many<const N: usize>( + inputs: &[&[u8; N]], + key: &CVWords, + counter: u64, + increment_counter: IncrementCounter, + flags: u8, + flags_start: u8, + flags_end: u8, + out: &mut [u8], +) { + unsafe { + // The Rust hash_many implementations do bounds checking on the `out` + // array, but the C implementations don't. Even though this is an unsafe + // function, assert the bounds here. + assert!(out.len() >= inputs.len() * OUT_LEN); + ffi::blake3_hash_many_avx512( + inputs.as_ptr() as *const *const u8, + inputs.len(), + N / BLOCK_LEN, + key.as_ptr(), + counter, + increment_counter.yes(), + flags, + flags_start, + flags_end, + out.as_mut_ptr(), + ) + } +} + +// Unsafe because this may only be called on platforms supporting AVX-512. +#[cfg(unix)] +pub unsafe fn xof_many( + cv: &CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, + out: &mut [u8], +) { + unsafe { + debug_assert_eq!(0, out.len() % BLOCK_LEN, "whole blocks only"); + ffi::blake3_xof_many_avx512( + cv.as_ptr(), + block.as_ptr(), + block_len, + counter, + flags, + out.as_mut_ptr(), + out.len() / BLOCK_LEN, + ); + } +} + +pub mod ffi { + extern "C" { + pub fn blake3_compress_in_place_avx512( + cv: *mut u32, + block: *const u8, + block_len: u8, + counter: u64, + flags: u8, + ); + pub fn blake3_compress_xof_avx512( + cv: *const u32, + block: *const u8, + block_len: u8, + counter: u64, + flags: u8, + out: *mut u8, + ); + pub fn blake3_hash_many_avx512( + inputs: *const *const u8, + num_inputs: usize, + blocks: usize, + key: *const u32, + counter: u64, + increment_counter: bool, + flags: u8, + flags_start: u8, + flags_end: u8, + out: *mut u8, + ); + #[cfg(unix)] + pub fn blake3_xof_many_avx512( + cv: *const u32, + block: *const u8, + block_len: u8, + counter: u64, + flags: u8, + out: *mut u8, + outblocks: usize, + ); + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_compress() { + if !crate::platform::avx512_detected() { + return; + } + crate::test::test_compress_fn(compress_in_place, compress_xof); + } + + #[test] + fn test_hash_many() { + if !crate::platform::avx512_detected() { + return; + } + crate::test::test_hash_many_fn(hash_many, hash_many); + } + + #[cfg(unix)] + #[test] + fn test_xof_many() { + if !crate::platform::avx512_detected() { + return; + } + crate::test::test_xof_many_fn(xof_many); + } +} diff --git a/thirdparty/blake3/src/ffi_neon.rs b/thirdparty/blake3/src/ffi_neon.rs new file mode 100644 index 000000000..54d07a4de --- /dev/null +++ b/thirdparty/blake3/src/ffi_neon.rs @@ -0,0 +1,82 @@ +use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN}; + +// Unsafe because this may only be called on platforms supporting NEON. +pub unsafe fn hash_many<const N: usize>( + inputs: &[&[u8; N]], + key: &CVWords, + counter: u64, + increment_counter: IncrementCounter, + flags: u8, + flags_start: u8, + flags_end: u8, + out: &mut [u8], +) { + // The Rust hash_many implementations do bounds checking on the `out` + // array, but the C implementations don't. Even though this is an unsafe + // function, assert the bounds here. + assert!(out.len() >= inputs.len() * OUT_LEN); + ffi::blake3_hash_many_neon( + inputs.as_ptr() as *const *const u8, + inputs.len(), + N / BLOCK_LEN, + key.as_ptr(), + counter, + increment_counter.yes(), + flags, + flags_start, + flags_end, + out.as_mut_ptr(), + ) +} + +// blake3_neon.c normally depends on blake3_portable.c, because the NEON +// implementation only provides 4x compression, and it relies on the portable +// implementation for 1x compression. However, we expose the portable Rust +// implementation here instead, to avoid linking in unnecessary code. +#[no_mangle] +pub extern "C" fn blake3_compress_in_place_portable( + cv: *mut u32, + block: *const u8, + block_len: u8, + counter: u64, + flags: u8, +) { + unsafe { + crate::portable::compress_in_place( + &mut *(cv as *mut [u32; 8]), + &*(block as *const [u8; 64]), + block_len, + counter, + flags, + ) + } +} + +pub mod ffi { + extern "C" { + pub fn blake3_hash_many_neon( + inputs: *const *const u8, + num_inputs: usize, + blocks: usize, + key: *const u32, + counter: u64, + increment_counter: bool, + flags: u8, + flags_start: u8, + flags_end: u8, + out: *mut u8, + ); + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_hash_many() { + // This entire file is gated on feature="neon", so NEON support is + // assumed here. + crate::test::test_hash_many_fn(hash_many, hash_many); + } +} diff --git a/thirdparty/blake3/src/ffi_sse2.rs b/thirdparty/blake3/src/ffi_sse2.rs new file mode 100644 index 000000000..8dafd6983 --- /dev/null +++ b/thirdparty/blake3/src/ffi_sse2.rs @@ -0,0 +1,126 @@ +use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN}; + +// Unsafe because this may only be called on platforms supporting SSE2. +pub unsafe fn compress_in_place( + cv: &mut CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) { + unsafe { + ffi::blake3_compress_in_place_sse2( + cv.as_mut_ptr(), + block.as_ptr(), + block_len, + counter, + flags, + ) + } +} + +// Unsafe because this may only be called on platforms supporting SSE2. +pub unsafe fn compress_xof( + cv: &CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) -> [u8; 64] { + unsafe { + let mut out = [0u8; 64]; + ffi::blake3_compress_xof_sse2( + cv.as_ptr(), + block.as_ptr(), + block_len, + counter, + flags, + out.as_mut_ptr(), + ); + out + } +} + +// Unsafe because this may only be called on platforms supporting SSE2. +pub unsafe fn hash_many<const N: usize>( + inputs: &[&[u8; N]], + key: &CVWords, + counter: u64, + increment_counter: IncrementCounter, + flags: u8, + flags_start: u8, + flags_end: u8, + out: &mut [u8], +) { + unsafe { + // The Rust hash_many implementations do bounds checking on the `out` + // array, but the C implementations don't. Even though this is an unsafe + // function, assert the bounds here. + assert!(out.len() >= inputs.len() * OUT_LEN); + ffi::blake3_hash_many_sse2( + inputs.as_ptr() as *const *const u8, + inputs.len(), + N / BLOCK_LEN, + key.as_ptr(), + counter, + increment_counter.yes(), + flags, + flags_start, + flags_end, + out.as_mut_ptr(), + ) + } +} + +pub mod ffi { + extern "C" { + pub fn blake3_compress_in_place_sse2( + cv: *mut u32, + block: *const u8, + block_len: u8, + counter: u64, + flags: u8, + ); + pub fn blake3_compress_xof_sse2( + cv: *const u32, + block: *const u8, + block_len: u8, + counter: u64, + flags: u8, + out: *mut u8, + ); + pub fn blake3_hash_many_sse2( + inputs: *const *const u8, + num_inputs: usize, + blocks: usize, + key: *const u32, + counter: u64, + increment_counter: bool, + flags: u8, + flags_start: u8, + flags_end: u8, + out: *mut u8, + ); + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_compress() { + if !crate::platform::sse2_detected() { + return; + } + crate::test::test_compress_fn(compress_in_place, compress_xof); + } + + #[test] + fn test_hash_many() { + if !crate::platform::sse2_detected() { + return; + } + crate::test::test_hash_many_fn(hash_many, hash_many); + } +} diff --git a/thirdparty/blake3/src/ffi_sse41.rs b/thirdparty/blake3/src/ffi_sse41.rs new file mode 100644 index 000000000..f851ca153 --- /dev/null +++ b/thirdparty/blake3/src/ffi_sse41.rs @@ -0,0 +1,126 @@ +use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN}; + +// Unsafe because this may only be called on platforms supporting SSE4.1. +pub unsafe fn compress_in_place( + cv: &mut CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) { + unsafe { + ffi::blake3_compress_in_place_sse41( + cv.as_mut_ptr(), + block.as_ptr(), + block_len, + counter, + flags, + ) + } +} + +// Unsafe because this may only be called on platforms supporting SSE4.1. +pub unsafe fn compress_xof( + cv: &CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) -> [u8; 64] { + unsafe { + let mut out = [0u8; 64]; + ffi::blake3_compress_xof_sse41( + cv.as_ptr(), + block.as_ptr(), + block_len, + counter, + flags, + out.as_mut_ptr(), + ); + out + } +} + +// Unsafe because this may only be called on platforms supporting SSE4.1. +pub unsafe fn hash_many<const N: usize>( + inputs: &[&[u8; N]], + key: &CVWords, + counter: u64, + increment_counter: IncrementCounter, + flags: u8, + flags_start: u8, + flags_end: u8, + out: &mut [u8], +) { + unsafe { + // The Rust hash_many implementations do bounds checking on the `out` + // array, but the C implementations don't. Even though this is an unsafe + // function, assert the bounds here. + assert!(out.len() >= inputs.len() * OUT_LEN); + ffi::blake3_hash_many_sse41( + inputs.as_ptr() as *const *const u8, + inputs.len(), + N / BLOCK_LEN, + key.as_ptr(), + counter, + increment_counter.yes(), + flags, + flags_start, + flags_end, + out.as_mut_ptr(), + ) + } +} + +pub mod ffi { + extern "C" { + pub fn blake3_compress_in_place_sse41( + cv: *mut u32, + block: *const u8, + block_len: u8, + counter: u64, + flags: u8, + ); + pub fn blake3_compress_xof_sse41( + cv: *const u32, + block: *const u8, + block_len: u8, + counter: u64, + flags: u8, + out: *mut u8, + ); + pub fn blake3_hash_many_sse41( + inputs: *const *const u8, + num_inputs: usize, + blocks: usize, + key: *const u32, + counter: u64, + increment_counter: bool, + flags: u8, + flags_start: u8, + flags_end: u8, + out: *mut u8, + ); + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_compress() { + if !crate::platform::sse41_detected() { + return; + } + crate::test::test_compress_fn(compress_in_place, compress_xof); + } + + #[test] + fn test_hash_many() { + if !crate::platform::sse41_detected() { + return; + } + crate::test::test_hash_many_fn(hash_many, hash_many); + } +} diff --git a/thirdparty/blake3/src/guts.rs b/thirdparty/blake3/src/guts.rs new file mode 100644 index 000000000..6bbf5a571 --- /dev/null +++ b/thirdparty/blake3/src/guts.rs @@ -0,0 +1,60 @@ +//! Deprecated in favor of [`hazmat`](crate::hazmat) + +pub use crate::{BLOCK_LEN, CHUNK_LEN}; + +#[derive(Clone, Debug)] +pub struct ChunkState(crate::ChunkState); + +impl ChunkState { + // Currently this type only supports the regular hash mode. If an + // incremental user needs keyed_hash or derive_key, we can add that. + pub fn new(chunk_counter: u64) -> Self { + Self(crate::ChunkState::new( + crate::IV, + chunk_counter, + 0, + crate::platform::Platform::detect(), + )) + } + + #[inline] + pub fn len(&self) -> usize { + self.0.count() + } + + #[inline] + pub fn update(&mut self, input: &[u8]) -> &mut Self { + self.0.update(input); + self + } + + pub fn finalize(&self, is_root: bool) -> crate::Hash { + let output = self.0.output(); + if is_root { + output.root_hash() + } else { + output.chaining_value().into() + } + } +} + +// As above, this currently assumes the regular hash mode. If an incremental +// user needs keyed_hash or derive_key, we can add that. +pub fn parent_cv( + left_child: &crate::Hash, + right_child: &crate::Hash, + is_root: bool, +) -> crate::Hash { + let output = crate::parent_node_output( + left_child.as_bytes(), + right_child.as_bytes(), + crate::IV, + 0, + crate::platform::Platform::detect(), + ); + if is_root { + output.root_hash() + } else { + output.chaining_value().into() + } +} diff --git a/thirdparty/blake3/src/hazmat.rs b/thirdparty/blake3/src/hazmat.rs new file mode 100644 index 000000000..2fd2449db --- /dev/null +++ b/thirdparty/blake3/src/hazmat.rs @@ -0,0 +1,704 @@ +//! Low-level tree manipulations and other sharp tools +//! +//! The target audience for this module is projects like [Bao](https://github.com/oconnor663/bao), +//! which work directly with the interior hashes ("chaining values") of BLAKE3 chunks and subtrees. +//! For example, you could use these functions to implement a BitTorrent-like protocol using the +//! BLAKE3 tree structure, or to hash an input that's distributed across different machines. These +//! use cases are advanced, and most applications don't need this module. Also: +//! +//! <div class="warning"> +//! +//! **Warning:** This module is *hazardous material*. If you've heard folks say *don't roll your +//! own crypto,* this is the sort of thing they're talking about. These functions have complicated +//! requirements, and any mistakes will give you garbage output and/or break the security +//! properties that BLAKE3 is supposed to have. Read section 2.1 of [the BLAKE3 +//! paper](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf) to understand the +//! tree structure you need to maintain. Test your code against [`blake3::hash`](../fn.hash.html) +//! and make sure you can get the same outputs for [lots of different +//! inputs](https://github.com/BLAKE3-team/BLAKE3/blob/master/test_vectors/test_vectors.json). +//! +//! </div> +//! +//! On the other hand: +//! +//! <div class="warning"> +//! +//! **Encouragement:** Playing with these functions is a great way to learn how BLAKE3 works on the +//! inside. Have fun! +//! +//! </div> +//! +//! The main entrypoint for this module is the [`HasherExt`] trait, particularly the +//! [`set_input_offset`](HasherExt::set_input_offset) and +//! [`finalize_non_root`](HasherExt::finalize_non_root) methods. These let you compute the chaining +//! values of individual chunks or subtrees. You then combine these chaining values into larger +//! subtrees using [`merge_subtrees_non_root`] and finally (once at the very top) +//! [`merge_subtrees_root`] or [`merge_subtrees_root_xof`]. +//! +//! # Examples +//! +//! Here's an example of computing all the interior hashes in a 3-chunk tree: +//! +//! ```text +//! root +//! / \ +//! parent \ +//! / \ \ +//! chunk0 chunk1 chunk2 +//! ``` +//! +//! ``` +//! # fn main() { +//! use blake3::{Hasher, CHUNK_LEN}; +//! use blake3::hazmat::{merge_subtrees_non_root, merge_subtrees_root, Mode}; +//! use blake3::hazmat::HasherExt; // an extension trait for Hasher +//! +//! let chunk0 = [b'a'; CHUNK_LEN]; +//! let chunk1 = [b'b'; CHUNK_LEN]; +//! let chunk2 = [b'c'; 42]; // The final chunk can be short. +//! +//! // Compute the non-root hashes ("chaining values") of all three chunks. Chunks or subtrees +//! // that don't begin at the start of the input use `set_input_offset` to say where they begin. +//! let chunk0_cv = Hasher::new() +//! // .set_input_offset(0) is the default. +//! .update(&chunk0) +//! .finalize_non_root(); +//! let chunk1_cv = Hasher::new() +//! .set_input_offset(CHUNK_LEN as u64) +//! .update(&chunk1) +//! .finalize_non_root(); +//! let chunk2_cv = Hasher::new() +//! .set_input_offset(2 * CHUNK_LEN as u64) +//! .update(&chunk2) +//! .finalize_non_root(); +//! +//! // Join the first two chunks with a non-root parent node and compute its chaining value. +//! let parent_cv = merge_subtrees_non_root(&chunk0_cv, &chunk1_cv, Mode::Hash); +//! +//! // Join that parent node and the third chunk with a root parent node and compute the hash. +//! let root_hash = merge_subtrees_root(&parent_cv, &chunk2_cv, Mode::Hash); +//! +//! // Double check that we got the right answer. +//! let mut combined_input = Vec::new(); +//! combined_input.extend_from_slice(&chunk0); +//! combined_input.extend_from_slice(&chunk1); +//! combined_input.extend_from_slice(&chunk2); +//! assert_eq!(root_hash, blake3::hash(&combined_input)); +//! # } +//! ``` +//! +//! Hashing many chunks together is important for performance, because it allows the implementation +//! to use SIMD parallelism internally. ([AVX-512](https://en.wikipedia.org/wiki/AVX-512) for +//! example needs 16 chunks to really get going.) We can reproduce `parent_cv` by hashing `chunk0` +//! and `chunk1` at the same time: +//! +//! ``` +//! # fn main() { +//! # use blake3::{Hasher, CHUNK_LEN}; +//! # use blake3::hazmat::{Mode, HasherExt, merge_subtrees_non_root, merge_subtrees_root}; +//! # let chunk0 = [b'a'; CHUNK_LEN]; +//! # let chunk1 = [b'b'; CHUNK_LEN]; +//! # let chunk0_cv = Hasher::new().update(&chunk0).finalize_non_root(); +//! # let chunk1_cv = Hasher::new().set_input_offset(CHUNK_LEN as u64).update(&chunk1).finalize_non_root(); +//! # let parent_cv = merge_subtrees_non_root(&chunk0_cv, &chunk1_cv, Mode::Hash); +//! # let mut combined_input = Vec::new(); +//! # combined_input.extend_from_slice(&chunk0); +//! # combined_input.extend_from_slice(&chunk1); +//! let left_subtree_cv = Hasher::new() +//! // .set_input_offset(0) is the default. +//! .update(&combined_input[..2 * CHUNK_LEN]) +//! .finalize_non_root(); +//! assert_eq!(left_subtree_cv, parent_cv); +//! +//! // Using multiple updates gives the same answer, though it's not as efficient. +//! let mut subtree_hasher = Hasher::new(); +//! // Again, .set_input_offset(0) is the default. +//! subtree_hasher.update(&chunk0); +//! subtree_hasher.update(&chunk1); +//! assert_eq!(left_subtree_cv, subtree_hasher.finalize_non_root()); +//! # } +//! ``` +//! +//! However, hashing multiple chunks together **must** respect the overall tree structure. Hashing +//! `chunk0` and `chunk1` together is valid, but hashing `chunk1` and `chunk2` together is +//! incorrect and gives a garbage result that will never match a standard BLAKE3 hash. The +//! implementation includes a few best-effort asserts to catch some of these mistakes, but these +//! checks aren't guaranteed. For example, this second call to `update` currently panics: +//! +//! ```should_panic +//! # fn main() { +//! # use blake3::{Hasher, CHUNK_LEN}; +//! # use blake3::hazmat::HasherExt; +//! # let chunk0 = [b'a'; CHUNK_LEN]; +//! # let chunk1 = [b'b'; CHUNK_LEN]; +//! # let chunk2 = [b'c'; 42]; +//! let oops = Hasher::new() +//! .set_input_offset(CHUNK_LEN as u64) +//! .update(&chunk1) +//! // PANIC: "the subtree starting at 1024 contains at most 1024 bytes" +//! .update(&chunk2) +//! .finalize_non_root(); +//! # } +//! ``` +//! +//! For more on valid tree structures, see the docs for and [`left_subtree_len`] and +//! [`max_subtree_len`], and see section 2.1 of [the BLAKE3 +//! paper](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf). Note that the +//! merging functions ([`merge_subtrees_root`] and friends) don't know the shape of the left and +//! right subtrees you're giving them, and they can't help you catch mistakes. The best way to +//! catch mistakes with these is to compare your root output to the [`blake3::hash`](crate::hash) +//! of the same input. + +use crate::platform::Platform; +use crate::{CVWords, Hasher, CHUNK_LEN, IV, KEY_LEN, OUT_LEN}; + +/// Extension methods for [`Hasher`]. This is the main entrypoint to the `hazmat` module. +pub trait HasherExt { + /// Similar to [`Hasher::new_derive_key`] but using a pre-hashed [`ContextKey`] from + /// [`hash_derive_key_context`]. + /// + /// The [`hash_derive_key_context`] function is _only_ valid source of the [`ContextKey`] + /// + /// # Example + /// + /// ``` + /// use blake3::Hasher; + /// use blake3::hazmat::HasherExt; + /// + /// let context_key = blake3::hazmat::hash_derive_key_context("foo"); + /// let mut hasher = Hasher::new_from_context_key(&context_key); + /// hasher.update(b"bar"); + /// let derived_key = *hasher.finalize().as_bytes(); + /// + /// assert_eq!(derived_key, blake3::derive_key("foo", b"bar")); + /// ``` + fn new_from_context_key(context_key: &ContextKey) -> Self; + + /// Configure the `Hasher` to process a chunk or subtree starting at `offset` bytes into the + /// whole input. + /// + /// You must call this function before processing any input with [`update`](Hasher::update) or + /// similar. This step isn't required for the first chunk, or for a subtree that includes the + /// first chunk (i.e. when the `offset` is zero), but it's required for all other chunks and + /// subtrees. + /// + /// The starting input offset of a subtree implies a maximum possible length for that subtree. + /// See [`max_subtree_len`] and section 2.1 of [the BLAKE3 + /// paper](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf). Note that only + /// subtrees along the right edge of the whole tree can have a length less than their maximum + /// possible length. + /// + /// See the [module level examples](index.html#examples). + /// + /// # Panics + /// + /// This function panics if the `Hasher` has already accepted any input with + /// [`update`](Hasher::update) or similar. + /// + /// This should always be paired with [`finalize_non_root`](HasherExt::finalize_non_root). It's + /// never correct to use a non-zero input offset with [`finalize`](Hasher::finalize) or + /// [`finalize_xof`](Hasher::finalize_xof). The `offset` must also be a multiple of + /// `CHUNK_LEN`. Violating either of these rules will currently fail an assertion and panic, + /// but this is not guaranteed. + fn set_input_offset(&mut self, offset: u64) -> &mut Self; + + /// Finalize the non-root hash ("chaining value") of the current chunk or subtree. + /// + /// Afterwards you can merge subtree chaining values into parent nodes using + /// [`merge_subtrees_non_root`] and ultimately into the root node with either + /// [`merge_subtrees_root`] (similar to [`Hasher::finalize`]) or [`merge_subtrees_root_xof`] + /// (similar to [`Hasher::finalize_xof`]). + /// + /// See the [module level examples](index.html#examples), particularly the discussion of valid + /// tree structures. + fn finalize_non_root(&self) -> ChainingValue; +} + +impl HasherExt for Hasher { + fn new_from_context_key(context_key: &[u8; KEY_LEN]) -> Hasher { + let context_key_words = crate::platform::words_from_le_bytes_32(context_key); + Hasher::new_internal(&context_key_words, crate::DERIVE_KEY_MATERIAL) + } + + fn set_input_offset(&mut self, offset: u64) -> &mut Hasher { + assert_eq!(self.count(), 0, "hasher has already accepted input"); + assert_eq!( + offset % CHUNK_LEN as u64, + 0, + "offset ({offset}) must be a chunk boundary (divisible by {CHUNK_LEN})", + ); + let counter = offset / CHUNK_LEN as u64; + self.chunk_state.chunk_counter = counter; + self.initial_chunk_counter = counter; + self + } + + fn finalize_non_root(&self) -> ChainingValue { + assert_ne!(self.count(), 0, "empty subtrees are never valid"); + self.final_output().chaining_value() + } +} + +/// The maximum length of a subtree in bytes, given its starting offset in bytes +/// +/// If you try to hash more than this many bytes as one subtree, you'll end up merging parent nodes +/// that shouldn't be merged, and your output will be garbage. [`Hasher::update`] will currently +/// panic in this case, but this is not guaranteed. +/// +/// For input offset zero (the default), there is no maximum length, and this function returns +/// `None`. For all other offsets it returns `Some`. Note that valid offsets must be a multiple of +/// [`CHUNK_LEN`] (1024); it's not possible to start hashing a chunk in the middle. +/// +/// In the example tree below, chunks are numbered by their _0-based index_. The subtree that +/// _starts_ with chunk 3, i.e. `input_offset = 3 * CHUNK_LEN`, includes only that one chunk, so +/// its max length is `Some(CHUNK_LEN)`. The subtree that starts with chunk 6 includes chunk 7 but +/// not chunk 8, so its max length is `Some(2 * CHUNK_LEN)`. The subtree that starts with chunk 12 +/// includes chunks 13, 14, and 15, but if the tree were bigger it would not include chunk 16, so +/// its max length is `Some(4 * CHUNK_LEN)`. One way to think about the rule here is that, if you +/// go beyond the max subtree length from a given starting offset, you start dealing with subtrees +/// that include chunks _to the left_ of where you started. +/// +/// ```text +/// root +/// / \ +/// . . +/// / \ / \ +/// . . . . +/// / \ / \ / \ / \ +/// . . . . . . . . +/// / \ / \ / \ / \ / \ / \ / \ / \ +/// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 +/// ``` +/// +/// The general rule turns out to be that for a subtree starting at a 0-based chunk index N greater +/// than zero, the maximum number of chunks in that subtree is the largest power-of-two that +/// divides N, which is given by `1 << N.trailing_zeros()`. +/// +/// This function can be useful for writing tests or debug assertions, but it's actually rare to +/// use this for real control flow. Callers who split their input recursively using +/// [`left_subtree_len`] will automatically satisfy the `max_subtree_len` bound and don't +/// necessarily need to check. It's also common to choose some fixed power-of-two subtree size, say +/// 64 chunks, and divide your input up into slices of that fixed length (with the final slice +/// possibly short). This approach also automatically satisfies the `max_subtree_len` bound and +/// doesn't need to check. Proving that this is true can be an interesting exercise. Note that +/// chunks 0, 4, 8, and 12 all begin subtrees of at least 4 chunks in the example tree above. +/// +/// # Panics +/// +/// This function currently panics if `input_offset` is not a multiple of `CHUNK_LEN`. This is not +/// guaranteed. +#[inline(always)] +pub fn max_subtree_len(input_offset: u64) -> Option<u64> { + if input_offset == 0 { + return None; + } + assert_eq!(input_offset % CHUNK_LEN as u64, 0); + let counter = input_offset / CHUNK_LEN as u64; + let max_chunks = 1 << counter.trailing_zeros(); + Some(max_chunks * CHUNK_LEN as u64) +} + +#[test] +fn test_max_subtree_len() { + assert_eq!(max_subtree_len(0), None); + // (chunk index, max chunks) + let cases = [ + (1, 1), + (2, 2), + (3, 1), + (4, 4), + (5, 1), + (6, 2), + (7, 1), + (8, 8), + ]; + for (chunk_index, max_chunks) in cases { + let input_offset = chunk_index * CHUNK_LEN as u64; + assert_eq!( + max_subtree_len(input_offset), + Some(max_chunks * CHUNK_LEN as u64), + ); + } +} + +/// Given the length in bytes of either a complete input or a subtree input, return the number of +/// bytes that belong to its left child subtree. The rest belong to its right child subtree. +/// +/// Concretely, this function returns the largest power-of-two number of bytes that's strictly less +/// than `input_len`. This leads to a tree where all left subtrees are "complete" and at least as +/// large as their sibling right subtrees, as specified in section 2.1 of [the BLAKE3 +/// paper](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf). For example, if an +/// input is exactly two chunks, its left and right subtrees both get one chunk. But if an input is +/// two chunks plus one more byte, then its left subtree gets two chunks, and its right subtree +/// only gets one byte. +/// +/// This function isn't meaningful for one chunk of input, because chunks don't have children. It +/// currently panics in debug mode if `input_len <= CHUNK_LEN`. +/// +/// # Example +/// +/// Hash a input of random length as two subtrees: +/// +/// ``` +/// # #[cfg(feature = "std")] { +/// use blake3::hazmat::{left_subtree_len, merge_subtrees_root, HasherExt, Mode}; +/// use blake3::{Hasher, CHUNK_LEN}; +/// +/// // Generate a random-length input. Note that to be split into two subtrees, the input length +/// // must be greater than CHUNK_LEN. +/// let input_len = rand::random_range(CHUNK_LEN + 1..1_000_000); +/// let mut input = vec![0; input_len]; +/// rand::fill(&mut input[..]); +/// +/// // Compute the left and right subtree hashes and then the root hash. left_subtree_len() tells +/// // us exactly where to split the input. Any other split would either panic (if we're lucky) or +/// // lead to an incorrect root hash. +/// let left_len = left_subtree_len(input_len as u64) as usize; +/// let left_subtree_cv = Hasher::new() +/// .update(&input[..left_len]) +/// .finalize_non_root(); +/// let right_subtree_cv = Hasher::new() +/// .set_input_offset(left_len as u64) +/// .update(&input[left_len..]) +/// .finalize_non_root(); +/// let root_hash = merge_subtrees_root(&left_subtree_cv, &right_subtree_cv, Mode::Hash); +/// +/// // Double check the answer. +/// assert_eq!(root_hash, blake3::hash(&input)); +/// # } +/// ``` +#[inline(always)] +pub fn left_subtree_len(input_len: u64) -> u64 { + debug_assert!(input_len > CHUNK_LEN as u64); + // Note that .next_power_of_two() is greater than *or equal*. + ((input_len + 1) / 2).next_power_of_two() +} + +#[test] +fn test_left_subtree_len() { + assert_eq!(left_subtree_len(1025), 1024); + for boundary_case in [2, 4, 8, 16, 32, 64] { + let input_len = boundary_case * CHUNK_LEN as u64; + assert_eq!(left_subtree_len(input_len - 1), input_len / 2); + assert_eq!(left_subtree_len(input_len), input_len / 2); + assert_eq!(left_subtree_len(input_len + 1), input_len); + } +} + +/// The `mode` argument to [`merge_subtrees_root`] and friends +/// +/// See the [module level examples](index.html#examples). +#[derive(Copy, Clone, Debug)] +pub enum Mode<'a> { + /// Corresponding to [`hash`](crate::hash) + Hash, + + /// Corresponding to [`keyed_hash`](crate::hash) + KeyedHash(&'a [u8; KEY_LEN]), + + /// Corresponding to [`derive_key`](crate::hash) + /// + /// The [`ContextKey`] comes from [`hash_derive_key_context`]. + DeriveKeyMaterial(&'a ContextKey), +} + +impl<'a> Mode<'a> { + fn key_words(&self) -> CVWords { + match self { + Mode::Hash => *IV, + Mode::KeyedHash(key) => crate::platform::words_from_le_bytes_32(key), + Mode::DeriveKeyMaterial(cx_key) => crate::platform::words_from_le_bytes_32(cx_key), + } + } + + fn flags_byte(&self) -> u8 { + match self { + Mode::Hash => 0, + Mode::KeyedHash(_) => crate::KEYED_HASH, + Mode::DeriveKeyMaterial(_) => crate::DERIVE_KEY_MATERIAL, + } + } +} + +/// "Chaining value" is the academic term for a non-root or non-final hash. +/// +/// Besides just sounding fancy, it turns out there are [security +/// reasons](https://jacko.io/tree_hashing.html) to be careful about the difference between +/// (root/final) hashes and (non-root/non-final) chaining values. +pub type ChainingValue = [u8; OUT_LEN]; + +fn merge_subtrees_inner( + left_child: &ChainingValue, + right_child: &ChainingValue, + mode: Mode, +) -> crate::Output { + crate::parent_node_output( + &left_child, + &right_child, + &mode.key_words(), + mode.flags_byte(), + Platform::detect(), + ) +} + +/// Compute a non-root parent node chaining value from two child chaining values. +/// +/// See the [module level examples](index.html#examples), particularly the discussion of valid tree +/// structures. The left and right child chaining values can come from either +/// [`Hasher::finalize_non_root`](HasherExt::finalize_non_root) or other calls to +/// `merge_subtrees_non_root`. "Chaining value" is the academic term for a non-root or non-final +/// hash. +pub fn merge_subtrees_non_root( + left_child: &ChainingValue, + right_child: &ChainingValue, + mode: Mode, +) -> ChainingValue { + merge_subtrees_inner(left_child, right_child, mode).chaining_value() +} + +/// Compute a root hash from two child chaining values. +/// +/// See the [module level examples](index.html#examples), particularly the discussion of valid tree +/// structures. The left and right child chaining values can come from either +/// [`Hasher::finalize_non_root`](HasherExt::finalize_non_root) or [`merge_subtrees_non_root`]. +/// "Chaining value" is the academic term for a non-root or non-final hash. +/// +/// Note that inputs of [`CHUNK_LEN`] or less don't produce any parent nodes and can't be hashed +/// using this function. In that case you must get the root hash from [`Hasher::finalize`] (or just +/// [`blake3::hash`](crate::hash)). +pub fn merge_subtrees_root( + left_child: &ChainingValue, + right_child: &ChainingValue, + mode: Mode, +) -> crate::Hash { + merge_subtrees_inner(left_child, right_child, mode).root_hash() +} + +/// Build a root [`OutputReader`](crate::OutputReader) from two child chaining values. +/// +/// See also the [module level examples](index.html#examples), particularly the discussion of valid +/// tree structures. The left and right child chaining values can come from either +/// [`Hasher::finalize_non_root`](HasherExt::finalize_non_root) or [`merge_subtrees_non_root`]. +/// "Chaining value" is the academic term for a non-root or non-final hash. +/// +/// Note that inputs of [`CHUNK_LEN`] or less don't produce any parent nodes and can't be hashed +/// using this function. In that case you must get the `OutputReader` from +/// [`Hasher::finalize_xof`]. +/// +/// # Example +/// +/// ``` +/// use blake3::hazmat::{merge_subtrees_root_xof, HasherExt, Mode}; +/// use blake3::{Hasher, CHUNK_LEN}; +/// +/// // Hash a 2-chunk subtree in steps. Note that only +/// // the final chunk can be shorter than CHUNK_LEN. +/// let chunk0 = &[42; CHUNK_LEN]; +/// let chunk1 = b"hello world"; +/// let chunk0_cv = Hasher::new() +/// .update(chunk0) +/// .finalize_non_root(); +/// let chunk1_cv = Hasher::new() +/// .set_input_offset(CHUNK_LEN as u64) +/// .update(chunk1) +/// .finalize_non_root(); +/// +/// // Obtain a blake3::OutputReader at the root and extract 1000 bytes. +/// let mut output_reader = merge_subtrees_root_xof(&chunk0_cv, &chunk1_cv, Mode::Hash); +/// let mut output_bytes = [0; 1_000]; +/// output_reader.fill(&mut output_bytes); +/// +/// // Double check the answer. +/// let mut hasher = Hasher::new(); +/// hasher.update(chunk0); +/// hasher.update(chunk1); +/// let mut expected = [0; 1_000]; +/// hasher.finalize_xof().fill(&mut expected); +/// assert_eq!(output_bytes, expected); +/// ``` +pub fn merge_subtrees_root_xof( + left_child: &ChainingValue, + right_child: &ChainingValue, + mode: Mode, +) -> crate::OutputReader { + crate::OutputReader::new(merge_subtrees_inner(left_child, right_child, mode)) +} + +/// An alias to distinguish [`hash_derive_key_context`] outputs from other keys. +pub type ContextKey = [u8; KEY_LEN]; + +/// Hash a [`derive_key`](crate::derive_key) context string and return a [`ContextKey`]. +/// +/// The _only_ valid uses for the returned [`ContextKey`] are [`Hasher::new_from_context_key`] and +/// [`Mode::DeriveKeyMaterial`] (together with the merge subtree functions). +/// +/// # Example +/// +/// ``` +/// use blake3::Hasher; +/// use blake3::hazmat::HasherExt; +/// +/// let context_key = blake3::hazmat::hash_derive_key_context("foo"); +/// let mut hasher = Hasher::new_from_context_key(&context_key); +/// hasher.update(b"bar"); +/// let derived_key = *hasher.finalize().as_bytes(); +/// +/// assert_eq!(derived_key, blake3::derive_key("foo", b"bar")); +/// ``` +pub fn hash_derive_key_context(context: &str) -> ContextKey { + crate::hash_all_at_once::<crate::join::SerialJoin>( + context.as_bytes(), + IV, + crate::DERIVE_KEY_CONTEXT, + ) + .root_hash() + .0 +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + #[should_panic] + fn test_empty_subtree_should_panic() { + Hasher::new().finalize_non_root(); + } + + #[test] + #[should_panic] + fn test_unaligned_offset_should_panic() { + Hasher::new().set_input_offset(1); + } + + #[test] + #[should_panic] + fn test_hasher_already_accepted_input_should_panic() { + Hasher::new().update(b"x").set_input_offset(0); + } + + #[test] + #[should_panic] + fn test_too_much_input_should_panic() { + Hasher::new() + .set_input_offset(CHUNK_LEN as u64) + .update(&[0; CHUNK_LEN + 1]); + } + + #[test] + #[should_panic] + fn test_set_input_offset_cant_finalize() { + Hasher::new().set_input_offset(CHUNK_LEN as u64).finalize(); + } + + #[test] + #[should_panic] + fn test_set_input_offset_cant_finalize_xof() { + Hasher::new() + .set_input_offset(CHUNK_LEN as u64) + .finalize_xof(); + } + + #[test] + fn test_grouped_hash() { + const MAX_CHUNKS: usize = (crate::test::TEST_CASES_MAX + 1) / CHUNK_LEN; + let mut input_buf = [0; crate::test::TEST_CASES_MAX]; + crate::test::paint_test_input(&mut input_buf); + for subtree_chunks in [1, 2, 4, 8, 16, 32] { + #[cfg(feature = "std")] + dbg!(subtree_chunks); + let subtree_len = subtree_chunks * CHUNK_LEN; + for &case in crate::test::TEST_CASES { + if case <= subtree_len { + continue; + } + #[cfg(feature = "std")] + dbg!(case); + let input = &input_buf[..case]; + let expected_hash = crate::hash(input); + + // Collect all the group chaining values. + let mut chaining_values = arrayvec::ArrayVec::<ChainingValue, MAX_CHUNKS>::new(); + let mut subtree_offset = 0; + while subtree_offset < input.len() { + let take = core::cmp::min(subtree_len, input.len() - subtree_offset); + let subtree_input = &input[subtree_offset..][..take]; + let subtree_cv = Hasher::new() + .set_input_offset(subtree_offset as u64) + .update(subtree_input) + .finalize_non_root(); + chaining_values.push(subtree_cv); + subtree_offset += take; + } + + // Compress all the chaining_values together, layer by layer. + assert!(chaining_values.len() >= 2); + while chaining_values.len() > 2 { + let n = chaining_values.len(); + // Merge each side-by-side pair in place, overwriting the front half of the + // array with the merged results. This moves us "up one level" in the tree. + for i in 0..(n / 2) { + chaining_values[i] = merge_subtrees_non_root( + &chaining_values[2 * i], + &chaining_values[2 * i + 1], + Mode::Hash, + ); + } + // If there's an odd CV out, it moves up. + if n % 2 == 1 { + chaining_values[n / 2] = chaining_values[n - 1]; + } + chaining_values.truncate(n / 2 + n % 2); + } + assert_eq!(chaining_values.len(), 2); + let root_hash = + merge_subtrees_root(&chaining_values[0], &chaining_values[1], Mode::Hash); + assert_eq!(expected_hash, root_hash); + } + } + } + + #[test] + fn test_keyed_hash_xof() { + let group0 = &[42; 4096]; + let group1 = &[43; 4095]; + let mut input = [0; 8191]; + input[..4096].copy_from_slice(group0); + input[4096..].copy_from_slice(group1); + let key = &[44; 32]; + + let mut expected_output = [0; 100]; + Hasher::new_keyed(&key) + .update(&input) + .finalize_xof() + .fill(&mut expected_output); + + let mut hazmat_output = [0; 100]; + let left = Hasher::new_keyed(key).update(group0).finalize_non_root(); + let right = Hasher::new_keyed(key) + .set_input_offset(group0.len() as u64) + .update(group1) + .finalize_non_root(); + merge_subtrees_root_xof(&left, &right, Mode::KeyedHash(&key)).fill(&mut hazmat_output); + assert_eq!(expected_output, hazmat_output); + } + + #[test] + fn test_derive_key() { + let context = "foo"; + let mut input = [0; 1025]; + crate::test::paint_test_input(&mut input); + let expected = crate::derive_key(context, &input); + + let cx_key = hash_derive_key_context(context); + let left = Hasher::new_from_context_key(&cx_key) + .update(&input[..1024]) + .finalize_non_root(); + let right = Hasher::new_from_context_key(&cx_key) + .set_input_offset(1024) + .update(&input[1024..]) + .finalize_non_root(); + let derived_key = merge_subtrees_root(&left, &right, Mode::DeriveKeyMaterial(&cx_key)).0; + assert_eq!(expected, derived_key); + } +} diff --git a/thirdparty/blake3/src/io.rs b/thirdparty/blake3/src/io.rs new file mode 100644 index 000000000..7e8e154f7 --- /dev/null +++ b/thirdparty/blake3/src/io.rs @@ -0,0 +1,64 @@ +//! Helper functions for efficient IO. + +#[cfg(feature = "std")] +pub(crate) fn copy_wide( + mut reader: impl std::io::Read, + hasher: &mut crate::Hasher, +) -> std::io::Result<u64> { + let mut buffer = [0; 65536]; + let mut total = 0; + loop { + match reader.read(&mut buffer) { + Ok(0) => return Ok(total), + Ok(n) => { + hasher.update(&buffer[..n]); + total += n as u64; + } + // see test_update_reader_interrupted + Err(e) if e.kind() == std::io::ErrorKind::Interrupted => continue, + Err(e) => return Err(e), + } + } +} + +// Mmap a file, if it looks like a good idea. Return None in cases where we know mmap will fail, or +// if the file is short enough that mmapping isn't worth it. However, if we do try to mmap and it +// fails, return the error. +// +// SAFETY: Mmaps are fundamentally unsafe, because you can call invariant-checking functions like +// str::from_utf8 on them and then have them change out from under you. Letting a safe caller get +// their hands on an mmap, or even a &[u8] that's backed by an mmap, is unsound. However, because +// this function is crate-private, we can guarantee that all can ever happen in the event of a race +// condition is that we either hash nonsense bytes or crash with SIGBUS or similar, neither of +// which should risk memory corruption in a safe caller. +// +// PARANOIA: But a data race...is a data race...is a data race...right? Even if we know that no +// platform in the "real world" is ever going to do anything other than compute the "wrong answer" +// if we race on this mmap while we hash it, aren't we still supposed to feel bad about doing this? +// Well, maybe. This is IO, and IO gets special carve-outs in the memory model. Consider a +// memory-mapped register that returns random 32-bit words. (This is actually realistic if you have +// a hardware RNG.) It's probably sound to construct a *const i32 pointing to that register and do +// some raw pointer reads from it. Those reads should be volatile if you don't want the compiler to +// coalesce them, but either way the compiler isn't allowed to just _go nuts_ and insert +// should-never-happen branches to wipe your hard drive if two adjacent reads happen to give +// different values. As far as I'm aware, there's no such thing as a read that's allowed if it's +// volatile but prohibited if it's not (unlike atomics). As mentioned above, it's not ok to +// construct a safe &i32 to the register if you're going to leak that reference to unknown callers. +// But if you "know what you're doing," I don't think *const i32 and &i32 are fundamentally +// different here. Feedback needed. +#[cfg(feature = "mmap")] +pub(crate) fn maybe_mmap_file(file: &std::fs::File) -> std::io::Result<Option<memmap2::Mmap>> { + let metadata = file.metadata()?; + let file_size = metadata.len(); + if !metadata.is_file() { + // Not a real file. + Ok(None) + } else if file_size < 16 * 1024 { + // Mapping small files is not worth it, and some special files that can't be mapped report + // a size of zero. + Ok(None) + } else { + let map = unsafe { memmap2::Mmap::map(file)? }; + Ok(Some(map)) + } +} diff --git a/thirdparty/blake3/src/join.rs b/thirdparty/blake3/src/join.rs new file mode 100644 index 000000000..862ebcf9a --- /dev/null +++ b/thirdparty/blake3/src/join.rs @@ -0,0 +1,92 @@ +//! The multi-threading abstractions used by `Hasher::update_with_join`. +//! +//! Different implementations of the `Join` trait determine whether +//! `Hasher::update_with_join` performs multi-threading on sufficiently large +//! inputs. The `SerialJoin` implementation is single-threaded, and the +//! `RayonJoin` implementation (gated by the `rayon` feature) is multi-threaded. +//! Interfaces other than `Hasher::update_with_join`, like [`hash`](crate::hash) +//! and [`Hasher::update`](crate::Hasher::update), always use `SerialJoin` +//! internally. +//! +//! The `Join` trait is an almost exact copy of the [`rayon::join`] API, and +//! `RayonJoin` is the only non-trivial implementation. Previously this trait +//! was public, but currently it's been re-privatized, as it's both 1) of no +//! value to most callers and 2) a pretty big implementation detail to commit +//! to. +//! +//! [`rayon::join`]: https://docs.rs/rayon/1.3.0/rayon/fn.join.html + +/// The trait that abstracts over single-threaded and multi-threaded recursion. +/// +/// See the [`join` module docs](index.html) for more details. +pub trait Join { + fn join<A, B, RA, RB>(oper_a: A, oper_b: B) -> (RA, RB) + where + A: FnOnce() -> RA + Send, + B: FnOnce() -> RB + Send, + RA: Send, + RB: Send; +} + +/// The trivial, serial implementation of `Join`. The left and right sides are +/// executed one after the other, on the calling thread. The standalone hashing +/// functions and the `Hasher::update` method use this implementation +/// internally. +/// +/// See the [`join` module docs](index.html) for more details. +pub enum SerialJoin {} + +impl Join for SerialJoin { + #[inline] + fn join<A, B, RA, RB>(oper_a: A, oper_b: B) -> (RA, RB) + where + A: FnOnce() -> RA + Send, + B: FnOnce() -> RB + Send, + RA: Send, + RB: Send, + { + (oper_a(), oper_b()) + } +} + +/// The Rayon-based implementation of `Join`. The left and right sides are +/// executed on the Rayon thread pool, potentially in parallel. This +/// implementation is gated by the `rayon` feature, which is off by default. +/// +/// See the [`join` module docs](index.html) for more details. +#[cfg(feature = "rayon")] +pub enum RayonJoin {} + +#[cfg(feature = "rayon")] +impl Join for RayonJoin { + #[inline] + fn join<A, B, RA, RB>(oper_a: A, oper_b: B) -> (RA, RB) + where + A: FnOnce() -> RA + Send, + B: FnOnce() -> RB + Send, + RA: Send, + RB: Send, + { + rayon_core::join(oper_a, oper_b) + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_serial_join() { + let oper_a = || 1 + 1; + let oper_b = || 2 + 2; + assert_eq!((2, 4), SerialJoin::join(oper_a, oper_b)); + } + + #[test] + #[cfg(feature = "rayon")] + fn test_rayon_join() { + let oper_a = || 1 + 1; + let oper_b = || 2 + 2; + assert_eq!((2, 4), RayonJoin::join(oper_a, oper_b)); + } +} diff --git a/thirdparty/blake3/src/lib.rs b/thirdparty/blake3/src/lib.rs new file mode 100644 index 000000000..d777896f4 --- /dev/null +++ b/thirdparty/blake3/src/lib.rs @@ -0,0 +1,1835 @@ +//! The official Rust implementation of the [BLAKE3] cryptographic hash +//! function. +//! +//! # Examples +//! +//! ``` +//! # fn main() -> Result<(), Box<dyn std::error::Error>> { +//! // Hash an input all at once. +//! let hash1 = blake3::hash(b"foobarbaz"); +//! +//! // Hash an input incrementally. +//! let mut hasher = blake3::Hasher::new(); +//! hasher.update(b"foo"); +//! hasher.update(b"bar"); +//! hasher.update(b"baz"); +//! let hash2 = hasher.finalize(); +//! assert_eq!(hash1, hash2); +//! +//! // Extended output. OutputReader also implements Read and Seek. +//! # #[cfg(feature = "std")] { +//! let mut output = [0; 1000]; +//! let mut output_reader = hasher.finalize_xof(); +//! output_reader.fill(&mut output); +//! assert_eq!(hash1, output[..32]); +//! # } +//! +//! // Print a hash as hex. +//! println!("{}", hash1); +//! # Ok(()) +//! # } +//! ``` +//! +//! # Cargo Features +//! +//! The `std` feature (the only feature enabled by default) is required for +//! implementations of the [`Write`] and [`Seek`] traits, the +//! [`update_reader`](Hasher::update_reader) helper method, and runtime CPU +//! feature detection on x86. If this feature is disabled, the only way to use +//! the x86 SIMD implementations is to enable the corresponding instruction sets +//! globally, with e.g. `RUSTFLAGS="-C target-cpu=native"`. The resulting binary +//! will not be portable to other machines. +//! +//! The `rayon` feature (disabled by default, but enabled for [docs.rs]) adds +//! the [`update_rayon`](Hasher::update_rayon) and (in combination with `mmap` +//! below) [`update_mmap_rayon`](Hasher::update_mmap_rayon) methods, for +//! multithreaded hashing. However, even if this feature is enabled, all other +//! APIs remain single-threaded. +//! +//! The `mmap` feature (disabled by default, but enabled for [docs.rs]) adds the +//! [`update_mmap`](Hasher::update_mmap) and (in combination with `rayon` above) +//! [`update_mmap_rayon`](Hasher::update_mmap_rayon) helper methods for +//! memory-mapped IO. +//! +//! The `zeroize` feature (disabled by default, but enabled for [docs.rs]) +//! implements +//! [`Zeroize`](https://docs.rs/zeroize/latest/zeroize/trait.Zeroize.html) for +//! this crate's types. +//! +//! The `serde` feature (disabled by default, but enabled for [docs.rs]) implements +//! [`serde::Serialize`](https://docs.rs/serde/latest/serde/trait.Serialize.html) and +//! [`serde::Deserialize`](https://docs.rs/serde/latest/serde/trait.Deserialize.html) +//! for [`Hash`](struct@Hash). +//! +//! The NEON implementation is enabled by default for AArch64 but requires the +//! `neon` feature for other ARM targets. Not all ARMv7 CPUs support NEON, and +//! enabling this feature will produce a binary that's not portable to CPUs +//! without NEON support. +//! +//! The `wasm32_simd` feature enables the WASM SIMD implementation for all `wasm32-` +//! targets. Similar to the `neon` feature, if `wasm32_simd` is enabled, WASM SIMD +//! support is assumed. This may become the default in the future. +//! +//! The `traits-preview` feature enables implementations of traits from the +//! RustCrypto [`digest`] crate, and re-exports that crate as `traits::digest`. +//! However, the traits aren't stable, and they're expected to change in +//! incompatible ways before that crate reaches 1.0. For that reason, this crate +//! makes no SemVer guarantees for this feature, and callers who use it should +//! expect breaking changes between patch versions. (The "-preview" feature name +//! follows the conventions of the RustCrypto [`signature`] crate.) +//! +//! [`Hasher::update_rayon`]: struct.Hasher.html#method.update_rayon +//! [BLAKE3]: https://blake3.io +//! [Rayon]: https://github.com/rayon-rs/rayon +//! [docs.rs]: https://docs.rs/ +//! [`Write`]: https://doc.rust-lang.org/std/io/trait.Write.html +//! [`Seek`]: https://doc.rust-lang.org/std/io/trait.Seek.html +//! [`digest`]: https://crates.io/crates/digest +//! [`signature`]: https://crates.io/crates/signature + +#![cfg_attr(not(feature = "std"), no_std)] + +#[cfg(test)] +mod test; + +#[doc(hidden)] +#[deprecated(since = "1.8.0", note = "use the hazmat module instead")] +pub mod guts; + +pub mod hazmat; + +/// Undocumented and unstable, for benchmarks only. +#[doc(hidden)] +pub mod platform; + +// Platform-specific implementations of the compression function. These +// BLAKE3-specific cfg flags are set in build.rs. +#[cfg(blake3_avx2_rust)] +#[path = "rust_avx2.rs"] +mod avx2; +#[cfg(blake3_avx2_ffi)] +#[path = "ffi_avx2.rs"] +mod avx2; +#[cfg(blake3_avx512_ffi)] +#[path = "ffi_avx512.rs"] +mod avx512; +#[cfg(blake3_neon)] +#[path = "ffi_neon.rs"] +mod neon; +mod portable; +#[cfg(blake3_sse2_rust)] +#[path = "rust_sse2.rs"] +mod sse2; +#[cfg(blake3_sse2_ffi)] +#[path = "ffi_sse2.rs"] +mod sse2; +#[cfg(blake3_sse41_rust)] +#[path = "rust_sse41.rs"] +mod sse41; +#[cfg(blake3_sse41_ffi)] +#[path = "ffi_sse41.rs"] +mod sse41; + +#[cfg(blake3_wasm32_simd)] +#[path = "wasm32_simd.rs"] +mod wasm32_simd; + +#[cfg(feature = "traits-preview")] +pub mod traits; + +mod io; +mod join; + +use arrayref::{array_mut_ref, array_ref}; +use arrayvec::{ArrayString, ArrayVec}; +use core::cmp; +use core::fmt; +use platform::{Platform, MAX_SIMD_DEGREE, MAX_SIMD_DEGREE_OR_2}; +#[cfg(feature = "zeroize")] +use zeroize::Zeroize; + +/// The number of bytes in a [`Hash`](struct.Hash.html), 32. +pub const OUT_LEN: usize = 32; + +/// The number of bytes in a key, 32. +pub const KEY_LEN: usize = 32; + +/// The number of bytes in a block, 64. +/// +/// You don't usually need to think about this number. One case where it matters is calling +/// [`OutputReader::fill`] in a loop, where using a `buf` argument that's a multiple of `BLOCK_LEN` +/// avoids repeating work. +pub const BLOCK_LEN: usize = 64; + +/// The number of bytes in a chunk, 1024. +/// +/// You don't usually need to think about this number, but it often comes up in benchmarks, because +/// the maximum degree of parallelism used by the implementation equals the number of chunks. +pub const CHUNK_LEN: usize = 1024; + +const MAX_DEPTH: usize = 54; // 2^54 * CHUNK_LEN = 2^64 + +// While iterating the compression function within a chunk, the CV is +// represented as words, to avoid doing two extra endianness conversions for +// each compression in the portable implementation. But the hash_many interface +// needs to hash both input bytes and parent nodes, so its better for its +// output CVs to be represented as bytes. +type CVWords = [u32; 8]; +type CVBytes = [u8; 32]; // little-endian + +const IV: &CVWords = &[ + 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19, +]; + +const MSG_SCHEDULE: [[usize; 16]; 7] = [ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], + [2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8], + [3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1], + [10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6], + [12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4], + [9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7], + [11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13], +]; + +// These are the internal flags that we use to domain separate root/non-root, +// chunk/parent, and chunk beginning/middle/end. These get set at the high end +// of the block flags word in the compression function, so their values start +// high and go down. +const CHUNK_START: u8 = 1 << 0; +const CHUNK_END: u8 = 1 << 1; +const PARENT: u8 = 1 << 2; +const ROOT: u8 = 1 << 3; +const KEYED_HASH: u8 = 1 << 4; +const DERIVE_KEY_CONTEXT: u8 = 1 << 5; +const DERIVE_KEY_MATERIAL: u8 = 1 << 6; + +#[inline] +fn counter_low(counter: u64) -> u32 { + counter as u32 +} + +#[inline] +fn counter_high(counter: u64) -> u32 { + (counter >> 32) as u32 +} + +/// An output of the default size, 32 bytes, which provides constant-time +/// equality checking. +/// +/// `Hash` implements [`From`] and [`Into`] for `[u8; 32]`, and it provides +/// [`from_bytes`] and [`as_bytes`] for explicit conversions between itself and +/// `[u8; 32]`. However, byte arrays and slices don't provide constant-time +/// equality checking, which is often a security requirement in software that +/// handles private data. `Hash` doesn't implement [`Deref`] or [`AsRef`], to +/// avoid situations where a type conversion happens implicitly and the +/// constant-time property is accidentally lost. +/// +/// `Hash` provides the [`to_hex`] and [`from_hex`] methods for converting to +/// and from hexadecimal. It also implements [`Display`] and [`FromStr`]. +/// +/// [`From`]: https://doc.rust-lang.org/std/convert/trait.From.html +/// [`Into`]: https://doc.rust-lang.org/std/convert/trait.Into.html +/// [`as_bytes`]: #method.as_bytes +/// [`from_bytes`]: #method.from_bytes +/// [`Deref`]: https://doc.rust-lang.org/stable/std/ops/trait.Deref.html +/// [`AsRef`]: https://doc.rust-lang.org/std/convert/trait.AsRef.html +/// [`to_hex`]: #method.to_hex +/// [`from_hex`]: #method.from_hex +/// [`Display`]: https://doc.rust-lang.org/std/fmt/trait.Display.html +/// [`FromStr`]: https://doc.rust-lang.org/std/str/trait.FromStr.html +#[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))] +#[derive(Clone, Copy, Hash, Eq)] +pub struct Hash([u8; OUT_LEN]); + +impl Hash { + /// The raw bytes of the `Hash`. Note that byte arrays don't provide + /// constant-time equality checking, so if you need to compare hashes, + /// prefer the `Hash` type. + #[inline] + pub const fn as_bytes(&self) -> &[u8; OUT_LEN] { + &self.0 + } + + /// Create a `Hash` from its raw bytes representation. + pub const fn from_bytes(bytes: [u8; OUT_LEN]) -> Self { + Self(bytes) + } + + /// Create a `Hash` from its raw bytes representation as a slice. + /// + /// Returns an error if the slice is not exactly 32 bytes long. + pub fn from_slice(bytes: &[u8]) -> Result<Self, core::array::TryFromSliceError> { + Ok(Self::from_bytes(bytes.try_into()?)) + } + + /// Encode a `Hash` in lowercase hexadecimal. + /// + /// The returned [`ArrayString`] is a fixed size and doesn't allocate memory + /// on the heap. Note that [`ArrayString`] doesn't provide constant-time + /// equality checking, so if you need to compare hashes, prefer the `Hash` + /// type. + /// + /// [`ArrayString`]: https://docs.rs/arrayvec/0.5.1/arrayvec/struct.ArrayString.html + pub fn to_hex(&self) -> ArrayString<{ 2 * OUT_LEN }> { + let mut s = ArrayString::new(); + let table = b"0123456789abcdef"; + for &b in self.0.iter() { + s.push(table[(b >> 4) as usize] as char); + s.push(table[(b & 0xf) as usize] as char); + } + s + } + + /// Decode a `Hash` from hexadecimal. Both uppercase and lowercase ASCII + /// bytes are supported. + /// + /// Any byte outside the ranges `'0'...'9'`, `'a'...'f'`, and `'A'...'F'` + /// results in an error. An input length other than 64 also results in an + /// error. + /// + /// Note that `Hash` also implements `FromStr`, so `Hash::from_hex("...")` + /// is equivalent to `"...".parse()`. + pub fn from_hex(hex: impl AsRef<[u8]>) -> Result<Self, HexError> { + fn hex_val(byte: u8) -> Result<u8, HexError> { + match byte { + b'A'..=b'F' => Ok(byte - b'A' + 10), + b'a'..=b'f' => Ok(byte - b'a' + 10), + b'0'..=b'9' => Ok(byte - b'0'), + _ => Err(HexError(HexErrorInner::InvalidByte(byte))), + } + } + let hex_bytes: &[u8] = hex.as_ref(); + if hex_bytes.len() != OUT_LEN * 2 { + return Err(HexError(HexErrorInner::InvalidLen(hex_bytes.len()))); + } + let mut hash_bytes: [u8; OUT_LEN] = [0; OUT_LEN]; + for i in 0..OUT_LEN { + hash_bytes[i] = 16 * hex_val(hex_bytes[2 * i])? + hex_val(hex_bytes[2 * i + 1])?; + } + Ok(Hash::from(hash_bytes)) + } +} + +impl From<[u8; OUT_LEN]> for Hash { + #[inline] + fn from(bytes: [u8; OUT_LEN]) -> Self { + Self::from_bytes(bytes) + } +} + +impl From<Hash> for [u8; OUT_LEN] { + #[inline] + fn from(hash: Hash) -> Self { + hash.0 + } +} + +impl core::str::FromStr for Hash { + type Err = HexError; + + fn from_str(s: &str) -> Result<Self, Self::Err> { + Hash::from_hex(s) + } +} + +#[cfg(feature = "zeroize")] +impl Zeroize for Hash { + fn zeroize(&mut self) { + // Destructuring to trigger compile error as a reminder to update this impl. + let Self(bytes) = self; + bytes.zeroize(); + } +} + +/// This implementation is constant-time. +impl PartialEq for Hash { + #[inline] + fn eq(&self, other: &Hash) -> bool { + constant_time_eq::constant_time_eq_32(&self.0, &other.0) + } +} + +/// This implementation is constant-time. +impl PartialEq<[u8; OUT_LEN]> for Hash { + #[inline] + fn eq(&self, other: &[u8; OUT_LEN]) -> bool { + constant_time_eq::constant_time_eq_32(&self.0, other) + } +} + +/// This implementation is constant-time if the target is 32 bytes long. +impl PartialEq<[u8]> for Hash { + #[inline] + fn eq(&self, other: &[u8]) -> bool { + constant_time_eq::constant_time_eq(&self.0, other) + } +} + +impl fmt::Display for Hash { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + // Formatting field as `&str` to reduce code size since the `Debug` + // dynamic dispatch table for `&str` is likely needed elsewhere already, + // but that for `ArrayString<[u8; 64]>` is not. + let hex = self.to_hex(); + let hex: &str = hex.as_str(); + + f.write_str(hex) + } +} + +impl fmt::Debug for Hash { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + // Formatting field as `&str` to reduce code size since the `Debug` + // dynamic dispatch table for `&str` is likely needed elsewhere already, + // but that for `ArrayString<[u8; 64]>` is not. + let hex = self.to_hex(); + let hex: &str = hex.as_str(); + + f.debug_tuple("Hash").field(&hex).finish() + } +} + +/// The error type for [`Hash::from_hex`]. +/// +/// The `.to_string()` representation of this error currently distinguishes between bad length +/// errors and bad character errors. This is to help with logging and debugging, but it isn't a +/// stable API detail, and it may change at any time. +#[derive(Clone, Debug)] +pub struct HexError(HexErrorInner); + +#[derive(Clone, Debug)] +enum HexErrorInner { + InvalidByte(u8), + InvalidLen(usize), +} + +impl fmt::Display for HexError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self.0 { + HexErrorInner::InvalidByte(byte) => { + if byte < 128 { + write!(f, "invalid hex character: {:?}", byte as char) + } else { + write!(f, "invalid hex character: 0x{:x}", byte) + } + } + HexErrorInner::InvalidLen(len) => { + write!(f, "expected 64 hex bytes, received {}", len) + } + } + } +} + +#[cfg(feature = "std")] +impl std::error::Error for HexError {} + +// Each chunk or parent node can produce either a 32-byte chaining value or, by +// setting the ROOT flag, any number of final output bytes. The Output struct +// captures the state just prior to choosing between those two possibilities. +#[derive(Clone)] +struct Output { + input_chaining_value: CVWords, + block: [u8; 64], + block_len: u8, + counter: u64, + flags: u8, + platform: Platform, +} + +impl Output { + fn chaining_value(&self) -> CVBytes { + let mut cv = self.input_chaining_value; + self.platform.compress_in_place( + &mut cv, + &self.block, + self.block_len, + self.counter, + self.flags, + ); + platform::le_bytes_from_words_32(&cv) + } + + fn root_hash(&self) -> Hash { + debug_assert_eq!(self.counter, 0); + let mut cv = self.input_chaining_value; + self.platform + .compress_in_place(&mut cv, &self.block, self.block_len, 0, self.flags | ROOT); + Hash(platform::le_bytes_from_words_32(&cv)) + } + + fn root_output_block(&self) -> [u8; 2 * OUT_LEN] { + self.platform.compress_xof( + &self.input_chaining_value, + &self.block, + self.block_len, + self.counter, + self.flags | ROOT, + ) + } +} + +#[cfg(feature = "zeroize")] +impl Zeroize for Output { + fn zeroize(&mut self) { + // Destructuring to trigger compile error as a reminder to update this impl. + let Self { + input_chaining_value, + block, + block_len, + counter, + flags, + platform: _, + } = self; + + input_chaining_value.zeroize(); + block.zeroize(); + block_len.zeroize(); + counter.zeroize(); + flags.zeroize(); + } +} + +#[derive(Clone)] +struct ChunkState { + cv: CVWords, + chunk_counter: u64, + buf: [u8; BLOCK_LEN], + buf_len: u8, + blocks_compressed: u8, + flags: u8, + platform: Platform, +} + +impl ChunkState { + fn new(key: &CVWords, chunk_counter: u64, flags: u8, platform: Platform) -> Self { + Self { + cv: *key, + chunk_counter, + buf: [0; BLOCK_LEN], + buf_len: 0, + blocks_compressed: 0, + flags, + platform, + } + } + + fn count(&self) -> usize { + BLOCK_LEN * self.blocks_compressed as usize + self.buf_len as usize + } + + fn fill_buf(&mut self, input: &mut &[u8]) { + let want = BLOCK_LEN - self.buf_len as usize; + let take = cmp::min(want, input.len()); + self.buf[self.buf_len as usize..][..take].copy_from_slice(&input[..take]); + self.buf_len += take as u8; + *input = &input[take..]; + } + + fn start_flag(&self) -> u8 { + if self.blocks_compressed == 0 { + CHUNK_START + } else { + 0 + } + } + + // Try to avoid buffering as much as possible, by compressing directly from + // the input slice when full blocks are available. + fn update(&mut self, mut input: &[u8]) -> &mut Self { + if self.buf_len > 0 { + self.fill_buf(&mut input); + if !input.is_empty() { + debug_assert_eq!(self.buf_len as usize, BLOCK_LEN); + let block_flags = self.flags | self.start_flag(); // borrowck + self.platform.compress_in_place( + &mut self.cv, + &self.buf, + BLOCK_LEN as u8, + self.chunk_counter, + block_flags, + ); + self.buf_len = 0; + self.buf = [0; BLOCK_LEN]; + self.blocks_compressed += 1; + } + } + + while input.len() > BLOCK_LEN { + debug_assert_eq!(self.buf_len, 0); + let block_flags = self.flags | self.start_flag(); // borrowck + self.platform.compress_in_place( + &mut self.cv, + array_ref!(input, 0, BLOCK_LEN), + BLOCK_LEN as u8, + self.chunk_counter, + block_flags, + ); + self.blocks_compressed += 1; + input = &input[BLOCK_LEN..]; + } + + self.fill_buf(&mut input); + debug_assert!(input.is_empty()); + debug_assert!(self.count() <= CHUNK_LEN); + self + } + + fn output(&self) -> Output { + let block_flags = self.flags | self.start_flag() | CHUNK_END; + Output { + input_chaining_value: self.cv, + block: self.buf, + block_len: self.buf_len, + counter: self.chunk_counter, + flags: block_flags, + platform: self.platform, + } + } +} + +// Don't derive(Debug), because the state may be secret. +impl fmt::Debug for ChunkState { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.debug_struct("ChunkState") + .field("count", &self.count()) + .field("chunk_counter", &self.chunk_counter) + .field("flags", &self.flags) + .field("platform", &self.platform) + .finish() + } +} + +#[cfg(feature = "zeroize")] +impl Zeroize for ChunkState { + fn zeroize(&mut self) { + // Destructuring to trigger compile error as a reminder to update this impl. + let Self { + cv, + chunk_counter, + buf, + buf_len, + blocks_compressed, + flags, + platform: _, + } = self; + + cv.zeroize(); + chunk_counter.zeroize(); + buf.zeroize(); + buf_len.zeroize(); + blocks_compressed.zeroize(); + flags.zeroize(); + } +} + +// IMPLEMENTATION NOTE +// =================== +// The recursive function compress_subtree_wide(), implemented below, is the +// basis of high-performance BLAKE3. We use it both for all-at-once hashing, +// and for the incremental input with Hasher (though we have to be careful with +// subtree boundaries in the incremental case). compress_subtree_wide() applies +// several optimizations at the same time: +// - Multithreading with Rayon. +// - Parallel chunk hashing with SIMD. +// - Parallel parent hashing with SIMD. Note that while SIMD chunk hashing +// maxes out at MAX_SIMD_DEGREE*CHUNK_LEN, parallel parent hashing continues +// to benefit from larger inputs, because more levels of the tree benefit can +// use full-width SIMD vectors for parent hashing. Without parallel parent +// hashing, we lose about 10% of overall throughput on AVX2 and AVX-512. + +/// Undocumented and unstable, for benchmarks only. +#[doc(hidden)] +#[derive(Clone, Copy)] +pub enum IncrementCounter { + Yes, + No, +} + +impl IncrementCounter { + #[inline] + fn yes(&self) -> bool { + match self { + IncrementCounter::Yes => true, + IncrementCounter::No => false, + } + } +} + +// The largest power of two less than or equal to `n`, used in Hasher::update(). This is similar to +// left_subtree_len(n), but note that left_subtree_len(n) is strictly less than `n`. +fn largest_power_of_two_leq(n: usize) -> usize { + ((n / 2) + 1).next_power_of_two() +} + +// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE chunks at the same time +// on a single thread. Write out the chunk chaining values and return the +// number of chunks hashed. These chunks are never the root and never empty; +// those cases use a different codepath. +fn compress_chunks_parallel( + input: &[u8], + key: &CVWords, + chunk_counter: u64, + flags: u8, + platform: Platform, + out: &mut [u8], +) -> usize { + debug_assert!(!input.is_empty(), "empty chunks below the root"); + debug_assert!(input.len() <= MAX_SIMD_DEGREE * CHUNK_LEN); + + let mut chunks_exact = input.chunks_exact(CHUNK_LEN); + let mut chunks_array = ArrayVec::<&[u8; CHUNK_LEN], MAX_SIMD_DEGREE>::new(); + for chunk in &mut chunks_exact { + chunks_array.push(array_ref!(chunk, 0, CHUNK_LEN)); + } + platform.hash_many( + &chunks_array, + key, + chunk_counter, + IncrementCounter::Yes, + flags, + CHUNK_START, + CHUNK_END, + out, + ); + + // Hash the remaining partial chunk, if there is one. Note that the empty + // chunk (meaning the empty message) is a different codepath. + let chunks_so_far = chunks_array.len(); + if !chunks_exact.remainder().is_empty() { + let counter = chunk_counter + chunks_so_far as u64; + let mut chunk_state = ChunkState::new(key, counter, flags, platform); + chunk_state.update(chunks_exact.remainder()); + *array_mut_ref!(out, chunks_so_far * OUT_LEN, OUT_LEN) = + chunk_state.output().chaining_value(); + chunks_so_far + 1 + } else { + chunks_so_far + } +} + +// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE parents at the same time +// on a single thread. Write out the parent chaining values and return the +// number of parents hashed. (If there's an odd input chaining value left over, +// return it as an additional output.) These parents are never the root and +// never empty; those cases use a different codepath. +fn compress_parents_parallel( + child_chaining_values: &[u8], + key: &CVWords, + flags: u8, + platform: Platform, + out: &mut [u8], +) -> usize { + debug_assert_eq!(child_chaining_values.len() % OUT_LEN, 0, "wacky hash bytes"); + let num_children = child_chaining_values.len() / OUT_LEN; + debug_assert!(num_children >= 2, "not enough children"); + debug_assert!(num_children <= 2 * MAX_SIMD_DEGREE_OR_2, "too many"); + + let mut parents_exact = child_chaining_values.chunks_exact(BLOCK_LEN); + // Use MAX_SIMD_DEGREE_OR_2 rather than MAX_SIMD_DEGREE here, because of + // the requirements of compress_subtree_wide(). + let mut parents_array = ArrayVec::<&[u8; BLOCK_LEN], MAX_SIMD_DEGREE_OR_2>::new(); + for parent in &mut parents_exact { + parents_array.push(array_ref!(parent, 0, BLOCK_LEN)); + } + platform.hash_many( + &parents_array, + key, + 0, // Parents always use counter 0. + IncrementCounter::No, + flags | PARENT, + 0, // Parents have no start flags. + 0, // Parents have no end flags. + out, + ); + + // If there's an odd child left over, it becomes an output. + let parents_so_far = parents_array.len(); + if !parents_exact.remainder().is_empty() { + out[parents_so_far * OUT_LEN..][..OUT_LEN].copy_from_slice(parents_exact.remainder()); + parents_so_far + 1 + } else { + parents_so_far + } +} + +// The wide helper function returns (writes out) an array of chaining values +// and returns the length of that array. The number of chaining values returned +// is the dynamically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer, +// if the input is shorter than that many chunks. The reason for maintaining a +// wide array of chaining values going back up the tree, is to allow the +// implementation to hash as many parents in parallel as possible. +// +// As a special case when the SIMD degree is 1, this function will still return +// at least 2 outputs. This guarantees that this function doesn't perform the +// root compression. (If it did, it would use the wrong flags, and also we +// wouldn't be able to implement extendable output.) Note that this function is +// not used when the whole input is only 1 chunk long; that's a different +// codepath. +// +// Why not just have the caller split the input on the first update(), instead +// of implementing this special rule? Because we don't want to limit SIMD or +// multithreading parallelism for that update(). +fn compress_subtree_wide<J: join::Join>( + input: &[u8], + key: &CVWords, + chunk_counter: u64, + flags: u8, + platform: Platform, + out: &mut [u8], +) -> usize { + // Note that the single chunk case does *not* bump the SIMD degree up to 2 + // when it is 1. This allows Rayon the option of multithreading even the + // 2-chunk case, which can help performance on smaller platforms. + if input.len() <= platform.simd_degree() * CHUNK_LEN { + return compress_chunks_parallel(input, key, chunk_counter, flags, platform, out); + } + + // With more than simd_degree chunks, we need to recurse. Start by dividing + // the input into left and right subtrees. (Note that this is only optimal + // as long as the SIMD degree is a power of 2. If we ever get a SIMD degree + // of 3 or something, we'll need a more complicated strategy.) + debug_assert_eq!(platform.simd_degree().count_ones(), 1, "power of 2"); + let (left, right) = input.split_at(hazmat::left_subtree_len(input.len() as u64) as usize); + let right_chunk_counter = chunk_counter + (left.len() / CHUNK_LEN) as u64; + + // Make space for the child outputs. Here we use MAX_SIMD_DEGREE_OR_2 to + // account for the special case of returning 2 outputs when the SIMD degree + // is 1. + let mut cv_array = [0; 2 * MAX_SIMD_DEGREE_OR_2 * OUT_LEN]; + let degree = if left.len() == CHUNK_LEN { + // The "simd_degree=1 and we're at the leaf nodes" case. + debug_assert_eq!(platform.simd_degree(), 1); + 1 + } else { + cmp::max(platform.simd_degree(), 2) + }; + let (left_out, right_out) = cv_array.split_at_mut(degree * OUT_LEN); + + // Recurse! For update_rayon(), this is where we take advantage of RayonJoin and use multiple + // threads. + let (left_n, right_n) = J::join( + || compress_subtree_wide::<J>(left, key, chunk_counter, flags, platform, left_out), + || compress_subtree_wide::<J>(right, key, right_chunk_counter, flags, platform, right_out), + ); + + // The special case again. If simd_degree=1, then we'll have left_n=1 and + // right_n=1. Rather than compressing them into a single output, return + // them directly, to make sure we always have at least two outputs. + debug_assert_eq!(left_n, degree); + debug_assert!(right_n >= 1 && right_n <= left_n); + if left_n == 1 { + out[..2 * OUT_LEN].copy_from_slice(&cv_array[..2 * OUT_LEN]); + return 2; + } + + // Otherwise, do one layer of parent node compression. + let num_children = left_n + right_n; + compress_parents_parallel( + &cv_array[..num_children * OUT_LEN], + key, + flags, + platform, + out, + ) +} + +// Hash a subtree with compress_subtree_wide(), and then condense the resulting +// list of chaining values down to a single parent node. Don't compress that +// last parent node, however. Instead, return its message bytes (the +// concatenated chaining values of its children). This is necessary when the +// first call to update() supplies a complete subtree, because the topmost +// parent node of that subtree could end up being the root. It's also necessary +// for extended output in the general case. +// +// As with compress_subtree_wide(), this function is not used on inputs of 1 +// chunk or less. That's a different codepath. +fn compress_subtree_to_parent_node<J: join::Join>( + input: &[u8], + key: &CVWords, + chunk_counter: u64, + flags: u8, + platform: Platform, +) -> [u8; BLOCK_LEN] { + debug_assert!(input.len() > CHUNK_LEN); + let mut cv_array = [0; MAX_SIMD_DEGREE_OR_2 * OUT_LEN]; + let mut num_cvs = + compress_subtree_wide::<J>(input, &key, chunk_counter, flags, platform, &mut cv_array); + debug_assert!(num_cvs >= 2); + + // If MAX_SIMD_DEGREE is greater than 2 and there's enough input, + // compress_subtree_wide() returns more than 2 chaining values. Condense + // them into 2 by forming parent nodes repeatedly. + let mut out_array = [0; MAX_SIMD_DEGREE_OR_2 * OUT_LEN / 2]; + while num_cvs > 2 { + let cv_slice = &cv_array[..num_cvs * OUT_LEN]; + num_cvs = compress_parents_parallel(cv_slice, key, flags, platform, &mut out_array); + cv_array[..num_cvs * OUT_LEN].copy_from_slice(&out_array[..num_cvs * OUT_LEN]); + } + *array_ref!(cv_array, 0, 2 * OUT_LEN) +} + +// Hash a complete input all at once. Unlike compress_subtree_wide() and +// compress_subtree_to_parent_node(), this function handles the 1 chunk case. +fn hash_all_at_once<J: join::Join>(input: &[u8], key: &CVWords, flags: u8) -> Output { + let platform = Platform::detect(); + + // If the whole subtree is one chunk, hash it directly with a ChunkState. + if input.len() <= CHUNK_LEN { + return ChunkState::new(key, 0, flags, platform) + .update(input) + .output(); + } + + // Otherwise construct an Output object from the parent node returned by + // compress_subtree_to_parent_node(). + Output { + input_chaining_value: *key, + block: compress_subtree_to_parent_node::<J>(input, key, 0, flags, platform), + block_len: BLOCK_LEN as u8, + counter: 0, + flags: flags | PARENT, + platform, + } +} + +/// The default hash function. +/// +/// For an incremental version that accepts multiple writes, see [`Hasher::new`], +/// [`Hasher::update`], and [`Hasher::finalize`]. These two lines are equivalent: +/// +/// ``` +/// let hash = blake3::hash(b"foo"); +/// # let hash1 = hash; +/// +/// let hash = blake3::Hasher::new().update(b"foo").finalize(); +/// # let hash2 = hash; +/// # assert_eq!(hash1, hash2); +/// ``` +/// +/// For output sizes other than 32 bytes, see [`Hasher::finalize_xof`] and +/// [`OutputReader`]. +/// +/// This function is always single-threaded. For multithreading support, see +/// [`Hasher::update_rayon`](struct.Hasher.html#method.update_rayon). +pub fn hash(input: &[u8]) -> Hash { + hash_all_at_once::<join::SerialJoin>(input, IV, 0).root_hash() +} + +/// The keyed hash function. +/// +/// This is suitable for use as a message authentication code, for example to +/// replace an HMAC instance. In that use case, the constant-time equality +/// checking provided by [`Hash`](struct.Hash.html) is almost always a security +/// requirement, and callers need to be careful not to compare MACs as raw +/// bytes. +/// +/// For an incremental version that accepts multiple writes, see [`Hasher::new_keyed`], +/// [`Hasher::update`], and [`Hasher::finalize`]. These two lines are equivalent: +/// +/// ``` +/// # const KEY: &[u8; 32] = &[0; 32]; +/// let mac = blake3::keyed_hash(KEY, b"foo"); +/// # let mac1 = mac; +/// +/// let mac = blake3::Hasher::new_keyed(KEY).update(b"foo").finalize(); +/// # let mac2 = mac; +/// # assert_eq!(mac1, mac2); +/// ``` +/// +/// For output sizes other than 32 bytes, see [`Hasher::finalize_xof`], and [`OutputReader`]. +/// +/// This function is always single-threaded. For multithreading support, see +/// [`Hasher::update_rayon`](struct.Hasher.html#method.update_rayon). +pub fn keyed_hash(key: &[u8; KEY_LEN], input: &[u8]) -> Hash { + let key_words = platform::words_from_le_bytes_32(key); + hash_all_at_once::<join::SerialJoin>(input, &key_words, KEYED_HASH).root_hash() +} + +/// The key derivation function. +/// +/// Given cryptographic key material of any length and a context string of any +/// length, this function outputs a 32-byte derived subkey. **The context string +/// should be hardcoded, globally unique, and application-specific.** A good +/// default format for such strings is `"[application] [commit timestamp] +/// [purpose]"`, e.g., `"example.com 2019-12-25 16:18:03 session tokens v1"`. +/// +/// Key derivation is important when you want to use the same key in multiple +/// algorithms or use cases. Using the same key with different cryptographic +/// algorithms is generally forbidden, and deriving a separate subkey for each +/// use case protects you from bad interactions. Derived keys also mitigate the +/// damage from one part of your application accidentally leaking its key. +/// +/// As a rare exception to that general rule, however, it is possible to use +/// `derive_key` itself with key material that you are already using with +/// another algorithm. You might need to do this if you're adding features to +/// an existing application, which does not yet use key derivation internally. +/// However, you still must not share key material with algorithms that forbid +/// key reuse entirely, like a one-time pad. For more on this, see sections 6.2 +/// and 7.8 of the [BLAKE3 paper](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf). +/// +/// Note that BLAKE3 is not a password hash, and **`derive_key` should never be +/// used with passwords.** Instead, use a dedicated password hash like +/// [Argon2]. Password hashes are entirely different from generic hash +/// functions, with opposite design requirements. +/// +/// For an incremental version that accepts multiple writes, see [`Hasher::new_derive_key`], +/// [`Hasher::update`], and [`Hasher::finalize`]. These two statements are equivalent: +/// +/// ``` +/// # const CONTEXT: &str = "example.com 2019-12-25 16:18:03 session tokens v1"; +/// let key = blake3::derive_key(CONTEXT, b"key material, not a password"); +/// # let key1 = key; +/// +/// let key: [u8; 32] = blake3::Hasher::new_derive_key(CONTEXT) +/// .update(b"key material, not a password") +/// .finalize() +/// .into(); +/// # let key2 = key; +/// # assert_eq!(key1, key2); +/// ``` +/// +/// For output sizes other than 32 bytes, see [`Hasher::finalize_xof`], and [`OutputReader`]. +/// +/// This function is always single-threaded. For multithreading support, see +/// [`Hasher::update_rayon`](struct.Hasher.html#method.update_rayon). +/// +/// [Argon2]: https://en.wikipedia.org/wiki/Argon2 +pub fn derive_key(context: &str, key_material: &[u8]) -> [u8; OUT_LEN] { + let context_key = hazmat::hash_derive_key_context(context); + let context_key_words = platform::words_from_le_bytes_32(&context_key); + hash_all_at_once::<join::SerialJoin>(key_material, &context_key_words, DERIVE_KEY_MATERIAL) + .root_hash() + .0 +} + +fn parent_node_output( + left_child: &CVBytes, + right_child: &CVBytes, + key: &CVWords, + flags: u8, + platform: Platform, +) -> Output { + let mut block = [0; BLOCK_LEN]; + block[..32].copy_from_slice(left_child); + block[32..].copy_from_slice(right_child); + Output { + input_chaining_value: *key, + block, + block_len: BLOCK_LEN as u8, + counter: 0, + flags: flags | PARENT, + platform, + } +} + +/// An incremental hash state that can accept any number of writes. +/// +/// The `rayon` and `mmap` Cargo features enable additional methods on this +/// type related to multithreading and memory-mapped IO. +/// +/// When the `traits-preview` Cargo feature is enabled, this type implements +/// several commonly used traits from the +/// [`digest`](https://crates.io/crates/digest) crate. However, those +/// traits aren't stable, and they're expected to change in incompatible ways +/// before that crate reaches 1.0. For that reason, this crate makes no SemVer +/// guarantees for this feature, and callers who use it should expect breaking +/// changes between patch versions. +/// +/// # Examples +/// +/// ``` +/// # fn main() -> Result<(), Box<dyn std::error::Error>> { +/// // Hash an input incrementally. +/// let mut hasher = blake3::Hasher::new(); +/// hasher.update(b"foo"); +/// hasher.update(b"bar"); +/// hasher.update(b"baz"); +/// assert_eq!(hasher.finalize(), blake3::hash(b"foobarbaz")); +/// +/// // Extended output. OutputReader also implements Read and Seek. +/// # #[cfg(feature = "std")] { +/// let mut output = [0; 1000]; +/// let mut output_reader = hasher.finalize_xof(); +/// output_reader.fill(&mut output); +/// assert_eq!(&output[..32], blake3::hash(b"foobarbaz").as_bytes()); +/// # } +/// # Ok(()) +/// # } +/// ``` +#[derive(Clone)] +pub struct Hasher { + key: CVWords, + chunk_state: ChunkState, + initial_chunk_counter: u64, + // The stack size is MAX_DEPTH + 1 because we do lazy merging. For example, + // with 7 chunks, we have 3 entries in the stack. Adding an 8th chunk + // requires a 4th entry, rather than merging everything down to 1, because + // we don't know whether more input is coming. This is different from how + // the reference implementation does things. + cv_stack: ArrayVec<CVBytes, { MAX_DEPTH + 1 }>, +} + +impl Hasher { + fn new_internal(key: &CVWords, flags: u8) -> Self { + Self { + key: *key, + chunk_state: ChunkState::new(key, 0, flags, Platform::detect()), + initial_chunk_counter: 0, + cv_stack: ArrayVec::new(), + } + } + + /// Construct a new `Hasher` for the regular hash function. + pub fn new() -> Self { + Self::new_internal(IV, 0) + } + + /// Construct a new `Hasher` for the keyed hash function. See + /// [`keyed_hash`]. + /// + /// [`keyed_hash`]: fn.keyed_hash.html + pub fn new_keyed(key: &[u8; KEY_LEN]) -> Self { + let key_words = platform::words_from_le_bytes_32(key); + Self::new_internal(&key_words, KEYED_HASH) + } + + /// Construct a new `Hasher` for the key derivation function. See + /// [`derive_key`]. The context string should be hardcoded, globally + /// unique, and application-specific. + /// + /// [`derive_key`]: fn.derive_key.html + pub fn new_derive_key(context: &str) -> Self { + let context_key = hazmat::hash_derive_key_context(context); + let context_key_words = platform::words_from_le_bytes_32(&context_key); + Self::new_internal(&context_key_words, DERIVE_KEY_MATERIAL) + } + + /// Reset the `Hasher` to its initial state. + /// + /// This is functionally the same as overwriting the `Hasher` with a new + /// one, using the same key or context string if any. + pub fn reset(&mut self) -> &mut Self { + self.chunk_state = ChunkState::new( + &self.key, + 0, + self.chunk_state.flags, + self.chunk_state.platform, + ); + self.cv_stack.clear(); + self + } + + // As described in push_cv() below, we do "lazy merging", delaying merges + // until right before the next CV is about to be added. This is different + // from the reference implementation. Another difference is that we aren't + // always merging 1 chunk at a time. Instead, each CV might represent any + // power-of-two number of chunks, as long as the smaller-above-larger stack + // order is maintained. Instead of the "count the trailing 0-bits" + // algorithm described in the spec (which assumes you're adding one chunk + // at a time), we use a "count the total number of 1-bits" variant (which + // doesn't assume that). The principle is the same: each CV that should + // remain in the stack is represented by a 1-bit in the total number of + // chunks (or bytes) so far. + fn merge_cv_stack(&mut self, chunk_counter: u64) { + // Account for non-zero cases of Hasher::set_input_offset, where there are no prior + // subtrees in the stack. Note that initial_chunk_counter is always 0 for callers who don't + // use the hazmat module. + let post_merge_stack_len = + (chunk_counter - self.initial_chunk_counter).count_ones() as usize; + while self.cv_stack.len() > post_merge_stack_len { + let right_child = self.cv_stack.pop().unwrap(); + let left_child = self.cv_stack.pop().unwrap(); + let parent_output = parent_node_output( + &left_child, + &right_child, + &self.key, + self.chunk_state.flags, + self.chunk_state.platform, + ); + self.cv_stack.push(parent_output.chaining_value()); + } + } + + // In reference_impl.rs, we merge the new CV with existing CVs from the + // stack before pushing it. We can do that because we know more input is + // coming, so we know none of the merges are root. + // + // This setting is different. We want to feed as much input as possible to + // compress_subtree_wide(), without setting aside anything for the + // chunk_state. If the user gives us 64 KiB, we want to parallelize over + // all 64 KiB at once as a single subtree, if at all possible. + // + // This leads to two problems: + // 1) This 64 KiB input might be the only call that ever gets made to + // update. In this case, the root node of the 64 KiB subtree would be + // the root node of the whole tree, and it would need to be ROOT + // finalized. We can't compress it until we know. + // 2) This 64 KiB input might complete a larger tree, whose root node is + // similarly going to be the root of the whole tree. For example, + // maybe we have 196 KiB (that is, 128 + 64) hashed so far. We can't + // compress the node at the root of the 256 KiB subtree until we know + // how to finalize it. + // + // The second problem is solved with "lazy merging". That is, when we're + // about to add a CV to the stack, we don't merge it with anything first, + // as the reference impl does. Instead we do merges using the *previous* CV + // that was added, which is sitting on top of the stack, and we put the new + // CV (unmerged) on top of the stack afterwards. This guarantees that we + // never merge the root node until finalize(). + // + // Solving the first problem requires an additional tool, + // compress_subtree_to_parent_node(). That function always returns the top + // *two* chaining values of the subtree it's compressing. We then do lazy + // merging with each of them separately, so that the second CV will always + // remain unmerged. (That also helps us support extendable output when + // we're hashing an input all-at-once.) + fn push_cv(&mut self, new_cv: &CVBytes, chunk_counter: u64) { + self.merge_cv_stack(chunk_counter); + self.cv_stack.push(*new_cv); + } + + /// Add input bytes to the hash state. You can call this any number of times. + /// + /// This method is always single-threaded. For multithreading support, see + /// [`update_rayon`](#method.update_rayon) (enabled with the `rayon` Cargo feature). + /// + /// Note that the degree of SIMD parallelism that `update` can use is limited by the size of + /// this input buffer. See [`update_reader`](#method.update_reader). + pub fn update(&mut self, input: &[u8]) -> &mut Self { + self.update_with_join::<join::SerialJoin>(input) + } + + fn update_with_join<J: join::Join>(&mut self, mut input: &[u8]) -> &mut Self { + let input_offset = self.initial_chunk_counter * CHUNK_LEN as u64; + if let Some(max) = hazmat::max_subtree_len(input_offset) { + let remaining = max - self.count(); + assert!( + input.len() as u64 <= remaining, + "the subtree starting at {} contains at most {} bytes (found {})", + CHUNK_LEN as u64 * self.initial_chunk_counter, + max, + input.len(), + ); + } + // If we have some partial chunk bytes in the internal chunk_state, we + // need to finish that chunk first. + if self.chunk_state.count() > 0 { + let want = CHUNK_LEN - self.chunk_state.count(); + let take = cmp::min(want, input.len()); + self.chunk_state.update(&input[..take]); + input = &input[take..]; + if !input.is_empty() { + // We've filled the current chunk, and there's more input + // coming, so we know it's not the root and we can finalize it. + // Then we'll proceed to hashing whole chunks below. + debug_assert_eq!(self.chunk_state.count(), CHUNK_LEN); + let chunk_cv = self.chunk_state.output().chaining_value(); + self.push_cv(&chunk_cv, self.chunk_state.chunk_counter); + self.chunk_state = ChunkState::new( + &self.key, + self.chunk_state.chunk_counter + 1, + self.chunk_state.flags, + self.chunk_state.platform, + ); + } else { + return self; + } + } + + // Now the chunk_state is clear, and we have more input. If there's + // more than a single chunk (so, definitely not the root chunk), hash + // the largest whole subtree we can, with the full benefits of SIMD and + // multithreading parallelism. Two restrictions: + // - The subtree has to be a power-of-2 number of chunks. Only subtrees + // along the right edge can be incomplete, and we don't know where + // the right edge is going to be until we get to finalize(). + // - The subtree must evenly divide the total number of chunks up until + // this point (if total is not 0). If the current incomplete subtree + // is only waiting for 1 more chunk, we can't hash a subtree of 4 + // chunks. We have to complete the current subtree first. + // Because we might need to break up the input to form powers of 2, or + // to evenly divide what we already have, this part runs in a loop. + while input.len() > CHUNK_LEN { + debug_assert_eq!(self.chunk_state.count(), 0, "no partial chunk data"); + debug_assert_eq!(CHUNK_LEN.count_ones(), 1, "power of 2 chunk len"); + let mut subtree_len = largest_power_of_two_leq(input.len()); + let count_so_far = self.chunk_state.chunk_counter * CHUNK_LEN as u64; + // Shrink the subtree_len until it evenly divides the count so far. + // We know that subtree_len itself is a power of 2, so we can use a + // bitmasking trick instead of an actual remainder operation. (Note + // that if the caller consistently passes power-of-2 inputs of the + // same size, as is hopefully typical, this loop condition will + // always fail, and subtree_len will always be the full length of + // the input.) + // + // An aside: We don't have to shrink subtree_len quite this much. + // For example, if count_so_far is 1, we could pass 2 chunks to + // compress_subtree_to_parent_node. Since we'll get 2 CVs back, + // we'll still get the right answer in the end, and we might get to + // use 2-way SIMD parallelism. The problem with this optimization, + // is that it gets us stuck always hashing 2 chunks. The total + // number of chunks will remain odd, and we'll never graduate to + // higher degrees of parallelism. See + // https://github.com/BLAKE3-team/BLAKE3/issues/69. + while (subtree_len - 1) as u64 & count_so_far != 0 { + subtree_len /= 2; + } + // The shrunken subtree_len might now be 1 chunk long. If so, hash + // that one chunk by itself. Otherwise, compress the subtree into a + // pair of CVs. + let subtree_chunks = (subtree_len / CHUNK_LEN) as u64; + if subtree_len <= CHUNK_LEN { + debug_assert_eq!(subtree_len, CHUNK_LEN); + self.push_cv( + &ChunkState::new( + &self.key, + self.chunk_state.chunk_counter, + self.chunk_state.flags, + self.chunk_state.platform, + ) + .update(&input[..subtree_len]) + .output() + .chaining_value(), + self.chunk_state.chunk_counter, + ); + } else { + // This is the high-performance happy path, though getting here + // depends on the caller giving us a long enough input. + let cv_pair = compress_subtree_to_parent_node::<J>( + &input[..subtree_len], + &self.key, + self.chunk_state.chunk_counter, + self.chunk_state.flags, + self.chunk_state.platform, + ); + let left_cv = array_ref!(cv_pair, 0, 32); + let right_cv = array_ref!(cv_pair, 32, 32); + // Push the two CVs we received into the CV stack in order. Because + // the stack merges lazily, this guarantees we aren't merging the + // root. + self.push_cv(left_cv, self.chunk_state.chunk_counter); + self.push_cv( + right_cv, + self.chunk_state.chunk_counter + (subtree_chunks / 2), + ); + } + self.chunk_state.chunk_counter += subtree_chunks; + input = &input[subtree_len..]; + } + + // What remains is 1 chunk or less. Add it to the chunk state. + debug_assert!(input.len() <= CHUNK_LEN); + if !input.is_empty() { + self.chunk_state.update(input); + // Having added some input to the chunk_state, we know what's in + // the CV stack won't become the root node, and we can do an extra + // merge. This simplifies finalize(). + self.merge_cv_stack(self.chunk_state.chunk_counter); + } + + self + } + + fn final_output(&self) -> Output { + // If the current chunk is the only chunk, that makes it the root node + // also. Convert it directly into an Output. Otherwise, we need to + // merge subtrees below. + if self.cv_stack.is_empty() { + debug_assert_eq!(self.chunk_state.chunk_counter, self.initial_chunk_counter); + return self.chunk_state.output(); + } + + // If there are any bytes in the ChunkState, finalize that chunk and + // merge its CV with everything in the CV stack. In that case, the work + // we did at the end of update() above guarantees that the stack + // doesn't contain any unmerged subtrees that need to be merged first. + // (This is important, because if there were two chunk hashes sitting + // on top of the stack, they would need to merge with each other, and + // merging a new chunk hash into them would be incorrect.) + // + // If there are no bytes in the ChunkState, we'll merge what's already + // in the stack. In this case it's fine if there are unmerged chunks on + // top, because we'll merge them with each other. Note that the case of + // the empty chunk is taken care of above. + let mut output: Output; + let mut num_cvs_remaining = self.cv_stack.len(); + if self.chunk_state.count() > 0 { + debug_assert_eq!( + self.cv_stack.len(), + (self.chunk_state.chunk_counter - self.initial_chunk_counter).count_ones() as usize, + "cv stack does not need a merge", + ); + output = self.chunk_state.output(); + } else { + debug_assert!(self.cv_stack.len() >= 2); + output = parent_node_output( + &self.cv_stack[num_cvs_remaining - 2], + &self.cv_stack[num_cvs_remaining - 1], + &self.key, + self.chunk_state.flags, + self.chunk_state.platform, + ); + num_cvs_remaining -= 2; + } + while num_cvs_remaining > 0 { + output = parent_node_output( + &self.cv_stack[num_cvs_remaining - 1], + &output.chaining_value(), + &self.key, + self.chunk_state.flags, + self.chunk_state.platform, + ); + num_cvs_remaining -= 1; + } + output + } + + /// Finalize the hash state and return the [`Hash`](struct.Hash.html) of + /// the input. + /// + /// This method is idempotent. Calling it twice will give the same result. + /// You can also add more input and finalize again. + pub fn finalize(&self) -> Hash { + assert_eq!( + self.initial_chunk_counter, 0, + "set_input_offset must be used with finalize_non_root", + ); + self.final_output().root_hash() + } + + /// Finalize the hash state and return an [`OutputReader`], which can + /// supply any number of output bytes. + /// + /// This method is idempotent. Calling it twice will give the same result. + /// You can also add more input and finalize again. + /// + /// [`OutputReader`]: struct.OutputReader.html + pub fn finalize_xof(&self) -> OutputReader { + assert_eq!( + self.initial_chunk_counter, 0, + "set_input_offset must be used with finalize_non_root", + ); + OutputReader::new(self.final_output()) + } + + /// Return the total number of bytes hashed so far. + /// + /// [`hazmat::HasherExt::set_input_offset`] does not affect this value. This only counts bytes + /// passed to [`update`](Hasher::update). + pub fn count(&self) -> u64 { + // Account for non-zero cases of Hasher::set_input_offset. Note that initial_chunk_counter + // is always 0 for callers who don't use the hazmat module. + (self.chunk_state.chunk_counter - self.initial_chunk_counter) * CHUNK_LEN as u64 + + self.chunk_state.count() as u64 + } + + /// As [`update`](Hasher::update), but reading from a + /// [`std::io::Read`](https://doc.rust-lang.org/std/io/trait.Read.html) implementation. + /// + /// [`Hasher`] implements + /// [`std::io::Write`](https://doc.rust-lang.org/std/io/trait.Write.html), so it's possible to + /// use [`std::io::copy`](https://doc.rust-lang.org/std/io/fn.copy.html) to update a [`Hasher`] + /// from any reader. Unfortunately, this standard approach can limit performance, because + /// `copy` currently uses an internal 8 KiB buffer that isn't big enough to take advantage of + /// all SIMD instruction sets. (In particular, [AVX-512](https://en.wikipedia.org/wiki/AVX-512) + /// needs a 16 KiB buffer.) `update_reader` avoids this performance problem and is slightly + /// more convenient. + /// + /// The internal buffer size this method uses may change at any time, and it may be different + /// for different targets. The only guarantee is that it will be large enough for all of this + /// crate's SIMD implementations on the current platform. + /// + /// The most common implementer of + /// [`std::io::Read`](https://doc.rust-lang.org/std/io/trait.Read.html) might be + /// [`std::fs::File`](https://doc.rust-lang.org/std/fs/struct.File.html), but note that memory + /// mapping can be faster than this method for hashing large files. See + /// [`update_mmap`](Hasher::update_mmap) and [`update_mmap_rayon`](Hasher::update_mmap_rayon), + /// which require the `mmap` and (for the latter) `rayon` Cargo features. + /// + /// This method requires the `std` Cargo feature, which is enabled by default. + /// + /// # Example + /// + /// ```no_run + /// # use std::fs::File; + /// # use std::io; + /// # fn main() -> io::Result<()> { + /// // Hash standard input. + /// let mut hasher = blake3::Hasher::new(); + /// hasher.update_reader(std::io::stdin().lock())?; + /// println!("{}", hasher.finalize()); + /// # Ok(()) + /// # } + /// ``` + #[cfg(feature = "std")] + pub fn update_reader(&mut self, reader: impl std::io::Read) -> std::io::Result<&mut Self> { + io::copy_wide(reader, self)?; + Ok(self) + } + + /// As [`update`](Hasher::update), but using Rayon-based multithreading + /// internally. + /// + /// This method is gated by the `rayon` Cargo feature, which is disabled by + /// default but enabled on [docs.rs](https://docs.rs). + /// + /// To get any performance benefit from multithreading, the input buffer + /// needs to be large. As a rule of thumb on x86_64, `update_rayon` is + /// _slower_ than `update` for inputs under 128 KiB. That threshold varies + /// quite a lot across different processors, and it's important to benchmark + /// your specific use case. See also the performance warning associated with + /// [`update_mmap_rayon`](Hasher::update_mmap_rayon). + /// + /// If you already have a large buffer in memory, and you want to hash it + /// with multiple threads, this method is a good option. However, reading a + /// file into memory just to call this method can be a performance mistake, + /// both because it requires lots of memory and because single-threaded + /// reads can be slow. For hashing whole files, see + /// [`update_mmap_rayon`](Hasher::update_mmap_rayon), which is gated by both + /// the `rayon` and `mmap` Cargo features. + #[cfg(feature = "rayon")] + pub fn update_rayon(&mut self, input: &[u8]) -> &mut Self { + self.update_with_join::<join::RayonJoin>(input) + } + + /// As [`update`](Hasher::update), but reading the contents of a file using memory mapping. + /// + /// Not all files can be memory mapped, and memory mapping small files can be slower than + /// reading them the usual way. In those cases, this method will fall back to standard file IO. + /// The heuristic for whether to use memory mapping is currently very simple (file size >= + /// 16 KiB), and it might change at any time. + /// + /// Like [`update`](Hasher::update), this method is single-threaded. In this author's + /// experience, memory mapping improves single-threaded performance by ~10% for large files + /// that are already in cache. This probably varies between platforms, and as always it's a + /// good idea to benchmark your own use case. In comparison, the multithreaded + /// [`update_mmap_rayon`](Hasher::update_mmap_rayon) method can have a much larger impact on + /// performance. + /// + /// There's a correctness reason that this method takes + /// [`Path`](https://doc.rust-lang.org/stable/std/path/struct.Path.html) instead of + /// [`File`](https://doc.rust-lang.org/std/fs/struct.File.html): reading from a memory-mapped + /// file ignores the seek position of the original file handle (it neither respects the current + /// position nor updates the position). This difference in behavior would've caused + /// `update_mmap` and [`update_reader`](Hasher::update_reader) to give different answers and + /// have different side effects in some cases. Taking a + /// [`Path`](https://doc.rust-lang.org/stable/std/path/struct.Path.html) avoids this problem by + /// making it clear that a new [`File`](https://doc.rust-lang.org/std/fs/struct.File.html) is + /// opened internally. + /// + /// This method requires the `mmap` Cargo feature, which is disabled by default but enabled on + /// [docs.rs](https://docs.rs). + /// + /// # Example + /// + /// ```no_run + /// # use std::io; + /// # use std::path::Path; + /// # fn main() -> io::Result<()> { + /// let path = Path::new("file.dat"); + /// let mut hasher = blake3::Hasher::new(); + /// hasher.update_mmap(path)?; + /// println!("{}", hasher.finalize()); + /// # Ok(()) + /// # } + /// ``` + #[cfg(feature = "mmap")] + pub fn update_mmap(&mut self, path: impl AsRef<std::path::Path>) -> std::io::Result<&mut Self> { + let file = std::fs::File::open(path.as_ref())?; + if let Some(mmap) = io::maybe_mmap_file(&file)? { + self.update(&mmap); + } else { + io::copy_wide(&file, self)?; + } + Ok(self) + } + + /// As [`update_rayon`](Hasher::update_rayon), but reading the contents of a file using + /// memory mapping. This is the default behavior of `b3sum`. + /// + /// For large files that are likely to be in cache, this can be much faster than + /// single-threaded hashing. When benchmarks report that BLAKE3 is 10x or 20x faster than other + /// cryptographic hashes, this is usually what they're measuring. However... + /// + /// **Performance Warning:** There are cases where multithreading hurts performance. The worst + /// case is [a large file on a spinning disk](https://github.com/BLAKE3-team/BLAKE3/issues/31), + /// where simultaneous reads from multiple threads can cause "thrashing" (i.e. the disk spends + /// more time seeking around than reading data). Windows tends to be somewhat worse about this, + /// in part because it's less likely than Linux to keep very large files in cache. More + /// generally, if your CPU cores are already busy, then multithreading will add overhead + /// without improving performance. If your code runs in different environments that you don't + /// control and can't measure, then unfortunately there's no one-size-fits-all answer for + /// whether multithreading is a good idea. + /// + /// The memory mapping behavior of this function is the same as + /// [`update_mmap`](Hasher::update_mmap), and the heuristic for when to fall back to standard + /// file IO might change at any time. + /// + /// This method requires both the `mmap` and `rayon` Cargo features, which are disabled by + /// default but enabled on [docs.rs](https://docs.rs). + /// + /// # Example + /// + /// ```no_run + /// # use std::io; + /// # use std::path::Path; + /// # fn main() -> io::Result<()> { + /// # #[cfg(feature = "rayon")] + /// # { + /// let path = Path::new("big_file.dat"); + /// let mut hasher = blake3::Hasher::new(); + /// hasher.update_mmap_rayon(path)?; + /// println!("{}", hasher.finalize()); + /// # } + /// # Ok(()) + /// # } + /// ``` + #[cfg(feature = "mmap")] + #[cfg(feature = "rayon")] + pub fn update_mmap_rayon( + &mut self, + path: impl AsRef<std::path::Path>, + ) -> std::io::Result<&mut Self> { + let file = std::fs::File::open(path.as_ref())?; + if let Some(mmap) = io::maybe_mmap_file(&file)? { + self.update_rayon(&mmap); + } else { + io::copy_wide(&file, self)?; + } + Ok(self) + } +} + +// Don't derive(Debug), because the state may be secret. +impl fmt::Debug for Hasher { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.debug_struct("Hasher") + .field("flags", &self.chunk_state.flags) + .field("platform", &self.chunk_state.platform) + .finish() + } +} + +impl Default for Hasher { + #[inline] + fn default() -> Self { + Self::new() + } +} + +#[cfg(feature = "std")] +impl std::io::Write for Hasher { + /// This is equivalent to [`update`](#method.update). + #[inline] + fn write(&mut self, input: &[u8]) -> std::io::Result<usize> { + self.update(input); + Ok(input.len()) + } + + #[inline] + fn flush(&mut self) -> std::io::Result<()> { + Ok(()) + } +} + +#[cfg(feature = "zeroize")] +impl Zeroize for Hasher { + fn zeroize(&mut self) { + // Destructuring to trigger compile error as a reminder to update this impl. + let Self { + key, + chunk_state, + initial_chunk_counter, + cv_stack, + } = self; + + key.zeroize(); + chunk_state.zeroize(); + initial_chunk_counter.zeroize(); + cv_stack.zeroize(); + } +} + +/// An incremental reader for extended output, returned by +/// [`Hasher::finalize_xof`](struct.Hasher.html#method.finalize_xof). +/// +/// Shorter BLAKE3 outputs are prefixes of longer ones, and explicitly requesting a short output is +/// equivalent to truncating the default-length output. Note that this is a difference between +/// BLAKE2 and BLAKE3. +/// +/// # Security notes +/// +/// Outputs shorter than the default length of 32 bytes (256 bits) provide less security. An N-bit +/// BLAKE3 output is intended to provide N bits of first and second preimage resistance and N/2 +/// bits of collision resistance, for any N up to 256. Longer outputs don't provide any additional +/// security. +/// +/// Avoid relying on the secrecy of the output offset, that is, the number of output bytes read or +/// the arguments to [`seek`](struct.OutputReader.html#method.seek) or +/// [`set_position`](struct.OutputReader.html#method.set_position). [_Block-Cipher-Based Tree +/// Hashing_ by Aldo Gunsing](https://eprint.iacr.org/2022/283) shows that an attacker who knows +/// both the message and the key (if any) can easily determine the offset of an extended output. +/// For comparison, AES-CTR has a similar property: if you know the key, you can decrypt a block +/// from an unknown position in the output stream to recover its block index. Callers with strong +/// secret keys aren't affected in practice, but secret offsets are a [design +/// smell](https://en.wikipedia.org/wiki/Design_smell) in any case. +#[derive(Clone)] +pub struct OutputReader { + inner: Output, + position_within_block: u8, +} + +impl OutputReader { + fn new(inner: Output) -> Self { + Self { + inner, + position_within_block: 0, + } + } + + // This helper function handles both the case where the output buffer is + // shorter than one block, and the case where our position_within_block is + // non-zero. + fn fill_one_block(&mut self, buf: &mut &mut [u8]) { + let output_block: [u8; BLOCK_LEN] = self.inner.root_output_block(); + let output_bytes = &output_block[self.position_within_block as usize..]; + let take = cmp::min(buf.len(), output_bytes.len()); + buf[..take].copy_from_slice(&output_bytes[..take]); + self.position_within_block += take as u8; + if self.position_within_block == BLOCK_LEN as u8 { + self.inner.counter += 1; + self.position_within_block = 0; + } + // Advance the dest buffer. mem::take() is a borrowck workaround. + *buf = &mut core::mem::take(buf)[take..]; + } + + /// Fill a buffer with output bytes and advance the position of the + /// `OutputReader`. This is equivalent to [`Read::read`], except that it + /// doesn't return a `Result`. Both methods always fill the entire buffer. + /// + /// Note that `OutputReader` doesn't buffer output bytes internally, so + /// calling `fill` repeatedly with a short-length or odd-length slice will + /// end up performing the same compression multiple times. If you're + /// reading output in a loop, prefer a slice length that's a multiple of + /// [`BLOCK_LEN`] (64 bytes). + /// + /// The maximum output size of BLAKE3 is 2<sup>64</sup>-1 bytes. If you try + /// to extract more than that, for example by seeking near the end and + /// reading further, the behavior is unspecified. + /// + /// [`Read::read`]: #method.read + pub fn fill(&mut self, mut buf: &mut [u8]) { + if buf.is_empty() { + return; + } + + // If we're partway through a block, try to get to a block boundary. + if self.position_within_block != 0 { + self.fill_one_block(&mut buf); + } + + let full_blocks = buf.len() / BLOCK_LEN; + let full_blocks_len = full_blocks * BLOCK_LEN; + if full_blocks > 0 { + debug_assert_eq!(0, self.position_within_block); + self.inner.platform.xof_many( + &self.inner.input_chaining_value, + &self.inner.block, + self.inner.block_len, + self.inner.counter, + self.inner.flags | ROOT, + &mut buf[..full_blocks_len], + ); + self.inner.counter += full_blocks as u64; + buf = &mut buf[full_blocks * BLOCK_LEN..]; + } + + if !buf.is_empty() { + debug_assert!(buf.len() < BLOCK_LEN); + self.fill_one_block(&mut buf); + debug_assert!(buf.is_empty()); + } + } + + /// Return the current read position in the output stream. This is + /// equivalent to [`Seek::stream_position`], except that it doesn't return + /// a `Result`. The position of a new `OutputReader` starts at 0, and each + /// call to [`fill`] or [`Read::read`] moves the position forward by the + /// number of bytes read. + /// + /// [`Seek::stream_position`]: #method.stream_position + /// [`fill`]: #method.fill + /// [`Read::read`]: #method.read + pub fn position(&self) -> u64 { + self.inner.counter * BLOCK_LEN as u64 + self.position_within_block as u64 + } + + /// Seek to a new read position in the output stream. This is equivalent to + /// calling [`Seek::seek`] with [`SeekFrom::Start`], except that it doesn't + /// return a `Result`. + /// + /// [`Seek::seek`]: #method.seek + /// [`SeekFrom::Start`]: https://doc.rust-lang.org/std/io/enum.SeekFrom.html + pub fn set_position(&mut self, position: u64) { + self.position_within_block = (position % BLOCK_LEN as u64) as u8; + self.inner.counter = position / BLOCK_LEN as u64; + } +} + +// Don't derive(Debug), because the state may be secret. +impl fmt::Debug for OutputReader { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.debug_struct("OutputReader") + .field("position", &self.position()) + .finish() + } +} + +#[cfg(feature = "std")] +impl std::io::Read for OutputReader { + #[inline] + fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> { + self.fill(buf); + Ok(buf.len()) + } +} + +#[cfg(feature = "std")] +impl std::io::Seek for OutputReader { + fn seek(&mut self, pos: std::io::SeekFrom) -> std::io::Result<u64> { + let max_position = u64::max_value() as i128; + let target_position: i128 = match pos { + std::io::SeekFrom::Start(x) => x as i128, + std::io::SeekFrom::Current(x) => self.position() as i128 + x as i128, + std::io::SeekFrom::End(_) => { + return Err(std::io::Error::new( + std::io::ErrorKind::InvalidInput, + "seek from end not supported", + )); + } + }; + if target_position < 0 { + return Err(std::io::Error::new( + std::io::ErrorKind::InvalidInput, + "seek before start", + )); + } + self.set_position(cmp::min(target_position, max_position) as u64); + Ok(self.position()) + } +} + +#[cfg(feature = "zeroize")] +impl Zeroize for OutputReader { + fn zeroize(&mut self) { + // Destructuring to trigger compile error as a reminder to update this impl. + let Self { + inner, + position_within_block, + } = self; + + inner.zeroize(); + position_within_block.zeroize(); + } +} diff --git a/thirdparty/blake3/src/platform.rs b/thirdparty/blake3/src/platform.rs new file mode 100644 index 000000000..51b3b7b17 --- /dev/null +++ b/thirdparty/blake3/src/platform.rs @@ -0,0 +1,587 @@ +use crate::{portable, CVWords, IncrementCounter, BLOCK_LEN}; +use arrayref::{array_mut_ref, array_ref}; + +cfg_if::cfg_if! { + if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { + cfg_if::cfg_if! { + if #[cfg(blake3_avx512_ffi)] { + pub const MAX_SIMD_DEGREE: usize = 16; + } else { + pub const MAX_SIMD_DEGREE: usize = 8; + } + } + } else if #[cfg(blake3_neon)] { + pub const MAX_SIMD_DEGREE: usize = 4; + } else if #[cfg(blake3_wasm32_simd)] { + pub const MAX_SIMD_DEGREE: usize = 4; + } else { + pub const MAX_SIMD_DEGREE: usize = 1; + } +} + +// There are some places where we want a static size that's equal to the +// MAX_SIMD_DEGREE, but also at least 2. Constant contexts aren't currently +// allowed to use cmp::max, so we have to hardcode this additional constant +// value. Get rid of this once cmp::max is a const fn. +cfg_if::cfg_if! { + if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { + cfg_if::cfg_if! { + if #[cfg(blake3_avx512_ffi)] { + pub const MAX_SIMD_DEGREE_OR_2: usize = 16; + } else { + pub const MAX_SIMD_DEGREE_OR_2: usize = 8; + } + } + } else if #[cfg(blake3_neon)] { + pub const MAX_SIMD_DEGREE_OR_2: usize = 4; + } else if #[cfg(blake3_wasm32_simd)] { + pub const MAX_SIMD_DEGREE_OR_2: usize = 4; + } else { + pub const MAX_SIMD_DEGREE_OR_2: usize = 2; + } +} + +#[derive(Clone, Copy, Debug)] +pub enum Platform { + Portable, + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + SSE2, + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + SSE41, + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + AVX2, + #[cfg(blake3_avx512_ffi)] + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + AVX512, + #[cfg(blake3_neon)] + NEON, + #[cfg(blake3_wasm32_simd)] + #[allow(non_camel_case_types)] + WASM32_SIMD, +} + +impl Platform { + #[allow(unreachable_code)] + pub fn detect() -> Self { + #[cfg(miri)] + { + return Platform::Portable; + } + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { + #[cfg(blake3_avx512_ffi)] + { + if avx512_detected() { + return Platform::AVX512; + } + } + if avx2_detected() { + return Platform::AVX2; + } + if sse41_detected() { + return Platform::SSE41; + } + if sse2_detected() { + return Platform::SSE2; + } + } + // We don't use dynamic feature detection for NEON. If the "neon" + // feature is on, NEON is assumed to be supported. + #[cfg(blake3_neon)] + { + return Platform::NEON; + } + #[cfg(blake3_wasm32_simd)] + { + return Platform::WASM32_SIMD; + } + Platform::Portable + } + + pub fn simd_degree(&self) -> usize { + let degree = match self { + Platform::Portable => 1, + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::SSE2 => 4, + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::SSE41 => 4, + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::AVX2 => 8, + #[cfg(blake3_avx512_ffi)] + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::AVX512 => 16, + #[cfg(blake3_neon)] + Platform::NEON => 4, + #[cfg(blake3_wasm32_simd)] + Platform::WASM32_SIMD => 4, + }; + debug_assert!(degree <= MAX_SIMD_DEGREE); + degree + } + + pub fn compress_in_place( + &self, + cv: &mut CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, + ) { + match self { + Platform::Portable => portable::compress_in_place(cv, block, block_len, counter, flags), + // Safe because detect() checked for platform support. + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::SSE2 => unsafe { + crate::sse2::compress_in_place(cv, block, block_len, counter, flags) + }, + // Safe because detect() checked for platform support. + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::SSE41 | Platform::AVX2 => unsafe { + crate::sse41::compress_in_place(cv, block, block_len, counter, flags) + }, + // Safe because detect() checked for platform support. + #[cfg(blake3_avx512_ffi)] + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::AVX512 => unsafe { + crate::avx512::compress_in_place(cv, block, block_len, counter, flags) + }, + // No NEON compress_in_place() implementation yet. + #[cfg(blake3_neon)] + Platform::NEON => portable::compress_in_place(cv, block, block_len, counter, flags), + #[cfg(blake3_wasm32_simd)] + Platform::WASM32_SIMD => { + crate::wasm32_simd::compress_in_place(cv, block, block_len, counter, flags) + } + } + } + + pub fn compress_xof( + &self, + cv: &CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, + ) -> [u8; 64] { + match self { + Platform::Portable => portable::compress_xof(cv, block, block_len, counter, flags), + // Safe because detect() checked for platform support. + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::SSE2 => unsafe { + crate::sse2::compress_xof(cv, block, block_len, counter, flags) + }, + // Safe because detect() checked for platform support. + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::SSE41 | Platform::AVX2 => unsafe { + crate::sse41::compress_xof(cv, block, block_len, counter, flags) + }, + // Safe because detect() checked for platform support. + #[cfg(blake3_avx512_ffi)] + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::AVX512 => unsafe { + crate::avx512::compress_xof(cv, block, block_len, counter, flags) + }, + // No NEON compress_xof() implementation yet. + #[cfg(blake3_neon)] + Platform::NEON => portable::compress_xof(cv, block, block_len, counter, flags), + #[cfg(blake3_wasm32_simd)] + Platform::WASM32_SIMD => { + crate::wasm32_simd::compress_xof(cv, block, block_len, counter, flags) + } + } + } + + // IMPLEMENTATION NOTE + // =================== + // hash_many() applies two optimizations. The critically important + // optimization is the high-performance parallel SIMD hashing mode, + // described in detail in the spec. This more than doubles throughput per + // thread. Another optimization is keeping the state vectors transposed + // from block to block within a chunk. When state vectors are transposed + // after every block, there's a small but measurable performance loss. + // Compressing chunks with a dedicated loop avoids this. + + pub fn hash_many<const N: usize>( + &self, + inputs: &[&[u8; N]], + key: &CVWords, + counter: u64, + increment_counter: IncrementCounter, + flags: u8, + flags_start: u8, + flags_end: u8, + out: &mut [u8], + ) { + match self { + Platform::Portable => portable::hash_many( + inputs, + key, + counter, + increment_counter, + flags, + flags_start, + flags_end, + out, + ), + // Safe because detect() checked for platform support. + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::SSE2 => unsafe { + crate::sse2::hash_many( + inputs, + key, + counter, + increment_counter, + flags, + flags_start, + flags_end, + out, + ) + }, + // Safe because detect() checked for platform support. + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::SSE41 => unsafe { + crate::sse41::hash_many( + inputs, + key, + counter, + increment_counter, + flags, + flags_start, + flags_end, + out, + ) + }, + // Safe because detect() checked for platform support. + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::AVX2 => unsafe { + crate::avx2::hash_many( + inputs, + key, + counter, + increment_counter, + flags, + flags_start, + flags_end, + out, + ) + }, + // Safe because detect() checked for platform support. + #[cfg(blake3_avx512_ffi)] + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::AVX512 => unsafe { + crate::avx512::hash_many( + inputs, + key, + counter, + increment_counter, + flags, + flags_start, + flags_end, + out, + ) + }, + // Assumed to be safe if the "neon" feature is on. + #[cfg(blake3_neon)] + Platform::NEON => unsafe { + crate::neon::hash_many( + inputs, + key, + counter, + increment_counter, + flags, + flags_start, + flags_end, + out, + ) + }, + // Assumed to be safe if the "wasm32_simd" feature is on. + #[cfg(blake3_wasm32_simd)] + Platform::WASM32_SIMD => unsafe { + crate::wasm32_simd::hash_many( + inputs, + key, + counter, + increment_counter, + flags, + flags_start, + flags_end, + out, + ) + }, + } + } + + pub fn xof_many( + &self, + cv: &CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + mut counter: u64, + flags: u8, + out: &mut [u8], + ) { + debug_assert_eq!(0, out.len() % BLOCK_LEN, "whole blocks only"); + if out.is_empty() { + // The current assembly implementation always outputs at least 1 block. + return; + } + match self { + // Safe because detect() checked for platform support. + #[cfg(blake3_avx512_ffi)] + #[cfg(unix)] + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::AVX512 => unsafe { + crate::avx512::xof_many(cv, block, block_len, counter, flags, out) + }, + _ => { + // For platforms without an optimized xof_many, fall back to a loop over + // compress_xof. This is still faster than portable code. + for out_block in out.chunks_exact_mut(BLOCK_LEN) { + // TODO: Use array_chunks_mut here once that's stable. + let out_array: &mut [u8; BLOCK_LEN] = out_block.try_into().unwrap(); + *out_array = self.compress_xof(cv, block, block_len, counter, flags); + counter += 1; + } + } + } + } + + // Explicit platform constructors, for benchmarks. + + pub fn portable() -> Self { + Self::Portable + } + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + pub fn sse2() -> Option<Self> { + if sse2_detected() { + Some(Self::SSE2) + } else { + None + } + } + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + pub fn sse41() -> Option<Self> { + if sse41_detected() { + Some(Self::SSE41) + } else { + None + } + } + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + pub fn avx2() -> Option<Self> { + if avx2_detected() { + Some(Self::AVX2) + } else { + None + } + } + + #[cfg(blake3_avx512_ffi)] + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + pub fn avx512() -> Option<Self> { + if avx512_detected() { + Some(Self::AVX512) + } else { + None + } + } + + #[cfg(blake3_neon)] + pub fn neon() -> Option<Self> { + // Assumed to be safe if the "neon" feature is on. + Some(Self::NEON) + } + + #[cfg(blake3_wasm32_simd)] + pub fn wasm32_simd() -> Option<Self> { + // Assumed to be safe if the "wasm32_simd" feature is on. + Some(Self::WASM32_SIMD) + } +} + +// Note that AVX-512 is divided into multiple featuresets, and we use two of +// them, F and VL. +#[cfg(blake3_avx512_ffi)] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +#[inline(always)] +#[allow(unreachable_code)] +pub fn avx512_detected() -> bool { + if cfg!(miri) { + return false; + } + + // A testing-only short-circuit. + if cfg!(feature = "no_avx512") { + return false; + } + // Static check, e.g. for building with target-cpu=native. + #[cfg(all(target_feature = "avx512f", target_feature = "avx512vl"))] + { + return true; + } + // Dynamic check, if std is enabled. + #[cfg(feature = "std")] + { + if is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512vl") { + return true; + } + } + false +} + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +#[inline(always)] +#[allow(unreachable_code)] +pub fn avx2_detected() -> bool { + if cfg!(miri) { + return false; + } + + // A testing-only short-circuit. + if cfg!(feature = "no_avx2") { + return false; + } + // Static check, e.g. for building with target-cpu=native. + #[cfg(target_feature = "avx2")] + { + return true; + } + // Dynamic check, if std is enabled. + #[cfg(feature = "std")] + { + if is_x86_feature_detected!("avx2") { + return true; + } + } + false +} + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +#[inline(always)] +#[allow(unreachable_code)] +pub fn sse41_detected() -> bool { + if cfg!(miri) { + return false; + } + + // A testing-only short-circuit. + if cfg!(feature = "no_sse41") { + return false; + } + // Static check, e.g. for building with target-cpu=native. + #[cfg(target_feature = "sse4.1")] + { + return true; + } + // Dynamic check, if std is enabled. + #[cfg(feature = "std")] + { + if is_x86_feature_detected!("sse4.1") { + return true; + } + } + false +} + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +#[inline(always)] +#[allow(unreachable_code)] +pub fn sse2_detected() -> bool { + if cfg!(miri) { + return false; + } + + // A testing-only short-circuit. + if cfg!(feature = "no_sse2") { + return false; + } + // Static check, e.g. for building with target-cpu=native. + #[cfg(target_feature = "sse2")] + { + return true; + } + // Dynamic check, if std is enabled. + #[cfg(feature = "std")] + { + if is_x86_feature_detected!("sse2") { + return true; + } + } + false +} + +#[inline(always)] +pub fn words_from_le_bytes_32(bytes: &[u8; 32]) -> [u32; 8] { + let mut out = [0; 8]; + out[0] = u32::from_le_bytes(*array_ref!(bytes, 0 * 4, 4)); + out[1] = u32::from_le_bytes(*array_ref!(bytes, 1 * 4, 4)); + out[2] = u32::from_le_bytes(*array_ref!(bytes, 2 * 4, 4)); + out[3] = u32::from_le_bytes(*array_ref!(bytes, 3 * 4, 4)); + out[4] = u32::from_le_bytes(*array_ref!(bytes, 4 * 4, 4)); + out[5] = u32::from_le_bytes(*array_ref!(bytes, 5 * 4, 4)); + out[6] = u32::from_le_bytes(*array_ref!(bytes, 6 * 4, 4)); + out[7] = u32::from_le_bytes(*array_ref!(bytes, 7 * 4, 4)); + out +} + +#[inline(always)] +pub fn words_from_le_bytes_64(bytes: &[u8; 64]) -> [u32; 16] { + let mut out = [0; 16]; + out[0] = u32::from_le_bytes(*array_ref!(bytes, 0 * 4, 4)); + out[1] = u32::from_le_bytes(*array_ref!(bytes, 1 * 4, 4)); + out[2] = u32::from_le_bytes(*array_ref!(bytes, 2 * 4, 4)); + out[3] = u32::from_le_bytes(*array_ref!(bytes, 3 * 4, 4)); + out[4] = u32::from_le_bytes(*array_ref!(bytes, 4 * 4, 4)); + out[5] = u32::from_le_bytes(*array_ref!(bytes, 5 * 4, 4)); + out[6] = u32::from_le_bytes(*array_ref!(bytes, 6 * 4, 4)); + out[7] = u32::from_le_bytes(*array_ref!(bytes, 7 * 4, 4)); + out[8] = u32::from_le_bytes(*array_ref!(bytes, 8 * 4, 4)); + out[9] = u32::from_le_bytes(*array_ref!(bytes, 9 * 4, 4)); + out[10] = u32::from_le_bytes(*array_ref!(bytes, 10 * 4, 4)); + out[11] = u32::from_le_bytes(*array_ref!(bytes, 11 * 4, 4)); + out[12] = u32::from_le_bytes(*array_ref!(bytes, 12 * 4, 4)); + out[13] = u32::from_le_bytes(*array_ref!(bytes, 13 * 4, 4)); + out[14] = u32::from_le_bytes(*array_ref!(bytes, 14 * 4, 4)); + out[15] = u32::from_le_bytes(*array_ref!(bytes, 15 * 4, 4)); + out +} + +#[inline(always)] +pub fn le_bytes_from_words_32(words: &[u32; 8]) -> [u8; 32] { + let mut out = [0; 32]; + *array_mut_ref!(out, 0 * 4, 4) = words[0].to_le_bytes(); + *array_mut_ref!(out, 1 * 4, 4) = words[1].to_le_bytes(); + *array_mut_ref!(out, 2 * 4, 4) = words[2].to_le_bytes(); + *array_mut_ref!(out, 3 * 4, 4) = words[3].to_le_bytes(); + *array_mut_ref!(out, 4 * 4, 4) = words[4].to_le_bytes(); + *array_mut_ref!(out, 5 * 4, 4) = words[5].to_le_bytes(); + *array_mut_ref!(out, 6 * 4, 4) = words[6].to_le_bytes(); + *array_mut_ref!(out, 7 * 4, 4) = words[7].to_le_bytes(); + out +} + +#[inline(always)] +pub fn le_bytes_from_words_64(words: &[u32; 16]) -> [u8; 64] { + let mut out = [0; 64]; + *array_mut_ref!(out, 0 * 4, 4) = words[0].to_le_bytes(); + *array_mut_ref!(out, 1 * 4, 4) = words[1].to_le_bytes(); + *array_mut_ref!(out, 2 * 4, 4) = words[2].to_le_bytes(); + *array_mut_ref!(out, 3 * 4, 4) = words[3].to_le_bytes(); + *array_mut_ref!(out, 4 * 4, 4) = words[4].to_le_bytes(); + *array_mut_ref!(out, 5 * 4, 4) = words[5].to_le_bytes(); + *array_mut_ref!(out, 6 * 4, 4) = words[6].to_le_bytes(); + *array_mut_ref!(out, 7 * 4, 4) = words[7].to_le_bytes(); + *array_mut_ref!(out, 8 * 4, 4) = words[8].to_le_bytes(); + *array_mut_ref!(out, 9 * 4, 4) = words[9].to_le_bytes(); + *array_mut_ref!(out, 10 * 4, 4) = words[10].to_le_bytes(); + *array_mut_ref!(out, 11 * 4, 4) = words[11].to_le_bytes(); + *array_mut_ref!(out, 12 * 4, 4) = words[12].to_le_bytes(); + *array_mut_ref!(out, 13 * 4, 4) = words[13].to_le_bytes(); + *array_mut_ref!(out, 14 * 4, 4) = words[14].to_le_bytes(); + *array_mut_ref!(out, 15 * 4, 4) = words[15].to_le_bytes(); + out +} diff --git a/thirdparty/blake3/src/portable.rs b/thirdparty/blake3/src/portable.rs new file mode 100644 index 000000000..7af6828b0 --- /dev/null +++ b/thirdparty/blake3/src/portable.rs @@ -0,0 +1,198 @@ +use crate::{ + counter_high, counter_low, CVBytes, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE, + OUT_LEN, +}; +use arrayref::{array_mut_ref, array_ref}; + +#[inline(always)] +fn g(state: &mut [u32; 16], a: usize, b: usize, c: usize, d: usize, x: u32, y: u32) { + state[a] = state[a].wrapping_add(state[b]).wrapping_add(x); + state[d] = (state[d] ^ state[a]).rotate_right(16); + state[c] = state[c].wrapping_add(state[d]); + state[b] = (state[b] ^ state[c]).rotate_right(12); + state[a] = state[a].wrapping_add(state[b]).wrapping_add(y); + state[d] = (state[d] ^ state[a]).rotate_right(8); + state[c] = state[c].wrapping_add(state[d]); + state[b] = (state[b] ^ state[c]).rotate_right(7); +} + +#[inline(always)] +fn round(state: &mut [u32; 16], msg: &[u32; 16], round: usize) { + // Select the message schedule based on the round. + let schedule = MSG_SCHEDULE[round]; + + // Mix the columns. + g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]); + g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]); + g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]); + g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]); + + // Mix the diagonals. + g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]); + g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]); + g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]); + g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]); +} + +#[inline(always)] +fn compress_pre( + cv: &CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) -> [u32; 16] { + let block_words = crate::platform::words_from_le_bytes_64(block); + + let mut state = [ + cv[0], + cv[1], + cv[2], + cv[3], + cv[4], + cv[5], + cv[6], + cv[7], + IV[0], + IV[1], + IV[2], + IV[3], + counter_low(counter), + counter_high(counter), + block_len as u32, + flags as u32, + ]; + + round(&mut state, &block_words, 0); + round(&mut state, &block_words, 1); + round(&mut state, &block_words, 2); + round(&mut state, &block_words, 3); + round(&mut state, &block_words, 4); + round(&mut state, &block_words, 5); + round(&mut state, &block_words, 6); + + state +} + +pub fn compress_in_place( + cv: &mut CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) { + let state = compress_pre(cv, block, block_len, counter, flags); + + cv[0] = state[0] ^ state[8]; + cv[1] = state[1] ^ state[9]; + cv[2] = state[2] ^ state[10]; + cv[3] = state[3] ^ state[11]; + cv[4] = state[4] ^ state[12]; + cv[5] = state[5] ^ state[13]; + cv[6] = state[6] ^ state[14]; + cv[7] = state[7] ^ state[15]; +} + +pub fn compress_xof( + cv: &CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) -> [u8; 64] { + let mut state = compress_pre(cv, block, block_len, counter, flags); + state[0] ^= state[8]; + state[1] ^= state[9]; + state[2] ^= state[10]; + state[3] ^= state[11]; + state[4] ^= state[12]; + state[5] ^= state[13]; + state[6] ^= state[14]; + state[7] ^= state[15]; + state[8] ^= cv[0]; + state[9] ^= cv[1]; + state[10] ^= cv[2]; + state[11] ^= cv[3]; + state[12] ^= cv[4]; + state[13] ^= cv[5]; + state[14] ^= cv[6]; + state[15] ^= cv[7]; + crate::platform::le_bytes_from_words_64(&state) +} + +pub fn hash1<const N: usize>( + input: &[u8; N], + key: &CVWords, + counter: u64, + flags: u8, + flags_start: u8, + flags_end: u8, + out: &mut CVBytes, +) { + debug_assert_eq!(N % BLOCK_LEN, 0, "uneven blocks"); + let mut cv = *key; + let mut block_flags = flags | flags_start; + let mut slice = &input[..]; + while slice.len() >= BLOCK_LEN { + if slice.len() == BLOCK_LEN { + block_flags |= flags_end; + } + compress_in_place( + &mut cv, + array_ref!(slice, 0, BLOCK_LEN), + BLOCK_LEN as u8, + counter, + block_flags, + ); + block_flags = flags; + slice = &slice[BLOCK_LEN..]; + } + *out = crate::platform::le_bytes_from_words_32(&cv); +} + +pub fn hash_many<const N: usize>( + inputs: &[&[u8; N]], + key: &CVWords, + mut counter: u64, + increment_counter: IncrementCounter, + flags: u8, + flags_start: u8, + flags_end: u8, + out: &mut [u8], +) { + debug_assert!(out.len() >= inputs.len() * OUT_LEN, "out too short"); + for (&input, output) in inputs.iter().zip(out.chunks_exact_mut(OUT_LEN)) { + hash1( + input, + key, + counter, + flags, + flags_start, + flags_end, + array_mut_ref!(output, 0, OUT_LEN), + ); + if increment_counter.yes() { + counter += 1; + } + } +} + +#[cfg(test)] +pub mod test { + use super::*; + + // This is basically testing the portable implementation against itself, + // but it also checks that compress_in_place and compress_xof are + // consistent. And there are tests against the reference implementation and + // against hardcoded test vectors elsewhere. + #[test] + fn test_compress() { + crate::test::test_compress_fn(compress_in_place, compress_xof); + } + + // Ditto. + #[test] + fn test_hash_many() { + crate::test::test_hash_many_fn(hash_many, hash_many); + } +} diff --git a/thirdparty/blake3/src/rust_avx2.rs b/thirdparty/blake3/src/rust_avx2.rs new file mode 100644 index 000000000..a37a4caac --- /dev/null +++ b/thirdparty/blake3/src/rust_avx2.rs @@ -0,0 +1,474 @@ +#[cfg(target_arch = "x86")] +use core::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use core::arch::x86_64::*; + +use crate::{ + counter_high, counter_low, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE, OUT_LEN, +}; +use arrayref::{array_mut_ref, mut_array_refs}; + +pub const DEGREE: usize = 8; + +#[inline(always)] +unsafe fn loadu(src: *const u8) -> __m256i { + // This is an unaligned load, so the pointer cast is allowed. + _mm256_loadu_si256(src as *const __m256i) +} + +#[inline(always)] +unsafe fn storeu(src: __m256i, dest: *mut u8) { + // This is an unaligned store, so the pointer cast is allowed. + _mm256_storeu_si256(dest as *mut __m256i, src) +} + +#[inline(always)] +unsafe fn add(a: __m256i, b: __m256i) -> __m256i { + _mm256_add_epi32(a, b) +} + +#[inline(always)] +unsafe fn xor(a: __m256i, b: __m256i) -> __m256i { + _mm256_xor_si256(a, b) +} + +#[inline(always)] +unsafe fn set1(x: u32) -> __m256i { + _mm256_set1_epi32(x as i32) +} + +#[inline(always)] +unsafe fn set8(a: u32, b: u32, c: u32, d: u32, e: u32, f: u32, g: u32, h: u32) -> __m256i { + _mm256_setr_epi32( + a as i32, b as i32, c as i32, d as i32, e as i32, f as i32, g as i32, h as i32, + ) +} + +// These rotations are the "simple/shifts version". For the +// "complicated/shuffles version", see +// https://github.com/sneves/blake2-avx2/blob/b3723921f668df09ece52dcd225a36d4a4eea1d9/blake2s-common.h#L63-L66. +// For a discussion of the tradeoffs, see +// https://github.com/sneves/blake2-avx2/pull/5. Due to an LLVM bug +// (https://bugs.llvm.org/show_bug.cgi?id=44379), this version performs better +// on recent x86 chips. + +#[inline(always)] +unsafe fn rot16(x: __m256i) -> __m256i { + _mm256_or_si256(_mm256_srli_epi32(x, 16), _mm256_slli_epi32(x, 32 - 16)) +} + +#[inline(always)] +unsafe fn rot12(x: __m256i) -> __m256i { + _mm256_or_si256(_mm256_srli_epi32(x, 12), _mm256_slli_epi32(x, 32 - 12)) +} + +#[inline(always)] +unsafe fn rot8(x: __m256i) -> __m256i { + _mm256_or_si256(_mm256_srli_epi32(x, 8), _mm256_slli_epi32(x, 32 - 8)) +} + +#[inline(always)] +unsafe fn rot7(x: __m256i) -> __m256i { + _mm256_or_si256(_mm256_srli_epi32(x, 7), _mm256_slli_epi32(x, 32 - 7)) +} + +#[inline(always)] +unsafe fn round(v: &mut [__m256i; 16], m: &[__m256i; 16], r: usize) { + v[0] = add(v[0], m[MSG_SCHEDULE[r][0] as usize]); + v[1] = add(v[1], m[MSG_SCHEDULE[r][2] as usize]); + v[2] = add(v[2], m[MSG_SCHEDULE[r][4] as usize]); + v[3] = add(v[3], m[MSG_SCHEDULE[r][6] as usize]); + v[0] = add(v[0], v[4]); + v[1] = add(v[1], v[5]); + v[2] = add(v[2], v[6]); + v[3] = add(v[3], v[7]); + v[12] = xor(v[12], v[0]); + v[13] = xor(v[13], v[1]); + v[14] = xor(v[14], v[2]); + v[15] = xor(v[15], v[3]); + v[12] = rot16(v[12]); + v[13] = rot16(v[13]); + v[14] = rot16(v[14]); + v[15] = rot16(v[15]); + v[8] = add(v[8], v[12]); + v[9] = add(v[9], v[13]); + v[10] = add(v[10], v[14]); + v[11] = add(v[11], v[15]); + v[4] = xor(v[4], v[8]); + v[5] = xor(v[5], v[9]); + v[6] = xor(v[6], v[10]); + v[7] = xor(v[7], v[11]); + v[4] = rot12(v[4]); + v[5] = rot12(v[5]); + v[6] = rot12(v[6]); + v[7] = rot12(v[7]); + v[0] = add(v[0], m[MSG_SCHEDULE[r][1] as usize]); + v[1] = add(v[1], m[MSG_SCHEDULE[r][3] as usize]); + v[2] = add(v[2], m[MSG_SCHEDULE[r][5] as usize]); + v[3] = add(v[3], m[MSG_SCHEDULE[r][7] as usize]); + v[0] = add(v[0], v[4]); + v[1] = add(v[1], v[5]); + v[2] = add(v[2], v[6]); + v[3] = add(v[3], v[7]); + v[12] = xor(v[12], v[0]); + v[13] = xor(v[13], v[1]); + v[14] = xor(v[14], v[2]); + v[15] = xor(v[15], v[3]); + v[12] = rot8(v[12]); + v[13] = rot8(v[13]); + v[14] = rot8(v[14]); + v[15] = rot8(v[15]); + v[8] = add(v[8], v[12]); + v[9] = add(v[9], v[13]); + v[10] = add(v[10], v[14]); + v[11] = add(v[11], v[15]); + v[4] = xor(v[4], v[8]); + v[5] = xor(v[5], v[9]); + v[6] = xor(v[6], v[10]); + v[7] = xor(v[7], v[11]); + v[4] = rot7(v[4]); + v[5] = rot7(v[5]); + v[6] = rot7(v[6]); + v[7] = rot7(v[7]); + + v[0] = add(v[0], m[MSG_SCHEDULE[r][8] as usize]); + v[1] = add(v[1], m[MSG_SCHEDULE[r][10] as usize]); + v[2] = add(v[2], m[MSG_SCHEDULE[r][12] as usize]); + v[3] = add(v[3], m[MSG_SCHEDULE[r][14] as usize]); + v[0] = add(v[0], v[5]); + v[1] = add(v[1], v[6]); + v[2] = add(v[2], v[7]); + v[3] = add(v[3], v[4]); + v[15] = xor(v[15], v[0]); + v[12] = xor(v[12], v[1]); + v[13] = xor(v[13], v[2]); + v[14] = xor(v[14], v[3]); + v[15] = rot16(v[15]); + v[12] = rot16(v[12]); + v[13] = rot16(v[13]); + v[14] = rot16(v[14]); + v[10] = add(v[10], v[15]); + v[11] = add(v[11], v[12]); + v[8] = add(v[8], v[13]); + v[9] = add(v[9], v[14]); + v[5] = xor(v[5], v[10]); + v[6] = xor(v[6], v[11]); + v[7] = xor(v[7], v[8]); + v[4] = xor(v[4], v[9]); + v[5] = rot12(v[5]); + v[6] = rot12(v[6]); + v[7] = rot12(v[7]); + v[4] = rot12(v[4]); + v[0] = add(v[0], m[MSG_SCHEDULE[r][9] as usize]); + v[1] = add(v[1], m[MSG_SCHEDULE[r][11] as usize]); + v[2] = add(v[2], m[MSG_SCHEDULE[r][13] as usize]); + v[3] = add(v[3], m[MSG_SCHEDULE[r][15] as usize]); + v[0] = add(v[0], v[5]); + v[1] = add(v[1], v[6]); + v[2] = add(v[2], v[7]); + v[3] = add(v[3], v[4]); + v[15] = xor(v[15], v[0]); + v[12] = xor(v[12], v[1]); + v[13] = xor(v[13], v[2]); + v[14] = xor(v[14], v[3]); + v[15] = rot8(v[15]); + v[12] = rot8(v[12]); + v[13] = rot8(v[13]); + v[14] = rot8(v[14]); + v[10] = add(v[10], v[15]); + v[11] = add(v[11], v[12]); + v[8] = add(v[8], v[13]); + v[9] = add(v[9], v[14]); + v[5] = xor(v[5], v[10]); + v[6] = xor(v[6], v[11]); + v[7] = xor(v[7], v[8]); + v[4] = xor(v[4], v[9]); + v[5] = rot7(v[5]); + v[6] = rot7(v[6]); + v[7] = rot7(v[7]); + v[4] = rot7(v[4]); +} + +#[inline(always)] +unsafe fn interleave128(a: __m256i, b: __m256i) -> (__m256i, __m256i) { + ( + _mm256_permute2x128_si256(a, b, 0x20), + _mm256_permute2x128_si256(a, b, 0x31), + ) +} + +// There are several ways to do a transposition. We could do it naively, with 8 separate +// _mm256_set_epi32 instructions, referencing each of the 32 words explicitly. Or we could copy +// the vecs into contiguous storage and then use gather instructions. This third approach is to use +// a series of unpack instructions to interleave the vectors. In my benchmarks, interleaving is the +// fastest approach. To test this, run `cargo +nightly bench --bench libtest load_8` in the +// https://github.com/oconnor663/bao_experiments repo. +#[inline(always)] +unsafe fn transpose_vecs(vecs: &mut [__m256i; DEGREE]) { + // Interleave 32-bit lanes. The low unpack is lanes 00/11/44/55, and the high is 22/33/66/77. + let ab_0145 = _mm256_unpacklo_epi32(vecs[0], vecs[1]); + let ab_2367 = _mm256_unpackhi_epi32(vecs[0], vecs[1]); + let cd_0145 = _mm256_unpacklo_epi32(vecs[2], vecs[3]); + let cd_2367 = _mm256_unpackhi_epi32(vecs[2], vecs[3]); + let ef_0145 = _mm256_unpacklo_epi32(vecs[4], vecs[5]); + let ef_2367 = _mm256_unpackhi_epi32(vecs[4], vecs[5]); + let gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]); + let gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]); + + // Interleave 64-bit lanes. The low unpack is lanes 00/22 and the high is 11/33. + let abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145); + let abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145); + let abcd_26 = _mm256_unpacklo_epi64(ab_2367, cd_2367); + let abcd_37 = _mm256_unpackhi_epi64(ab_2367, cd_2367); + let efgh_04 = _mm256_unpacklo_epi64(ef_0145, gh_0145); + let efgh_15 = _mm256_unpackhi_epi64(ef_0145, gh_0145); + let efgh_26 = _mm256_unpacklo_epi64(ef_2367, gh_2367); + let efgh_37 = _mm256_unpackhi_epi64(ef_2367, gh_2367); + + // Interleave 128-bit lanes. + let (abcdefgh_0, abcdefgh_4) = interleave128(abcd_04, efgh_04); + let (abcdefgh_1, abcdefgh_5) = interleave128(abcd_15, efgh_15); + let (abcdefgh_2, abcdefgh_6) = interleave128(abcd_26, efgh_26); + let (abcdefgh_3, abcdefgh_7) = interleave128(abcd_37, efgh_37); + + vecs[0] = abcdefgh_0; + vecs[1] = abcdefgh_1; + vecs[2] = abcdefgh_2; + vecs[3] = abcdefgh_3; + vecs[4] = abcdefgh_4; + vecs[5] = abcdefgh_5; + vecs[6] = abcdefgh_6; + vecs[7] = abcdefgh_7; +} + +#[inline(always)] +unsafe fn transpose_msg_vecs(inputs: &[*const u8; DEGREE], block_offset: usize) -> [__m256i; 16] { + let mut vecs = [ + loadu(inputs[0].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[1].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[2].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[3].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[4].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[5].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[6].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[7].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[0].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[1].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[2].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[3].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[4].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[5].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[6].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[7].add(block_offset + 1 * 4 * DEGREE)), + ]; + for i in 0..DEGREE { + _mm_prefetch(inputs[i].add(block_offset + 256) as *const i8, _MM_HINT_T0); + } + let squares = mut_array_refs!(&mut vecs, DEGREE, DEGREE); + transpose_vecs(squares.0); + transpose_vecs(squares.1); + vecs +} + +#[inline(always)] +unsafe fn load_counters(counter: u64, increment_counter: IncrementCounter) -> (__m256i, __m256i) { + let mask = if increment_counter.yes() { !0 } else { 0 }; + ( + set8( + counter_low(counter + (mask & 0)), + counter_low(counter + (mask & 1)), + counter_low(counter + (mask & 2)), + counter_low(counter + (mask & 3)), + counter_low(counter + (mask & 4)), + counter_low(counter + (mask & 5)), + counter_low(counter + (mask & 6)), + counter_low(counter + (mask & 7)), + ), + set8( + counter_high(counter + (mask & 0)), + counter_high(counter + (mask & 1)), + counter_high(counter + (mask & 2)), + counter_high(counter + (mask & 3)), + counter_high(counter + (mask & 4)), + counter_high(counter + (mask & 5)), + counter_high(counter + (mask & 6)), + counter_high(counter + (mask & 7)), + ), + ) +} + +#[target_feature(enable = "avx2")] +pub unsafe fn hash8( + inputs: &[*const u8; DEGREE], + blocks: usize, + key: &CVWords, + counter: u64, + increment_counter: IncrementCounter, + flags: u8, + flags_start: u8, + flags_end: u8, + out: &mut [u8; DEGREE * OUT_LEN], +) { + let mut h_vecs = [ + set1(key[0]), + set1(key[1]), + set1(key[2]), + set1(key[3]), + set1(key[4]), + set1(key[5]), + set1(key[6]), + set1(key[7]), + ]; + let (counter_low_vec, counter_high_vec) = load_counters(counter, increment_counter); + let mut block_flags = flags | flags_start; + + for block in 0..blocks { + if block + 1 == blocks { + block_flags |= flags_end; + } + let block_len_vec = set1(BLOCK_LEN as u32); // full blocks only + let block_flags_vec = set1(block_flags as u32); + let msg_vecs = transpose_msg_vecs(inputs, block * BLOCK_LEN); + + // The transposed compression function. Note that inlining this + // manually here improves compile times by a lot, compared to factoring + // it out into its own function and making it #[inline(always)]. Just + // guessing, it might have something to do with loop unrolling. + let mut v = [ + h_vecs[0], + h_vecs[1], + h_vecs[2], + h_vecs[3], + h_vecs[4], + h_vecs[5], + h_vecs[6], + h_vecs[7], + set1(IV[0]), + set1(IV[1]), + set1(IV[2]), + set1(IV[3]), + counter_low_vec, + counter_high_vec, + block_len_vec, + block_flags_vec, + ]; + round(&mut v, &msg_vecs, 0); + round(&mut v, &msg_vecs, 1); + round(&mut v, &msg_vecs, 2); + round(&mut v, &msg_vecs, 3); + round(&mut v, &msg_vecs, 4); + round(&mut v, &msg_vecs, 5); + round(&mut v, &msg_vecs, 6); + h_vecs[0] = xor(v[0], v[8]); + h_vecs[1] = xor(v[1], v[9]); + h_vecs[2] = xor(v[2], v[10]); + h_vecs[3] = xor(v[3], v[11]); + h_vecs[4] = xor(v[4], v[12]); + h_vecs[5] = xor(v[5], v[13]); + h_vecs[6] = xor(v[6], v[14]); + h_vecs[7] = xor(v[7], v[15]); + + block_flags = flags; + } + + transpose_vecs(&mut h_vecs); + storeu(h_vecs[0], out.as_mut_ptr().add(0 * 4 * DEGREE)); + storeu(h_vecs[1], out.as_mut_ptr().add(1 * 4 * DEGREE)); + storeu(h_vecs[2], out.as_mut_ptr().add(2 * 4 * DEGREE)); + storeu(h_vecs[3], out.as_mut_ptr().add(3 * 4 * DEGREE)); + storeu(h_vecs[4], out.as_mut_ptr().add(4 * 4 * DEGREE)); + storeu(h_vecs[5], out.as_mut_ptr().add(5 * 4 * DEGREE)); + storeu(h_vecs[6], out.as_mut_ptr().add(6 * 4 * DEGREE)); + storeu(h_vecs[7], out.as_mut_ptr().add(7 * 4 * DEGREE)); +} + +#[target_feature(enable = "avx2")] +pub unsafe fn hash_many<const N: usize>( + mut inputs: &[&[u8; N]], + key: &CVWords, + mut counter: u64, + increment_counter: IncrementCounter, + flags: u8, + flags_start: u8, + flags_end: u8, + mut out: &mut [u8], +) { + debug_assert!(out.len() >= inputs.len() * OUT_LEN, "out too short"); + while inputs.len() >= DEGREE && out.len() >= DEGREE * OUT_LEN { + // Safe because the layout of arrays is guaranteed, and because the + // `blocks` count is determined statically from the argument type. + let input_ptrs: &[*const u8; DEGREE] = &*(inputs.as_ptr() as *const [*const u8; DEGREE]); + let blocks = N / BLOCK_LEN; + hash8( + input_ptrs, + blocks, + key, + counter, + increment_counter, + flags, + flags_start, + flags_end, + array_mut_ref!(out, 0, DEGREE * OUT_LEN), + ); + if increment_counter.yes() { + counter += DEGREE as u64; + } + inputs = &inputs[DEGREE..]; + out = &mut out[DEGREE * OUT_LEN..]; + } + crate::sse41::hash_many( + inputs, + key, + counter, + increment_counter, + flags, + flags_start, + flags_end, + out, + ); +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_transpose() { + if !crate::platform::avx2_detected() { + return; + } + + #[target_feature(enable = "avx2")] + unsafe fn transpose_wrapper(vecs: &mut [__m256i; DEGREE]) { + transpose_vecs(vecs); + } + + let mut matrix = [[0 as u32; DEGREE]; DEGREE]; + for i in 0..DEGREE { + for j in 0..DEGREE { + matrix[i][j] = (i * DEGREE + j) as u32; + } + } + + unsafe { + let mut vecs: [__m256i; DEGREE] = core::mem::transmute(matrix); + transpose_wrapper(&mut vecs); + matrix = core::mem::transmute(vecs); + } + + for i in 0..DEGREE { + for j in 0..DEGREE { + // Reversed indexes from above. + assert_eq!(matrix[j][i], (i * DEGREE + j) as u32); + } + } + } + + #[test] + fn test_hash_many() { + if !crate::platform::avx2_detected() { + return; + } + crate::test::test_hash_many_fn(hash_many, hash_many); + } +} diff --git a/thirdparty/blake3/src/rust_sse2.rs b/thirdparty/blake3/src/rust_sse2.rs new file mode 100644 index 000000000..bd2be69f6 --- /dev/null +++ b/thirdparty/blake3/src/rust_sse2.rs @@ -0,0 +1,775 @@ +#[cfg(target_arch = "x86")] +use core::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use core::arch::x86_64::*; + +use crate::{ + counter_high, counter_low, CVBytes, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE, + OUT_LEN, +}; +use arrayref::{array_mut_ref, array_ref, mut_array_refs}; + +pub const DEGREE: usize = 4; + +#[inline(always)] +unsafe fn loadu(src: *const u8) -> __m128i { + // This is an unaligned load, so the pointer cast is allowed. + _mm_loadu_si128(src as *const __m128i) +} + +#[inline(always)] +unsafe fn storeu(src: __m128i, dest: *mut u8) { + // This is an unaligned store, so the pointer cast is allowed. + _mm_storeu_si128(dest as *mut __m128i, src) +} + +#[inline(always)] +unsafe fn add(a: __m128i, b: __m128i) -> __m128i { + _mm_add_epi32(a, b) +} + +#[inline(always)] +unsafe fn xor(a: __m128i, b: __m128i) -> __m128i { + _mm_xor_si128(a, b) +} + +#[inline(always)] +unsafe fn set1(x: u32) -> __m128i { + _mm_set1_epi32(x as i32) +} + +#[inline(always)] +unsafe fn set4(a: u32, b: u32, c: u32, d: u32) -> __m128i { + _mm_setr_epi32(a as i32, b as i32, c as i32, d as i32) +} + +// These rotations are the "simple/shifts version". For the +// "complicated/shuffles version", see +// https://github.com/sneves/blake2-avx2/blob/b3723921f668df09ece52dcd225a36d4a4eea1d9/blake2s-common.h#L63-L66. +// For a discussion of the tradeoffs, see +// https://github.com/sneves/blake2-avx2/pull/5. Due to an LLVM bug +// (https://bugs.llvm.org/show_bug.cgi?id=44379), this version performs better +// on recent x86 chips. + +#[inline(always)] +unsafe fn rot16(a: __m128i) -> __m128i { + _mm_or_si128(_mm_srli_epi32(a, 16), _mm_slli_epi32(a, 32 - 16)) +} + +#[inline(always)] +unsafe fn rot12(a: __m128i) -> __m128i { + _mm_or_si128(_mm_srli_epi32(a, 12), _mm_slli_epi32(a, 32 - 12)) +} + +#[inline(always)] +unsafe fn rot8(a: __m128i) -> __m128i { + _mm_or_si128(_mm_srli_epi32(a, 8), _mm_slli_epi32(a, 32 - 8)) +} + +#[inline(always)] +unsafe fn rot7(a: __m128i) -> __m128i { + _mm_or_si128(_mm_srli_epi32(a, 7), _mm_slli_epi32(a, 32 - 7)) +} + +#[inline(always)] +unsafe fn g1( + row0: &mut __m128i, + row1: &mut __m128i, + row2: &mut __m128i, + row3: &mut __m128i, + m: __m128i, +) { + *row0 = add(add(*row0, m), *row1); + *row3 = xor(*row3, *row0); + *row3 = rot16(*row3); + *row2 = add(*row2, *row3); + *row1 = xor(*row1, *row2); + *row1 = rot12(*row1); +} + +#[inline(always)] +unsafe fn g2( + row0: &mut __m128i, + row1: &mut __m128i, + row2: &mut __m128i, + row3: &mut __m128i, + m: __m128i, +) { + *row0 = add(add(*row0, m), *row1); + *row3 = xor(*row3, *row0); + *row3 = rot8(*row3); + *row2 = add(*row2, *row3); + *row1 = xor(*row1, *row2); + *row1 = rot7(*row1); +} + +// Adapted from https://github.com/rust-lang-nursery/stdsimd/pull/479. +macro_rules! _MM_SHUFFLE { + ($z:expr, $y:expr, $x:expr, $w:expr) => { + ($z << 6) | ($y << 4) | ($x << 2) | $w + }; +} + +macro_rules! shuffle2 { + ($a:expr, $b:expr, $c:expr) => { + _mm_castps_si128(_mm_shuffle_ps( + _mm_castsi128_ps($a), + _mm_castsi128_ps($b), + $c, + )) + }; +} + +// Note the optimization here of leaving row1 as the unrotated row, rather than +// row0. All the message loads below are adjusted to compensate for this. See +// discussion at https://github.com/sneves/blake2-avx2/pull/4 +#[inline(always)] +unsafe fn diagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) { + *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(2, 1, 0, 3)); + *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2)); + *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(0, 3, 2, 1)); +} + +#[inline(always)] +unsafe fn undiagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) { + *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(0, 3, 2, 1)); + *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2)); + *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(2, 1, 0, 3)); +} + +#[inline(always)] +unsafe fn blend_epi16(a: __m128i, b: __m128i, imm8: i32) -> __m128i { + let bits = _mm_set_epi16(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01); + let mut mask = _mm_set1_epi16(imm8 as i16); + mask = _mm_and_si128(mask, bits); + mask = _mm_cmpeq_epi16(mask, bits); + _mm_or_si128(_mm_and_si128(mask, b), _mm_andnot_si128(mask, a)) +} + +#[inline(always)] +unsafe fn compress_pre( + cv: &CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) -> [__m128i; 4] { + let row0 = &mut loadu(cv.as_ptr().add(0) as *const u8); + let row1 = &mut loadu(cv.as_ptr().add(4) as *const u8); + let row2 = &mut set4(IV[0], IV[1], IV[2], IV[3]); + let row3 = &mut set4( + counter_low(counter), + counter_high(counter), + block_len as u32, + flags as u32, + ); + + let mut m0 = loadu(block.as_ptr().add(0 * 4 * DEGREE)); + let mut m1 = loadu(block.as_ptr().add(1 * 4 * DEGREE)); + let mut m2 = loadu(block.as_ptr().add(2 * 4 * DEGREE)); + let mut m3 = loadu(block.as_ptr().add(3 * 4 * DEGREE)); + + let mut t0; + let mut t1; + let mut t2; + let mut t3; + let mut tt; + + // Round 1. The first round permutes the message words from the original + // input order, into the groups that get mixed in parallel. + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(2, 0, 2, 0)); // 6 4 2 0 + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 3, 1)); // 7 5 3 1 + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = shuffle2!(m2, m3, _MM_SHUFFLE!(2, 0, 2, 0)); // 14 12 10 8 + t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE!(2, 1, 0, 3)); // 12 10 8 14 + g1(row0, row1, row2, row3, t2); + t3 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 1, 3, 1)); // 15 13 11 9 + t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE!(2, 1, 0, 3)); // 13 11 9 15 + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 2. This round and all following rounds apply a fixed permutation + // to the message words from the round before. + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); + t1 = blend_epi16(tt, t1, 0xCC); + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); + g1(row0, row1, row2, row3, t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 3 + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); + t1 = blend_epi16(tt, t1, 0xCC); + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); + g1(row0, row1, row2, row3, t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 4 + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); + t1 = blend_epi16(tt, t1, 0xCC); + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); + g1(row0, row1, row2, row3, t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 5 + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); + t1 = blend_epi16(tt, t1, 0xCC); + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); + g1(row0, row1, row2, row3, t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 6 + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); + t1 = blend_epi16(tt, t1, 0xCC); + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); + g1(row0, row1, row2, row3, t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 7 + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); + t1 = blend_epi16(tt, t1, 0xCC); + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); + g1(row0, row1, row2, row3, t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + + [*row0, *row1, *row2, *row3] +} + +#[target_feature(enable = "sse2")] +pub unsafe fn compress_in_place( + cv: &mut CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) { + let [row0, row1, row2, row3] = compress_pre(cv, block, block_len, counter, flags); + storeu(xor(row0, row2), cv.as_mut_ptr().add(0) as *mut u8); + storeu(xor(row1, row3), cv.as_mut_ptr().add(4) as *mut u8); +} + +#[target_feature(enable = "sse2")] +pub unsafe fn compress_xof( + cv: &CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) -> [u8; 64] { + let [mut row0, mut row1, mut row2, mut row3] = + compress_pre(cv, block, block_len, counter, flags); + row0 = xor(row0, row2); + row1 = xor(row1, row3); + row2 = xor(row2, loadu(cv.as_ptr().add(0) as *const u8)); + row3 = xor(row3, loadu(cv.as_ptr().add(4) as *const u8)); + core::mem::transmute([row0, row1, row2, row3]) +} + +#[inline(always)] +unsafe fn round(v: &mut [__m128i; 16], m: &[__m128i; 16], r: usize) { + v[0] = add(v[0], m[MSG_SCHEDULE[r][0] as usize]); + v[1] = add(v[1], m[MSG_SCHEDULE[r][2] as usize]); + v[2] = add(v[2], m[MSG_SCHEDULE[r][4] as usize]); + v[3] = add(v[3], m[MSG_SCHEDULE[r][6] as usize]); + v[0] = add(v[0], v[4]); + v[1] = add(v[1], v[5]); + v[2] = add(v[2], v[6]); + v[3] = add(v[3], v[7]); + v[12] = xor(v[12], v[0]); + v[13] = xor(v[13], v[1]); + v[14] = xor(v[14], v[2]); + v[15] = xor(v[15], v[3]); + v[12] = rot16(v[12]); + v[13] = rot16(v[13]); + v[14] = rot16(v[14]); + v[15] = rot16(v[15]); + v[8] = add(v[8], v[12]); + v[9] = add(v[9], v[13]); + v[10] = add(v[10], v[14]); + v[11] = add(v[11], v[15]); + v[4] = xor(v[4], v[8]); + v[5] = xor(v[5], v[9]); + v[6] = xor(v[6], v[10]); + v[7] = xor(v[7], v[11]); + v[4] = rot12(v[4]); + v[5] = rot12(v[5]); + v[6] = rot12(v[6]); + v[7] = rot12(v[7]); + v[0] = add(v[0], m[MSG_SCHEDULE[r][1] as usize]); + v[1] = add(v[1], m[MSG_SCHEDULE[r][3] as usize]); + v[2] = add(v[2], m[MSG_SCHEDULE[r][5] as usize]); + v[3] = add(v[3], m[MSG_SCHEDULE[r][7] as usize]); + v[0] = add(v[0], v[4]); + v[1] = add(v[1], v[5]); + v[2] = add(v[2], v[6]); + v[3] = add(v[3], v[7]); + v[12] = xor(v[12], v[0]); + v[13] = xor(v[13], v[1]); + v[14] = xor(v[14], v[2]); + v[15] = xor(v[15], v[3]); + v[12] = rot8(v[12]); + v[13] = rot8(v[13]); + v[14] = rot8(v[14]); + v[15] = rot8(v[15]); + v[8] = add(v[8], v[12]); + v[9] = add(v[9], v[13]); + v[10] = add(v[10], v[14]); + v[11] = add(v[11], v[15]); + v[4] = xor(v[4], v[8]); + v[5] = xor(v[5], v[9]); + v[6] = xor(v[6], v[10]); + v[7] = xor(v[7], v[11]); + v[4] = rot7(v[4]); + v[5] = rot7(v[5]); + v[6] = rot7(v[6]); + v[7] = rot7(v[7]); + + v[0] = add(v[0], m[MSG_SCHEDULE[r][8] as usize]); + v[1] = add(v[1], m[MSG_SCHEDULE[r][10] as usize]); + v[2] = add(v[2], m[MSG_SCHEDULE[r][12] as usize]); + v[3] = add(v[3], m[MSG_SCHEDULE[r][14] as usize]); + v[0] = add(v[0], v[5]); + v[1] = add(v[1], v[6]); + v[2] = add(v[2], v[7]); + v[3] = add(v[3], v[4]); + v[15] = xor(v[15], v[0]); + v[12] = xor(v[12], v[1]); + v[13] = xor(v[13], v[2]); + v[14] = xor(v[14], v[3]); + v[15] = rot16(v[15]); + v[12] = rot16(v[12]); + v[13] = rot16(v[13]); + v[14] = rot16(v[14]); + v[10] = add(v[10], v[15]); + v[11] = add(v[11], v[12]); + v[8] = add(v[8], v[13]); + v[9] = add(v[9], v[14]); + v[5] = xor(v[5], v[10]); + v[6] = xor(v[6], v[11]); + v[7] = xor(v[7], v[8]); + v[4] = xor(v[4], v[9]); + v[5] = rot12(v[5]); + v[6] = rot12(v[6]); + v[7] = rot12(v[7]); + v[4] = rot12(v[4]); + v[0] = add(v[0], m[MSG_SCHEDULE[r][9] as usize]); + v[1] = add(v[1], m[MSG_SCHEDULE[r][11] as usize]); + v[2] = add(v[2], m[MSG_SCHEDULE[r][13] as usize]); + v[3] = add(v[3], m[MSG_SCHEDULE[r][15] as usize]); + v[0] = add(v[0], v[5]); + v[1] = add(v[1], v[6]); + v[2] = add(v[2], v[7]); + v[3] = add(v[3], v[4]); + v[15] = xor(v[15], v[0]); + v[12] = xor(v[12], v[1]); + v[13] = xor(v[13], v[2]); + v[14] = xor(v[14], v[3]); + v[15] = rot8(v[15]); + v[12] = rot8(v[12]); + v[13] = rot8(v[13]); + v[14] = rot8(v[14]); + v[10] = add(v[10], v[15]); + v[11] = add(v[11], v[12]); + v[8] = add(v[8], v[13]); + v[9] = add(v[9], v[14]); + v[5] = xor(v[5], v[10]); + v[6] = xor(v[6], v[11]); + v[7] = xor(v[7], v[8]); + v[4] = xor(v[4], v[9]); + v[5] = rot7(v[5]); + v[6] = rot7(v[6]); + v[7] = rot7(v[7]); + v[4] = rot7(v[4]); +} + +#[inline(always)] +unsafe fn transpose_vecs(vecs: &mut [__m128i; DEGREE]) { + // Interleave 32-bit lanes. The low unpack is lanes 00/11 and the high is + // 22/33. Note that this doesn't split the vector into two lanes, as the + // AVX2 counterparts do. + let ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]); + let ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]); + let cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]); + let cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]); + + // Interleave 64-bit lanes. + let abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01); + let abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01); + let abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23); + let abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23); + + vecs[0] = abcd_0; + vecs[1] = abcd_1; + vecs[2] = abcd_2; + vecs[3] = abcd_3; +} + +#[inline(always)] +unsafe fn transpose_msg_vecs(inputs: &[*const u8; DEGREE], block_offset: usize) -> [__m128i; 16] { + let mut vecs = [ + loadu(inputs[0].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[1].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[2].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[3].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[0].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[1].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[2].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[3].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[0].add(block_offset + 2 * 4 * DEGREE)), + loadu(inputs[1].add(block_offset + 2 * 4 * DEGREE)), + loadu(inputs[2].add(block_offset + 2 * 4 * DEGREE)), + loadu(inputs[3].add(block_offset + 2 * 4 * DEGREE)), + loadu(inputs[0].add(block_offset + 3 * 4 * DEGREE)), + loadu(inputs[1].add(block_offset + 3 * 4 * DEGREE)), + loadu(inputs[2].add(block_offset + 3 * 4 * DEGREE)), + loadu(inputs[3].add(block_offset + 3 * 4 * DEGREE)), + ]; + for i in 0..DEGREE { + _mm_prefetch(inputs[i].add(block_offset + 256) as *const i8, _MM_HINT_T0); + } + let squares = mut_array_refs!(&mut vecs, DEGREE, DEGREE, DEGREE, DEGREE); + transpose_vecs(squares.0); + transpose_vecs(squares.1); + transpose_vecs(squares.2); + transpose_vecs(squares.3); + vecs +} + +#[inline(always)] +unsafe fn load_counters(counter: u64, increment_counter: IncrementCounter) -> (__m128i, __m128i) { + let mask = if increment_counter.yes() { !0 } else { 0 }; + ( + set4( + counter_low(counter + (mask & 0)), + counter_low(counter + (mask & 1)), + counter_low(counter + (mask & 2)), + counter_low(counter + (mask & 3)), + ), + set4( + counter_high(counter + (mask & 0)), + counter_high(counter + (mask & 1)), + counter_high(counter + (mask & 2)), + counter_high(counter + (mask & 3)), + ), + ) +} + +#[target_feature(enable = "sse2")] +pub unsafe fn hash4( + inputs: &[*const u8; DEGREE], + blocks: usize, + key: &CVWords, + counter: u64, + increment_counter: IncrementCounter, + flags: u8, + flags_start: u8, + flags_end: u8, + out: &mut [u8; DEGREE * OUT_LEN], +) { + let mut h_vecs = [ + set1(key[0]), + set1(key[1]), + set1(key[2]), + set1(key[3]), + set1(key[4]), + set1(key[5]), + set1(key[6]), + set1(key[7]), + ]; + let (counter_low_vec, counter_high_vec) = load_counters(counter, increment_counter); + let mut block_flags = flags | flags_start; + + for block in 0..blocks { + if block + 1 == blocks { + block_flags |= flags_end; + } + let block_len_vec = set1(BLOCK_LEN as u32); // full blocks only + let block_flags_vec = set1(block_flags as u32); + let msg_vecs = transpose_msg_vecs(inputs, block * BLOCK_LEN); + + // The transposed compression function. Note that inlining this + // manually here improves compile times by a lot, compared to factoring + // it out into its own function and making it #[inline(always)]. Just + // guessing, it might have something to do with loop unrolling. + let mut v = [ + h_vecs[0], + h_vecs[1], + h_vecs[2], + h_vecs[3], + h_vecs[4], + h_vecs[5], + h_vecs[6], + h_vecs[7], + set1(IV[0]), + set1(IV[1]), + set1(IV[2]), + set1(IV[3]), + counter_low_vec, + counter_high_vec, + block_len_vec, + block_flags_vec, + ]; + round(&mut v, &msg_vecs, 0); + round(&mut v, &msg_vecs, 1); + round(&mut v, &msg_vecs, 2); + round(&mut v, &msg_vecs, 3); + round(&mut v, &msg_vecs, 4); + round(&mut v, &msg_vecs, 5); + round(&mut v, &msg_vecs, 6); + h_vecs[0] = xor(v[0], v[8]); + h_vecs[1] = xor(v[1], v[9]); + h_vecs[2] = xor(v[2], v[10]); + h_vecs[3] = xor(v[3], v[11]); + h_vecs[4] = xor(v[4], v[12]); + h_vecs[5] = xor(v[5], v[13]); + h_vecs[6] = xor(v[6], v[14]); + h_vecs[7] = xor(v[7], v[15]); + + block_flags = flags; + } + + let squares = mut_array_refs!(&mut h_vecs, DEGREE, DEGREE); + transpose_vecs(squares.0); + transpose_vecs(squares.1); + // The first four vecs now contain the first half of each output, and the + // second four vecs contain the second half of each output. + storeu(h_vecs[0], out.as_mut_ptr().add(0 * 4 * DEGREE)); + storeu(h_vecs[4], out.as_mut_ptr().add(1 * 4 * DEGREE)); + storeu(h_vecs[1], out.as_mut_ptr().add(2 * 4 * DEGREE)); + storeu(h_vecs[5], out.as_mut_ptr().add(3 * 4 * DEGREE)); + storeu(h_vecs[2], out.as_mut_ptr().add(4 * 4 * DEGREE)); + storeu(h_vecs[6], out.as_mut_ptr().add(5 * 4 * DEGREE)); + storeu(h_vecs[3], out.as_mut_ptr().add(6 * 4 * DEGREE)); + storeu(h_vecs[7], out.as_mut_ptr().add(7 * 4 * DEGREE)); +} + +#[target_feature(enable = "sse2")] +unsafe fn hash1<const N: usize>( + input: &[u8; N], + key: &CVWords, + counter: u64, + flags: u8, + flags_start: u8, + flags_end: u8, + out: &mut CVBytes, +) { + debug_assert_eq!(N % BLOCK_LEN, 0, "uneven blocks"); + let mut cv = *key; + let mut block_flags = flags | flags_start; + let mut slice = &input[..]; + while slice.len() >= BLOCK_LEN { + if slice.len() == BLOCK_LEN { + block_flags |= flags_end; + } + compress_in_place( + &mut cv, + array_ref!(slice, 0, BLOCK_LEN), + BLOCK_LEN as u8, + counter, + block_flags, + ); + block_flags = flags; + slice = &slice[BLOCK_LEN..]; + } + *out = core::mem::transmute(cv); // x86 is little-endian +} + +#[target_feature(enable = "sse2")] +pub unsafe fn hash_many<const N: usize>( + mut inputs: &[&[u8; N]], + key: &CVWords, + mut counter: u64, + increment_counter: IncrementCounter, + flags: u8, + flags_start: u8, + flags_end: u8, + mut out: &mut [u8], +) { + debug_assert!(out.len() >= inputs.len() * OUT_LEN, "out too short"); + while inputs.len() >= DEGREE && out.len() >= DEGREE * OUT_LEN { + // Safe because the layout of arrays is guaranteed, and because the + // `blocks` count is determined statically from the argument type. + let input_ptrs: &[*const u8; DEGREE] = &*(inputs.as_ptr() as *const [*const u8; DEGREE]); + let blocks = N / BLOCK_LEN; + hash4( + input_ptrs, + blocks, + key, + counter, + increment_counter, + flags, + flags_start, + flags_end, + array_mut_ref!(out, 0, DEGREE * OUT_LEN), + ); + if increment_counter.yes() { + counter += DEGREE as u64; + } + inputs = &inputs[DEGREE..]; + out = &mut out[DEGREE * OUT_LEN..]; + } + for (&input, output) in inputs.iter().zip(out.chunks_exact_mut(OUT_LEN)) { + hash1( + input, + key, + counter, + flags, + flags_start, + flags_end, + array_mut_ref!(output, 0, OUT_LEN), + ); + if increment_counter.yes() { + counter += 1; + } + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_transpose() { + if !crate::platform::sse2_detected() { + return; + } + + #[target_feature(enable = "sse2")] + unsafe fn transpose_wrapper(vecs: &mut [__m128i; DEGREE]) { + transpose_vecs(vecs); + } + + let mut matrix = [[0 as u32; DEGREE]; DEGREE]; + for i in 0..DEGREE { + for j in 0..DEGREE { + matrix[i][j] = (i * DEGREE + j) as u32; + } + } + + unsafe { + let mut vecs: [__m128i; DEGREE] = core::mem::transmute(matrix); + transpose_wrapper(&mut vecs); + matrix = core::mem::transmute(vecs); + } + + for i in 0..DEGREE { + for j in 0..DEGREE { + // Reversed indexes from above. + assert_eq!(matrix[j][i], (i * DEGREE + j) as u32); + } + } + } + + #[test] + fn test_compress() { + if !crate::platform::sse2_detected() { + return; + } + crate::test::test_compress_fn(compress_in_place, compress_xof); + } + + #[test] + fn test_hash_many() { + if !crate::platform::sse2_detected() { + return; + } + crate::test::test_hash_many_fn(hash_many, hash_many); + } +} diff --git a/thirdparty/blake3/src/rust_sse41.rs b/thirdparty/blake3/src/rust_sse41.rs new file mode 100644 index 000000000..1ebadc482 --- /dev/null +++ b/thirdparty/blake3/src/rust_sse41.rs @@ -0,0 +1,766 @@ +#[cfg(target_arch = "x86")] +use core::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use core::arch::x86_64::*; + +use crate::{ + counter_high, counter_low, CVBytes, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE, + OUT_LEN, +}; +use arrayref::{array_mut_ref, array_ref, mut_array_refs}; + +pub const DEGREE: usize = 4; + +#[inline(always)] +unsafe fn loadu(src: *const u8) -> __m128i { + // This is an unaligned load, so the pointer cast is allowed. + _mm_loadu_si128(src as *const __m128i) +} + +#[inline(always)] +unsafe fn storeu(src: __m128i, dest: *mut u8) { + // This is an unaligned store, so the pointer cast is allowed. + _mm_storeu_si128(dest as *mut __m128i, src) +} + +#[inline(always)] +unsafe fn add(a: __m128i, b: __m128i) -> __m128i { + _mm_add_epi32(a, b) +} + +#[inline(always)] +unsafe fn xor(a: __m128i, b: __m128i) -> __m128i { + _mm_xor_si128(a, b) +} + +#[inline(always)] +unsafe fn set1(x: u32) -> __m128i { + _mm_set1_epi32(x as i32) +} + +#[inline(always)] +unsafe fn set4(a: u32, b: u32, c: u32, d: u32) -> __m128i { + _mm_setr_epi32(a as i32, b as i32, c as i32, d as i32) +} + +// These rotations are the "simple/shifts version". For the +// "complicated/shuffles version", see +// https://github.com/sneves/blake2-avx2/blob/b3723921f668df09ece52dcd225a36d4a4eea1d9/blake2s-common.h#L63-L66. +// For a discussion of the tradeoffs, see +// https://github.com/sneves/blake2-avx2/pull/5. Due to an LLVM bug +// (https://bugs.llvm.org/show_bug.cgi?id=44379), this version performs better +// on recent x86 chips. + +#[inline(always)] +unsafe fn rot16(a: __m128i) -> __m128i { + _mm_or_si128(_mm_srli_epi32(a, 16), _mm_slli_epi32(a, 32 - 16)) +} + +#[inline(always)] +unsafe fn rot12(a: __m128i) -> __m128i { + _mm_or_si128(_mm_srli_epi32(a, 12), _mm_slli_epi32(a, 32 - 12)) +} + +#[inline(always)] +unsafe fn rot8(a: __m128i) -> __m128i { + _mm_or_si128(_mm_srli_epi32(a, 8), _mm_slli_epi32(a, 32 - 8)) +} + +#[inline(always)] +unsafe fn rot7(a: __m128i) -> __m128i { + _mm_or_si128(_mm_srli_epi32(a, 7), _mm_slli_epi32(a, 32 - 7)) +} + +#[inline(always)] +unsafe fn g1( + row0: &mut __m128i, + row1: &mut __m128i, + row2: &mut __m128i, + row3: &mut __m128i, + m: __m128i, +) { + *row0 = add(add(*row0, m), *row1); + *row3 = xor(*row3, *row0); + *row3 = rot16(*row3); + *row2 = add(*row2, *row3); + *row1 = xor(*row1, *row2); + *row1 = rot12(*row1); +} + +#[inline(always)] +unsafe fn g2( + row0: &mut __m128i, + row1: &mut __m128i, + row2: &mut __m128i, + row3: &mut __m128i, + m: __m128i, +) { + *row0 = add(add(*row0, m), *row1); + *row3 = xor(*row3, *row0); + *row3 = rot8(*row3); + *row2 = add(*row2, *row3); + *row1 = xor(*row1, *row2); + *row1 = rot7(*row1); +} + +// Adapted from https://github.com/rust-lang-nursery/stdsimd/pull/479. +macro_rules! _MM_SHUFFLE { + ($z:expr, $y:expr, $x:expr, $w:expr) => { + ($z << 6) | ($y << 4) | ($x << 2) | $w + }; +} + +macro_rules! shuffle2 { + ($a:expr, $b:expr, $c:expr) => { + _mm_castps_si128(_mm_shuffle_ps( + _mm_castsi128_ps($a), + _mm_castsi128_ps($b), + $c, + )) + }; +} + +// Note the optimization here of leaving row1 as the unrotated row, rather than +// row0. All the message loads below are adjusted to compensate for this. See +// discussion at https://github.com/sneves/blake2-avx2/pull/4 +#[inline(always)] +unsafe fn diagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) { + *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(2, 1, 0, 3)); + *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2)); + *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(0, 3, 2, 1)); +} + +#[inline(always)] +unsafe fn undiagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) { + *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(0, 3, 2, 1)); + *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2)); + *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(2, 1, 0, 3)); +} + +#[inline(always)] +unsafe fn compress_pre( + cv: &CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) -> [__m128i; 4] { + let row0 = &mut loadu(cv.as_ptr().add(0) as *const u8); + let row1 = &mut loadu(cv.as_ptr().add(4) as *const u8); + let row2 = &mut set4(IV[0], IV[1], IV[2], IV[3]); + let row3 = &mut set4( + counter_low(counter), + counter_high(counter), + block_len as u32, + flags as u32, + ); + + let mut m0 = loadu(block.as_ptr().add(0 * 4 * DEGREE)); + let mut m1 = loadu(block.as_ptr().add(1 * 4 * DEGREE)); + let mut m2 = loadu(block.as_ptr().add(2 * 4 * DEGREE)); + let mut m3 = loadu(block.as_ptr().add(3 * 4 * DEGREE)); + + let mut t0; + let mut t1; + let mut t2; + let mut t3; + let mut tt; + + // Round 1. The first round permutes the message words from the original + // input order, into the groups that get mixed in parallel. + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(2, 0, 2, 0)); // 6 4 2 0 + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 3, 1)); // 7 5 3 1 + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = shuffle2!(m2, m3, _MM_SHUFFLE!(2, 0, 2, 0)); // 14 12 10 8 + t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE!(2, 1, 0, 3)); // 12 10 8 14 + g1(row0, row1, row2, row3, t2); + t3 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 1, 3, 1)); // 15 13 11 9 + t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE!(2, 1, 0, 3)); // 13 11 9 15 + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 2. This round and all following rounds apply a fixed permutation + // to the message words from the round before. + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); + g1(row0, row1, row2, row3, t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 3 + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); + g1(row0, row1, row2, row3, t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 4 + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); + g1(row0, row1, row2, row3, t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 5 + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); + g1(row0, row1, row2, row3, t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 6 + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); + g1(row0, row1, row2, row3, t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 7 + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); + g1(row0, row1, row2, row3, t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + + [*row0, *row1, *row2, *row3] +} + +#[target_feature(enable = "sse4.1")] +pub unsafe fn compress_in_place( + cv: &mut CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) { + let [row0, row1, row2, row3] = compress_pre(cv, block, block_len, counter, flags); + storeu(xor(row0, row2), cv.as_mut_ptr().add(0) as *mut u8); + storeu(xor(row1, row3), cv.as_mut_ptr().add(4) as *mut u8); +} + +#[target_feature(enable = "sse4.1")] +pub unsafe fn compress_xof( + cv: &CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) -> [u8; 64] { + let [mut row0, mut row1, mut row2, mut row3] = + compress_pre(cv, block, block_len, counter, flags); + row0 = xor(row0, row2); + row1 = xor(row1, row3); + row2 = xor(row2, loadu(cv.as_ptr().add(0) as *const u8)); + row3 = xor(row3, loadu(cv.as_ptr().add(4) as *const u8)); + core::mem::transmute([row0, row1, row2, row3]) +} + +#[inline(always)] +unsafe fn round(v: &mut [__m128i; 16], m: &[__m128i; 16], r: usize) { + v[0] = add(v[0], m[MSG_SCHEDULE[r][0] as usize]); + v[1] = add(v[1], m[MSG_SCHEDULE[r][2] as usize]); + v[2] = add(v[2], m[MSG_SCHEDULE[r][4] as usize]); + v[3] = add(v[3], m[MSG_SCHEDULE[r][6] as usize]); + v[0] = add(v[0], v[4]); + v[1] = add(v[1], v[5]); + v[2] = add(v[2], v[6]); + v[3] = add(v[3], v[7]); + v[12] = xor(v[12], v[0]); + v[13] = xor(v[13], v[1]); + v[14] = xor(v[14], v[2]); + v[15] = xor(v[15], v[3]); + v[12] = rot16(v[12]); + v[13] = rot16(v[13]); + v[14] = rot16(v[14]); + v[15] = rot16(v[15]); + v[8] = add(v[8], v[12]); + v[9] = add(v[9], v[13]); + v[10] = add(v[10], v[14]); + v[11] = add(v[11], v[15]); + v[4] = xor(v[4], v[8]); + v[5] = xor(v[5], v[9]); + v[6] = xor(v[6], v[10]); + v[7] = xor(v[7], v[11]); + v[4] = rot12(v[4]); + v[5] = rot12(v[5]); + v[6] = rot12(v[6]); + v[7] = rot12(v[7]); + v[0] = add(v[0], m[MSG_SCHEDULE[r][1] as usize]); + v[1] = add(v[1], m[MSG_SCHEDULE[r][3] as usize]); + v[2] = add(v[2], m[MSG_SCHEDULE[r][5] as usize]); + v[3] = add(v[3], m[MSG_SCHEDULE[r][7] as usize]); + v[0] = add(v[0], v[4]); + v[1] = add(v[1], v[5]); + v[2] = add(v[2], v[6]); + v[3] = add(v[3], v[7]); + v[12] = xor(v[12], v[0]); + v[13] = xor(v[13], v[1]); + v[14] = xor(v[14], v[2]); + v[15] = xor(v[15], v[3]); + v[12] = rot8(v[12]); + v[13] = rot8(v[13]); + v[14] = rot8(v[14]); + v[15] = rot8(v[15]); + v[8] = add(v[8], v[12]); + v[9] = add(v[9], v[13]); + v[10] = add(v[10], v[14]); + v[11] = add(v[11], v[15]); + v[4] = xor(v[4], v[8]); + v[5] = xor(v[5], v[9]); + v[6] = xor(v[6], v[10]); + v[7] = xor(v[7], v[11]); + v[4] = rot7(v[4]); + v[5] = rot7(v[5]); + v[6] = rot7(v[6]); + v[7] = rot7(v[7]); + + v[0] = add(v[0], m[MSG_SCHEDULE[r][8] as usize]); + v[1] = add(v[1], m[MSG_SCHEDULE[r][10] as usize]); + v[2] = add(v[2], m[MSG_SCHEDULE[r][12] as usize]); + v[3] = add(v[3], m[MSG_SCHEDULE[r][14] as usize]); + v[0] = add(v[0], v[5]); + v[1] = add(v[1], v[6]); + v[2] = add(v[2], v[7]); + v[3] = add(v[3], v[4]); + v[15] = xor(v[15], v[0]); + v[12] = xor(v[12], v[1]); + v[13] = xor(v[13], v[2]); + v[14] = xor(v[14], v[3]); + v[15] = rot16(v[15]); + v[12] = rot16(v[12]); + v[13] = rot16(v[13]); + v[14] = rot16(v[14]); + v[10] = add(v[10], v[15]); + v[11] = add(v[11], v[12]); + v[8] = add(v[8], v[13]); + v[9] = add(v[9], v[14]); + v[5] = xor(v[5], v[10]); + v[6] = xor(v[6], v[11]); + v[7] = xor(v[7], v[8]); + v[4] = xor(v[4], v[9]); + v[5] = rot12(v[5]); + v[6] = rot12(v[6]); + v[7] = rot12(v[7]); + v[4] = rot12(v[4]); + v[0] = add(v[0], m[MSG_SCHEDULE[r][9] as usize]); + v[1] = add(v[1], m[MSG_SCHEDULE[r][11] as usize]); + v[2] = add(v[2], m[MSG_SCHEDULE[r][13] as usize]); + v[3] = add(v[3], m[MSG_SCHEDULE[r][15] as usize]); + v[0] = add(v[0], v[5]); + v[1] = add(v[1], v[6]); + v[2] = add(v[2], v[7]); + v[3] = add(v[3], v[4]); + v[15] = xor(v[15], v[0]); + v[12] = xor(v[12], v[1]); + v[13] = xor(v[13], v[2]); + v[14] = xor(v[14], v[3]); + v[15] = rot8(v[15]); + v[12] = rot8(v[12]); + v[13] = rot8(v[13]); + v[14] = rot8(v[14]); + v[10] = add(v[10], v[15]); + v[11] = add(v[11], v[12]); + v[8] = add(v[8], v[13]); + v[9] = add(v[9], v[14]); + v[5] = xor(v[5], v[10]); + v[6] = xor(v[6], v[11]); + v[7] = xor(v[7], v[8]); + v[4] = xor(v[4], v[9]); + v[5] = rot7(v[5]); + v[6] = rot7(v[6]); + v[7] = rot7(v[7]); + v[4] = rot7(v[4]); +} + +#[inline(always)] +unsafe fn transpose_vecs(vecs: &mut [__m128i; DEGREE]) { + // Interleave 32-bit lanes. The low unpack is lanes 00/11 and the high is + // 22/33. Note that this doesn't split the vector into two lanes, as the + // AVX2 counterparts do. + let ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]); + let ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]); + let cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]); + let cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]); + + // Interleave 64-bit lanes. + let abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01); + let abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01); + let abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23); + let abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23); + + vecs[0] = abcd_0; + vecs[1] = abcd_1; + vecs[2] = abcd_2; + vecs[3] = abcd_3; +} + +#[inline(always)] +unsafe fn transpose_msg_vecs(inputs: &[*const u8; DEGREE], block_offset: usize) -> [__m128i; 16] { + let mut vecs = [ + loadu(inputs[0].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[1].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[2].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[3].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[0].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[1].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[2].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[3].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[0].add(block_offset + 2 * 4 * DEGREE)), + loadu(inputs[1].add(block_offset + 2 * 4 * DEGREE)), + loadu(inputs[2].add(block_offset + 2 * 4 * DEGREE)), + loadu(inputs[3].add(block_offset + 2 * 4 * DEGREE)), + loadu(inputs[0].add(block_offset + 3 * 4 * DEGREE)), + loadu(inputs[1].add(block_offset + 3 * 4 * DEGREE)), + loadu(inputs[2].add(block_offset + 3 * 4 * DEGREE)), + loadu(inputs[3].add(block_offset + 3 * 4 * DEGREE)), + ]; + for i in 0..DEGREE { + _mm_prefetch(inputs[i].add(block_offset + 256) as *const i8, _MM_HINT_T0); + } + let squares = mut_array_refs!(&mut vecs, DEGREE, DEGREE, DEGREE, DEGREE); + transpose_vecs(squares.0); + transpose_vecs(squares.1); + transpose_vecs(squares.2); + transpose_vecs(squares.3); + vecs +} + +#[inline(always)] +unsafe fn load_counters(counter: u64, increment_counter: IncrementCounter) -> (__m128i, __m128i) { + let mask = if increment_counter.yes() { !0 } else { 0 }; + ( + set4( + counter_low(counter + (mask & 0)), + counter_low(counter + (mask & 1)), + counter_low(counter + (mask & 2)), + counter_low(counter + (mask & 3)), + ), + set4( + counter_high(counter + (mask & 0)), + counter_high(counter + (mask & 1)), + counter_high(counter + (mask & 2)), + counter_high(counter + (mask & 3)), + ), + ) +} + +#[target_feature(enable = "sse4.1")] +pub unsafe fn hash4( + inputs: &[*const u8; DEGREE], + blocks: usize, + key: &CVWords, + counter: u64, + increment_counter: IncrementCounter, + flags: u8, + flags_start: u8, + flags_end: u8, + out: &mut [u8; DEGREE * OUT_LEN], +) { + let mut h_vecs = [ + set1(key[0]), + set1(key[1]), + set1(key[2]), + set1(key[3]), + set1(key[4]), + set1(key[5]), + set1(key[6]), + set1(key[7]), + ]; + let (counter_low_vec, counter_high_vec) = load_counters(counter, increment_counter); + let mut block_flags = flags | flags_start; + + for block in 0..blocks { + if block + 1 == blocks { + block_flags |= flags_end; + } + let block_len_vec = set1(BLOCK_LEN as u32); // full blocks only + let block_flags_vec = set1(block_flags as u32); + let msg_vecs = transpose_msg_vecs(inputs, block * BLOCK_LEN); + + // The transposed compression function. Note that inlining this + // manually here improves compile times by a lot, compared to factoring + // it out into its own function and making it #[inline(always)]. Just + // guessing, it might have something to do with loop unrolling. + let mut v = [ + h_vecs[0], + h_vecs[1], + h_vecs[2], + h_vecs[3], + h_vecs[4], + h_vecs[5], + h_vecs[6], + h_vecs[7], + set1(IV[0]), + set1(IV[1]), + set1(IV[2]), + set1(IV[3]), + counter_low_vec, + counter_high_vec, + block_len_vec, + block_flags_vec, + ]; + round(&mut v, &msg_vecs, 0); + round(&mut v, &msg_vecs, 1); + round(&mut v, &msg_vecs, 2); + round(&mut v, &msg_vecs, 3); + round(&mut v, &msg_vecs, 4); + round(&mut v, &msg_vecs, 5); + round(&mut v, &msg_vecs, 6); + h_vecs[0] = xor(v[0], v[8]); + h_vecs[1] = xor(v[1], v[9]); + h_vecs[2] = xor(v[2], v[10]); + h_vecs[3] = xor(v[3], v[11]); + h_vecs[4] = xor(v[4], v[12]); + h_vecs[5] = xor(v[5], v[13]); + h_vecs[6] = xor(v[6], v[14]); + h_vecs[7] = xor(v[7], v[15]); + + block_flags = flags; + } + + let squares = mut_array_refs!(&mut h_vecs, DEGREE, DEGREE); + transpose_vecs(squares.0); + transpose_vecs(squares.1); + // The first four vecs now contain the first half of each output, and the + // second four vecs contain the second half of each output. + storeu(h_vecs[0], out.as_mut_ptr().add(0 * 4 * DEGREE)); + storeu(h_vecs[4], out.as_mut_ptr().add(1 * 4 * DEGREE)); + storeu(h_vecs[1], out.as_mut_ptr().add(2 * 4 * DEGREE)); + storeu(h_vecs[5], out.as_mut_ptr().add(3 * 4 * DEGREE)); + storeu(h_vecs[2], out.as_mut_ptr().add(4 * 4 * DEGREE)); + storeu(h_vecs[6], out.as_mut_ptr().add(5 * 4 * DEGREE)); + storeu(h_vecs[3], out.as_mut_ptr().add(6 * 4 * DEGREE)); + storeu(h_vecs[7], out.as_mut_ptr().add(7 * 4 * DEGREE)); +} + +#[target_feature(enable = "sse4.1")] +unsafe fn hash1<const N: usize>( + input: &[u8; N], + key: &CVWords, + counter: u64, + flags: u8, + flags_start: u8, + flags_end: u8, + out: &mut CVBytes, +) { + debug_assert_eq!(N % BLOCK_LEN, 0, "uneven blocks"); + let mut cv = *key; + let mut block_flags = flags | flags_start; + let mut slice = &input[..]; + while slice.len() >= BLOCK_LEN { + if slice.len() == BLOCK_LEN { + block_flags |= flags_end; + } + compress_in_place( + &mut cv, + array_ref!(slice, 0, BLOCK_LEN), + BLOCK_LEN as u8, + counter, + block_flags, + ); + block_flags = flags; + slice = &slice[BLOCK_LEN..]; + } + *out = core::mem::transmute(cv); // x86 is little-endian +} + +#[target_feature(enable = "sse4.1")] +pub unsafe fn hash_many<const N: usize>( + mut inputs: &[&[u8; N]], + key: &CVWords, + mut counter: u64, + increment_counter: IncrementCounter, + flags: u8, + flags_start: u8, + flags_end: u8, + mut out: &mut [u8], +) { + debug_assert!(out.len() >= inputs.len() * OUT_LEN, "out too short"); + while inputs.len() >= DEGREE && out.len() >= DEGREE * OUT_LEN { + // Safe because the layout of arrays is guaranteed, and because the + // `blocks` count is determined statically from the argument type. + let input_ptrs: &[*const u8; DEGREE] = &*(inputs.as_ptr() as *const [*const u8; DEGREE]); + let blocks = N / BLOCK_LEN; + hash4( + input_ptrs, + blocks, + key, + counter, + increment_counter, + flags, + flags_start, + flags_end, + array_mut_ref!(out, 0, DEGREE * OUT_LEN), + ); + if increment_counter.yes() { + counter += DEGREE as u64; + } + inputs = &inputs[DEGREE..]; + out = &mut out[DEGREE * OUT_LEN..]; + } + for (&input, output) in inputs.iter().zip(out.chunks_exact_mut(OUT_LEN)) { + hash1( + input, + key, + counter, + flags, + flags_start, + flags_end, + array_mut_ref!(output, 0, OUT_LEN), + ); + if increment_counter.yes() { + counter += 1; + } + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_transpose() { + if !crate::platform::sse41_detected() { + return; + } + + #[target_feature(enable = "sse4.1")] + unsafe fn transpose_wrapper(vecs: &mut [__m128i; DEGREE]) { + transpose_vecs(vecs); + } + + let mut matrix = [[0 as u32; DEGREE]; DEGREE]; + for i in 0..DEGREE { + for j in 0..DEGREE { + matrix[i][j] = (i * DEGREE + j) as u32; + } + } + + unsafe { + let mut vecs: [__m128i; DEGREE] = core::mem::transmute(matrix); + transpose_wrapper(&mut vecs); + matrix = core::mem::transmute(vecs); + } + + for i in 0..DEGREE { + for j in 0..DEGREE { + // Reversed indexes from above. + assert_eq!(matrix[j][i], (i * DEGREE + j) as u32); + } + } + } + + #[test] + fn test_compress() { + if !crate::platform::sse41_detected() { + return; + } + crate::test::test_compress_fn(compress_in_place, compress_xof); + } + + #[test] + fn test_hash_many() { + if !crate::platform::sse41_detected() { + return; + } + crate::test::test_hash_many_fn(hash_many, hash_many); + } +} diff --git a/thirdparty/blake3/src/test.rs b/thirdparty/blake3/src/test.rs new file mode 100644 index 000000000..2d21856a0 --- /dev/null +++ b/thirdparty/blake3/src/test.rs @@ -0,0 +1,1049 @@ +use crate::{CVBytes, CVWords, IncrementCounter, BLOCK_LEN, CHUNK_LEN, OUT_LEN}; +use arrayref::array_ref; +use arrayvec::ArrayVec; +use core::usize; +use rand::prelude::*; + +// Interesting input lengths to run tests on. +pub const TEST_CASES: &[usize] = &[ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + BLOCK_LEN - 1, + BLOCK_LEN, + BLOCK_LEN + 1, + 2 * BLOCK_LEN - 1, + 2 * BLOCK_LEN, + 2 * BLOCK_LEN + 1, + CHUNK_LEN - 1, + CHUNK_LEN, + CHUNK_LEN + 1, + 2 * CHUNK_LEN, + 2 * CHUNK_LEN + 1, + 3 * CHUNK_LEN, + 3 * CHUNK_LEN + 1, + 4 * CHUNK_LEN, + 4 * CHUNK_LEN + 1, + 5 * CHUNK_LEN, + 5 * CHUNK_LEN + 1, + 6 * CHUNK_LEN, + 6 * CHUNK_LEN + 1, + 7 * CHUNK_LEN, + 7 * CHUNK_LEN + 1, + 8 * CHUNK_LEN, + 8 * CHUNK_LEN + 1, + 16 * CHUNK_LEN - 1, + 16 * CHUNK_LEN, // AVX512's bandwidth + 16 * CHUNK_LEN + 1, + 31 * CHUNK_LEN - 1, + 31 * CHUNK_LEN, // 16 + 8 + 4 + 2 + 1 + 31 * CHUNK_LEN + 1, + 100 * CHUNK_LEN, // subtrees larger than MAX_SIMD_DEGREE chunks +]; + +pub const TEST_CASES_MAX: usize = 100 * CHUNK_LEN; + +// There's a test to make sure these two are equal below. +pub const TEST_KEY: CVBytes = *b"whats the Elvish word for friend"; +pub const TEST_KEY_WORDS: CVWords = [ + 1952540791, 1752440947, 1816469605, 1752394102, 1919907616, 1868963940, 1919295602, 1684956521, +]; + +// Paint the input with a repeating byte pattern. We use a cycle length of 251, +// because that's the largest prime number less than 256. This makes it +// unlikely to swapping any two adjacent input blocks or chunks will give the +// same answer. +pub fn paint_test_input(buf: &mut [u8]) { + for (i, b) in buf.iter_mut().enumerate() { + *b = (i % 251) as u8; + } +} + +type CompressInPlaceFn = + unsafe fn(cv: &mut CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8); + +type CompressXofFn = unsafe fn( + cv: &CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) -> [u8; 64]; + +// A shared helper function for platform-specific tests. +pub fn test_compress_fn(compress_in_place_fn: CompressInPlaceFn, compress_xof_fn: CompressXofFn) { + let initial_state = TEST_KEY_WORDS; + let block_len: u8 = 61; + let mut block = [0; BLOCK_LEN]; + paint_test_input(&mut block[..block_len as usize]); + // Use a counter with set bits in both 32-bit words. + let counter = (5u64 << 32) + 6; + let flags = crate::CHUNK_END | crate::ROOT | crate::KEYED_HASH; + + let portable_out = + crate::portable::compress_xof(&initial_state, &block, block_len, counter as u64, flags); + + let mut test_state = initial_state; + unsafe { compress_in_place_fn(&mut test_state, &block, block_len, counter as u64, flags) }; + let test_state_bytes = crate::platform::le_bytes_from_words_32(&test_state); + let test_xof = + unsafe { compress_xof_fn(&initial_state, &block, block_len, counter as u64, flags) }; + + assert_eq!(&portable_out[..32], &test_state_bytes[..]); + assert_eq!(&portable_out[..], &test_xof[..]); +} + +type HashManyFn<A> = unsafe fn( + inputs: &[&A], + key: &CVWords, + counter: u64, + increment_counter: IncrementCounter, + flags: u8, + flags_start: u8, + flags_end: u8, + out: &mut [u8], +); + +// A shared helper function for platform-specific tests. +pub fn test_hash_many_fn( + hash_many_chunks_fn: HashManyFn<[u8; CHUNK_LEN]>, + hash_many_parents_fn: HashManyFn<[u8; 2 * OUT_LEN]>, +) { + // Test a few different initial counter values. + // - 0: The base case. + // - u32::MAX: The low word of the counter overflows for all inputs except the first. + // - i32::MAX: *No* overflow. But carry bugs in tricky SIMD code can screw this up, if you XOR + // when you're supposed to ANDNOT... + let initial_counters = [0, u32::MAX as u64, i32::MAX as u64]; + for counter in initial_counters { + #[cfg(feature = "std")] + dbg!(counter); + + // 31 (16 + 8 + 4 + 2 + 1) inputs + const NUM_INPUTS: usize = 31; + let mut input_buf = [0; CHUNK_LEN * NUM_INPUTS]; + crate::test::paint_test_input(&mut input_buf); + + // First hash chunks. + let mut chunks = ArrayVec::<&[u8; CHUNK_LEN], NUM_INPUTS>::new(); + for i in 0..NUM_INPUTS { + chunks.push(array_ref!(input_buf, i * CHUNK_LEN, CHUNK_LEN)); + } + let mut portable_chunks_out = [0; NUM_INPUTS * OUT_LEN]; + crate::portable::hash_many( + &chunks, + &TEST_KEY_WORDS, + counter, + IncrementCounter::Yes, + crate::KEYED_HASH, + crate::CHUNK_START, + crate::CHUNK_END, + &mut portable_chunks_out, + ); + + let mut test_chunks_out = [0; NUM_INPUTS * OUT_LEN]; + unsafe { + hash_many_chunks_fn( + &chunks[..], + &TEST_KEY_WORDS, + counter, + IncrementCounter::Yes, + crate::KEYED_HASH, + crate::CHUNK_START, + crate::CHUNK_END, + &mut test_chunks_out, + ); + } + for n in 0..NUM_INPUTS { + #[cfg(feature = "std")] + dbg!(n); + assert_eq!( + &portable_chunks_out[n * OUT_LEN..][..OUT_LEN], + &test_chunks_out[n * OUT_LEN..][..OUT_LEN] + ); + } + + // Then hash parents. + let mut parents = ArrayVec::<&[u8; 2 * OUT_LEN], NUM_INPUTS>::new(); + for i in 0..NUM_INPUTS { + parents.push(array_ref!(input_buf, i * 2 * OUT_LEN, 2 * OUT_LEN)); + } + let mut portable_parents_out = [0; NUM_INPUTS * OUT_LEN]; + crate::portable::hash_many( + &parents, + &TEST_KEY_WORDS, + counter, + IncrementCounter::No, + crate::KEYED_HASH | crate::PARENT, + 0, + 0, + &mut portable_parents_out, + ); + + let mut test_parents_out = [0; NUM_INPUTS * OUT_LEN]; + unsafe { + hash_many_parents_fn( + &parents[..], + &TEST_KEY_WORDS, + counter, + IncrementCounter::No, + crate::KEYED_HASH | crate::PARENT, + 0, + 0, + &mut test_parents_out, + ); + } + for n in 0..NUM_INPUTS { + #[cfg(feature = "std")] + dbg!(n); + assert_eq!( + &portable_parents_out[n * OUT_LEN..][..OUT_LEN], + &test_parents_out[n * OUT_LEN..][..OUT_LEN] + ); + } + } +} + +#[allow(unused)] +type XofManyFunction = unsafe fn( + cv: &CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, + out: &mut [u8], +); + +// A shared helper function for platform-specific tests. +#[allow(unused)] +pub fn test_xof_many_fn(xof_many_function: XofManyFunction) { + let mut block = [0; BLOCK_LEN]; + let block_len = 42; + crate::test::paint_test_input(&mut block[..block_len]); + let cv = [40, 41, 42, 43, 44, 45, 46, 47]; + let flags = crate::KEYED_HASH; + + // Test a few different initial counter values. + // - 0: The base case. + // - u32::MAX: The low word of the counter overflows for all inputs except the first. + // - i32::MAX: *No* overflow. But carry bugs in tricky SIMD code can screw this up, if you XOR + // when you're supposed to ANDNOT... + let initial_counters = [0, u32::MAX as u64, i32::MAX as u64]; + for counter in initial_counters { + #[cfg(feature = "std")] + dbg!(counter); + + // 31 (16 + 8 + 4 + 2 + 1) outputs + const OUTPUT_SIZE: usize = 31 * BLOCK_LEN; + + let mut portable_out = [0u8; OUTPUT_SIZE]; + for (i, out_block) in portable_out.chunks_exact_mut(64).enumerate() { + out_block.copy_from_slice(&crate::portable::compress_xof( + &cv, + &block, + block_len as u8, + counter + i as u64, + flags, + )); + } + + let mut test_out = [0u8; OUTPUT_SIZE]; + unsafe { + xof_many_function(&cv, &block, block_len as u8, counter, flags, &mut test_out); + } + + assert_eq!(portable_out, test_out); + } + + // Test that xof_many doesn't write more blocks than requested. Note that the current assembly + // implementation always outputs at least one block, so we don't test the zero case. + for block_count in 1..=32 { + let mut array = [0; BLOCK_LEN * 33]; + let output_start = 17; + let output_len = block_count * BLOCK_LEN; + let output_end = output_start + output_len; + let output = &mut array[output_start..output_end]; + unsafe { + xof_many_function(&cv, &block, block_len as u8, 0, flags, output); + } + for i in 0..array.len() { + if i < output_start || output_end <= i { + assert_eq!(0, array[i], "index {i}"); + } + } + } +} + +#[test] +fn test_key_bytes_equal_key_words() { + assert_eq!( + TEST_KEY_WORDS, + crate::platform::words_from_le_bytes_32(&TEST_KEY), + ); +} + +#[test] +fn test_reference_impl_size() { + // Because the Rust compiler optimizes struct layout, it's possible that + // some future version of the compiler will produce a different size. If + // that happens, we can either disable this test, or test for multiple + // expected values. For now, the purpose of this test is to make sure we + // notice if that happens. + assert_eq!(1880, core::mem::size_of::<reference_impl::Hasher>()); +} + +#[test] +fn test_counter_words() { + let counter: u64 = (1 << 32) + 2; + assert_eq!(crate::counter_low(counter), 2); + assert_eq!(crate::counter_high(counter), 1); +} + +#[test] +fn test_largest_power_of_two_leq() { + let input_output = &[ + // The zero case is nonsensical, but it does work. + (0, 1), + (1, 1), + (2, 2), + (3, 2), + (4, 4), + (5, 4), + (6, 4), + (7, 4), + (8, 8), + // the largest possible usize + (usize::MAX, (usize::MAX >> 1) + 1), + ]; + for &(input, output) in input_output { + assert_eq!( + output, + crate::largest_power_of_two_leq(input), + "wrong output for n={}", + input + ); + } +} + +#[test] +fn test_compare_reference_impl() { + const OUT: usize = 303; // more than 64, not a multiple of 4 + let mut input_buf = [0; TEST_CASES_MAX]; + paint_test_input(&mut input_buf); + for &case in TEST_CASES { + let input = &input_buf[..case]; + #[cfg(feature = "std")] + dbg!(case); + + // regular + { + let mut reference_hasher = reference_impl::Hasher::new(); + reference_hasher.update(input); + let mut expected_out = [0; OUT]; + reference_hasher.finalize(&mut expected_out); + + // all at once + let test_out = crate::hash(input); + assert_eq!(test_out, *array_ref!(expected_out, 0, 32)); + // incremental + let mut hasher = crate::Hasher::new(); + hasher.update(input); + assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32)); + assert_eq!(hasher.finalize(), test_out); + // incremental (rayon) + #[cfg(feature = "rayon")] + { + let mut hasher = crate::Hasher::new(); + hasher.update_rayon(input); + assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32)); + assert_eq!(hasher.finalize(), test_out); + } + // xof + let mut extended = [0; OUT]; + hasher.finalize_xof().fill(&mut extended); + assert_eq!(extended, expected_out); + } + + // keyed + { + let mut reference_hasher = reference_impl::Hasher::new_keyed(&TEST_KEY); + reference_hasher.update(input); + let mut expected_out = [0; OUT]; + reference_hasher.finalize(&mut expected_out); + + // all at once + let test_out = crate::keyed_hash(&TEST_KEY, input); + assert_eq!(test_out, *array_ref!(expected_out, 0, 32)); + // incremental + let mut hasher = crate::Hasher::new_keyed(&TEST_KEY); + hasher.update(input); + assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32)); + assert_eq!(hasher.finalize(), test_out); + // incremental (rayon) + #[cfg(feature = "rayon")] + { + let mut hasher = crate::Hasher::new_keyed(&TEST_KEY); + hasher.update_rayon(input); + assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32)); + assert_eq!(hasher.finalize(), test_out); + } + // xof + let mut extended = [0; OUT]; + hasher.finalize_xof().fill(&mut extended); + assert_eq!(extended, expected_out); + } + + // derive_key + { + let context = "BLAKE3 2019-12-27 16:13:59 example context (not the test vector one)"; + let mut reference_hasher = reference_impl::Hasher::new_derive_key(context); + reference_hasher.update(input); + let mut expected_out = [0; OUT]; + reference_hasher.finalize(&mut expected_out); + + // all at once + let test_out = crate::derive_key(context, input); + assert_eq!(test_out, expected_out[..32]); + // incremental + let mut hasher = crate::Hasher::new_derive_key(context); + hasher.update(input); + assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32)); + assert_eq!(hasher.finalize(), *array_ref!(test_out, 0, 32)); + // incremental (rayon) + #[cfg(feature = "rayon")] + { + let mut hasher = crate::Hasher::new_derive_key(context); + hasher.update_rayon(input); + assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32)); + assert_eq!(hasher.finalize(), *array_ref!(test_out, 0, 32)); + } + // xof + let mut extended = [0; OUT]; + hasher.finalize_xof().fill(&mut extended); + assert_eq!(extended, expected_out); + } + } +} + +#[test] +fn test_compare_reference_impl_long_xof() { + let mut reference_output = [0u8; 32 * BLOCK_LEN - 1]; + let mut reference_hasher = reference_impl::Hasher::new_keyed(&TEST_KEY); + reference_hasher.update(b"hello world"); + reference_hasher.finalize(&mut reference_output); + + let mut test_output = [0u8; 32 * BLOCK_LEN - 1]; + let mut test_hasher = crate::Hasher::new_keyed(&TEST_KEY); + test_hasher.update(b"hello world"); + test_hasher.finalize_xof().fill(&mut test_output); + + assert_eq!(reference_output, test_output); +} + +#[test] +fn test_xof_partial_blocks() { + const OUT_LEN: usize = 6 * BLOCK_LEN; + let mut reference_out = [0u8; OUT_LEN]; + reference_impl::Hasher::new().finalize(&mut reference_out); + + let mut all_at_once_out = [0u8; OUT_LEN]; + crate::Hasher::new() + .finalize_xof() + .fill(&mut all_at_once_out); + assert_eq!(reference_out, all_at_once_out); + + let mut partial_out = [0u8; OUT_LEN]; + let partial_start = 32; + let partial_end = OUT_LEN - 32; + let mut xof = crate::Hasher::new().finalize_xof(); + xof.fill(&mut partial_out[..partial_start]); + xof.fill(&mut partial_out[partial_start..partial_end]); + xof.fill(&mut partial_out[partial_end..]); + assert_eq!(reference_out, partial_out); +} + +fn reference_hash(input: &[u8]) -> crate::Hash { + let mut hasher = reference_impl::Hasher::new(); + hasher.update(input); + let mut bytes = [0; 32]; + hasher.finalize(&mut bytes); + bytes.into() +} + +#[test] +fn test_compare_update_multiple() { + // Don't use all the long test cases here, since that's unnecessarily slow + // in debug mode. + let mut short_test_cases = TEST_CASES; + while *short_test_cases.last().unwrap() > 4 * CHUNK_LEN { + short_test_cases = &short_test_cases[..short_test_cases.len() - 1]; + } + assert_eq!(*short_test_cases.last().unwrap(), 4 * CHUNK_LEN); + + let mut input_buf = [0; 2 * TEST_CASES_MAX]; + paint_test_input(&mut input_buf); + + for &first_update in short_test_cases { + #[cfg(feature = "std")] + dbg!(first_update); + let first_input = &input_buf[..first_update]; + let mut test_hasher = crate::Hasher::new(); + test_hasher.update(first_input); + + for &second_update in short_test_cases { + #[cfg(feature = "std")] + dbg!(second_update); + let second_input = &input_buf[first_update..][..second_update]; + let total_input = &input_buf[..first_update + second_update]; + + // Clone the hasher with first_update bytes already written, so + // that the next iteration can reuse it. + let mut test_hasher = test_hasher.clone(); + test_hasher.update(second_input); + let expected = reference_hash(total_input); + assert_eq!(expected, test_hasher.finalize()); + } + } +} + +#[test] +fn test_fuzz_hasher() { + const INPUT_MAX: usize = 4 * CHUNK_LEN; + let mut input_buf = [0; 3 * INPUT_MAX]; + paint_test_input(&mut input_buf); + + // Don't do too many iterations in debug mode, to keep the tests under a + // second or so. CI should run tests in release mode also. Provide an + // environment variable for specifying a larger number of fuzz iterations. + let num_tests = if cfg!(debug_assertions) { 100 } else { 10_000 }; + + // Use a fixed RNG seed for reproducibility. + let mut rng = rand_chacha::ChaCha8Rng::from_seed([1; 32]); + for _num_test in 0..num_tests { + #[cfg(feature = "std")] + dbg!(_num_test); + let mut hasher = crate::Hasher::new(); + let mut total_input = 0; + // For each test, write 3 inputs of random length. + for _ in 0..3 { + let input_len = rng.random_range(0..(INPUT_MAX + 1)); + #[cfg(feature = "std")] + dbg!(input_len); + let input = &input_buf[total_input..][..input_len]; + hasher.update(input); + total_input += input_len; + } + let expected = reference_hash(&input_buf[..total_input]); + assert_eq!(expected, hasher.finalize()); + } +} + +#[test] +fn test_fuzz_xof() { + let mut input_buf = [0u8; 3 * BLOCK_LEN]; + paint_test_input(&mut input_buf); + + // Don't do too many iterations in debug mode, to keep the tests under a + // second or so. CI should run tests in release mode also. Provide an + // environment variable for specifying a larger number of fuzz iterations. + let num_tests = if cfg!(debug_assertions) { 100 } else { 2500 }; + + // Use a fixed RNG seed for reproducibility. + let mut rng = rand_chacha::ChaCha8Rng::from_seed([1; 32]); + for _num_test in 0..num_tests { + #[cfg(feature = "std")] + dbg!(_num_test); + // 31 (16 + 8 + 4 + 2 + 1) outputs + let mut output_buf = [0; 31 * CHUNK_LEN]; + let input_len = rng.random_range(0..input_buf.len()); + let mut xof = crate::Hasher::new() + .update(&input_buf[..input_len]) + .finalize_xof(); + let partial_start = rng.random_range(0..output_buf.len()); + let partial_end = rng.random_range(partial_start..output_buf.len()); + xof.fill(&mut output_buf[..partial_start]); + xof.fill(&mut output_buf[partial_start..partial_end]); + xof.fill(&mut output_buf[partial_end..]); + + let mut reference_buf = [0; 31 * CHUNK_LEN]; + let mut reference_hasher = reference_impl::Hasher::new(); + reference_hasher.update(&input_buf[..input_len]); + reference_hasher.finalize(&mut reference_buf); + + assert_eq!(reference_buf, output_buf); + } +} + +#[test] +fn test_xof_seek() { + let mut out = [0; 533]; + let mut hasher = crate::Hasher::new(); + hasher.update(b"foo"); + hasher.finalize_xof().fill(&mut out); + assert_eq!(hasher.finalize().as_bytes(), &out[0..32]); + + let mut reader = hasher.finalize_xof(); + reader.set_position(303); + let mut out2 = [0; 102]; + reader.fill(&mut out2); + assert_eq!(&out[303..][..102], &out2[..]); + + #[cfg(feature = "std")] + { + use std::io::prelude::*; + let mut reader = hasher.finalize_xof(); + reader.seek(std::io::SeekFrom::Start(303)).unwrap(); + let mut out3 = Vec::new(); + reader.by_ref().take(102).read_to_end(&mut out3).unwrap(); + assert_eq!(&out[303..][..102], &out3[..]); + + assert_eq!( + reader.seek(std::io::SeekFrom::Current(0)).unwrap(), + 303 + 102 + ); + reader.seek(std::io::SeekFrom::Current(-5)).unwrap(); + assert_eq!( + reader.seek(std::io::SeekFrom::Current(0)).unwrap(), + 303 + 102 - 5 + ); + let mut out4 = [0; 17]; + assert_eq!(reader.read(&mut out4).unwrap(), 17); + assert_eq!(&out[303 + 102 - 5..][..17], &out4[..]); + assert_eq!( + reader.seek(std::io::SeekFrom::Current(0)).unwrap(), + 303 + 102 - 5 + 17 + ); + assert!(reader.seek(std::io::SeekFrom::End(0)).is_err()); + assert!(reader.seek(std::io::SeekFrom::Current(-1000)).is_err()); + } +} + +#[test] +fn test_msg_schedule_permutation() { + let permutation = [2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8]; + + let mut generated = [[0; 16]; 7]; + generated[0] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]; + + for round in 1..7 { + for i in 0..16 { + generated[round][i] = generated[round - 1][permutation[i]]; + } + } + + assert_eq!(generated, crate::MSG_SCHEDULE); +} + +#[test] +fn test_reset() { + let mut hasher = crate::Hasher::new(); + hasher.update(&[42; 3 * CHUNK_LEN + 7]); + hasher.reset(); + hasher.update(&[42; CHUNK_LEN + 3]); + assert_eq!(hasher.finalize(), crate::hash(&[42; CHUNK_LEN + 3])); + + let key = &[99; crate::KEY_LEN]; + let mut keyed_hasher = crate::Hasher::new_keyed(key); + keyed_hasher.update(&[42; 3 * CHUNK_LEN + 7]); + keyed_hasher.reset(); + keyed_hasher.update(&[42; CHUNK_LEN + 3]); + assert_eq!( + keyed_hasher.finalize(), + crate::keyed_hash(key, &[42; CHUNK_LEN + 3]), + ); + + let context = "BLAKE3 2020-02-12 10:20:58 reset test"; + let mut kdf = crate::Hasher::new_derive_key(context); + kdf.update(&[42; 3 * CHUNK_LEN + 7]); + kdf.reset(); + kdf.update(&[42; CHUNK_LEN + 3]); + let expected = crate::derive_key(context, &[42; CHUNK_LEN + 3]); + assert_eq!(kdf.finalize(), expected); +} + +#[test] +fn test_hex_encoding_decoding() { + let digest_str = "04e0bb39f30b1a3feb89f536c93be15055482df748674b00d26e5a75777702e9"; + let mut hasher = crate::Hasher::new(); + hasher.update(b"foo"); + let digest = hasher.finalize(); + assert_eq!(digest.to_hex().as_str(), digest_str); + #[cfg(feature = "std")] + assert_eq!(digest.to_string(), digest_str); + + // Test round trip + let digest = crate::Hash::from_hex(digest_str).unwrap(); + assert_eq!(digest.to_hex().as_str(), digest_str); + + // Test uppercase + let digest = crate::Hash::from_hex(digest_str.to_uppercase()).unwrap(); + assert_eq!(digest.to_hex().as_str(), digest_str); + + // Test string parsing via FromStr + let digest: crate::Hash = digest_str.parse().unwrap(); + assert_eq!(digest.to_hex().as_str(), digest_str); + + // Test errors + let bad_len = "04e0bb39f30b1"; + let _result = crate::Hash::from_hex(bad_len).unwrap_err(); + #[cfg(feature = "std")] + assert_eq!(_result.to_string(), "expected 64 hex bytes, received 13"); + + let bad_char = "Z4e0bb39f30b1a3feb89f536c93be15055482df748674b00d26e5a75777702e9"; + let _result = crate::Hash::from_hex(bad_char).unwrap_err(); + #[cfg(feature = "std")] + assert_eq!(_result.to_string(), "invalid hex character: 'Z'"); + + let _result = crate::Hash::from_hex([128; 64]).unwrap_err(); + #[cfg(feature = "std")] + assert_eq!(_result.to_string(), "invalid hex character: 0x80"); +} + +// This test is a mimized failure case for the Windows SSE2 bug described in +// https://github.com/BLAKE3-team/BLAKE3/issues/206. +// +// Before that issue was fixed, this test would fail on Windows in the following configuration: +// +// cargo test --features=no_avx512,no_avx2,no_sse41 --release +// +// Bugs like this one (stomping on a caller's register) are very sensitive to the details of +// surrounding code, so it's not especially likely that this test will catch another bug (or even +// the same bug) in the future. Still, there's no harm in keeping it. +#[test] +fn test_issue_206_windows_sse2() { + // This stupid loop has to be here to trigger the bug. I don't know why. + for _ in &[0] { + // The length 65 (two blocks) is significant. It doesn't repro with 64 (one block). It also + // doesn't repro with an all-zero input. + let input = &[0xff; 65]; + let expected_hash = [ + 183, 235, 50, 217, 156, 24, 190, 219, 2, 216, 176, 255, 224, 53, 28, 95, 57, 148, 179, + 245, 162, 90, 37, 121, 0, 142, 219, 62, 234, 204, 225, 161, + ]; + + // This throwaway call has to be here to trigger the bug. + crate::Hasher::new().update(input); + + // This assert fails when the bug is triggered. + assert_eq!(crate::Hasher::new().update(input).finalize(), expected_hash); + } +} + +#[test] +fn test_hash_conversions() { + let bytes1 = [42; 32]; + let hash1: crate::Hash = bytes1.into(); + let bytes2: [u8; 32] = hash1.into(); + assert_eq!(bytes1, bytes2); + + let bytes3 = *hash1.as_bytes(); + assert_eq!(bytes1, bytes3); + + let hash2 = crate::Hash::from_bytes(bytes1); + assert_eq!(hash1, hash2); + + let hex = hash1.to_hex(); + let hash3 = crate::Hash::from_hex(hex.as_bytes()).unwrap(); + assert_eq!(hash1, hash3); + + let slice1: &[u8] = bytes1.as_slice(); + let hash4 = crate::Hash::from_slice(slice1).expect("correct length"); + assert_eq!(hash1, hash4); + + assert!(crate::Hash::from_slice(&[]).is_err()); + assert!(crate::Hash::from_slice(&[42]).is_err()); + assert!(crate::Hash::from_slice([42; 31].as_slice()).is_err()); + assert!(crate::Hash::from_slice([42; 33].as_slice()).is_err()); + assert!(crate::Hash::from_slice([42; 100].as_slice()).is_err()); +} + +#[test] +const fn test_hash_const_conversions() { + let bytes = [42; 32]; + let hash = crate::Hash::from_bytes(bytes); + _ = hash.as_bytes(); +} + +#[cfg(feature = "zeroize")] +#[test] +fn test_zeroize() { + use zeroize::Zeroize; + + let mut hash = crate::Hash([42; 32]); + hash.zeroize(); + assert_eq!(hash.0, [0u8; 32]); + + let mut hasher = crate::Hasher { + chunk_state: crate::ChunkState { + cv: [42; 8], + chunk_counter: 42, + buf: [42; 64], + buf_len: 42, + blocks_compressed: 42, + flags: 42, + platform: crate::Platform::Portable, + }, + initial_chunk_counter: 42, + key: [42; 8], + cv_stack: [[42; 32]; { crate::MAX_DEPTH + 1 }].into(), + }; + hasher.zeroize(); + assert_eq!(hasher.chunk_state.cv, [0; 8]); + assert_eq!(hasher.chunk_state.chunk_counter, 0); + assert_eq!(hasher.chunk_state.buf, [0; 64]); + assert_eq!(hasher.chunk_state.buf_len, 0); + assert_eq!(hasher.chunk_state.blocks_compressed, 0); + assert_eq!(hasher.chunk_state.flags, 0); + assert!(matches!( + hasher.chunk_state.platform, + crate::Platform::Portable + )); + assert_eq!(hasher.initial_chunk_counter, 0); + assert_eq!(hasher.key, [0; 8]); + assert_eq!(&*hasher.cv_stack, &[[0u8; 32]; 0]); + + let mut output_reader = crate::OutputReader { + inner: crate::Output { + input_chaining_value: [42; 8], + block: [42; 64], + counter: 42, + block_len: 42, + flags: 42, + platform: crate::Platform::Portable, + }, + position_within_block: 42, + }; + + output_reader.zeroize(); + assert_eq!(output_reader.inner.input_chaining_value, [0; 8]); + assert_eq!(output_reader.inner.block, [0; 64]); + assert_eq!(output_reader.inner.counter, 0); + assert_eq!(output_reader.inner.block_len, 0); + assert_eq!(output_reader.inner.flags, 0); + assert!(matches!( + output_reader.inner.platform, + crate::Platform::Portable + )); + assert_eq!(output_reader.position_within_block, 0); +} + +#[test] +#[cfg(feature = "std")] +fn test_update_reader() -> Result<(), std::io::Error> { + // This is a brief test, since update_reader() is mostly a wrapper around update(), which already + // has substantial testing. + let mut input = vec![0; 1_000_000]; + paint_test_input(&mut input); + assert_eq!( + crate::Hasher::new().update_reader(&input[..])?.finalize(), + crate::hash(&input), + ); + Ok(()) +} + +#[test] +#[cfg(feature = "std")] +fn test_update_reader_interrupted() -> std::io::Result<()> { + use std::io; + struct InterruptingReader<'a> { + already_interrupted: bool, + slice: &'a [u8], + } + impl<'a> InterruptingReader<'a> { + fn new(slice: &'a [u8]) -> Self { + Self { + already_interrupted: false, + slice, + } + } + } + impl<'a> io::Read for InterruptingReader<'a> { + fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> { + if !self.already_interrupted { + self.already_interrupted = true; + return Err(io::Error::from(io::ErrorKind::Interrupted)); + } + let take = std::cmp::min(self.slice.len(), buf.len()); + buf[..take].copy_from_slice(&self.slice[..take]); + self.slice = &self.slice[take..]; + Ok(take) + } + } + + let input = b"hello world"; + let mut reader = InterruptingReader::new(input); + let mut hasher = crate::Hasher::new(); + hasher.update_reader(&mut reader)?; + assert_eq!(hasher.finalize(), crate::hash(input)); + Ok(()) +} + +#[test] +#[cfg(feature = "mmap")] +// NamedTempFile isn't Miri-compatible +#[cfg(not(miri))] +fn test_mmap() -> Result<(), std::io::Error> { + // This is a brief test, since update_mmap() is mostly a wrapper around update(), which already + // has substantial testing. + use std::io::prelude::*; + let mut input = vec![0; 1_000_000]; + paint_test_input(&mut input); + let mut tempfile = tempfile::NamedTempFile::new()?; + tempfile.write_all(&input)?; + tempfile.flush()?; + assert_eq!( + crate::Hasher::new() + .update_mmap(tempfile.path())? + .finalize(), + crate::hash(&input), + ); + Ok(()) +} + +#[test] +#[cfg(feature = "mmap")] +#[cfg(target_os = "linux")] +fn test_mmap_virtual_file() -> Result<(), std::io::Error> { + // Virtual files like /proc/version can't be mmapped, because their contents don't actually + // exist anywhere in memory. Make sure we fall back to regular file IO in these cases. + // Currently this is handled with a length check, where the assumption is that virtual files + // will always report length 0. If that assumption ever breaks, hopefully this test will catch + // it. + let virtual_filepath = "/proc/version"; + let mut mmap_hasher = crate::Hasher::new(); + // We'll fail right here if the fallback doesn't work. + mmap_hasher.update_mmap(virtual_filepath)?; + let mut read_hasher = crate::Hasher::new(); + read_hasher.update_reader(std::fs::File::open(virtual_filepath)?)?; + assert_eq!(mmap_hasher.finalize(), read_hasher.finalize()); + Ok(()) +} + +#[test] +#[cfg(feature = "mmap")] +#[cfg(feature = "rayon")] +// NamedTempFile isn't Miri-compatible +#[cfg(not(miri))] +fn test_mmap_rayon() -> Result<(), std::io::Error> { + // This is a brief test, since update_mmap_rayon() is mostly a wrapper around update_rayon(), + // which already has substantial testing. + use std::io::prelude::*; + let mut input = vec![0; 1_000_000]; + paint_test_input(&mut input); + let mut tempfile = tempfile::NamedTempFile::new()?; + tempfile.write_all(&input)?; + tempfile.flush()?; + assert_eq!( + crate::Hasher::new() + .update_mmap_rayon(tempfile.path())? + .finalize(), + crate::hash(&input), + ); + Ok(()) +} + +#[test] +#[cfg(feature = "std")] +#[cfg(feature = "serde")] +fn test_serde() { + // Henrik suggested that we use 0xfe / 254 for byte test data instead of 0xff / 255, due to the + // fact that 0xfe is not a well formed CBOR item. + let hash: crate::Hash = [0xfe; 32].into(); + + let json = serde_json::to_string(&hash).unwrap(); + assert_eq!( + json, + "[254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254]", + ); + let hash2: crate::Hash = serde_json::from_str(&json).unwrap(); + assert_eq!(hash, hash2); + + let mut cbor = Vec::<u8>::new(); + ciborium::into_writer(&hash, &mut cbor).unwrap(); + assert_eq!( + cbor, + [ + 0x98, 0x20, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, + 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, + 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, + 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, + 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, + ] + ); + let hash_from_cbor: crate::Hash = ciborium::from_reader(&cbor[..]).unwrap(); + assert_eq!(hash_from_cbor, hash); + + // Version 1.5.2 of this crate changed the default serialization format to a bytestring + // (instead of an array/list) to save bytes on the wire. That was a backwards compatibility + // mistake for non-self-describing formats, and it's been reverted. Since some small number of + // serialized bytestrings will probably exist forever in the wild, we shold test that we can + // still deserialize these from self-describing formats. + let bytestring_cbor: &[u8] = &[ + 0x58, 0x20, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, + 0xfe, 0xfe, 0xfe, 0xfe, + ]; + let hash_from_bytestring_cbor: crate::Hash = ciborium::from_reader(bytestring_cbor).unwrap(); + assert_eq!(hash_from_bytestring_cbor, hash); +} + +// `cargo +nightly miri test` currently works, but it takes forever, because some of our test +// inputs are quite large. Most of our unsafe code is platform specific and incompatible with Miri +// anyway, but we'd like it to be possible for callers to run their own tests under Miri, assuming +// they don't use incompatible features like Rayon or mmap. This test should get reasonable +// coverage of our public API without using any large inputs, so we can run it in CI and catch +// obvious breaks. (For example, constant_time_eq is not compatible with Miri.) +#[test] +fn test_miri_smoketest() { + let mut hasher = crate::Hasher::new_derive_key("Miri smoketest"); + hasher.update(b"foo"); + #[cfg(feature = "std")] + hasher.update_reader(&b"bar"[..]).unwrap(); + assert_eq!(hasher.finalize(), hasher.finalize()); + let mut reader = hasher.finalize_xof(); + reader.set_position(999999); + reader.fill(&mut [0]); +} + +// I had to move these tests out of the deprecated guts module, because leaving them there causes +// an un-silenceable warning: https://github.com/rust-lang/rust/issues/47238 +#[cfg(test)] +#[allow(deprecated)] +mod guts_tests { + use crate::guts::*; + + #[test] + fn test_chunk() { + assert_eq!( + crate::hash(b"foo"), + ChunkState::new(0).update(b"foo").finalize(true) + ); + } + + #[test] + fn test_parents() { + let mut hasher = crate::Hasher::new(); + let mut buf = [0; crate::CHUNK_LEN]; + + buf[0] = 'a' as u8; + hasher.update(&buf); + let chunk0_cv = ChunkState::new(0).update(&buf).finalize(false); + + buf[0] = 'b' as u8; + hasher.update(&buf); + let chunk1_cv = ChunkState::new(1).update(&buf).finalize(false); + + hasher.update(b"c"); + let chunk2_cv = ChunkState::new(2).update(b"c").finalize(false); + + let parent = parent_cv(&chunk0_cv, &chunk1_cv, false); + let root = parent_cv(&parent, &chunk2_cv, true); + assert_eq!(hasher.finalize(), root); + } +} diff --git a/thirdparty/blake3/src/traits.rs b/thirdparty/blake3/src/traits.rs new file mode 100644 index 000000000..70b1c0687 --- /dev/null +++ b/thirdparty/blake3/src/traits.rs @@ -0,0 +1,227 @@ +//! Implementations of commonly used traits like `Digest` and `Mac` from the +//! [`digest`](https://crates.io/crates/digest) crate. + +pub use digest; + +use crate::{Hasher, OutputReader}; +use digest::crypto_common; +use digest::generic_array::{typenum::U32, typenum::U64, GenericArray}; + +impl digest::HashMarker for Hasher {} + +impl digest::Update for Hasher { + #[inline] + fn update(&mut self, data: &[u8]) { + self.update(data); + } +} + +impl digest::Reset for Hasher { + #[inline] + fn reset(&mut self) { + self.reset(); // the inherent method + } +} + +impl digest::OutputSizeUser for Hasher { + type OutputSize = U32; +} + +impl digest::FixedOutput for Hasher { + #[inline] + fn finalize_into(self, out: &mut GenericArray<u8, Self::OutputSize>) { + out.copy_from_slice(self.finalize().as_bytes()); + } +} + +impl digest::FixedOutputReset for Hasher { + #[inline] + fn finalize_into_reset(&mut self, out: &mut GenericArray<u8, Self::OutputSize>) { + out.copy_from_slice(self.finalize().as_bytes()); + self.reset(); + } +} + +impl digest::ExtendableOutput for Hasher { + type Reader = OutputReader; + + #[inline] + fn finalize_xof(self) -> Self::Reader { + Hasher::finalize_xof(&self) + } +} + +impl digest::ExtendableOutputReset for Hasher { + #[inline] + fn finalize_xof_reset(&mut self) -> Self::Reader { + let reader = Hasher::finalize_xof(self); + self.reset(); + reader + } +} + +impl digest::XofReader for OutputReader { + #[inline] + fn read(&mut self, buffer: &mut [u8]) { + self.fill(buffer); + } +} + +impl crypto_common::KeySizeUser for Hasher { + type KeySize = U32; +} + +impl crypto_common::BlockSizeUser for Hasher { + type BlockSize = U64; +} + +impl digest::MacMarker for Hasher {} + +impl digest::KeyInit for Hasher { + #[inline] + fn new(key: &digest::Key<Self>) -> Self { + let key_bytes: [u8; 32] = (*key).into(); + Hasher::new_keyed(&key_bytes) + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_digest_traits() { + // Inherent methods. + let mut hasher1 = crate::Hasher::new(); + hasher1.update(b"foo"); + hasher1.update(b"bar"); + hasher1.update(b"baz"); + let out1 = hasher1.finalize(); + let mut xof1 = [0; 301]; + hasher1.finalize_xof().fill(&mut xof1); + assert_eq!(out1.as_bytes(), &xof1[..32]); + + // Trait implementations. + let mut hasher2: crate::Hasher = digest::Digest::new(); + digest::Digest::update(&mut hasher2, b"xxx"); + digest::Digest::reset(&mut hasher2); + digest::Digest::update(&mut hasher2, b"foo"); + digest::Digest::update(&mut hasher2, b"bar"); + digest::Digest::update(&mut hasher2, b"baz"); + let out2 = digest::Digest::finalize(hasher2.clone()); + let mut xof2 = [0; 301]; + digest::XofReader::read( + &mut digest::ExtendableOutput::finalize_xof(hasher2.clone()), + &mut xof2, + ); + assert_eq!(out1.as_bytes(), &out2[..]); + assert_eq!(xof1[..], xof2[..]); + + // Again with the resetting variants. + let mut hasher3: crate::Hasher = digest::Digest::new(); + digest::Digest::update(&mut hasher3, b"foobarbaz"); + let mut out3 = [0; 32]; + digest::FixedOutputReset::finalize_into_reset( + &mut hasher3, + GenericArray::from_mut_slice(&mut out3), + ); + digest::Digest::update(&mut hasher3, b"foobarbaz"); + let mut out4 = [0; 32]; + digest::FixedOutputReset::finalize_into_reset( + &mut hasher3, + GenericArray::from_mut_slice(&mut out4), + ); + digest::Digest::update(&mut hasher3, b"foobarbaz"); + let mut xof3 = [0; 301]; + digest::XofReader::read( + &mut digest::ExtendableOutputReset::finalize_xof_reset(&mut hasher3), + &mut xof3, + ); + digest::Digest::update(&mut hasher3, b"foobarbaz"); + let mut xof4 = [0; 301]; + digest::XofReader::read( + &mut digest::ExtendableOutputReset::finalize_xof_reset(&mut hasher3), + &mut xof4, + ); + assert_eq!(out1.as_bytes(), &out3[..]); + assert_eq!(out1.as_bytes(), &out4[..]); + assert_eq!(xof1[..], xof3[..]); + assert_eq!(xof1[..], xof4[..]); + } + + #[test] + fn test_mac_trait() { + // Inherent methods. + let key = b"some super secret key bytes fooo"; + let mut hasher1 = crate::Hasher::new_keyed(key); + hasher1.update(b"foo"); + hasher1.update(b"bar"); + hasher1.update(b"baz"); + let out1 = hasher1.finalize(); + + // Trait implementation. + let generic_key = (*key).into(); + let mut hasher2: crate::Hasher = digest::Mac::new(&generic_key); + digest::Mac::update(&mut hasher2, b"xxx"); + digest::Mac::reset(&mut hasher2); + digest::Mac::update(&mut hasher2, b"foo"); + digest::Mac::update(&mut hasher2, b"bar"); + digest::Mac::update(&mut hasher2, b"baz"); + let out2 = digest::Mac::finalize(hasher2); + assert_eq!(out1.as_bytes(), out2.into_bytes().as_slice()); + } + + fn expected_hmac_blake3(key: &[u8], input: &[u8]) -> [u8; 32] { + // See https://en.wikipedia.org/wiki/HMAC. + let key_hash; + let key_prime = if key.len() <= 64 { + key + } else { + key_hash = *crate::hash(key).as_bytes(); + &key_hash + }; + let mut ipad = [0x36; 64]; + let mut opad = [0x5c; 64]; + for i in 0..key_prime.len() { + ipad[i] ^= key_prime[i]; + opad[i] ^= key_prime[i]; + } + let mut inner_state = crate::Hasher::new(); + inner_state.update(&ipad); + inner_state.update(input); + let mut outer_state = crate::Hasher::new(); + outer_state.update(&opad); + outer_state.update(inner_state.finalize().as_bytes()); + outer_state.finalize().into() + } + + #[test] + fn test_hmac_compatibility() { + use hmac::{Mac, SimpleHmac}; + + // Test a short key. + let mut x = SimpleHmac::<Hasher>::new_from_slice(b"key").unwrap(); + hmac::digest::Update::update(&mut x, b"data"); + let output = x.finalize().into_bytes(); + assert_ne!(output.len(), 0); + let expected = expected_hmac_blake3(b"key", b"data"); + assert_eq!(expected, output.as_ref()); + + // Test a range of key and data lengths, particularly to exercise the long-key logic. + let mut input_bytes = [0; crate::test::TEST_CASES_MAX]; + crate::test::paint_test_input(&mut input_bytes); + for &input_len in crate::test::TEST_CASES { + #[cfg(feature = "std")] + dbg!(input_len); + let input = &input_bytes[..input_len]; + + let mut x = SimpleHmac::<Hasher>::new_from_slice(input).unwrap(); + hmac::digest::Update::update(&mut x, input); + let output = x.finalize().into_bytes(); + assert_ne!(output.len(), 0); + + let expected = expected_hmac_blake3(input, input); + assert_eq!(expected, output.as_ref()); + } + } +} diff --git a/thirdparty/blake3/src/wasm32_simd.rs b/thirdparty/blake3/src/wasm32_simd.rs new file mode 100644 index 000000000..135249158 --- /dev/null +++ b/thirdparty/blake3/src/wasm32_simd.rs @@ -0,0 +1,794 @@ +/* + * This code is based on rust_sse2.rs of the same distribution, and is subject to further improvements. + * Some comments are left intact even if their applicability is questioned. + * + * Performance measurements with a primitive benchmark with ~16Kb of data: + * + * | M1 native | 11,610 ns | + * | M1 Wasm SIMD | 13,355 ns | + * | M1 Wasm | 22,037 ns | + * | x64 native | 6,713 ns | + * | x64 Wasm SIMD | 11,985 ns | + * | x64 Wasm | 25,978 ns | + * + * wasmtime v12.0.1 was used on both platforms. + */ + +use core::arch::wasm32::*; + +use crate::{ + counter_high, counter_low, CVBytes, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE, + OUT_LEN, +}; +use arrayref::{array_mut_ref, array_ref, mut_array_refs}; + +pub const DEGREE: usize = 4; + +#[inline(always)] +unsafe fn loadu(src: *const u8) -> v128 { + // This is an unaligned load, so the pointer cast is allowed. + v128_load(src as *const v128) +} + +#[inline(always)] +unsafe fn storeu(src: v128, dest: *mut u8) { + // This is an unaligned store, so the pointer cast is allowed. + v128_store(dest as *mut v128, src) +} + +#[inline(always)] +fn add(a: v128, b: v128) -> v128 { + i32x4_add(a, b) +} + +#[inline(always)] +fn xor(a: v128, b: v128) -> v128 { + v128_xor(a, b) +} + +#[inline(always)] +fn set1(x: u32) -> v128 { + i32x4_splat(x as i32) +} + +#[inline(always)] +fn set4(a: u32, b: u32, c: u32, d: u32) -> v128 { + i32x4(a as i32, b as i32, c as i32, d as i32) +} + +// These rotations are the "simple/shifts version". For the +// "complicated/shuffles version", see +// https://github.com/sneves/blake2-avx2/blob/b3723921f668df09ece52dcd225a36d4a4eea1d9/blake2s-common.h#L63-L66. +// For a discussion of the tradeoffs, see +// https://github.com/sneves/blake2-avx2/pull/5. Due to an LLVM bug +// (https://bugs.llvm.org/show_bug.cgi?id=44379), this version performs better +// on recent x86 chips. +#[inline(always)] +fn rot16(a: v128) -> v128 { + v128_or(u32x4_shr(a, 16), u32x4_shl(a, 32 - 16)) +} + +#[inline(always)] +fn rot12(a: v128) -> v128 { + v128_or(u32x4_shr(a, 12), u32x4_shl(a, 32 - 12)) +} + +#[inline(always)] +fn rot8(a: v128) -> v128 { + v128_or(u32x4_shr(a, 8), u32x4_shl(a, 32 - 8)) +} + +#[inline(always)] +fn rot7(a: v128) -> v128 { + v128_or(u32x4_shr(a, 7), u32x4_shl(a, 32 - 7)) +} + +#[inline(always)] +fn g1(row0: &mut v128, row1: &mut v128, row2: &mut v128, row3: &mut v128, m: v128) { + *row0 = add(add(*row0, m), *row1); + *row3 = xor(*row3, *row0); + *row3 = rot16(*row3); + *row2 = add(*row2, *row3); + *row1 = xor(*row1, *row2); + *row1 = rot12(*row1); +} + +#[inline(always)] +fn g2(row0: &mut v128, row1: &mut v128, row2: &mut v128, row3: &mut v128, m: v128) { + *row0 = add(add(*row0, m), *row1); + *row3 = xor(*row3, *row0); + *row3 = rot8(*row3); + *row2 = add(*row2, *row3); + *row1 = xor(*row1, *row2); + *row1 = rot7(*row1); +} + +// It could be a function, but artimetics in const generics is too limited yet. +macro_rules! shuffle { + ($a: expr, $b: expr, $z:expr, $y:expr, $x:expr, $w:expr) => { + i32x4_shuffle::<{ $w }, { $x }, { $y + 4 }, { $z + 4 }>($a, $b) + }; +} + +#[inline(always)] +fn unpacklo_epi64(a: v128, b: v128) -> v128 { + i64x2_shuffle::<0, 2>(a, b) +} + +#[inline(always)] +fn unpackhi_epi64(a: v128, b: v128) -> v128 { + i64x2_shuffle::<1, 3>(a, b) +} + +#[inline(always)] +fn unpacklo_epi32(a: v128, b: v128) -> v128 { + i32x4_shuffle::<0, 4, 1, 5>(a, b) +} + +#[inline(always)] +fn unpackhi_epi32(a: v128, b: v128) -> v128 { + i32x4_shuffle::<2, 6, 3, 7>(a, b) +} + +#[inline(always)] +fn shuffle_epi32<const I3: usize, const I2: usize, const I1: usize, const I0: usize>( + a: v128, +) -> v128 { + // Please note that generic arguments in delcaration and imlementation are in + // different order. + // second arg is actually ignored. + i32x4_shuffle::<I0, I1, I2, I3>(a, a) +} + +#[inline(always)] +fn blend_epi16(a: v128, b: v128, imm8: i32) -> v128 { + // imm8 is always constant; it allows to implement this function with + // i16x8_shuffle. However, it is marginally slower on x64. + let bits = i16x8(0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80); + let mut mask = i16x8_splat(imm8 as i16); + mask = v128_and(mask, bits); + mask = i16x8_eq(mask, bits); + // The swapped argument order is equivalent to mask negation. + v128_bitselect(b, a, mask) +} + +// Note the optimization here of leaving row1 as the unrotated row, rather than +// row0. All the message loads below are adjusted to compensate for this. See +// discussion at https://github.com/sneves/blake2-avx2/pull/4 +#[inline(always)] +fn diagonalize(row0: &mut v128, row2: &mut v128, row3: &mut v128) { + *row0 = shuffle_epi32::<2, 1, 0, 3>(*row0); + *row3 = shuffle_epi32::<1, 0, 3, 2>(*row3); + *row2 = shuffle_epi32::<0, 3, 2, 1>(*row2); +} + +#[inline(always)] +fn undiagonalize(row0: &mut v128, row2: &mut v128, row3: &mut v128) { + *row0 = shuffle_epi32::<0, 3, 2, 1>(*row0); + *row3 = shuffle_epi32::<1, 0, 3, 2>(*row3); + *row2 = shuffle_epi32::<2, 1, 0, 3>(*row2); +} + +#[inline(always)] +fn compress_pre( + cv: &CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) -> [v128; 4] { + // safe because CVWords is [u32; 8] + let row0 = &mut unsafe { loadu(cv.as_ptr().add(0) as *const u8) }; + let row1 = &mut unsafe { loadu(cv.as_ptr().add(4) as *const u8) }; + let row2 = &mut set4(IV[0], IV[1], IV[2], IV[3]); + let row3 = &mut set4( + counter_low(counter), + counter_high(counter), + block_len as u32, + flags as u32, + ); + + // safe because block is &[u8; 64] + let mut m0 = unsafe { loadu(block.as_ptr().add(0 * 4 * DEGREE)) }; + let mut m1 = unsafe { loadu(block.as_ptr().add(1 * 4 * DEGREE)) }; + let mut m2 = unsafe { loadu(block.as_ptr().add(2 * 4 * DEGREE)) }; + let mut m3 = unsafe { loadu(block.as_ptr().add(3 * 4 * DEGREE)) }; + + let mut t0; + let mut t1; + let mut t2; + let mut t3; + let mut tt; + + // Round 1. The first round permutes the message words from the original + // input order, into the groups that get mixed in parallel. + t0 = shuffle!(m0, m1, 2, 0, 2, 0); // 6 4 2 0 + g1(row0, row1, row2, row3, t0); + t1 = shuffle!(m0, m1, 3, 1, 3, 1); // 7 5 3 1 + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = shuffle!(m2, m3, 2, 0, 2, 0); // 14 12 10 8 + t2 = shuffle_epi32::<2, 1, 0, 3>(t2); // 12 10 8 14 + g1(row0, row1, row2, row3, t2); + t3 = shuffle!(m2, m3, 3, 1, 3, 1); // 15 13 11 9 + t3 = shuffle_epi32::<2, 1, 0, 3>(t3); // 13 11 9 15 + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 2. This round and all following rounds apply a fixed permutation + // to the message words from the round before. + t0 = shuffle!(m0, m1, 3, 1, 1, 2); + t0 = shuffle_epi32::<0, 3, 2, 1>(t0); + g1(row0, row1, row2, row3, t0); + t1 = shuffle!(m2, m3, 3, 3, 2, 2); + tt = shuffle_epi32::<0, 0, 3, 3>(m0); + t1 = blend_epi16(tt, t1, 0xCC); + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = unpacklo_epi64(m3, m1); + tt = blend_epi16(t2, m2, 0xC0); + t2 = shuffle_epi32::<1, 3, 2, 0>(tt); + g1(row0, row1, row2, row3, t2); + t3 = unpackhi_epi32(m1, m3); + tt = unpacklo_epi32(m2, t3); + t3 = shuffle_epi32::<0, 1, 3, 2>(tt); + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 3 + t0 = shuffle!(m0, m1, 3, 1, 1, 2); + t0 = shuffle_epi32::<0, 3, 2, 1>(t0); + g1(row0, row1, row2, row3, t0); + t1 = shuffle!(m2, m3, 3, 3, 2, 2); + tt = shuffle_epi32::<0, 0, 3, 3>(m0); + t1 = blend_epi16(tt, t1, 0xCC); + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = unpacklo_epi64(m3, m1); + tt = blend_epi16(t2, m2, 0xC0); + t2 = shuffle_epi32::<1, 3, 2, 0>(tt); + g1(row0, row1, row2, row3, t2); + t3 = unpackhi_epi32(m1, m3); + tt = unpacklo_epi32(m2, t3); + t3 = shuffle_epi32::<0, 1, 3, 2>(tt); + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 4 + t0 = shuffle!(m0, m1, 3, 1, 1, 2); + t0 = shuffle_epi32::<0, 3, 2, 1>(t0); + g1(row0, row1, row2, row3, t0); + t1 = shuffle!(m2, m3, 3, 3, 2, 2); + tt = shuffle_epi32::<0, 0, 3, 3>(m0); + t1 = blend_epi16(tt, t1, 0xCC); + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = unpacklo_epi64(m3, m1); + tt = blend_epi16(t2, m2, 0xC0); + t2 = shuffle_epi32::<1, 3, 2, 0>(tt); + g1(row0, row1, row2, row3, t2); + t3 = unpackhi_epi32(m1, m3); + tt = unpacklo_epi32(m2, t3); + t3 = shuffle_epi32::<0, 1, 3, 2>(tt); + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 5 + t0 = shuffle!(m0, m1, 3, 1, 1, 2); + t0 = shuffle_epi32::<0, 3, 2, 1>(t0); + g1(row0, row1, row2, row3, t0); + t1 = shuffle!(m2, m3, 3, 3, 2, 2); + tt = shuffle_epi32::<0, 0, 3, 3>(m0); + t1 = blend_epi16(tt, t1, 0xCC); + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = unpacklo_epi64(m3, m1); + tt = blend_epi16(t2, m2, 0xC0); + t2 = shuffle_epi32::<1, 3, 2, 0>(tt); + g1(row0, row1, row2, row3, t2); + t3 = unpackhi_epi32(m1, m3); + tt = unpacklo_epi32(m2, t3); + t3 = shuffle_epi32::<0, 1, 3, 2>(tt); + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 6 + t0 = shuffle!(m0, m1, 3, 1, 1, 2); + t0 = shuffle_epi32::<0, 3, 2, 1>(t0); + g1(row0, row1, row2, row3, t0); + t1 = shuffle!(m2, m3, 3, 3, 2, 2); + tt = shuffle_epi32::<0, 0, 3, 3>(m0); + t1 = blend_epi16(tt, t1, 0xCC); + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = unpacklo_epi64(m3, m1); + tt = blend_epi16(t2, m2, 0xC0); + t2 = shuffle_epi32::<1, 3, 2, 0>(tt); + g1(row0, row1, row2, row3, t2); + t3 = unpackhi_epi32(m1, m3); + tt = unpacklo_epi32(m2, t3); + t3 = shuffle_epi32::<0, 1, 3, 2>(tt); + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 7 + t0 = shuffle!(m0, m1, 3, 1, 1, 2); + t0 = shuffle_epi32::<0, 3, 2, 1>(t0); + g1(row0, row1, row2, row3, t0); + t1 = shuffle!(m2, m3, 3, 3, 2, 2); + tt = shuffle_epi32::<0, 0, 3, 3>(m0); + t1 = blend_epi16(tt, t1, 0xCC); + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = unpacklo_epi64(m3, m1); + tt = blend_epi16(t2, m2, 0xC0); + t2 = shuffle_epi32::<1, 3, 2, 0>(tt); + g1(row0, row1, row2, row3, t2); + t3 = unpackhi_epi32(m1, m3); + tt = unpacklo_epi32(m2, t3); + t3 = shuffle_epi32::<0, 1, 3, 2>(tt); + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + + [*row0, *row1, *row2, *row3] +} + +#[target_feature(enable = "simd128")] +pub fn compress_in_place( + cv: &mut CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) { + let [row0, row1, row2, row3] = compress_pre(cv, block, block_len, counter, flags); + // it stores in reversed order... + // safe because CVWords is [u32; 8] + unsafe { + storeu(xor(row0, row2), cv.as_mut_ptr().add(0) as *mut u8); + storeu(xor(row1, row3), cv.as_mut_ptr().add(4) as *mut u8); + } +} + +#[target_feature(enable = "simd128")] +pub fn compress_xof( + cv: &CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) -> [u8; 64] { + let [mut row0, mut row1, mut row2, mut row3] = + compress_pre(cv, block, block_len, counter, flags); + row0 = xor(row0, row2); + row1 = xor(row1, row3); + // safe because CVWords is [u32; 8] + row2 = xor(row2, unsafe { loadu(cv.as_ptr().add(0) as *const u8) }); + row3 = xor(row3, unsafe { loadu(cv.as_ptr().add(4) as *const u8) }); + // It seems to be architecture dependent, but works. + // safe because sizes match, and every state of u8 is valid. + unsafe { core::mem::transmute([row0, row1, row2, row3]) } +} + +#[inline(always)] +fn round(v: &mut [v128; 16], m: &[v128; 16], r: usize) { + v[0] = add(v[0], m[MSG_SCHEDULE[r][0] as usize]); + v[1] = add(v[1], m[MSG_SCHEDULE[r][2] as usize]); + v[2] = add(v[2], m[MSG_SCHEDULE[r][4] as usize]); + v[3] = add(v[3], m[MSG_SCHEDULE[r][6] as usize]); + v[0] = add(v[0], v[4]); + v[1] = add(v[1], v[5]); + v[2] = add(v[2], v[6]); + v[3] = add(v[3], v[7]); + v[12] = xor(v[12], v[0]); + v[13] = xor(v[13], v[1]); + v[14] = xor(v[14], v[2]); + v[15] = xor(v[15], v[3]); + v[12] = rot16(v[12]); + v[13] = rot16(v[13]); + v[14] = rot16(v[14]); + v[15] = rot16(v[15]); + v[8] = add(v[8], v[12]); + v[9] = add(v[9], v[13]); + v[10] = add(v[10], v[14]); + v[11] = add(v[11], v[15]); + v[4] = xor(v[4], v[8]); + v[5] = xor(v[5], v[9]); + v[6] = xor(v[6], v[10]); + v[7] = xor(v[7], v[11]); + v[4] = rot12(v[4]); + v[5] = rot12(v[5]); + v[6] = rot12(v[6]); + v[7] = rot12(v[7]); + v[0] = add(v[0], m[MSG_SCHEDULE[r][1] as usize]); + v[1] = add(v[1], m[MSG_SCHEDULE[r][3] as usize]); + v[2] = add(v[2], m[MSG_SCHEDULE[r][5] as usize]); + v[3] = add(v[3], m[MSG_SCHEDULE[r][7] as usize]); + v[0] = add(v[0], v[4]); + v[1] = add(v[1], v[5]); + v[2] = add(v[2], v[6]); + v[3] = add(v[3], v[7]); + v[12] = xor(v[12], v[0]); + v[13] = xor(v[13], v[1]); + v[14] = xor(v[14], v[2]); + v[15] = xor(v[15], v[3]); + v[12] = rot8(v[12]); + v[13] = rot8(v[13]); + v[14] = rot8(v[14]); + v[15] = rot8(v[15]); + v[8] = add(v[8], v[12]); + v[9] = add(v[9], v[13]); + v[10] = add(v[10], v[14]); + v[11] = add(v[11], v[15]); + v[4] = xor(v[4], v[8]); + v[5] = xor(v[5], v[9]); + v[6] = xor(v[6], v[10]); + v[7] = xor(v[7], v[11]); + v[4] = rot7(v[4]); + v[5] = rot7(v[5]); + v[6] = rot7(v[6]); + v[7] = rot7(v[7]); + + v[0] = add(v[0], m[MSG_SCHEDULE[r][8] as usize]); + v[1] = add(v[1], m[MSG_SCHEDULE[r][10] as usize]); + v[2] = add(v[2], m[MSG_SCHEDULE[r][12] as usize]); + v[3] = add(v[3], m[MSG_SCHEDULE[r][14] as usize]); + v[0] = add(v[0], v[5]); + v[1] = add(v[1], v[6]); + v[2] = add(v[2], v[7]); + v[3] = add(v[3], v[4]); + v[15] = xor(v[15], v[0]); + v[12] = xor(v[12], v[1]); + v[13] = xor(v[13], v[2]); + v[14] = xor(v[14], v[3]); + v[15] = rot16(v[15]); + v[12] = rot16(v[12]); + v[13] = rot16(v[13]); + v[14] = rot16(v[14]); + v[10] = add(v[10], v[15]); + v[11] = add(v[11], v[12]); + v[8] = add(v[8], v[13]); + v[9] = add(v[9], v[14]); + v[5] = xor(v[5], v[10]); + v[6] = xor(v[6], v[11]); + v[7] = xor(v[7], v[8]); + v[4] = xor(v[4], v[9]); + v[5] = rot12(v[5]); + v[6] = rot12(v[6]); + v[7] = rot12(v[7]); + v[4] = rot12(v[4]); + v[0] = add(v[0], m[MSG_SCHEDULE[r][9] as usize]); + v[1] = add(v[1], m[MSG_SCHEDULE[r][11] as usize]); + v[2] = add(v[2], m[MSG_SCHEDULE[r][13] as usize]); + v[3] = add(v[3], m[MSG_SCHEDULE[r][15] as usize]); + v[0] = add(v[0], v[5]); + v[1] = add(v[1], v[6]); + v[2] = add(v[2], v[7]); + v[3] = add(v[3], v[4]); + v[15] = xor(v[15], v[0]); + v[12] = xor(v[12], v[1]); + v[13] = xor(v[13], v[2]); + v[14] = xor(v[14], v[3]); + v[15] = rot8(v[15]); + v[12] = rot8(v[12]); + v[13] = rot8(v[13]); + v[14] = rot8(v[14]); + v[10] = add(v[10], v[15]); + v[11] = add(v[11], v[12]); + v[8] = add(v[8], v[13]); + v[9] = add(v[9], v[14]); + v[5] = xor(v[5], v[10]); + v[6] = xor(v[6], v[11]); + v[7] = xor(v[7], v[8]); + v[4] = xor(v[4], v[9]); + v[5] = rot7(v[5]); + v[6] = rot7(v[6]); + v[7] = rot7(v[7]); + v[4] = rot7(v[4]); +} + +#[inline(always)] +fn transpose_vecs(vecs: &mut [v128; DEGREE]) { + // Interleave 32-bit lanes. The low unpack is lanes 00/11 and the high is + // 22/33. Note that this doesn't split the vector into two lanes, as the + // AVX2 counterparts do. + let ab_01 = unpacklo_epi32(vecs[0], vecs[1]); + let ab_23 = unpackhi_epi32(vecs[0], vecs[1]); + let cd_01 = unpacklo_epi32(vecs[2], vecs[3]); + let cd_23 = unpackhi_epi32(vecs[2], vecs[3]); + + // Interleave 64-bit lanes. + let abcd_0 = unpacklo_epi64(ab_01, cd_01); + let abcd_1 = unpackhi_epi64(ab_01, cd_01); + let abcd_2 = unpacklo_epi64(ab_23, cd_23); + let abcd_3 = unpackhi_epi64(ab_23, cd_23); + + vecs[0] = abcd_0; + vecs[1] = abcd_1; + vecs[2] = abcd_2; + vecs[3] = abcd_3; +} + +#[inline(always)] +unsafe fn transpose_msg_vecs(inputs: &[*const u8; DEGREE], block_offset: usize) -> [v128; 16] { + let mut vecs = [ + loadu(inputs[0].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[1].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[2].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[3].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[0].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[1].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[2].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[3].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[0].add(block_offset + 2 * 4 * DEGREE)), + loadu(inputs[1].add(block_offset + 2 * 4 * DEGREE)), + loadu(inputs[2].add(block_offset + 2 * 4 * DEGREE)), + loadu(inputs[3].add(block_offset + 2 * 4 * DEGREE)), + loadu(inputs[0].add(block_offset + 3 * 4 * DEGREE)), + loadu(inputs[1].add(block_offset + 3 * 4 * DEGREE)), + loadu(inputs[2].add(block_offset + 3 * 4 * DEGREE)), + loadu(inputs[3].add(block_offset + 3 * 4 * DEGREE)), + ]; + let squares = mut_array_refs!(&mut vecs, DEGREE, DEGREE, DEGREE, DEGREE); + transpose_vecs(squares.0); + transpose_vecs(squares.1); + transpose_vecs(squares.2); + transpose_vecs(squares.3); + vecs +} + +#[inline(always)] +fn load_counters(counter: u64, increment_counter: IncrementCounter) -> (v128, v128) { + let mask = if increment_counter.yes() { !0 } else { 0 }; + ( + set4( + counter_low(counter + (mask & 0)), + counter_low(counter + (mask & 1)), + counter_low(counter + (mask & 2)), + counter_low(counter + (mask & 3)), + ), + set4( + counter_high(counter + (mask & 0)), + counter_high(counter + (mask & 1)), + counter_high(counter + (mask & 2)), + counter_high(counter + (mask & 3)), + ), + ) +} + +#[target_feature(enable = "simd128")] +pub unsafe fn hash4( + inputs: &[*const u8; DEGREE], + blocks: usize, + key: &CVWords, + counter: u64, + increment_counter: IncrementCounter, + flags: u8, + flags_start: u8, + flags_end: u8, + out: &mut [u8; DEGREE * OUT_LEN], +) { + let mut h_vecs = [ + set1(key[0]), + set1(key[1]), + set1(key[2]), + set1(key[3]), + set1(key[4]), + set1(key[5]), + set1(key[6]), + set1(key[7]), + ]; + let (counter_low_vec, counter_high_vec) = load_counters(counter, increment_counter); + let mut block_flags = flags | flags_start; + + for block in 0..blocks { + if block + 1 == blocks { + block_flags |= flags_end; + } + let block_len_vec = set1(BLOCK_LEN as u32); // full blocks only + let block_flags_vec = set1(block_flags as u32); + let msg_vecs = transpose_msg_vecs(inputs, block * BLOCK_LEN); + + // The transposed compression function. Note that inlining this + // manually here improves compile times by a lot, compared to factoring + // it out into its own function and making it #[inline(always)]. Just + // guessing, it might have something to do with loop unrolling. + let mut v = [ + h_vecs[0], + h_vecs[1], + h_vecs[2], + h_vecs[3], + h_vecs[4], + h_vecs[5], + h_vecs[6], + h_vecs[7], + set1(IV[0]), + set1(IV[1]), + set1(IV[2]), + set1(IV[3]), + counter_low_vec, + counter_high_vec, + block_len_vec, + block_flags_vec, + ]; + round(&mut v, &msg_vecs, 0); + round(&mut v, &msg_vecs, 1); + round(&mut v, &msg_vecs, 2); + round(&mut v, &msg_vecs, 3); + round(&mut v, &msg_vecs, 4); + round(&mut v, &msg_vecs, 5); + round(&mut v, &msg_vecs, 6); + h_vecs[0] = xor(v[0], v[8]); + h_vecs[1] = xor(v[1], v[9]); + h_vecs[2] = xor(v[2], v[10]); + h_vecs[3] = xor(v[3], v[11]); + h_vecs[4] = xor(v[4], v[12]); + h_vecs[5] = xor(v[5], v[13]); + h_vecs[6] = xor(v[6], v[14]); + h_vecs[7] = xor(v[7], v[15]); + + block_flags = flags; + } + + let squares = mut_array_refs!(&mut h_vecs, DEGREE, DEGREE); + transpose_vecs(squares.0); + transpose_vecs(squares.1); + // The first four vecs now contain the first half of each output, and the + // second four vecs contain the second half of each output. + storeu(h_vecs[0], out.as_mut_ptr().add(0 * 4 * DEGREE)); + storeu(h_vecs[4], out.as_mut_ptr().add(1 * 4 * DEGREE)); + storeu(h_vecs[1], out.as_mut_ptr().add(2 * 4 * DEGREE)); + storeu(h_vecs[5], out.as_mut_ptr().add(3 * 4 * DEGREE)); + storeu(h_vecs[2], out.as_mut_ptr().add(4 * 4 * DEGREE)); + storeu(h_vecs[6], out.as_mut_ptr().add(5 * 4 * DEGREE)); + storeu(h_vecs[3], out.as_mut_ptr().add(6 * 4 * DEGREE)); + storeu(h_vecs[7], out.as_mut_ptr().add(7 * 4 * DEGREE)); +} + +#[target_feature(enable = "simd128")] +unsafe fn hash1<const N: usize>( + input: &[u8; N], + key: &CVWords, + counter: u64, + flags: u8, + flags_start: u8, + flags_end: u8, + out: &mut CVBytes, +) { + debug_assert_eq!(N % BLOCK_LEN, 0, "uneven blocks"); + let mut cv = *key; + let mut block_flags = flags | flags_start; + let mut slice = &input[..]; + while slice.len() >= BLOCK_LEN { + if slice.len() == BLOCK_LEN { + block_flags |= flags_end; + } + compress_in_place( + &mut cv, + array_ref!(slice, 0, BLOCK_LEN), + BLOCK_LEN as u8, + counter, + block_flags, + ); + block_flags = flags; + slice = &slice[BLOCK_LEN..]; + } + *out = core::mem::transmute(cv); +} + +#[target_feature(enable = "simd128")] +pub unsafe fn hash_many<const N: usize>( + mut inputs: &[&[u8; N]], + key: &CVWords, + mut counter: u64, + increment_counter: IncrementCounter, + flags: u8, + flags_start: u8, + flags_end: u8, + mut out: &mut [u8], +) { + debug_assert!(out.len() >= inputs.len() * OUT_LEN, "out too short"); + while inputs.len() >= DEGREE && out.len() >= DEGREE * OUT_LEN { + // Safe because the layout of arrays is guaranteed, and because the + // `blocks` count is determined statically from the argument type. + let input_ptrs: &[*const u8; DEGREE] = &*(inputs.as_ptr() as *const [*const u8; DEGREE]); + let blocks = N / BLOCK_LEN; + hash4( + input_ptrs, + blocks, + key, + counter, + increment_counter, + flags, + flags_start, + flags_end, + array_mut_ref!(out, 0, DEGREE * OUT_LEN), + ); + if increment_counter.yes() { + counter += DEGREE as u64; + } + inputs = &inputs[DEGREE..]; + out = &mut out[DEGREE * OUT_LEN..]; + } + for (&input, output) in inputs.iter().zip(out.chunks_exact_mut(OUT_LEN)) { + hash1( + input, + key, + counter, + flags, + flags_start, + flags_end, + array_mut_ref!(output, 0, OUT_LEN), + ); + if increment_counter.yes() { + counter += 1; + } + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_transpose() { + #[target_feature(enable = "simd128")] + fn transpose_wrapper(vecs: &mut [v128; DEGREE]) { + transpose_vecs(vecs); + } + + let mut matrix = [[0 as u32; DEGREE]; DEGREE]; + for i in 0..DEGREE { + for j in 0..DEGREE { + matrix[i][j] = (i * DEGREE + j) as u32; + } + } + + unsafe { + let mut vecs: [v128; DEGREE] = core::mem::transmute(matrix); + transpose_wrapper(&mut vecs); + matrix = core::mem::transmute(vecs); + } + + for i in 0..DEGREE { + for j in 0..DEGREE { + // Reversed indexes from above. + assert_eq!(matrix[j][i], (i * DEGREE + j) as u32); + } + } + } + + #[test] + fn test_compress() { + crate::test::test_compress_fn(compress_in_place, compress_xof); + } + + #[test] + fn test_hash_many() { + crate::test::test_hash_many_fn(hash_many, hash_many); + } +} |