diff options
| author | Stefan Boberg <[email protected]> | 2022-09-20 17:28:41 +0200 |
|---|---|---|
| committer | GitHub <[email protected]> | 2022-09-20 17:28:41 +0200 |
| commit | a735967c7c54fcecbfd9760286afc06a3b48233a (patch) | |
| tree | 4789717b7a05c7122cb366d3bcf5810db9678058 /thirdparty/BLAKE3/src | |
| parent | rename URI chunk requests from value -> chunk (#166) (diff) | |
| download | zen-a735967c7c54fcecbfd9760286afc06a3b48233a.tar.xz zen-a735967c7c54fcecbfd9760286afc06a3b48233a.zip | |
Use BLAKE3 port from vcpkg (#141)
use BLAKE3 port from vcpkg instead of in-tree binaries
Diffstat (limited to 'thirdparty/BLAKE3/src')
| -rw-r--r-- | thirdparty/BLAKE3/src/ffi_avx2.rs | 63 | ||||
| -rw-r--r-- | thirdparty/BLAKE3/src/ffi_avx512.rs | 114 | ||||
| -rw-r--r-- | thirdparty/BLAKE3/src/ffi_neon.rs | 82 | ||||
| -rw-r--r-- | thirdparty/BLAKE3/src/ffi_sse2.rs | 114 | ||||
| -rw-r--r-- | thirdparty/BLAKE3/src/ffi_sse41.rs | 114 | ||||
| -rw-r--r-- | thirdparty/BLAKE3/src/guts.rs | 95 | ||||
| -rw-r--r-- | thirdparty/BLAKE3/src/join.rs | 120 | ||||
| -rw-r--r-- | thirdparty/BLAKE3/src/lib.rs | 1359 | ||||
| -rw-r--r-- | thirdparty/BLAKE3/src/platform.rs | 487 | ||||
| -rw-r--r-- | thirdparty/BLAKE3/src/portable.rs | 198 | ||||
| -rw-r--r-- | thirdparty/BLAKE3/src/rust_avx2.rs | 474 | ||||
| -rw-r--r-- | thirdparty/BLAKE3/src/rust_sse2.rs | 775 | ||||
| -rw-r--r-- | thirdparty/BLAKE3/src/rust_sse41.rs | 766 | ||||
| -rw-r--r-- | thirdparty/BLAKE3/src/test.rs | 569 | ||||
| -rw-r--r-- | thirdparty/BLAKE3/src/traits.rs | 184 |
15 files changed, 0 insertions, 5514 deletions
diff --git a/thirdparty/BLAKE3/src/ffi_avx2.rs b/thirdparty/BLAKE3/src/ffi_avx2.rs deleted file mode 100644 index d805e868e..000000000 --- a/thirdparty/BLAKE3/src/ffi_avx2.rs +++ /dev/null @@ -1,63 +0,0 @@ -use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN}; - -// Note that there is no AVX2 implementation of compress_in_place or -// compress_xof. - -// Unsafe because this may only be called on platforms supporting AVX2. -pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>( - inputs: &[&A], - key: &CVWords, - counter: u64, - increment_counter: IncrementCounter, - flags: u8, - flags_start: u8, - flags_end: u8, - out: &mut [u8], -) { - // The Rust hash_many implementations do bounds checking on the `out` - // array, but the C implementations don't. Even though this is an unsafe - // function, assert the bounds here. - assert!(out.len() >= inputs.len() * OUT_LEN); - ffi::blake3_hash_many_avx2( - inputs.as_ptr() as *const *const u8, - inputs.len(), - A::CAPACITY / BLOCK_LEN, - key.as_ptr(), - counter, - increment_counter.yes(), - flags, - flags_start, - flags_end, - out.as_mut_ptr(), - ) -} - -pub mod ffi { - extern "C" { - pub fn blake3_hash_many_avx2( - inputs: *const *const u8, - num_inputs: usize, - blocks: usize, - key: *const u32, - counter: u64, - increment_counter: bool, - flags: u8, - flags_start: u8, - flags_end: u8, - out: *mut u8, - ); - } -} - -#[cfg(test)] -mod test { - use super::*; - - #[test] - fn test_hash_many() { - if !crate::platform::avx2_detected() { - return; - } - crate::test::test_hash_many_fn(hash_many, hash_many); - } -} diff --git a/thirdparty/BLAKE3/src/ffi_avx512.rs b/thirdparty/BLAKE3/src/ffi_avx512.rs deleted file mode 100644 index c1b9f649b..000000000 --- a/thirdparty/BLAKE3/src/ffi_avx512.rs +++ /dev/null @@ -1,114 +0,0 @@ -use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN}; - -// Unsafe because this may only be called on platforms supporting AVX-512. -pub unsafe fn compress_in_place( - cv: &mut CVWords, - block: &[u8; BLOCK_LEN], - block_len: u8, - counter: u64, - flags: u8, -) { - ffi::blake3_compress_in_place_avx512(cv.as_mut_ptr(), block.as_ptr(), block_len, counter, flags) -} - -// Unsafe because this may only be called on platforms supporting AVX-512. -pub unsafe fn compress_xof( - cv: &CVWords, - block: &[u8; BLOCK_LEN], - block_len: u8, - counter: u64, - flags: u8, -) -> [u8; 64] { - let mut out = [0u8; 64]; - ffi::blake3_compress_xof_avx512( - cv.as_ptr(), - block.as_ptr(), - block_len, - counter, - flags, - out.as_mut_ptr(), - ); - out -} - -// Unsafe because this may only be called on platforms supporting AVX-512. -pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>( - inputs: &[&A], - key: &CVWords, - counter: u64, - increment_counter: IncrementCounter, - flags: u8, - flags_start: u8, - flags_end: u8, - out: &mut [u8], -) { - // The Rust hash_many implementations do bounds checking on the `out` - // array, but the C implementations don't. Even though this is an unsafe - // function, assert the bounds here. - assert!(out.len() >= inputs.len() * OUT_LEN); - ffi::blake3_hash_many_avx512( - inputs.as_ptr() as *const *const u8, - inputs.len(), - A::CAPACITY / BLOCK_LEN, - key.as_ptr(), - counter, - increment_counter.yes(), - flags, - flags_start, - flags_end, - out.as_mut_ptr(), - ) -} - -pub mod ffi { - extern "C" { - pub fn blake3_compress_in_place_avx512( - cv: *mut u32, - block: *const u8, - block_len: u8, - counter: u64, - flags: u8, - ); - pub fn blake3_compress_xof_avx512( - cv: *const u32, - block: *const u8, - block_len: u8, - counter: u64, - flags: u8, - out: *mut u8, - ); - pub fn blake3_hash_many_avx512( - inputs: *const *const u8, - num_inputs: usize, - blocks: usize, - key: *const u32, - counter: u64, - increment_counter: bool, - flags: u8, - flags_start: u8, - flags_end: u8, - out: *mut u8, - ); - } -} - -#[cfg(test)] -mod test { - use super::*; - - #[test] - fn test_compress() { - if !crate::platform::avx512_detected() { - return; - } - crate::test::test_compress_fn(compress_in_place, compress_xof); - } - - #[test] - fn test_hash_many() { - if !crate::platform::avx512_detected() { - return; - } - crate::test::test_hash_many_fn(hash_many, hash_many); - } -} diff --git a/thirdparty/BLAKE3/src/ffi_neon.rs b/thirdparty/BLAKE3/src/ffi_neon.rs deleted file mode 100644 index 889974277..000000000 --- a/thirdparty/BLAKE3/src/ffi_neon.rs +++ /dev/null @@ -1,82 +0,0 @@ -use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN}; - -// Unsafe because this may only be called on platforms supporting NEON. -pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>( - inputs: &[&A], - key: &CVWords, - counter: u64, - increment_counter: IncrementCounter, - flags: u8, - flags_start: u8, - flags_end: u8, - out: &mut [u8], -) { - // The Rust hash_many implementations do bounds checking on the `out` - // array, but the C implementations don't. Even though this is an unsafe - // function, assert the bounds here. - assert!(out.len() >= inputs.len() * OUT_LEN); - ffi::blake3_hash_many_neon( - inputs.as_ptr() as *const *const u8, - inputs.len(), - A::CAPACITY / BLOCK_LEN, - key.as_ptr(), - counter, - increment_counter.yes(), - flags, - flags_start, - flags_end, - out.as_mut_ptr(), - ) -} - -// blake3_neon.c normally depends on blake3_portable.c, because the NEON -// implementation only provides 4x compression, and it relies on the portable -// implementation for 1x compression. However, we expose the portable Rust -// implementation here instead, to avoid linking in unnecessary code. -#[no_mangle] -pub extern "C" fn blake3_compress_in_place_portable( - cv: *mut u32, - block: *const u8, - block_len: u8, - counter: u64, - flags: u8, -) { - unsafe { - crate::portable::compress_in_place( - &mut *(cv as *mut [u32; 8]), - &*(block as *const [u8; 64]), - block_len, - counter, - flags, - ) - } -} - -pub mod ffi { - extern "C" { - pub fn blake3_hash_many_neon( - inputs: *const *const u8, - num_inputs: usize, - blocks: usize, - key: *const u32, - counter: u64, - increment_counter: bool, - flags: u8, - flags_start: u8, - flags_end: u8, - out: *mut u8, - ); - } -} - -#[cfg(test)] -mod test { - use super::*; - - #[test] - fn test_hash_many() { - // This entire file is gated on feature="neon", so NEON support is - // assumed here. - crate::test::test_hash_many_fn(hash_many, hash_many); - } -} diff --git a/thirdparty/BLAKE3/src/ffi_sse2.rs b/thirdparty/BLAKE3/src/ffi_sse2.rs deleted file mode 100644 index c49a229ad..000000000 --- a/thirdparty/BLAKE3/src/ffi_sse2.rs +++ /dev/null @@ -1,114 +0,0 @@ -use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN}; - -// Unsafe because this may only be called on platforms supporting SSE2. -pub unsafe fn compress_in_place( - cv: &mut CVWords, - block: &[u8; BLOCK_LEN], - block_len: u8, - counter: u64, - flags: u8, -) { - ffi::blake3_compress_in_place_sse2(cv.as_mut_ptr(), block.as_ptr(), block_len, counter, flags) -} - -// Unsafe because this may only be called on platforms supporting SSE2. -pub unsafe fn compress_xof( - cv: &CVWords, - block: &[u8; BLOCK_LEN], - block_len: u8, - counter: u64, - flags: u8, -) -> [u8; 64] { - let mut out = [0u8; 64]; - ffi::blake3_compress_xof_sse2( - cv.as_ptr(), - block.as_ptr(), - block_len, - counter, - flags, - out.as_mut_ptr(), - ); - out -} - -// Unsafe because this may only be called on platforms supporting SSE2. -pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>( - inputs: &[&A], - key: &CVWords, - counter: u64, - increment_counter: IncrementCounter, - flags: u8, - flags_start: u8, - flags_end: u8, - out: &mut [u8], -) { - // The Rust hash_many implementations do bounds checking on the `out` - // array, but the C implementations don't. Even though this is an unsafe - // function, assert the bounds here. - assert!(out.len() >= inputs.len() * OUT_LEN); - ffi::blake3_hash_many_sse2( - inputs.as_ptr() as *const *const u8, - inputs.len(), - A::CAPACITY / BLOCK_LEN, - key.as_ptr(), - counter, - increment_counter.yes(), - flags, - flags_start, - flags_end, - out.as_mut_ptr(), - ) -} - -pub mod ffi { - extern "C" { - pub fn blake3_compress_in_place_sse2( - cv: *mut u32, - block: *const u8, - block_len: u8, - counter: u64, - flags: u8, - ); - pub fn blake3_compress_xof_sse2( - cv: *const u32, - block: *const u8, - block_len: u8, - counter: u64, - flags: u8, - out: *mut u8, - ); - pub fn blake3_hash_many_sse2( - inputs: *const *const u8, - num_inputs: usize, - blocks: usize, - key: *const u32, - counter: u64, - increment_counter: bool, - flags: u8, - flags_start: u8, - flags_end: u8, - out: *mut u8, - ); - } -} - -#[cfg(test)] -mod test { - use super::*; - - #[test] - fn test_compress() { - if !crate::platform::sse2_detected() { - return; - } - crate::test::test_compress_fn(compress_in_place, compress_xof); - } - - #[test] - fn test_hash_many() { - if !crate::platform::sse2_detected() { - return; - } - crate::test::test_hash_many_fn(hash_many, hash_many); - } -} diff --git a/thirdparty/BLAKE3/src/ffi_sse41.rs b/thirdparty/BLAKE3/src/ffi_sse41.rs deleted file mode 100644 index 0b64c90a0..000000000 --- a/thirdparty/BLAKE3/src/ffi_sse41.rs +++ /dev/null @@ -1,114 +0,0 @@ -use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN}; - -// Unsafe because this may only be called on platforms supporting SSE4.1. -pub unsafe fn compress_in_place( - cv: &mut CVWords, - block: &[u8; BLOCK_LEN], - block_len: u8, - counter: u64, - flags: u8, -) { - ffi::blake3_compress_in_place_sse41(cv.as_mut_ptr(), block.as_ptr(), block_len, counter, flags) -} - -// Unsafe because this may only be called on platforms supporting SSE4.1. -pub unsafe fn compress_xof( - cv: &CVWords, - block: &[u8; BLOCK_LEN], - block_len: u8, - counter: u64, - flags: u8, -) -> [u8; 64] { - let mut out = [0u8; 64]; - ffi::blake3_compress_xof_sse41( - cv.as_ptr(), - block.as_ptr(), - block_len, - counter, - flags, - out.as_mut_ptr(), - ); - out -} - -// Unsafe because this may only be called on platforms supporting SSE4.1. -pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>( - inputs: &[&A], - key: &CVWords, - counter: u64, - increment_counter: IncrementCounter, - flags: u8, - flags_start: u8, - flags_end: u8, - out: &mut [u8], -) { - // The Rust hash_many implementations do bounds checking on the `out` - // array, but the C implementations don't. Even though this is an unsafe - // function, assert the bounds here. - assert!(out.len() >= inputs.len() * OUT_LEN); - ffi::blake3_hash_many_sse41( - inputs.as_ptr() as *const *const u8, - inputs.len(), - A::CAPACITY / BLOCK_LEN, - key.as_ptr(), - counter, - increment_counter.yes(), - flags, - flags_start, - flags_end, - out.as_mut_ptr(), - ) -} - -pub mod ffi { - extern "C" { - pub fn blake3_compress_in_place_sse41( - cv: *mut u32, - block: *const u8, - block_len: u8, - counter: u64, - flags: u8, - ); - pub fn blake3_compress_xof_sse41( - cv: *const u32, - block: *const u8, - block_len: u8, - counter: u64, - flags: u8, - out: *mut u8, - ); - pub fn blake3_hash_many_sse41( - inputs: *const *const u8, - num_inputs: usize, - blocks: usize, - key: *const u32, - counter: u64, - increment_counter: bool, - flags: u8, - flags_start: u8, - flags_end: u8, - out: *mut u8, - ); - } -} - -#[cfg(test)] -mod test { - use super::*; - - #[test] - fn test_compress() { - if !crate::platform::sse41_detected() { - return; - } - crate::test::test_compress_fn(compress_in_place, compress_xof); - } - - #[test] - fn test_hash_many() { - if !crate::platform::sse41_detected() { - return; - } - crate::test::test_hash_many_fn(hash_many, hash_many); - } -} diff --git a/thirdparty/BLAKE3/src/guts.rs b/thirdparty/BLAKE3/src/guts.rs deleted file mode 100644 index 88dcc86cd..000000000 --- a/thirdparty/BLAKE3/src/guts.rs +++ /dev/null @@ -1,95 +0,0 @@ -// This module is for incremental use cases like the `bao` crate, which need to -// get their hands on internal chunk and parent chaining values. The vast -// majority of users should ignore this and use the publicly documented -// interface instead. - -#[derive(Clone, Debug)] -pub struct ChunkState(crate::ChunkState); - -impl ChunkState { - // Currently this type only supports the regular hash mode. If an - // incremental user needs keyed_hash or derive_key, we can add that. - pub fn new(chunk_counter: u64) -> Self { - Self(crate::ChunkState::new( - crate::IV, - chunk_counter, - 0, - crate::platform::Platform::detect(), - )) - } - - #[inline] - pub fn len(&self) -> usize { - self.0.len() - } - - #[inline] - pub fn update(&mut self, input: &[u8]) -> &mut Self { - self.0.update(input); - self - } - - pub fn finalize(&self, is_root: bool) -> crate::Hash { - let output = self.0.output(); - if is_root { - output.root_hash() - } else { - output.chaining_value().into() - } - } -} - -// As above, this currently assumes the regular hash mode. If an incremental -// user needs keyed_hash or derive_key, we can add that. -pub fn parent_cv( - left_child: &crate::Hash, - right_child: &crate::Hash, - is_root: bool, -) -> crate::Hash { - let output = crate::parent_node_output( - left_child.as_bytes(), - right_child.as_bytes(), - crate::IV, - 0, - crate::platform::Platform::detect(), - ); - if is_root { - output.root_hash() - } else { - output.chaining_value().into() - } -} - -#[cfg(test)] -mod test { - use super::*; - - #[test] - fn test_chunk() { - assert_eq!( - crate::hash(b"foo"), - ChunkState::new(0).update(b"foo").finalize(true) - ); - } - - #[test] - fn test_parents() { - let mut hasher = crate::Hasher::new(); - let mut buf = [0; crate::CHUNK_LEN]; - - buf[0] = 'a' as u8; - hasher.update(&buf); - let chunk0_cv = ChunkState::new(0).update(&buf).finalize(false); - - buf[0] = 'b' as u8; - hasher.update(&buf); - let chunk1_cv = ChunkState::new(1).update(&buf).finalize(false); - - hasher.update(b"c"); - let chunk2_cv = ChunkState::new(2).update(b"c").finalize(false); - - let parent = parent_cv(&chunk0_cv, &chunk1_cv, false); - let root = parent_cv(&parent, &chunk2_cv, true); - assert_eq!(hasher.finalize(), root); - } -} diff --git a/thirdparty/BLAKE3/src/join.rs b/thirdparty/BLAKE3/src/join.rs deleted file mode 100644 index 60932db1c..000000000 --- a/thirdparty/BLAKE3/src/join.rs +++ /dev/null @@ -1,120 +0,0 @@ -//! The multi-threading abstractions used by [`Hasher::update_with_join`]. -//! -//! Different implementations of the `Join` trait determine whether -//! [`Hasher::update_with_join`] performs multi-threading on sufficiently large -//! inputs. The `SerialJoin` implementation is single-threaded, and the -//! `RayonJoin` implementation (gated by the `rayon` feature) is -//! multi-threaded. Interfaces other than [`Hasher::update_with_join`], like -//! [`hash`] and [`Hasher::update`], always use `SerialJoin` internally. -//! -//! The `Join` trait is an almost exact copy of the [`rayon::join`] API, and -//! `RayonJoin` is the only non-trivial implementation provided. The only -//! difference between the function signature in the `Join` trait and the -//! underlying one in Rayon, is that the trait method includes two length -//! parameters. This gives an implementation the option of e.g. setting a -//! subtree size threshold below which it keeps splits on the same thread. -//! However, neither of the two provided implementations currently makes use of -//! those parameters. Note that in Rayon, the very first `join` call is more -//! expensive than subsequent calls, because it moves work from the calling -//! thread into the thread pool. That makes a coarse-grained input length -//! threshold in the caller more effective than a fine-grained subtree size -//! threshold after the implementation has already started recursing. -//! -//! # Example -//! -//! ``` -//! // Hash a large input using multi-threading. Note that multi-threading -//! // comes with some overhead, and it can actually hurt performance for small -//! // inputs. The meaning of "small" varies, however, depending on the -//! // platform and the number of threads. (On x86_64, the cutoff tends to be -//! // around 128 KiB.) You should benchmark your own use case to see whether -//! // multi-threading helps. -//! # #[cfg(feature = "rayon")] -//! # { -//! # fn some_large_input() -> &'static [u8] { b"foo" } -//! let input: &[u8] = some_large_input(); -//! let mut hasher = blake3::Hasher::new(); -//! hasher.update_with_join::<blake3::join::RayonJoin>(input); -//! let hash = hasher.finalize(); -//! # } -//! ``` -//! -//! [`Hasher::update_with_join`]: ../struct.Hasher.html#method.update_with_join -//! [`Hasher::update`]: ../struct.Hasher.html#method.update -//! [`hash`]: ../fn.hash.html -//! [`rayon::join`]: https://docs.rs/rayon/1.3.0/rayon/fn.join.html - -/// The trait that abstracts over single-threaded and multi-threaded recursion. -/// -/// See the [`join` module docs](index.html) for more details. -pub trait Join { - fn join<A, B, RA, RB>(oper_a: A, oper_b: B, len_a: usize, len_b: usize) -> (RA, RB) - where - A: FnOnce() -> RA + Send, - B: FnOnce() -> RB + Send, - RA: Send, - RB: Send; -} - -/// The trivial, serial implementation of `Join`. The left and right sides are -/// executed one after the other, on the calling thread. The standalone hashing -/// functions and the `Hasher::update` method use this implementation -/// internally. -/// -/// See the [`join` module docs](index.html) for more details. -pub enum SerialJoin {} - -impl Join for SerialJoin { - #[inline] - fn join<A, B, RA, RB>(oper_a: A, oper_b: B, _len_a: usize, _len_b: usize) -> (RA, RB) - where - A: FnOnce() -> RA + Send, - B: FnOnce() -> RB + Send, - RA: Send, - RB: Send, - { - (oper_a(), oper_b()) - } -} - -/// The Rayon-based implementation of `Join`. The left and right sides are -/// executed on the Rayon thread pool, potentially in parallel. This -/// implementation is gated by the `rayon` feature, which is off by default. -/// -/// See the [`join` module docs](index.html) for more details. -#[cfg(feature = "rayon")] -pub enum RayonJoin {} - -#[cfg(feature = "rayon")] -impl Join for RayonJoin { - #[inline] - fn join<A, B, RA, RB>(oper_a: A, oper_b: B, _len_a: usize, _len_b: usize) -> (RA, RB) - where - A: FnOnce() -> RA + Send, - B: FnOnce() -> RB + Send, - RA: Send, - RB: Send, - { - rayon::join(oper_a, oper_b) - } -} - -#[cfg(test)] -mod test { - use super::*; - - #[test] - fn test_serial_join() { - let oper_a = || 1 + 1; - let oper_b = || 2 + 2; - assert_eq!((2, 4), SerialJoin::join(oper_a, oper_b, 3, 4)); - } - - #[test] - #[cfg(feature = "rayon")] - fn test_rayon_join() { - let oper_a = || 1 + 1; - let oper_b = || 2 + 2; - assert_eq!((2, 4), RayonJoin::join(oper_a, oper_b, 3, 4)); - } -} diff --git a/thirdparty/BLAKE3/src/lib.rs b/thirdparty/BLAKE3/src/lib.rs deleted file mode 100644 index bf66b6dae..000000000 --- a/thirdparty/BLAKE3/src/lib.rs +++ /dev/null @@ -1,1359 +0,0 @@ -//! The official Rust implementation of the [BLAKE3] cryptographic hash -//! function. -//! -//! # Examples -//! -//! ``` -//! # fn main() -> Result<(), Box<dyn std::error::Error>> { -//! // Hash an input all at once. -//! let hash1 = blake3::hash(b"foobarbaz"); -//! -//! // Hash an input incrementally. -//! let mut hasher = blake3::Hasher::new(); -//! hasher.update(b"foo"); -//! hasher.update(b"bar"); -//! hasher.update(b"baz"); -//! let hash2 = hasher.finalize(); -//! assert_eq!(hash1, hash2); -//! -//! // Extended output. OutputReader also implements Read and Seek. -//! # #[cfg(feature = "std")] { -//! let mut output = [0; 1000]; -//! let mut output_reader = hasher.finalize_xof(); -//! output_reader.fill(&mut output); -//! assert_eq!(&output[..32], hash1.as_bytes()); -//! # } -//! -//! // Print a hash as hex. -//! println!("{}", hash1.to_hex()); -//! # Ok(()) -//! # } -//! ``` -//! -//! # Cargo Features -//! -//! The `rayon` feature provides [Rayon]-based multi-threading, in particular -//! the [`join::RayonJoin`] type for use with [`Hasher::update_with_join`]. It -//! is disabled by default, but enabled for [docs.rs]. -//! -//! The `neon` feature enables ARM NEON support. Currently there is no runtime -//! CPU feature detection for NEON, so you must only enable this feature for -//! targets that are known to have NEON support. In particular, some ARMv7 -//! targets support NEON, and some don't. -//! -//! The `std` feature (enabled by default) is required for implementations of -//! the [`Write`] and [`Seek`] traits, and also for runtime CPU feature -//! detection. If this feature is disabled, the only way to use the SIMD -//! implementations in this crate is to enable the corresponding instruction -//! sets statically for the entire build, with e.g. `RUSTFLAGS="-C -//! target-cpu=native"`. The resulting binary will not be portable to other -//! machines. -//! -//! [BLAKE3]: https://blake3.io -//! [Rayon]: https://github.com/rayon-rs/rayon -//! [`join::RayonJoin`]: join/enum.RayonJoin.html -//! [`Hasher::update_with_join`]: struct.Hasher.html#method.update_with_join -//! [docs.rs]: https://docs.rs/ -//! [`Write`]: https://doc.rust-lang.org/std/io/trait.Write.html -//! [`Seek`]: https://doc.rust-lang.org/std/io/trait.Seek.html - -#![cfg_attr(not(feature = "std"), no_std)] - -#[cfg(test)] -mod test; - -// The guts module is for incremental use cases like the `bao` crate that need -// to explicitly compute chunk and parent chaining values. It is semi-stable -// and likely to keep working, but largely undocumented and not intended for -// widespread use. -#[doc(hidden)] -pub mod guts; - -// The platform module is pub for benchmarks only. It is not stable. -#[doc(hidden)] -pub mod platform; - -// Platform-specific implementations of the compression function. These -// BLAKE3-specific cfg flags are set in build.rs. -#[cfg(blake3_avx2_rust)] -#[path = "rust_avx2.rs"] -mod avx2; -#[cfg(blake3_avx2_ffi)] -#[path = "ffi_avx2.rs"] -mod avx2; -#[cfg(blake3_avx512_ffi)] -#[path = "ffi_avx512.rs"] -mod avx512; -#[cfg(feature = "neon")] -#[path = "ffi_neon.rs"] -mod neon; -mod portable; -#[cfg(blake3_sse2_rust)] -#[path = "rust_sse2.rs"] -mod sse2; -#[cfg(blake3_sse2_ffi)] -#[path = "ffi_sse2.rs"] -mod sse2; -#[cfg(blake3_sse41_rust)] -#[path = "rust_sse41.rs"] -mod sse41; -#[cfg(blake3_sse41_ffi)] -#[path = "ffi_sse41.rs"] -mod sse41; - -pub mod traits; - -pub mod join; - -use arrayref::{array_mut_ref, array_ref}; -use arrayvec::{ArrayString, ArrayVec}; -use core::cmp; -use core::fmt; -use join::{Join, SerialJoin}; -use platform::{Platform, MAX_SIMD_DEGREE, MAX_SIMD_DEGREE_OR_2}; - -/// The number of bytes in a [`Hash`](struct.Hash.html), 32. -pub const OUT_LEN: usize = 32; - -/// The number of bytes in a key, 32. -pub const KEY_LEN: usize = 32; - -// These constants are pub for incremental use cases like `bao`, as well as -// tests and benchmarks. Most callers should not need them. -#[doc(hidden)] -pub const BLOCK_LEN: usize = 64; -#[doc(hidden)] -pub const CHUNK_LEN: usize = 1024; -#[doc(hidden)] -pub const MAX_DEPTH: usize = 54; // 2^54 * CHUNK_LEN = 2^64 - -// While iterating the compression function within a chunk, the CV is -// represented as words, to avoid doing two extra endianness conversions for -// each compression in the portable implementation. But the hash_many interface -// needs to hash both input bytes and parent nodes, so its better for its -// output CVs to be represented as bytes. -type CVWords = [u32; 8]; -type CVBytes = [u8; 32]; // little-endian - -const IV: &CVWords = &[ - 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19, -]; - -const MSG_SCHEDULE: [[usize; 16]; 7] = [ - [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], - [2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8], - [3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1], - [10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6], - [12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4], - [9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7], - [11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13], -]; - -// These are the internal flags that we use to domain separate root/non-root, -// chunk/parent, and chunk beginning/middle/end. These get set at the high end -// of the block flags word in the compression function, so their values start -// high and go down. -const CHUNK_START: u8 = 1 << 0; -const CHUNK_END: u8 = 1 << 1; -const PARENT: u8 = 1 << 2; -const ROOT: u8 = 1 << 3; -const KEYED_HASH: u8 = 1 << 4; -const DERIVE_KEY_CONTEXT: u8 = 1 << 5; -const DERIVE_KEY_MATERIAL: u8 = 1 << 6; - -#[inline] -fn counter_low(counter: u64) -> u32 { - counter as u32 -} - -#[inline] -fn counter_high(counter: u64) -> u32 { - (counter >> 32) as u32 -} - -/// An output of the default size, 32 bytes, which provides constant-time -/// equality checking. -/// -/// `Hash` implements [`From`] and [`Into`] for `[u8; 32]`, and it provides an -/// explicit [`as_bytes`] method returning `&[u8; 32]`. However, byte arrays -/// and slices don't provide constant-time equality checking, which is often a -/// security requirement in software that handles private data. `Hash` doesn't -/// implement [`Deref`] or [`AsRef`], to avoid situations where a type -/// conversion happens implicitly and the constant-time property is -/// accidentally lost. -/// -/// `Hash` provides the [`to_hex`] method for converting to hexadecimal. It -/// doesn't directly support converting from hexadecimal, but here's an example -/// of doing that with the [`hex`] crate: -/// -/// ``` -/// # fn main() -> Result<(), Box<dyn std::error::Error>> { -/// use std::convert::TryInto; -/// -/// let hash_hex = "d74981efa70a0c880b8d8c1985d075dbcbf679b99a5f9914e5aaf96b831a9e24"; -/// let hash_bytes = hex::decode(hash_hex)?; -/// let hash_array: [u8; blake3::OUT_LEN] = hash_bytes[..].try_into()?; -/// let hash: blake3::Hash = hash_array.into(); -/// # Ok(()) -/// # } -/// ``` -/// -/// [`From`]: https://doc.rust-lang.org/std/convert/trait.From.html -/// [`Into`]: https://doc.rust-lang.org/std/convert/trait.Into.html -/// [`as_bytes`]: #method.as_bytes -/// [`Deref`]: https://doc.rust-lang.org/stable/std/ops/trait.Deref.html -/// [`AsRef`]: https://doc.rust-lang.org/std/convert/trait.AsRef.html -/// [`to_hex`]: #method.to_hex -/// [`hex`]: https://crates.io/crates/hex -#[derive(Clone, Copy, Hash)] -pub struct Hash([u8; OUT_LEN]); - -impl Hash { - /// The bytes of the `Hash`. Note that byte arrays don't provide - /// constant-time equality checking, so if you need to compare hashes, - /// prefer the `Hash` type. - #[inline] - pub fn as_bytes(&self) -> &[u8; OUT_LEN] { - &self.0 - } - - /// The hexadecimal encoding of the `Hash`. The returned [`ArrayString`] is - /// a fixed size and doesn't allocate memory on the heap. Note that - /// [`ArrayString`] doesn't provide constant-time equality checking, so if - /// you need to compare hashes, prefer the `Hash` type. - /// - /// [`ArrayString`]: https://docs.rs/arrayvec/0.5.1/arrayvec/struct.ArrayString.html - pub fn to_hex(&self) -> ArrayString<[u8; 2 * OUT_LEN]> { - let mut s = ArrayString::new(); - let table = b"0123456789abcdef"; - for &b in self.0.iter() { - s.push(table[(b >> 4) as usize] as char); - s.push(table[(b & 0xf) as usize] as char); - } - s - } -} - -impl From<[u8; OUT_LEN]> for Hash { - #[inline] - fn from(bytes: [u8; OUT_LEN]) -> Self { - Self(bytes) - } -} - -impl From<Hash> for [u8; OUT_LEN] { - #[inline] - fn from(hash: Hash) -> Self { - hash.0 - } -} - -/// This implementation is constant-time. -impl PartialEq for Hash { - #[inline] - fn eq(&self, other: &Hash) -> bool { - constant_time_eq::constant_time_eq_32(&self.0, &other.0) - } -} - -/// This implementation is constant-time. -impl PartialEq<[u8; OUT_LEN]> for Hash { - #[inline] - fn eq(&self, other: &[u8; OUT_LEN]) -> bool { - constant_time_eq::constant_time_eq_32(&self.0, other) - } -} - -impl Eq for Hash {} - -impl fmt::Debug for Hash { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - // Formatting field as `&str` to reduce code size since the `Debug` - // dynamic dispatch table for `&str` is likely needed elsewhere already, - // but that for `ArrayString<[u8; 64]>` is not. - let hex = self.to_hex(); - let hex: &str = hex.as_str(); - - f.debug_tuple("Hash").field(&hex).finish() - } -} - -// Each chunk or parent node can produce either a 32-byte chaining value or, by -// setting the ROOT flag, any number of final output bytes. The Output struct -// captures the state just prior to choosing between those two possibilities. -#[derive(Clone)] -struct Output { - input_chaining_value: CVWords, - block: [u8; 64], - block_len: u8, - counter: u64, - flags: u8, - platform: Platform, -} - -impl Output { - fn chaining_value(&self) -> CVBytes { - let mut cv = self.input_chaining_value; - self.platform.compress_in_place( - &mut cv, - &self.block, - self.block_len, - self.counter, - self.flags, - ); - platform::le_bytes_from_words_32(&cv) - } - - fn root_hash(&self) -> Hash { - debug_assert_eq!(self.counter, 0); - let mut cv = self.input_chaining_value; - self.platform - .compress_in_place(&mut cv, &self.block, self.block_len, 0, self.flags | ROOT); - Hash(platform::le_bytes_from_words_32(&cv)) - } - - fn root_output_block(&self) -> [u8; 2 * OUT_LEN] { - self.platform.compress_xof( - &self.input_chaining_value, - &self.block, - self.block_len, - self.counter, - self.flags | ROOT, - ) - } -} - -#[derive(Clone)] -struct ChunkState { - cv: CVWords, - chunk_counter: u64, - buf: [u8; BLOCK_LEN], - buf_len: u8, - blocks_compressed: u8, - flags: u8, - platform: Platform, -} - -impl ChunkState { - fn new(key: &CVWords, chunk_counter: u64, flags: u8, platform: Platform) -> Self { - Self { - cv: *key, - chunk_counter, - buf: [0; BLOCK_LEN], - buf_len: 0, - blocks_compressed: 0, - flags, - platform, - } - } - - fn len(&self) -> usize { - BLOCK_LEN * self.blocks_compressed as usize + self.buf_len as usize - } - - fn fill_buf(&mut self, input: &mut &[u8]) { - let want = BLOCK_LEN - self.buf_len as usize; - let take = cmp::min(want, input.len()); - self.buf[self.buf_len as usize..][..take].copy_from_slice(&input[..take]); - self.buf_len += take as u8; - *input = &input[take..]; - } - - fn start_flag(&self) -> u8 { - if self.blocks_compressed == 0 { - CHUNK_START - } else { - 0 - } - } - - // Try to avoid buffering as much as possible, by compressing directly from - // the input slice when full blocks are available. - fn update(&mut self, mut input: &[u8]) -> &mut Self { - if self.buf_len > 0 { - self.fill_buf(&mut input); - if !input.is_empty() { - debug_assert_eq!(self.buf_len as usize, BLOCK_LEN); - let block_flags = self.flags | self.start_flag(); // borrowck - self.platform.compress_in_place( - &mut self.cv, - &self.buf, - BLOCK_LEN as u8, - self.chunk_counter, - block_flags, - ); - self.buf_len = 0; - self.buf = [0; BLOCK_LEN]; - self.blocks_compressed += 1; - } - } - - while input.len() > BLOCK_LEN { - debug_assert_eq!(self.buf_len, 0); - let block_flags = self.flags | self.start_flag(); // borrowck - self.platform.compress_in_place( - &mut self.cv, - array_ref!(input, 0, BLOCK_LEN), - BLOCK_LEN as u8, - self.chunk_counter, - block_flags, - ); - self.blocks_compressed += 1; - input = &input[BLOCK_LEN..]; - } - - self.fill_buf(&mut input); - debug_assert!(input.is_empty()); - debug_assert!(self.len() <= CHUNK_LEN); - self - } - - fn output(&self) -> Output { - let block_flags = self.flags | self.start_flag() | CHUNK_END; - Output { - input_chaining_value: self.cv, - block: self.buf, - block_len: self.buf_len, - counter: self.chunk_counter, - flags: block_flags, - platform: self.platform, - } - } -} - -// Don't derive(Debug), because the state may be secret. -impl fmt::Debug for ChunkState { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - f.debug_struct("ChunkState") - .field("len", &self.len()) - .field("chunk_counter", &self.chunk_counter) - .field("flags", &self.flags) - .field("platform", &self.platform) - .finish() - } -} - -// IMPLEMENTATION NOTE -// =================== -// The recursive function compress_subtree_wide(), implemented below, is the -// basis of high-performance BLAKE3. We use it both for all-at-once hashing, -// and for the incremental input with Hasher (though we have to be careful with -// subtree boundaries in the incremental case). compress_subtree_wide() applies -// several optimizations at the same time: -// - Multi-threading with Rayon. -// - Parallel chunk hashing with SIMD. -// - Parallel parent hashing with SIMD. Note that while SIMD chunk hashing -// maxes out at MAX_SIMD_DEGREE*CHUNK_LEN, parallel parent hashing continues -// to benefit from larger inputs, because more levels of the tree benefit can -// use full-width SIMD vectors for parent hashing. Without parallel parent -// hashing, we lose about 10% of overall throughput on AVX2 and AVX-512. - -// pub for benchmarks -#[doc(hidden)] -#[derive(Clone, Copy)] -pub enum IncrementCounter { - Yes, - No, -} - -impl IncrementCounter { - #[inline] - fn yes(&self) -> bool { - match self { - IncrementCounter::Yes => true, - IncrementCounter::No => false, - } - } -} - -// The largest power of two less than or equal to `n`, used for left_len() -// immediately below, and also directly in Hasher::update(). -fn largest_power_of_two_leq(n: usize) -> usize { - ((n / 2) + 1).next_power_of_two() -} - -// Given some input larger than one chunk, return the number of bytes that -// should go in the left subtree. This is the largest power-of-2 number of -// chunks that leaves at least 1 byte for the right subtree. -fn left_len(content_len: usize) -> usize { - debug_assert!(content_len > CHUNK_LEN); - // Subtract 1 to reserve at least one byte for the right side. - let full_chunks = (content_len - 1) / CHUNK_LEN; - largest_power_of_two_leq(full_chunks) * CHUNK_LEN -} - -// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE chunks at the same time -// on a single thread. Write out the chunk chaining values and return the -// number of chunks hashed. These chunks are never the root and never empty; -// those cases use a different codepath. -fn compress_chunks_parallel( - input: &[u8], - key: &CVWords, - chunk_counter: u64, - flags: u8, - platform: Platform, - out: &mut [u8], -) -> usize { - debug_assert!(!input.is_empty(), "empty chunks below the root"); - debug_assert!(input.len() <= MAX_SIMD_DEGREE * CHUNK_LEN); - - let mut chunks_exact = input.chunks_exact(CHUNK_LEN); - let mut chunks_array = ArrayVec::<[&[u8; CHUNK_LEN]; MAX_SIMD_DEGREE]>::new(); - for chunk in &mut chunks_exact { - chunks_array.push(array_ref!(chunk, 0, CHUNK_LEN)); - } - platform.hash_many( - &chunks_array, - key, - chunk_counter, - IncrementCounter::Yes, - flags, - CHUNK_START, - CHUNK_END, - out, - ); - - // Hash the remaining partial chunk, if there is one. Note that the empty - // chunk (meaning the empty message) is a different codepath. - let chunks_so_far = chunks_array.len(); - if !chunks_exact.remainder().is_empty() { - let counter = chunk_counter + chunks_so_far as u64; - let mut chunk_state = ChunkState::new(key, counter, flags, platform); - chunk_state.update(chunks_exact.remainder()); - *array_mut_ref!(out, chunks_so_far * OUT_LEN, OUT_LEN) = - chunk_state.output().chaining_value(); - chunks_so_far + 1 - } else { - chunks_so_far - } -} - -// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE parents at the same time -// on a single thread. Write out the parent chaining values and return the -// number of parents hashed. (If there's an odd input chaining value left over, -// return it as an additional output.) These parents are never the root and -// never empty; those cases use a different codepath. -fn compress_parents_parallel( - child_chaining_values: &[u8], - key: &CVWords, - flags: u8, - platform: Platform, - out: &mut [u8], -) -> usize { - debug_assert_eq!(child_chaining_values.len() % OUT_LEN, 0, "wacky hash bytes"); - let num_children = child_chaining_values.len() / OUT_LEN; - debug_assert!(num_children >= 2, "not enough children"); - debug_assert!(num_children <= 2 * MAX_SIMD_DEGREE_OR_2, "too many"); - - let mut parents_exact = child_chaining_values.chunks_exact(BLOCK_LEN); - // Use MAX_SIMD_DEGREE_OR_2 rather than MAX_SIMD_DEGREE here, because of - // the requirements of compress_subtree_wide(). - let mut parents_array = ArrayVec::<[&[u8; BLOCK_LEN]; MAX_SIMD_DEGREE_OR_2]>::new(); - for parent in &mut parents_exact { - parents_array.push(array_ref!(parent, 0, BLOCK_LEN)); - } - platform.hash_many( - &parents_array, - key, - 0, // Parents always use counter 0. - IncrementCounter::No, - flags | PARENT, - 0, // Parents have no start flags. - 0, // Parents have no end flags. - out, - ); - - // If there's an odd child left over, it becomes an output. - let parents_so_far = parents_array.len(); - if !parents_exact.remainder().is_empty() { - out[parents_so_far * OUT_LEN..][..OUT_LEN].copy_from_slice(parents_exact.remainder()); - parents_so_far + 1 - } else { - parents_so_far - } -} - -// The wide helper function returns (writes out) an array of chaining values -// and returns the length of that array. The number of chaining values returned -// is the dyanmically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer, -// if the input is shorter than that many chunks. The reason for maintaining a -// wide array of chaining values going back up the tree, is to allow the -// implementation to hash as many parents in parallel as possible. -// -// As a special case when the SIMD degree is 1, this function will still return -// at least 2 outputs. This guarantees that this function doesn't perform the -// root compression. (If it did, it would use the wrong flags, and also we -// wouldn't be able to implement exendable ouput.) Note that this function is -// not used when the whole input is only 1 chunk long; that's a different -// codepath. -// -// Why not just have the caller split the input on the first update(), instead -// of implementing this special rule? Because we don't want to limit SIMD or -// multi-threading parallelism for that update(). -fn compress_subtree_wide<J: Join>( - input: &[u8], - key: &CVWords, - chunk_counter: u64, - flags: u8, - platform: Platform, - out: &mut [u8], -) -> usize { - // Note that the single chunk case does *not* bump the SIMD degree up to 2 - // when it is 1. This allows Rayon the option of multi-threading even the - // 2-chunk case, which can help performance on smaller platforms. - if input.len() <= platform.simd_degree() * CHUNK_LEN { - return compress_chunks_parallel(input, key, chunk_counter, flags, platform, out); - } - - // With more than simd_degree chunks, we need to recurse. Start by dividing - // the input into left and right subtrees. (Note that this is only optimal - // as long as the SIMD degree is a power of 2. If we ever get a SIMD degree - // of 3 or something, we'll need a more complicated strategy.) - debug_assert_eq!(platform.simd_degree().count_ones(), 1, "power of 2"); - let (left, right) = input.split_at(left_len(input.len())); - let right_chunk_counter = chunk_counter + (left.len() / CHUNK_LEN) as u64; - - // Make space for the child outputs. Here we use MAX_SIMD_DEGREE_OR_2 to - // account for the special case of returning 2 outputs when the SIMD degree - // is 1. - let mut cv_array = [0; 2 * MAX_SIMD_DEGREE_OR_2 * OUT_LEN]; - let degree = if left.len() == CHUNK_LEN { - // The "simd_degree=1 and we're at the leaf nodes" case. - debug_assert_eq!(platform.simd_degree(), 1); - 1 - } else { - cmp::max(platform.simd_degree(), 2) - }; - let (left_out, right_out) = cv_array.split_at_mut(degree * OUT_LEN); - - // Recurse! This uses multiple threads if the "rayon" feature is enabled. - let (left_n, right_n) = J::join( - || compress_subtree_wide::<J>(left, key, chunk_counter, flags, platform, left_out), - || compress_subtree_wide::<J>(right, key, right_chunk_counter, flags, platform, right_out), - left.len(), - right.len(), - ); - - // The special case again. If simd_degree=1, then we'll have left_n=1 and - // right_n=1. Rather than compressing them into a single output, return - // them directly, to make sure we always have at least two outputs. - debug_assert_eq!(left_n, degree); - debug_assert!(right_n >= 1 && right_n <= left_n); - if left_n == 1 { - out[..2 * OUT_LEN].copy_from_slice(&cv_array[..2 * OUT_LEN]); - return 2; - } - - // Otherwise, do one layer of parent node compression. - let num_children = left_n + right_n; - compress_parents_parallel( - &cv_array[..num_children * OUT_LEN], - key, - flags, - platform, - out, - ) -} - -// Hash a subtree with compress_subtree_wide(), and then condense the resulting -// list of chaining values down to a single parent node. Don't compress that -// last parent node, however. Instead, return its message bytes (the -// concatenated chaining values of its children). This is necessary when the -// first call to update() supplies a complete subtree, because the topmost -// parent node of that subtree could end up being the root. It's also necessary -// for extended output in the general case. -// -// As with compress_subtree_wide(), this function is not used on inputs of 1 -// chunk or less. That's a different codepath. -fn compress_subtree_to_parent_node<J: Join>( - input: &[u8], - key: &CVWords, - chunk_counter: u64, - flags: u8, - platform: Platform, -) -> [u8; BLOCK_LEN] { - debug_assert!(input.len() > CHUNK_LEN); - let mut cv_array = [0; MAX_SIMD_DEGREE_OR_2 * OUT_LEN]; - let mut num_cvs = - compress_subtree_wide::<J>(input, &key, chunk_counter, flags, platform, &mut cv_array); - debug_assert!(num_cvs >= 2); - - // If MAX_SIMD_DEGREE is greater than 2 and there's enough input, - // compress_subtree_wide() returns more than 2 chaining values. Condense - // them into 2 by forming parent nodes repeatedly. - let mut out_array = [0; MAX_SIMD_DEGREE_OR_2 * OUT_LEN / 2]; - while num_cvs > 2 { - let cv_slice = &cv_array[..num_cvs * OUT_LEN]; - num_cvs = compress_parents_parallel(cv_slice, key, flags, platform, &mut out_array); - cv_array[..num_cvs * OUT_LEN].copy_from_slice(&out_array[..num_cvs * OUT_LEN]); - } - *array_ref!(cv_array, 0, 2 * OUT_LEN) -} - -// Hash a complete input all at once. Unlike compress_subtree_wide() and -// compress_subtree_to_parent_node(), this function handles the 1 chunk case. -// Note that this we use SerialJoin here, so this is always single-threaded. -fn hash_all_at_once(input: &[u8], key: &CVWords, flags: u8) -> Output { - let platform = Platform::detect(); - - // If the whole subtree is one chunk, hash it directly with a ChunkState. - if input.len() <= CHUNK_LEN { - return ChunkState::new(key, 0, flags, platform) - .update(input) - .output(); - } - - // Otherwise construct an Output object from the parent node returned by - // compress_subtree_to_parent_node(). - Output { - input_chaining_value: *key, - block: compress_subtree_to_parent_node::<SerialJoin>(input, key, 0, flags, platform), - block_len: BLOCK_LEN as u8, - counter: 0, - flags: flags | PARENT, - platform, - } -} - -/// The default hash function. -/// -/// For an incremental version that accepts multiple writes, see [`Hasher::update`]. -/// -/// This function is always single-threaded. For multi-threading support, see -/// [`Hasher::update_with_join`]. -/// -/// [`Hasher::update`]: struct.Hasher.html#method.update -/// [`Hasher::update_with_join`]: struct.Hasher.html#method.update_with_join -pub fn hash(input: &[u8]) -> Hash { - hash_all_at_once(input, IV, 0).root_hash() -} - -/// The keyed hash function. -/// -/// This is suitable for use as a message authentication code, for -/// example to replace an HMAC instance. -/// In that use case, the constant-time equality checking provided by -/// [`Hash`](struct.Hash.html) is almost always a security requirement, and -/// callers need to be careful not to compare MACs as raw bytes. -/// -/// This function is always single-threaded. For multi-threading support, see -/// [`Hasher::update_with_join`]. -/// -/// [`Hasher::update_with_join`]: struct.Hasher.html#method.update_with_join -pub fn keyed_hash(key: &[u8; KEY_LEN], input: &[u8]) -> Hash { - let key_words = platform::words_from_le_bytes_32(key); - hash_all_at_once(input, &key_words, KEYED_HASH).root_hash() -} - -/// The key derivation function. -/// -/// Given cryptographic key material of any length and a context string of any -/// length, this function outputs a derived subkey of any length. **The context -/// string should be hardcoded, globally unique, and application-specific.** A -/// good default format for such strings is `"[application] [commit timestamp] -/// [purpose]"`, e.g., `"example.com 2019-12-25 16:18:03 session tokens v1"`. -/// -/// Key derivation is important when you want to use the same key in multiple -/// algorithms or use cases. Using the same key with different cryptographic -/// algorithms is generally forbidden, and deriving a separate subkey for each -/// use case protects you from bad interactions. Derived keys also mitigate the -/// damage from one part of your application accidentally leaking its key. -/// -/// As a rare exception to that general rule, however, it is possible to use -/// `derive_key` itself with key material that you are already using with -/// another algorithm. You might need to do this if you're adding features to -/// an existing application, which does not yet use key derivation internally. -/// However, you still must not share key material with algorithms that forbid -/// key reuse entirely, like a one-time pad. -/// -/// Note that BLAKE3 is not a password hash, and **`derive_key` should never be -/// used with passwords.** Instead, use a dedicated password hash like -/// [Argon2]. Password hashes are entirely different from generic hash -/// functions, with opposite design requirements. -/// -/// This function is always single-threaded. For multi-threading support, see -/// [`Hasher::update_with_join`]. -/// -/// [`Hasher::new_derive_key`]: struct.Hasher.html#method.new_derive_key -/// [`Hasher::finalize_xof`]: struct.Hasher.html#method.finalize_xof -/// [Argon2]: https://en.wikipedia.org/wiki/Argon2 -/// [`Hasher::update_with_join`]: struct.Hasher.html#method.update_with_join -pub fn derive_key(context: &str, key_material: &[u8], output: &mut [u8]) { - let context_key = hash_all_at_once(context.as_bytes(), IV, DERIVE_KEY_CONTEXT).root_hash(); - let context_key_words = platform::words_from_le_bytes_32(context_key.as_bytes()); - let inner_output = hash_all_at_once(key_material, &context_key_words, DERIVE_KEY_MATERIAL); - OutputReader::new(inner_output).fill(output); -} - -fn parent_node_output( - left_child: &CVBytes, - right_child: &CVBytes, - key: &CVWords, - flags: u8, - platform: Platform, -) -> Output { - let mut block = [0; BLOCK_LEN]; - block[..32].copy_from_slice(left_child); - block[32..].copy_from_slice(right_child); - Output { - input_chaining_value: *key, - block, - block_len: BLOCK_LEN as u8, - counter: 0, - flags: flags | PARENT, - platform, - } -} - -/// An incremental hash state that can accept any number of writes. -/// -/// In addition to its inherent methods, this type implements several commonly -/// used traits from the [`digest`](https://crates.io/crates/digest) and -/// [`crypto_mac`](https://crates.io/crates/crypto-mac) crates. -/// -/// **Performance note:** The [`update`] and [`update_with_join`] methods -/// perform poorly when the caller's input buffer is small. See their method -/// docs below. A 16 KiB buffer is large enough to leverage all currently -/// supported SIMD instruction sets. -/// -/// # Examples -/// -/// ``` -/// # fn main() -> Result<(), Box<dyn std::error::Error>> { -/// // Hash an input incrementally. -/// let mut hasher = blake3::Hasher::new(); -/// hasher.update(b"foo"); -/// hasher.update(b"bar"); -/// hasher.update(b"baz"); -/// assert_eq!(hasher.finalize(), blake3::hash(b"foobarbaz")); -/// -/// // Extended output. OutputReader also implements Read and Seek. -/// # #[cfg(feature = "std")] { -/// let mut output = [0; 1000]; -/// let mut output_reader = hasher.finalize_xof(); -/// output_reader.fill(&mut output); -/// assert_eq!(&output[..32], blake3::hash(b"foobarbaz").as_bytes()); -/// # } -/// # Ok(()) -/// # } -/// ``` -/// -/// [`update`]: #method.update -/// [`update_with_join`]: #method.update_with_join -#[derive(Clone)] -pub struct Hasher { - key: CVWords, - chunk_state: ChunkState, - // The stack size is MAX_DEPTH + 1 because we do lazy merging. For example, - // with 7 chunks, we have 3 entries in the stack. Adding an 8th chunk - // requires a 4th entry, rather than merging everything down to 1, because - // we don't know whether more input is coming. This is different from how - // the reference implementation does things. - cv_stack: ArrayVec<[CVBytes; MAX_DEPTH + 1]>, -} - -impl Hasher { - fn new_internal(key: &CVWords, flags: u8) -> Self { - Self { - key: *key, - chunk_state: ChunkState::new(key, 0, flags, Platform::detect()), - cv_stack: ArrayVec::new(), - } - } - - /// Construct a new `Hasher` for the regular hash function. - pub fn new() -> Self { - Self::new_internal(IV, 0) - } - - /// Construct a new `Hasher` for the keyed hash function. See - /// [`keyed_hash`]. - /// - /// [`keyed_hash`]: fn.keyed_hash.html - pub fn new_keyed(key: &[u8; KEY_LEN]) -> Self { - let key_words = platform::words_from_le_bytes_32(key); - Self::new_internal(&key_words, KEYED_HASH) - } - - /// Construct a new `Hasher` for the key derivation function. See - /// [`derive_key`]. The context string should be hardcoded, globally - /// unique, and application-specific. - /// - /// [`derive_key`]: fn.derive_key.html - pub fn new_derive_key(context: &str) -> Self { - let context_key = hash_all_at_once(context.as_bytes(), IV, DERIVE_KEY_CONTEXT).root_hash(); - let context_key_words = platform::words_from_le_bytes_32(context_key.as_bytes()); - Self::new_internal(&context_key_words, DERIVE_KEY_MATERIAL) - } - - /// Reset the `Hasher` to its initial state. - /// - /// This is functionally the same as overwriting the `Hasher` with a new - /// one, using the same key or context string if any. However, depending on - /// how much inlining the optimizer does, moving a `Hasher` might copy its - /// entire CV stack, most of which is useless uninitialized bytes. This - /// methods avoids that copy. - pub fn reset(&mut self) -> &mut Self { - self.chunk_state = ChunkState::new( - &self.key, - 0, - self.chunk_state.flags, - self.chunk_state.platform, - ); - self.cv_stack.clear(); - self - } - - // As described in push_cv() below, we do "lazy merging", delaying merges - // until right before the next CV is about to be added. This is different - // from the reference implementation. Another difference is that we aren't - // always merging 1 chunk at a time. Instead, each CV might represent any - // power-of-two number of chunks, as long as the smaller-above-larger stack - // order is maintained. Instead of the "count the trailing 0-bits" - // algorithm described in the spec, we use a "count the total number of - // 1-bits" variant that doesn't require us to retain the subtree size of - // the CV on top of the stack. The principle is the same: each CV that - // should remain in the stack is represented by a 1-bit in the total number - // of chunks (or bytes) so far. - fn merge_cv_stack(&mut self, total_len: u64) { - let post_merge_stack_len = total_len.count_ones() as usize; - while self.cv_stack.len() > post_merge_stack_len { - let right_child = self.cv_stack.pop().unwrap(); - let left_child = self.cv_stack.pop().unwrap(); - let parent_output = parent_node_output( - &left_child, - &right_child, - &self.key, - self.chunk_state.flags, - self.chunk_state.platform, - ); - self.cv_stack.push(parent_output.chaining_value()); - } - } - - // In reference_impl.rs, we merge the new CV with existing CVs from the - // stack before pushing it. We can do that because we know more input is - // coming, so we know none of the merges are root. - // - // This setting is different. We want to feed as much input as possible to - // compress_subtree_wide(), without setting aside anything for the - // chunk_state. If the user gives us 64 KiB, we want to parallelize over - // all 64 KiB at once as a single subtree, if at all possible. - // - // This leads to two problems: - // 1) This 64 KiB input might be the only call that ever gets made to - // update. In this case, the root node of the 64 KiB subtree would be - // the root node of the whole tree, and it would need to be ROOT - // finalized. We can't compress it until we know. - // 2) This 64 KiB input might complete a larger tree, whose root node is - // similarly going to be the the root of the whole tree. For example, - // maybe we have 196 KiB (that is, 128 + 64) hashed so far. We can't - // compress the node at the root of the 256 KiB subtree until we know - // how to finalize it. - // - // The second problem is solved with "lazy merging". That is, when we're - // about to add a CV to the stack, we don't merge it with anything first, - // as the reference impl does. Instead we do merges using the *previous* CV - // that was added, which is sitting on top of the stack, and we put the new - // CV (unmerged) on top of the stack afterwards. This guarantees that we - // never merge the root node until finalize(). - // - // Solving the first problem requires an additional tool, - // compress_subtree_to_parent_node(). That function always returns the top - // *two* chaining values of the subtree it's compressing. We then do lazy - // merging with each of them separately, so that the second CV will always - // remain unmerged. (That also helps us support extendable output when - // we're hashing an input all-at-once.) - fn push_cv(&mut self, new_cv: &CVBytes, chunk_counter: u64) { - self.merge_cv_stack(chunk_counter); - self.cv_stack.push(*new_cv); - } - - /// Add input bytes to the hash state. You can call this any number of - /// times. - /// - /// This method is always single-threaded. For multi-threading support, see - /// `update_with_join` below. - /// - /// Note that the degree of SIMD parallelism that `update` can use is - /// limited by the size of this input buffer. The 8 KiB buffer currently - /// used by [`std::io::copy`] is enough to leverage AVX2, for example, but - /// not enough to leverage AVX-512. A 16 KiB buffer is large enough to - /// leverage all currently supported SIMD instruction sets. - /// - /// [`std::io::copy`]: https://doc.rust-lang.org/std/io/fn.copy.html - pub fn update(&mut self, input: &[u8]) -> &mut Self { - self.update_with_join::<SerialJoin>(input) - } - - /// Add input bytes to the hash state, as with `update`, but potentially - /// using multi-threading. See the example below, and the - /// [`join`](join/index.html) module for a more detailed explanation. - /// - /// To get any performance benefit from multi-threading, the input buffer - /// size needs to be very large. As a rule of thumb on x86_64, there is no - /// benefit to multi-threading inputs less than 128 KiB. Other platforms - /// have different thresholds, and in general you need to benchmark your - /// specific use case. Where possible, memory mapping an entire input file - /// is recommended, to take maximum advantage of multi-threading without - /// needing to tune a specific buffer size. Where memory mapping is not - /// possible, good multi-threading performance requires doing IO on a - /// background thread, to avoid sleeping all your worker threads while the - /// input buffer is (serially) refilled. This is quite complicated compared - /// to memory mapping. - /// - /// # Example - /// - /// ``` - /// // Hash a large input using multi-threading. Note that multi-threading - /// // comes with some overhead, and it can actually hurt performance for small - /// // inputs. The meaning of "small" varies, however, depending on the - /// // platform and the number of threads. (On x86_64, the cutoff tends to be - /// // around 128 KiB.) You should benchmark your own use case to see whether - /// // multi-threading helps. - /// # #[cfg(feature = "rayon")] - /// # { - /// # fn some_large_input() -> &'static [u8] { b"foo" } - /// let input: &[u8] = some_large_input(); - /// let mut hasher = blake3::Hasher::new(); - /// hasher.update_with_join::<blake3::join::RayonJoin>(input); - /// let hash = hasher.finalize(); - /// # } - /// ``` - pub fn update_with_join<J: Join>(&mut self, mut input: &[u8]) -> &mut Self { - // If we have some partial chunk bytes in the internal chunk_state, we - // need to finish that chunk first. - if self.chunk_state.len() > 0 { - let want = CHUNK_LEN - self.chunk_state.len(); - let take = cmp::min(want, input.len()); - self.chunk_state.update(&input[..take]); - input = &input[take..]; - if !input.is_empty() { - // We've filled the current chunk, and there's more input - // coming, so we know it's not the root and we can finalize it. - // Then we'll proceed to hashing whole chunks below. - debug_assert_eq!(self.chunk_state.len(), CHUNK_LEN); - let chunk_cv = self.chunk_state.output().chaining_value(); - self.push_cv(&chunk_cv, self.chunk_state.chunk_counter); - self.chunk_state = ChunkState::new( - &self.key, - self.chunk_state.chunk_counter + 1, - self.chunk_state.flags, - self.chunk_state.platform, - ); - } else { - return self; - } - } - - // Now the chunk_state is clear, and we have more input. If there's - // more than a single chunk (so, definitely not the root chunk), hash - // the largest whole subtree we can, with the full benefits of SIMD and - // multi-threading parallelism. Two restrictions: - // - The subtree has to be a power-of-2 number of chunks. Only subtrees - // along the right edge can be incomplete, and we don't know where - // the right edge is going to be until we get to finalize(). - // - The subtree must evenly divide the total number of chunks up until - // this point (if total is not 0). If the current incomplete subtree - // is only waiting for 1 more chunk, we can't hash a subtree of 4 - // chunks. We have to complete the current subtree first. - // Because we might need to break up the input to form powers of 2, or - // to evenly divide what we already have, this part runs in a loop. - while input.len() > CHUNK_LEN { - debug_assert_eq!(self.chunk_state.len(), 0, "no partial chunk data"); - debug_assert_eq!(CHUNK_LEN.count_ones(), 1, "power of 2 chunk len"); - let mut subtree_len = largest_power_of_two_leq(input.len()); - let count_so_far = self.chunk_state.chunk_counter * CHUNK_LEN as u64; - // Shrink the subtree_len until it evenly divides the count so far. - // We know that subtree_len itself is a power of 2, so we can use a - // bitmasking trick instead of an actual remainder operation. (Note - // that if the caller consistently passes power-of-2 inputs of the - // same size, as is hopefully typical, this loop condition will - // always fail, and subtree_len will always be the full length of - // the input.) - // - // An aside: We don't have to shrink subtree_len quite this much. - // For example, if count_so_far is 1, we could pass 2 chunks to - // compress_subtree_to_parent_node. Since we'll get 2 CVs back, - // we'll still get the right answer in the end, and we might get to - // use 2-way SIMD parallelism. The problem with this optimization, - // is that it gets us stuck always hashing 2 chunks. The total - // number of chunks will remain odd, and we'll never graduate to - // higher degrees of parallelism. See - // https://github.com/BLAKE3-team/BLAKE3/issues/69. - while (subtree_len - 1) as u64 & count_so_far != 0 { - subtree_len /= 2; - } - // The shrunken subtree_len might now be 1 chunk long. If so, hash - // that one chunk by itself. Otherwise, compress the subtree into a - // pair of CVs. - let subtree_chunks = (subtree_len / CHUNK_LEN) as u64; - if subtree_len <= CHUNK_LEN { - debug_assert_eq!(subtree_len, CHUNK_LEN); - self.push_cv( - &ChunkState::new( - &self.key, - self.chunk_state.chunk_counter, - self.chunk_state.flags, - self.chunk_state.platform, - ) - .update(&input[..subtree_len]) - .output() - .chaining_value(), - self.chunk_state.chunk_counter, - ); - } else { - // This is the high-performance happy path, though getting here - // depends on the caller giving us a long enough input. - let cv_pair = compress_subtree_to_parent_node::<J>( - &input[..subtree_len], - &self.key, - self.chunk_state.chunk_counter, - self.chunk_state.flags, - self.chunk_state.platform, - ); - let left_cv = array_ref!(cv_pair, 0, 32); - let right_cv = array_ref!(cv_pair, 32, 32); - // Push the two CVs we received into the CV stack in order. Because - // the stack merges lazily, this guarantees we aren't merging the - // root. - self.push_cv(left_cv, self.chunk_state.chunk_counter); - self.push_cv( - right_cv, - self.chunk_state.chunk_counter + (subtree_chunks / 2), - ); - } - self.chunk_state.chunk_counter += subtree_chunks; - input = &input[subtree_len..]; - } - - // What remains is 1 chunk or less. Add it to the chunk state. - debug_assert!(input.len() <= CHUNK_LEN); - if !input.is_empty() { - self.chunk_state.update(input); - // Having added some input to the chunk_state, we know what's in - // the CV stack won't become the root node, and we can do an extra - // merge. This simplifies finalize(). - self.merge_cv_stack(self.chunk_state.chunk_counter); - } - - self - } - - fn final_output(&self) -> Output { - // If the current chunk is the only chunk, that makes it the root node - // also. Convert it directly into an Output. Otherwise, we need to - // merge subtrees below. - if self.cv_stack.is_empty() { - debug_assert_eq!(self.chunk_state.chunk_counter, 0); - return self.chunk_state.output(); - } - - // If there are any bytes in the ChunkState, finalize that chunk and - // merge its CV with everything in the CV stack. In that case, the work - // we did at the end of update() above guarantees that the stack - // doesn't contain any unmerged subtrees that need to be merged first. - // (This is important, because if there were two chunk hashes sitting - // on top of the stack, they would need to merge with each other, and - // merging a new chunk hash into them would be incorrect.) - // - // If there are no bytes in the ChunkState, we'll merge what's already - // in the stack. In this case it's fine if there are unmerged chunks on - // top, because we'll merge them with each other. Note that the case of - // the empty chunk is taken care of above. - let mut output: Output; - let mut num_cvs_remaining = self.cv_stack.len(); - if self.chunk_state.len() > 0 { - debug_assert_eq!( - self.cv_stack.len(), - self.chunk_state.chunk_counter.count_ones() as usize, - "cv stack does not need a merge" - ); - output = self.chunk_state.output(); - } else { - debug_assert!(self.cv_stack.len() >= 2); - output = parent_node_output( - &self.cv_stack[num_cvs_remaining - 2], - &self.cv_stack[num_cvs_remaining - 1], - &self.key, - self.chunk_state.flags, - self.chunk_state.platform, - ); - num_cvs_remaining -= 2; - } - while num_cvs_remaining > 0 { - output = parent_node_output( - &self.cv_stack[num_cvs_remaining - 1], - &output.chaining_value(), - &self.key, - self.chunk_state.flags, - self.chunk_state.platform, - ); - num_cvs_remaining -= 1; - } - output - } - - /// Finalize the hash state and return the [`Hash`](struct.Hash.html) of - /// the input. - /// - /// This method is idempotent. Calling it twice will give the same result. - /// You can also add more input and finalize again. - pub fn finalize(&self) -> Hash { - self.final_output().root_hash() - } - - /// Finalize the hash state and return an [`OutputReader`], which can - /// supply any number of output bytes. - /// - /// This method is idempotent. Calling it twice will give the same result. - /// You can also add more input and finalize again. - /// - /// [`OutputReader`]: struct.OutputReader.html - pub fn finalize_xof(&self) -> OutputReader { - OutputReader::new(self.final_output()) - } -} - -// Don't derive(Debug), because the state may be secret. -impl fmt::Debug for Hasher { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - f.debug_struct("Hasher") - .field("flags", &self.chunk_state.flags) - .field("platform", &self.chunk_state.platform) - .finish() - } -} - -impl Default for Hasher { - #[inline] - fn default() -> Self { - Self::new() - } -} - -#[cfg(feature = "std")] -impl std::io::Write for Hasher { - /// This is equivalent to [`update`](#method.update). - #[inline] - fn write(&mut self, input: &[u8]) -> std::io::Result<usize> { - self.update(input); - Ok(input.len()) - } - - #[inline] - fn flush(&mut self) -> std::io::Result<()> { - Ok(()) - } -} - -/// An incremental reader for extended output, returned by -/// [`Hasher::finalize_xof`](struct.Hasher.html#method.finalize_xof). -#[derive(Clone)] -pub struct OutputReader { - inner: Output, - position_within_block: u8, -} - -impl OutputReader { - fn new(inner: Output) -> Self { - Self { - inner, - position_within_block: 0, - } - } - - /// Fill a buffer with output bytes and advance the position of the - /// `OutputReader`. This is equivalent to [`Read::read`], except that it - /// doesn't return a `Result`. Both methods always fill the entire buffer. - /// - /// Note that `OutputReader` doesn't buffer output bytes internally, so - /// calling `fill` repeatedly with a short-length or odd-length slice will - /// end up performing the same compression multiple times. If you're - /// reading output in a loop, prefer a slice length that's a multiple of - /// 64. - /// - /// The maximum output size of BLAKE3 is 2<sup>64</sup>-1 bytes. If you try - /// to extract more than that, for example by seeking near the end and - /// reading further, the behavior is unspecified. - /// - /// [`Read::read`]: #method.read - pub fn fill(&mut self, mut buf: &mut [u8]) { - while !buf.is_empty() { - let block: [u8; BLOCK_LEN] = self.inner.root_output_block(); - let output_bytes = &block[self.position_within_block as usize..]; - let take = cmp::min(buf.len(), output_bytes.len()); - buf[..take].copy_from_slice(&output_bytes[..take]); - buf = &mut buf[take..]; - self.position_within_block += take as u8; - if self.position_within_block == BLOCK_LEN as u8 { - self.inner.counter += 1; - self.position_within_block = 0; - } - } - } - - /// Return the current read position in the output stream. The position of - /// a new `OutputReader` starts at 0, and each call to [`fill`] or - /// [`Read::read`] moves the position forward by the number of bytes read. - /// - /// [`fill`]: #method.fill - /// [`Read::read`]: #method.read - pub fn position(&self) -> u64 { - self.inner.counter * BLOCK_LEN as u64 + self.position_within_block as u64 - } - - /// Seek to a new read position in the output stream. This is equivalent to - /// calling [`Seek::seek`] with [`SeekFrom::Start`], except that it doesn't - /// return a `Result`. - /// - /// [`Seek::seek`]: #method.seek - /// [`SeekFrom::Start`]: https://doc.rust-lang.org/std/io/enum.SeekFrom.html - pub fn set_position(&mut self, position: u64) { - self.position_within_block = (position % BLOCK_LEN as u64) as u8; - self.inner.counter = position / BLOCK_LEN as u64; - } -} - -// Don't derive(Debug), because the state may be secret. -impl fmt::Debug for OutputReader { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - f.debug_struct("OutputReader") - .field("position", &self.position()) - .finish() - } -} - -#[cfg(feature = "std")] -impl std::io::Read for OutputReader { - #[inline] - fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> { - self.fill(buf); - Ok(buf.len()) - } -} - -#[cfg(feature = "std")] -impl std::io::Seek for OutputReader { - fn seek(&mut self, pos: std::io::SeekFrom) -> std::io::Result<u64> { - let max_position = u64::max_value() as i128; - let target_position: i128 = match pos { - std::io::SeekFrom::Start(x) => x as i128, - std::io::SeekFrom::Current(x) => self.position() as i128 + x as i128, - std::io::SeekFrom::End(_) => { - return Err(std::io::Error::new( - std::io::ErrorKind::InvalidInput, - "seek from end not supported", - )); - } - }; - if target_position < 0 { - return Err(std::io::Error::new( - std::io::ErrorKind::InvalidInput, - "seek before start", - )); - } - self.set_position(cmp::min(target_position, max_position) as u64); - Ok(self.position()) - } -} diff --git a/thirdparty/BLAKE3/src/platform.rs b/thirdparty/BLAKE3/src/platform.rs deleted file mode 100644 index 4bd67de7a..000000000 --- a/thirdparty/BLAKE3/src/platform.rs +++ /dev/null @@ -1,487 +0,0 @@ -use crate::{portable, CVWords, IncrementCounter, BLOCK_LEN}; -use arrayref::{array_mut_ref, array_ref}; - -cfg_if::cfg_if! { - if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { - cfg_if::cfg_if! { - if #[cfg(blake3_avx512_ffi)] { - pub const MAX_SIMD_DEGREE: usize = 16; - } else { - pub const MAX_SIMD_DEGREE: usize = 8; - } - } - } else if #[cfg(feature = "neon")] { - pub const MAX_SIMD_DEGREE: usize = 4; - } else { - pub const MAX_SIMD_DEGREE: usize = 1; - } -} - -// There are some places where we want a static size that's equal to the -// MAX_SIMD_DEGREE, but also at least 2. Constant contexts aren't currently -// allowed to use cmp::max, so we have to hardcode this additional constant -// value. Get rid of this once cmp::max is a const fn. -cfg_if::cfg_if! { - if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { - cfg_if::cfg_if! { - if #[cfg(blake3_avx512_ffi)] { - pub const MAX_SIMD_DEGREE_OR_2: usize = 16; - } else { - pub const MAX_SIMD_DEGREE_OR_2: usize = 8; - } - } - } else if #[cfg(feature = "neon")] { - pub const MAX_SIMD_DEGREE_OR_2: usize = 4; - } else { - pub const MAX_SIMD_DEGREE_OR_2: usize = 2; - } -} - -#[derive(Clone, Copy, Debug)] -pub enum Platform { - Portable, - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - SSE2, - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - SSE41, - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - AVX2, - #[cfg(blake3_avx512_ffi)] - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - AVX512, - #[cfg(feature = "neon")] - NEON, -} - -impl Platform { - #[allow(unreachable_code)] - pub fn detect() -> Self { - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - { - #[cfg(blake3_avx512_ffi)] - { - if avx512_detected() { - return Platform::AVX512; - } - } - if avx2_detected() { - return Platform::AVX2; - } - if sse41_detected() { - return Platform::SSE41; - } - if sse2_detected() { - return Platform::SSE2; - } - } - // We don't use dynamic feature detection for NEON. If the "neon" - // feature is on, NEON is assumed to be supported. - #[cfg(feature = "neon")] - { - return Platform::NEON; - } - Platform::Portable - } - - pub fn simd_degree(&self) -> usize { - let degree = match self { - Platform::Portable => 1, - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - Platform::SSE2 => 4, - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - Platform::SSE41 => 4, - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - Platform::AVX2 => 8, - #[cfg(blake3_avx512_ffi)] - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - Platform::AVX512 => 16, - #[cfg(feature = "neon")] - Platform::NEON => 4, - }; - debug_assert!(degree <= MAX_SIMD_DEGREE); - degree - } - - pub fn compress_in_place( - &self, - cv: &mut CVWords, - block: &[u8; BLOCK_LEN], - block_len: u8, - counter: u64, - flags: u8, - ) { - match self { - Platform::Portable => portable::compress_in_place(cv, block, block_len, counter, flags), - // Safe because detect() checked for platform support. - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - Platform::SSE2 => unsafe { - crate::sse2::compress_in_place(cv, block, block_len, counter, flags) - }, - // Safe because detect() checked for platform support. - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - Platform::SSE41 | Platform::AVX2 => unsafe { - crate::sse41::compress_in_place(cv, block, block_len, counter, flags) - }, - // Safe because detect() checked for platform support. - #[cfg(blake3_avx512_ffi)] - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - Platform::AVX512 => unsafe { - crate::avx512::compress_in_place(cv, block, block_len, counter, flags) - }, - // No NEON compress_in_place() implementation yet. - #[cfg(feature = "neon")] - Platform::NEON => portable::compress_in_place(cv, block, block_len, counter, flags), - } - } - - pub fn compress_xof( - &self, - cv: &CVWords, - block: &[u8; BLOCK_LEN], - block_len: u8, - counter: u64, - flags: u8, - ) -> [u8; 64] { - match self { - Platform::Portable => portable::compress_xof(cv, block, block_len, counter, flags), - // Safe because detect() checked for platform support. - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - Platform::SSE2 => unsafe { - crate::sse2::compress_xof(cv, block, block_len, counter, flags) - }, - // Safe because detect() checked for platform support. - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - Platform::SSE41 | Platform::AVX2 => unsafe { - crate::sse41::compress_xof(cv, block, block_len, counter, flags) - }, - // Safe because detect() checked for platform support. - #[cfg(blake3_avx512_ffi)] - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - Platform::AVX512 => unsafe { - crate::avx512::compress_xof(cv, block, block_len, counter, flags) - }, - // No NEON compress_xof() implementation yet. - #[cfg(feature = "neon")] - Platform::NEON => portable::compress_xof(cv, block, block_len, counter, flags), - } - } - - // IMPLEMENTATION NOTE - // =================== - // hash_many() applies two optimizations. The critically important - // optimization is the high-performance parallel SIMD hashing mode, - // described in detail in the spec. This more than doubles throughput per - // thread. Another optimization is keeping the state vectors transposed - // from block to block within a chunk. When state vectors are transposed - // after every block, there's a small but measurable performance loss. - // Compressing chunks with a dedicated loop avoids this. - - pub fn hash_many<A: arrayvec::Array<Item = u8>>( - &self, - inputs: &[&A], - key: &CVWords, - counter: u64, - increment_counter: IncrementCounter, - flags: u8, - flags_start: u8, - flags_end: u8, - out: &mut [u8], - ) { - match self { - Platform::Portable => portable::hash_many( - inputs, - key, - counter, - increment_counter, - flags, - flags_start, - flags_end, - out, - ), - // Safe because detect() checked for platform support. - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - Platform::SSE2 => unsafe { - crate::sse2::hash_many( - inputs, - key, - counter, - increment_counter, - flags, - flags_start, - flags_end, - out, - ) - }, - // Safe because detect() checked for platform support. - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - Platform::SSE41 => unsafe { - crate::sse41::hash_many( - inputs, - key, - counter, - increment_counter, - flags, - flags_start, - flags_end, - out, - ) - }, - // Safe because detect() checked for platform support. - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - Platform::AVX2 => unsafe { - crate::avx2::hash_many( - inputs, - key, - counter, - increment_counter, - flags, - flags_start, - flags_end, - out, - ) - }, - // Safe because detect() checked for platform support. - #[cfg(blake3_avx512_ffi)] - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - Platform::AVX512 => unsafe { - crate::avx512::hash_many( - inputs, - key, - counter, - increment_counter, - flags, - flags_start, - flags_end, - out, - ) - }, - // Assumed to be safe if the "neon" feature is on. - #[cfg(feature = "neon")] - Platform::NEON => unsafe { - crate::neon::hash_many( - inputs, - key, - counter, - increment_counter, - flags, - flags_start, - flags_end, - out, - ) - }, - } - } - - // Explicit platform constructors, for benchmarks. - - pub fn portable() -> Self { - Self::Portable - } - - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - pub fn sse2() -> Option<Self> { - if sse2_detected() { - Some(Self::SSE2) - } else { - None - } - } - - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - pub fn sse41() -> Option<Self> { - if sse41_detected() { - Some(Self::SSE41) - } else { - None - } - } - - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - pub fn avx2() -> Option<Self> { - if avx2_detected() { - Some(Self::AVX2) - } else { - None - } - } - - #[cfg(blake3_avx512_ffi)] - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - pub fn avx512() -> Option<Self> { - if avx512_detected() { - Some(Self::AVX512) - } else { - None - } - } - - #[cfg(feature = "neon")] - pub fn neon() -> Option<Self> { - // Assumed to be safe if the "neon" feature is on. - Some(Self::NEON) - } -} - -// Note that AVX-512 is divided into multiple featuresets, and we use two of -// them, F and VL. -#[cfg(blake3_avx512_ffi)] -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -#[inline(always)] -pub fn avx512_detected() -> bool { - // A testing-only short-circuit. - if cfg!(feature = "no_avx512") { - return false; - } - // Static check, e.g. for building with target-cpu=native. - #[cfg(all(target_feature = "avx512f", target_feature = "avx512vl"))] - { - return true; - } - // Dynamic check, if std is enabled. - #[cfg(feature = "std")] - { - if is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512vl") { - return true; - } - } - false -} - -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -#[inline(always)] -pub fn avx2_detected() -> bool { - // A testing-only short-circuit. - if cfg!(feature = "no_avx2") { - return false; - } - // Static check, e.g. for building with target-cpu=native. - #[cfg(target_feature = "avx2")] - { - return true; - } - // Dynamic check, if std is enabled. - #[cfg(feature = "std")] - { - if is_x86_feature_detected!("avx2") { - return true; - } - } - false -} - -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -#[inline(always)] -pub fn sse41_detected() -> bool { - // A testing-only short-circuit. - if cfg!(feature = "no_sse41") { - return false; - } - // Static check, e.g. for building with target-cpu=native. - #[cfg(target_feature = "sse4.1")] - { - return true; - } - // Dynamic check, if std is enabled. - #[cfg(feature = "std")] - { - if is_x86_feature_detected!("sse4.1") { - return true; - } - } - false -} - -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -#[inline(always)] -#[allow(unreachable_code)] -pub fn sse2_detected() -> bool { - // A testing-only short-circuit. - if cfg!(feature = "no_sse2") { - return false; - } - // Static check, e.g. for building with target-cpu=native. - #[cfg(target_feature = "sse2")] - { - return true; - } - // Dynamic check, if std is enabled. - #[cfg(feature = "std")] - { - if is_x86_feature_detected!("sse2") { - return true; - } - } - false -} - -#[inline(always)] -pub fn words_from_le_bytes_32(bytes: &[u8; 32]) -> [u32; 8] { - let mut out = [0; 8]; - out[0] = u32::from_le_bytes(*array_ref!(bytes, 0 * 4, 4)); - out[1] = u32::from_le_bytes(*array_ref!(bytes, 1 * 4, 4)); - out[2] = u32::from_le_bytes(*array_ref!(bytes, 2 * 4, 4)); - out[3] = u32::from_le_bytes(*array_ref!(bytes, 3 * 4, 4)); - out[4] = u32::from_le_bytes(*array_ref!(bytes, 4 * 4, 4)); - out[5] = u32::from_le_bytes(*array_ref!(bytes, 5 * 4, 4)); - out[6] = u32::from_le_bytes(*array_ref!(bytes, 6 * 4, 4)); - out[7] = u32::from_le_bytes(*array_ref!(bytes, 7 * 4, 4)); - out -} - -#[inline(always)] -pub fn words_from_le_bytes_64(bytes: &[u8; 64]) -> [u32; 16] { - let mut out = [0; 16]; - out[0] = u32::from_le_bytes(*array_ref!(bytes, 0 * 4, 4)); - out[1] = u32::from_le_bytes(*array_ref!(bytes, 1 * 4, 4)); - out[2] = u32::from_le_bytes(*array_ref!(bytes, 2 * 4, 4)); - out[3] = u32::from_le_bytes(*array_ref!(bytes, 3 * 4, 4)); - out[4] = u32::from_le_bytes(*array_ref!(bytes, 4 * 4, 4)); - out[5] = u32::from_le_bytes(*array_ref!(bytes, 5 * 4, 4)); - out[6] = u32::from_le_bytes(*array_ref!(bytes, 6 * 4, 4)); - out[7] = u32::from_le_bytes(*array_ref!(bytes, 7 * 4, 4)); - out[8] = u32::from_le_bytes(*array_ref!(bytes, 8 * 4, 4)); - out[9] = u32::from_le_bytes(*array_ref!(bytes, 9 * 4, 4)); - out[10] = u32::from_le_bytes(*array_ref!(bytes, 10 * 4, 4)); - out[11] = u32::from_le_bytes(*array_ref!(bytes, 11 * 4, 4)); - out[12] = u32::from_le_bytes(*array_ref!(bytes, 12 * 4, 4)); - out[13] = u32::from_le_bytes(*array_ref!(bytes, 13 * 4, 4)); - out[14] = u32::from_le_bytes(*array_ref!(bytes, 14 * 4, 4)); - out[15] = u32::from_le_bytes(*array_ref!(bytes, 15 * 4, 4)); - out -} - -#[inline(always)] -pub fn le_bytes_from_words_32(words: &[u32; 8]) -> [u8; 32] { - let mut out = [0; 32]; - *array_mut_ref!(out, 0 * 4, 4) = words[0].to_le_bytes(); - *array_mut_ref!(out, 1 * 4, 4) = words[1].to_le_bytes(); - *array_mut_ref!(out, 2 * 4, 4) = words[2].to_le_bytes(); - *array_mut_ref!(out, 3 * 4, 4) = words[3].to_le_bytes(); - *array_mut_ref!(out, 4 * 4, 4) = words[4].to_le_bytes(); - *array_mut_ref!(out, 5 * 4, 4) = words[5].to_le_bytes(); - *array_mut_ref!(out, 6 * 4, 4) = words[6].to_le_bytes(); - *array_mut_ref!(out, 7 * 4, 4) = words[7].to_le_bytes(); - out -} - -#[inline(always)] -pub fn le_bytes_from_words_64(words: &[u32; 16]) -> [u8; 64] { - let mut out = [0; 64]; - *array_mut_ref!(out, 0 * 4, 4) = words[0].to_le_bytes(); - *array_mut_ref!(out, 1 * 4, 4) = words[1].to_le_bytes(); - *array_mut_ref!(out, 2 * 4, 4) = words[2].to_le_bytes(); - *array_mut_ref!(out, 3 * 4, 4) = words[3].to_le_bytes(); - *array_mut_ref!(out, 4 * 4, 4) = words[4].to_le_bytes(); - *array_mut_ref!(out, 5 * 4, 4) = words[5].to_le_bytes(); - *array_mut_ref!(out, 6 * 4, 4) = words[6].to_le_bytes(); - *array_mut_ref!(out, 7 * 4, 4) = words[7].to_le_bytes(); - *array_mut_ref!(out, 8 * 4, 4) = words[8].to_le_bytes(); - *array_mut_ref!(out, 9 * 4, 4) = words[9].to_le_bytes(); - *array_mut_ref!(out, 10 * 4, 4) = words[10].to_le_bytes(); - *array_mut_ref!(out, 11 * 4, 4) = words[11].to_le_bytes(); - *array_mut_ref!(out, 12 * 4, 4) = words[12].to_le_bytes(); - *array_mut_ref!(out, 13 * 4, 4) = words[13].to_le_bytes(); - *array_mut_ref!(out, 14 * 4, 4) = words[14].to_le_bytes(); - *array_mut_ref!(out, 15 * 4, 4) = words[15].to_le_bytes(); - out -} diff --git a/thirdparty/BLAKE3/src/portable.rs b/thirdparty/BLAKE3/src/portable.rs deleted file mode 100644 index 0a569cec7..000000000 --- a/thirdparty/BLAKE3/src/portable.rs +++ /dev/null @@ -1,198 +0,0 @@ -use crate::{ - counter_high, counter_low, CVBytes, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE, - OUT_LEN, -}; -use arrayref::{array_mut_ref, array_ref}; - -#[inline(always)] -fn g(state: &mut [u32; 16], a: usize, b: usize, c: usize, d: usize, x: u32, y: u32) { - state[a] = state[a].wrapping_add(state[b]).wrapping_add(x); - state[d] = (state[d] ^ state[a]).rotate_right(16); - state[c] = state[c].wrapping_add(state[d]); - state[b] = (state[b] ^ state[c]).rotate_right(12); - state[a] = state[a].wrapping_add(state[b]).wrapping_add(y); - state[d] = (state[d] ^ state[a]).rotate_right(8); - state[c] = state[c].wrapping_add(state[d]); - state[b] = (state[b] ^ state[c]).rotate_right(7); -} - -#[inline(always)] -fn round(state: &mut [u32; 16], msg: &[u32; 16], round: usize) { - // Select the message schedule based on the round. - let schedule = MSG_SCHEDULE[round]; - - // Mix the columns. - g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]); - g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]); - g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]); - g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]); - - // Mix the diagonals. - g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]); - g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]); - g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]); - g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]); -} - -#[inline(always)] -fn compress_pre( - cv: &CVWords, - block: &[u8; BLOCK_LEN], - block_len: u8, - counter: u64, - flags: u8, -) -> [u32; 16] { - let block_words = crate::platform::words_from_le_bytes_64(block); - - let mut state = [ - cv[0], - cv[1], - cv[2], - cv[3], - cv[4], - cv[5], - cv[6], - cv[7], - IV[0], - IV[1], - IV[2], - IV[3], - counter_low(counter), - counter_high(counter), - block_len as u32, - flags as u32, - ]; - - round(&mut state, &block_words, 0); - round(&mut state, &block_words, 1); - round(&mut state, &block_words, 2); - round(&mut state, &block_words, 3); - round(&mut state, &block_words, 4); - round(&mut state, &block_words, 5); - round(&mut state, &block_words, 6); - - state -} - -pub fn compress_in_place( - cv: &mut CVWords, - block: &[u8; BLOCK_LEN], - block_len: u8, - counter: u64, - flags: u8, -) { - let state = compress_pre(cv, block, block_len, counter, flags); - - cv[0] = state[0] ^ state[8]; - cv[1] = state[1] ^ state[9]; - cv[2] = state[2] ^ state[10]; - cv[3] = state[3] ^ state[11]; - cv[4] = state[4] ^ state[12]; - cv[5] = state[5] ^ state[13]; - cv[6] = state[6] ^ state[14]; - cv[7] = state[7] ^ state[15]; -} - -pub fn compress_xof( - cv: &CVWords, - block: &[u8; BLOCK_LEN], - block_len: u8, - counter: u64, - flags: u8, -) -> [u8; 64] { - let mut state = compress_pre(cv, block, block_len, counter, flags); - state[0] ^= state[8]; - state[1] ^= state[9]; - state[2] ^= state[10]; - state[3] ^= state[11]; - state[4] ^= state[12]; - state[5] ^= state[13]; - state[6] ^= state[14]; - state[7] ^= state[15]; - state[8] ^= cv[0]; - state[9] ^= cv[1]; - state[10] ^= cv[2]; - state[11] ^= cv[3]; - state[12] ^= cv[4]; - state[13] ^= cv[5]; - state[14] ^= cv[6]; - state[15] ^= cv[7]; - crate::platform::le_bytes_from_words_64(&state) -} - -pub fn hash1<A: arrayvec::Array<Item = u8>>( - input: &A, - key: &CVWords, - counter: u64, - flags: u8, - flags_start: u8, - flags_end: u8, - out: &mut CVBytes, -) { - debug_assert_eq!(A::CAPACITY % BLOCK_LEN, 0, "uneven blocks"); - let mut cv = *key; - let mut block_flags = flags | flags_start; - let mut slice = input.as_slice(); - while slice.len() >= BLOCK_LEN { - if slice.len() == BLOCK_LEN { - block_flags |= flags_end; - } - compress_in_place( - &mut cv, - array_ref!(slice, 0, BLOCK_LEN), - BLOCK_LEN as u8, - counter, - block_flags, - ); - block_flags = flags; - slice = &slice[BLOCK_LEN..]; - } - *out = crate::platform::le_bytes_from_words_32(&cv); -} - -pub fn hash_many<A: arrayvec::Array<Item = u8>>( - inputs: &[&A], - key: &CVWords, - mut counter: u64, - increment_counter: IncrementCounter, - flags: u8, - flags_start: u8, - flags_end: u8, - out: &mut [u8], -) { - debug_assert!(out.len() >= inputs.len() * OUT_LEN, "out too short"); - for (&input, output) in inputs.iter().zip(out.chunks_exact_mut(OUT_LEN)) { - hash1( - input, - key, - counter, - flags, - flags_start, - flags_end, - array_mut_ref!(output, 0, OUT_LEN), - ); - if increment_counter.yes() { - counter += 1; - } - } -} - -#[cfg(test)] -pub mod test { - use super::*; - - // This is basically testing the portable implementation against itself, - // but it also checks that compress_in_place and compress_xof are - // consistent. And there are tests against the reference implementation and - // against hardcoded test vectors elsewhere. - #[test] - fn test_compress() { - crate::test::test_compress_fn(compress_in_place, compress_xof); - } - - // Ditto. - #[test] - fn test_hash_many() { - crate::test::test_hash_many_fn(hash_many, hash_many); - } -} diff --git a/thirdparty/BLAKE3/src/rust_avx2.rs b/thirdparty/BLAKE3/src/rust_avx2.rs deleted file mode 100644 index 6ab773ad4..000000000 --- a/thirdparty/BLAKE3/src/rust_avx2.rs +++ /dev/null @@ -1,474 +0,0 @@ -#[cfg(target_arch = "x86")] -use core::arch::x86::*; -#[cfg(target_arch = "x86_64")] -use core::arch::x86_64::*; - -use crate::{ - counter_high, counter_low, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE, OUT_LEN, -}; -use arrayref::{array_mut_ref, mut_array_refs}; - -pub const DEGREE: usize = 8; - -#[inline(always)] -unsafe fn loadu(src: *const u8) -> __m256i { - // This is an unaligned load, so the pointer cast is allowed. - _mm256_loadu_si256(src as *const __m256i) -} - -#[inline(always)] -unsafe fn storeu(src: __m256i, dest: *mut u8) { - // This is an unaligned store, so the pointer cast is allowed. - _mm256_storeu_si256(dest as *mut __m256i, src) -} - -#[inline(always)] -unsafe fn add(a: __m256i, b: __m256i) -> __m256i { - _mm256_add_epi32(a, b) -} - -#[inline(always)] -unsafe fn xor(a: __m256i, b: __m256i) -> __m256i { - _mm256_xor_si256(a, b) -} - -#[inline(always)] -unsafe fn set1(x: u32) -> __m256i { - _mm256_set1_epi32(x as i32) -} - -#[inline(always)] -unsafe fn set8(a: u32, b: u32, c: u32, d: u32, e: u32, f: u32, g: u32, h: u32) -> __m256i { - _mm256_setr_epi32( - a as i32, b as i32, c as i32, d as i32, e as i32, f as i32, g as i32, h as i32, - ) -} - -// These rotations are the "simple/shifts version". For the -// "complicated/shuffles version", see -// https://github.com/sneves/blake2-avx2/blob/b3723921f668df09ece52dcd225a36d4a4eea1d9/blake2s-common.h#L63-L66. -// For a discussion of the tradeoffs, see -// https://github.com/sneves/blake2-avx2/pull/5. Due to an LLVM bug -// (https://bugs.llvm.org/show_bug.cgi?id=44379), this version performs better -// on recent x86 chips. - -#[inline(always)] -unsafe fn rot16(x: __m256i) -> __m256i { - _mm256_or_si256(_mm256_srli_epi32(x, 16), _mm256_slli_epi32(x, 32 - 16)) -} - -#[inline(always)] -unsafe fn rot12(x: __m256i) -> __m256i { - _mm256_or_si256(_mm256_srli_epi32(x, 12), _mm256_slli_epi32(x, 32 - 12)) -} - -#[inline(always)] -unsafe fn rot8(x: __m256i) -> __m256i { - _mm256_or_si256(_mm256_srli_epi32(x, 8), _mm256_slli_epi32(x, 32 - 8)) -} - -#[inline(always)] -unsafe fn rot7(x: __m256i) -> __m256i { - _mm256_or_si256(_mm256_srli_epi32(x, 7), _mm256_slli_epi32(x, 32 - 7)) -} - -#[inline(always)] -unsafe fn round(v: &mut [__m256i; 16], m: &[__m256i; 16], r: usize) { - v[0] = add(v[0], m[MSG_SCHEDULE[r][0] as usize]); - v[1] = add(v[1], m[MSG_SCHEDULE[r][2] as usize]); - v[2] = add(v[2], m[MSG_SCHEDULE[r][4] as usize]); - v[3] = add(v[3], m[MSG_SCHEDULE[r][6] as usize]); - v[0] = add(v[0], v[4]); - v[1] = add(v[1], v[5]); - v[2] = add(v[2], v[6]); - v[3] = add(v[3], v[7]); - v[12] = xor(v[12], v[0]); - v[13] = xor(v[13], v[1]); - v[14] = xor(v[14], v[2]); - v[15] = xor(v[15], v[3]); - v[12] = rot16(v[12]); - v[13] = rot16(v[13]); - v[14] = rot16(v[14]); - v[15] = rot16(v[15]); - v[8] = add(v[8], v[12]); - v[9] = add(v[9], v[13]); - v[10] = add(v[10], v[14]); - v[11] = add(v[11], v[15]); - v[4] = xor(v[4], v[8]); - v[5] = xor(v[5], v[9]); - v[6] = xor(v[6], v[10]); - v[7] = xor(v[7], v[11]); - v[4] = rot12(v[4]); - v[5] = rot12(v[5]); - v[6] = rot12(v[6]); - v[7] = rot12(v[7]); - v[0] = add(v[0], m[MSG_SCHEDULE[r][1] as usize]); - v[1] = add(v[1], m[MSG_SCHEDULE[r][3] as usize]); - v[2] = add(v[2], m[MSG_SCHEDULE[r][5] as usize]); - v[3] = add(v[3], m[MSG_SCHEDULE[r][7] as usize]); - v[0] = add(v[0], v[4]); - v[1] = add(v[1], v[5]); - v[2] = add(v[2], v[6]); - v[3] = add(v[3], v[7]); - v[12] = xor(v[12], v[0]); - v[13] = xor(v[13], v[1]); - v[14] = xor(v[14], v[2]); - v[15] = xor(v[15], v[3]); - v[12] = rot8(v[12]); - v[13] = rot8(v[13]); - v[14] = rot8(v[14]); - v[15] = rot8(v[15]); - v[8] = add(v[8], v[12]); - v[9] = add(v[9], v[13]); - v[10] = add(v[10], v[14]); - v[11] = add(v[11], v[15]); - v[4] = xor(v[4], v[8]); - v[5] = xor(v[5], v[9]); - v[6] = xor(v[6], v[10]); - v[7] = xor(v[7], v[11]); - v[4] = rot7(v[4]); - v[5] = rot7(v[5]); - v[6] = rot7(v[6]); - v[7] = rot7(v[7]); - - v[0] = add(v[0], m[MSG_SCHEDULE[r][8] as usize]); - v[1] = add(v[1], m[MSG_SCHEDULE[r][10] as usize]); - v[2] = add(v[2], m[MSG_SCHEDULE[r][12] as usize]); - v[3] = add(v[3], m[MSG_SCHEDULE[r][14] as usize]); - v[0] = add(v[0], v[5]); - v[1] = add(v[1], v[6]); - v[2] = add(v[2], v[7]); - v[3] = add(v[3], v[4]); - v[15] = xor(v[15], v[0]); - v[12] = xor(v[12], v[1]); - v[13] = xor(v[13], v[2]); - v[14] = xor(v[14], v[3]); - v[15] = rot16(v[15]); - v[12] = rot16(v[12]); - v[13] = rot16(v[13]); - v[14] = rot16(v[14]); - v[10] = add(v[10], v[15]); - v[11] = add(v[11], v[12]); - v[8] = add(v[8], v[13]); - v[9] = add(v[9], v[14]); - v[5] = xor(v[5], v[10]); - v[6] = xor(v[6], v[11]); - v[7] = xor(v[7], v[8]); - v[4] = xor(v[4], v[9]); - v[5] = rot12(v[5]); - v[6] = rot12(v[6]); - v[7] = rot12(v[7]); - v[4] = rot12(v[4]); - v[0] = add(v[0], m[MSG_SCHEDULE[r][9] as usize]); - v[1] = add(v[1], m[MSG_SCHEDULE[r][11] as usize]); - v[2] = add(v[2], m[MSG_SCHEDULE[r][13] as usize]); - v[3] = add(v[3], m[MSG_SCHEDULE[r][15] as usize]); - v[0] = add(v[0], v[5]); - v[1] = add(v[1], v[6]); - v[2] = add(v[2], v[7]); - v[3] = add(v[3], v[4]); - v[15] = xor(v[15], v[0]); - v[12] = xor(v[12], v[1]); - v[13] = xor(v[13], v[2]); - v[14] = xor(v[14], v[3]); - v[15] = rot8(v[15]); - v[12] = rot8(v[12]); - v[13] = rot8(v[13]); - v[14] = rot8(v[14]); - v[10] = add(v[10], v[15]); - v[11] = add(v[11], v[12]); - v[8] = add(v[8], v[13]); - v[9] = add(v[9], v[14]); - v[5] = xor(v[5], v[10]); - v[6] = xor(v[6], v[11]); - v[7] = xor(v[7], v[8]); - v[4] = xor(v[4], v[9]); - v[5] = rot7(v[5]); - v[6] = rot7(v[6]); - v[7] = rot7(v[7]); - v[4] = rot7(v[4]); -} - -#[inline(always)] -unsafe fn interleave128(a: __m256i, b: __m256i) -> (__m256i, __m256i) { - ( - _mm256_permute2x128_si256(a, b, 0x20), - _mm256_permute2x128_si256(a, b, 0x31), - ) -} - -// There are several ways to do a transposition. We could do it naively, with 8 separate -// _mm256_set_epi32 instructions, referencing each of the 32 words explicitly. Or we could copy -// the vecs into contiguous storage and then use gather instructions. This third approach is to use -// a series of unpack instructions to interleave the vectors. In my benchmarks, interleaving is the -// fastest approach. To test this, run `cargo +nightly bench --bench libtest load_8` in the -// https://github.com/oconnor663/bao_experiments repo. -#[inline(always)] -unsafe fn transpose_vecs(vecs: &mut [__m256i; DEGREE]) { - // Interleave 32-bit lanes. The low unpack is lanes 00/11/44/55, and the high is 22/33/66/77. - let ab_0145 = _mm256_unpacklo_epi32(vecs[0], vecs[1]); - let ab_2367 = _mm256_unpackhi_epi32(vecs[0], vecs[1]); - let cd_0145 = _mm256_unpacklo_epi32(vecs[2], vecs[3]); - let cd_2367 = _mm256_unpackhi_epi32(vecs[2], vecs[3]); - let ef_0145 = _mm256_unpacklo_epi32(vecs[4], vecs[5]); - let ef_2367 = _mm256_unpackhi_epi32(vecs[4], vecs[5]); - let gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]); - let gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]); - - // Interleave 64-bit lates. The low unpack is lanes 00/22 and the high is 11/33. - let abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145); - let abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145); - let abcd_26 = _mm256_unpacklo_epi64(ab_2367, cd_2367); - let abcd_37 = _mm256_unpackhi_epi64(ab_2367, cd_2367); - let efgh_04 = _mm256_unpacklo_epi64(ef_0145, gh_0145); - let efgh_15 = _mm256_unpackhi_epi64(ef_0145, gh_0145); - let efgh_26 = _mm256_unpacklo_epi64(ef_2367, gh_2367); - let efgh_37 = _mm256_unpackhi_epi64(ef_2367, gh_2367); - - // Interleave 128-bit lanes. - let (abcdefgh_0, abcdefgh_4) = interleave128(abcd_04, efgh_04); - let (abcdefgh_1, abcdefgh_5) = interleave128(abcd_15, efgh_15); - let (abcdefgh_2, abcdefgh_6) = interleave128(abcd_26, efgh_26); - let (abcdefgh_3, abcdefgh_7) = interleave128(abcd_37, efgh_37); - - vecs[0] = abcdefgh_0; - vecs[1] = abcdefgh_1; - vecs[2] = abcdefgh_2; - vecs[3] = abcdefgh_3; - vecs[4] = abcdefgh_4; - vecs[5] = abcdefgh_5; - vecs[6] = abcdefgh_6; - vecs[7] = abcdefgh_7; -} - -#[inline(always)] -unsafe fn transpose_msg_vecs(inputs: &[*const u8; DEGREE], block_offset: usize) -> [__m256i; 16] { - let mut vecs = [ - loadu(inputs[0].add(block_offset + 0 * 4 * DEGREE)), - loadu(inputs[1].add(block_offset + 0 * 4 * DEGREE)), - loadu(inputs[2].add(block_offset + 0 * 4 * DEGREE)), - loadu(inputs[3].add(block_offset + 0 * 4 * DEGREE)), - loadu(inputs[4].add(block_offset + 0 * 4 * DEGREE)), - loadu(inputs[5].add(block_offset + 0 * 4 * DEGREE)), - loadu(inputs[6].add(block_offset + 0 * 4 * DEGREE)), - loadu(inputs[7].add(block_offset + 0 * 4 * DEGREE)), - loadu(inputs[0].add(block_offset + 1 * 4 * DEGREE)), - loadu(inputs[1].add(block_offset + 1 * 4 * DEGREE)), - loadu(inputs[2].add(block_offset + 1 * 4 * DEGREE)), - loadu(inputs[3].add(block_offset + 1 * 4 * DEGREE)), - loadu(inputs[4].add(block_offset + 1 * 4 * DEGREE)), - loadu(inputs[5].add(block_offset + 1 * 4 * DEGREE)), - loadu(inputs[6].add(block_offset + 1 * 4 * DEGREE)), - loadu(inputs[7].add(block_offset + 1 * 4 * DEGREE)), - ]; - for i in 0..DEGREE { - _mm_prefetch(inputs[i].add(block_offset + 256) as *const i8, _MM_HINT_T0); - } - let squares = mut_array_refs!(&mut vecs, DEGREE, DEGREE); - transpose_vecs(squares.0); - transpose_vecs(squares.1); - vecs -} - -#[inline(always)] -unsafe fn load_counters(counter: u64, increment_counter: IncrementCounter) -> (__m256i, __m256i) { - let mask = if increment_counter.yes() { !0 } else { 0 }; - ( - set8( - counter_low(counter + (mask & 0)), - counter_low(counter + (mask & 1)), - counter_low(counter + (mask & 2)), - counter_low(counter + (mask & 3)), - counter_low(counter + (mask & 4)), - counter_low(counter + (mask & 5)), - counter_low(counter + (mask & 6)), - counter_low(counter + (mask & 7)), - ), - set8( - counter_high(counter + (mask & 0)), - counter_high(counter + (mask & 1)), - counter_high(counter + (mask & 2)), - counter_high(counter + (mask & 3)), - counter_high(counter + (mask & 4)), - counter_high(counter + (mask & 5)), - counter_high(counter + (mask & 6)), - counter_high(counter + (mask & 7)), - ), - ) -} - -#[target_feature(enable = "avx2")] -pub unsafe fn hash8( - inputs: &[*const u8; DEGREE], - blocks: usize, - key: &CVWords, - counter: u64, - increment_counter: IncrementCounter, - flags: u8, - flags_start: u8, - flags_end: u8, - out: &mut [u8; DEGREE * OUT_LEN], -) { - let mut h_vecs = [ - set1(key[0]), - set1(key[1]), - set1(key[2]), - set1(key[3]), - set1(key[4]), - set1(key[5]), - set1(key[6]), - set1(key[7]), - ]; - let (counter_low_vec, counter_high_vec) = load_counters(counter, increment_counter); - let mut block_flags = flags | flags_start; - - for block in 0..blocks { - if block + 1 == blocks { - block_flags |= flags_end; - } - let block_len_vec = set1(BLOCK_LEN as u32); // full blocks only - let block_flags_vec = set1(block_flags as u32); - let msg_vecs = transpose_msg_vecs(inputs, block * BLOCK_LEN); - - // The transposed compression function. Note that inlining this - // manually here improves compile times by a lot, compared to factoring - // it out into its own function and making it #[inline(always)]. Just - // guessing, it might have something to do with loop unrolling. - let mut v = [ - h_vecs[0], - h_vecs[1], - h_vecs[2], - h_vecs[3], - h_vecs[4], - h_vecs[5], - h_vecs[6], - h_vecs[7], - set1(IV[0]), - set1(IV[1]), - set1(IV[2]), - set1(IV[3]), - counter_low_vec, - counter_high_vec, - block_len_vec, - block_flags_vec, - ]; - round(&mut v, &msg_vecs, 0); - round(&mut v, &msg_vecs, 1); - round(&mut v, &msg_vecs, 2); - round(&mut v, &msg_vecs, 3); - round(&mut v, &msg_vecs, 4); - round(&mut v, &msg_vecs, 5); - round(&mut v, &msg_vecs, 6); - h_vecs[0] = xor(v[0], v[8]); - h_vecs[1] = xor(v[1], v[9]); - h_vecs[2] = xor(v[2], v[10]); - h_vecs[3] = xor(v[3], v[11]); - h_vecs[4] = xor(v[4], v[12]); - h_vecs[5] = xor(v[5], v[13]); - h_vecs[6] = xor(v[6], v[14]); - h_vecs[7] = xor(v[7], v[15]); - - block_flags = flags; - } - - transpose_vecs(&mut h_vecs); - storeu(h_vecs[0], out.as_mut_ptr().add(0 * 4 * DEGREE)); - storeu(h_vecs[1], out.as_mut_ptr().add(1 * 4 * DEGREE)); - storeu(h_vecs[2], out.as_mut_ptr().add(2 * 4 * DEGREE)); - storeu(h_vecs[3], out.as_mut_ptr().add(3 * 4 * DEGREE)); - storeu(h_vecs[4], out.as_mut_ptr().add(4 * 4 * DEGREE)); - storeu(h_vecs[5], out.as_mut_ptr().add(5 * 4 * DEGREE)); - storeu(h_vecs[6], out.as_mut_ptr().add(6 * 4 * DEGREE)); - storeu(h_vecs[7], out.as_mut_ptr().add(7 * 4 * DEGREE)); -} - -#[target_feature(enable = "avx2")] -pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>( - mut inputs: &[&A], - key: &CVWords, - mut counter: u64, - increment_counter: IncrementCounter, - flags: u8, - flags_start: u8, - flags_end: u8, - mut out: &mut [u8], -) { - debug_assert!(out.len() >= inputs.len() * OUT_LEN, "out too short"); - while inputs.len() >= DEGREE && out.len() >= DEGREE * OUT_LEN { - // Safe because the layout of arrays is guaranteed, and because the - // `blocks` count is determined statically from the argument type. - let input_ptrs: &[*const u8; DEGREE] = &*(inputs.as_ptr() as *const [*const u8; DEGREE]); - let blocks = A::CAPACITY / BLOCK_LEN; - hash8( - input_ptrs, - blocks, - key, - counter, - increment_counter, - flags, - flags_start, - flags_end, - array_mut_ref!(out, 0, DEGREE * OUT_LEN), - ); - if increment_counter.yes() { - counter += DEGREE as u64; - } - inputs = &inputs[DEGREE..]; - out = &mut out[DEGREE * OUT_LEN..]; - } - crate::sse41::hash_many( - inputs, - key, - counter, - increment_counter, - flags, - flags_start, - flags_end, - out, - ); -} - -#[cfg(test)] -mod test { - use super::*; - - #[test] - fn test_transpose() { - if !crate::platform::avx2_detected() { - return; - } - - #[target_feature(enable = "avx2")] - unsafe fn transpose_wrapper(vecs: &mut [__m256i; DEGREE]) { - transpose_vecs(vecs); - } - - let mut matrix = [[0 as u32; DEGREE]; DEGREE]; - for i in 0..DEGREE { - for j in 0..DEGREE { - matrix[i][j] = (i * DEGREE + j) as u32; - } - } - - unsafe { - let mut vecs: [__m256i; DEGREE] = core::mem::transmute(matrix); - transpose_wrapper(&mut vecs); - matrix = core::mem::transmute(vecs); - } - - for i in 0..DEGREE { - for j in 0..DEGREE { - // Reversed indexes from above. - assert_eq!(matrix[j][i], (i * DEGREE + j) as u32); - } - } - } - - #[test] - fn test_hash_many() { - if !crate::platform::avx2_detected() { - return; - } - crate::test::test_hash_many_fn(hash_many, hash_many); - } -} diff --git a/thirdparty/BLAKE3/src/rust_sse2.rs b/thirdparty/BLAKE3/src/rust_sse2.rs deleted file mode 100644 index 15b52ee5d..000000000 --- a/thirdparty/BLAKE3/src/rust_sse2.rs +++ /dev/null @@ -1,775 +0,0 @@ -#[cfg(target_arch = "x86")] -use core::arch::x86::*; -#[cfg(target_arch = "x86_64")] -use core::arch::x86_64::*; - -use crate::{ - counter_high, counter_low, CVBytes, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE, - OUT_LEN, -}; -use arrayref::{array_mut_ref, array_ref, mut_array_refs}; - -pub const DEGREE: usize = 4; - -#[inline(always)] -unsafe fn loadu(src: *const u8) -> __m128i { - // This is an unaligned load, so the pointer cast is allowed. - _mm_loadu_si128(src as *const __m128i) -} - -#[inline(always)] -unsafe fn storeu(src: __m128i, dest: *mut u8) { - // This is an unaligned store, so the pointer cast is allowed. - _mm_storeu_si128(dest as *mut __m128i, src) -} - -#[inline(always)] -unsafe fn add(a: __m128i, b: __m128i) -> __m128i { - _mm_add_epi32(a, b) -} - -#[inline(always)] -unsafe fn xor(a: __m128i, b: __m128i) -> __m128i { - _mm_xor_si128(a, b) -} - -#[inline(always)] -unsafe fn set1(x: u32) -> __m128i { - _mm_set1_epi32(x as i32) -} - -#[inline(always)] -unsafe fn set4(a: u32, b: u32, c: u32, d: u32) -> __m128i { - _mm_setr_epi32(a as i32, b as i32, c as i32, d as i32) -} - -// These rotations are the "simple/shifts version". For the -// "complicated/shuffles version", see -// https://github.com/sneves/blake2-avx2/blob/b3723921f668df09ece52dcd225a36d4a4eea1d9/blake2s-common.h#L63-L66. -// For a discussion of the tradeoffs, see -// https://github.com/sneves/blake2-avx2/pull/5. Due to an LLVM bug -// (https://bugs.llvm.org/show_bug.cgi?id=44379), this version performs better -// on recent x86 chips. - -#[inline(always)] -unsafe fn rot16(a: __m128i) -> __m128i { - _mm_or_si128(_mm_srli_epi32(a, 16), _mm_slli_epi32(a, 32 - 16)) -} - -#[inline(always)] -unsafe fn rot12(a: __m128i) -> __m128i { - _mm_or_si128(_mm_srli_epi32(a, 12), _mm_slli_epi32(a, 32 - 12)) -} - -#[inline(always)] -unsafe fn rot8(a: __m128i) -> __m128i { - _mm_or_si128(_mm_srli_epi32(a, 8), _mm_slli_epi32(a, 32 - 8)) -} - -#[inline(always)] -unsafe fn rot7(a: __m128i) -> __m128i { - _mm_or_si128(_mm_srli_epi32(a, 7), _mm_slli_epi32(a, 32 - 7)) -} - -#[inline(always)] -unsafe fn g1( - row0: &mut __m128i, - row1: &mut __m128i, - row2: &mut __m128i, - row3: &mut __m128i, - m: __m128i, -) { - *row0 = add(add(*row0, m), *row1); - *row3 = xor(*row3, *row0); - *row3 = rot16(*row3); - *row2 = add(*row2, *row3); - *row1 = xor(*row1, *row2); - *row1 = rot12(*row1); -} - -#[inline(always)] -unsafe fn g2( - row0: &mut __m128i, - row1: &mut __m128i, - row2: &mut __m128i, - row3: &mut __m128i, - m: __m128i, -) { - *row0 = add(add(*row0, m), *row1); - *row3 = xor(*row3, *row0); - *row3 = rot8(*row3); - *row2 = add(*row2, *row3); - *row1 = xor(*row1, *row2); - *row1 = rot7(*row1); -} - -// Adapted from https://github.com/rust-lang-nursery/stdsimd/pull/479. -macro_rules! _MM_SHUFFLE { - ($z:expr, $y:expr, $x:expr, $w:expr) => { - ($z << 6) | ($y << 4) | ($x << 2) | $w - }; -} - -macro_rules! shuffle2 { - ($a:expr, $b:expr, $c:expr) => { - _mm_castps_si128(_mm_shuffle_ps( - _mm_castsi128_ps($a), - _mm_castsi128_ps($b), - $c, - )) - }; -} - -// Note the optimization here of leaving row1 as the unrotated row, rather than -// row0. All the message loads below are adjusted to compensate for this. See -// discussion at https://github.com/sneves/blake2-avx2/pull/4 -#[inline(always)] -unsafe fn diagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) { - *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(2, 1, 0, 3)); - *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2)); - *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(0, 3, 2, 1)); -} - -#[inline(always)] -unsafe fn undiagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) { - *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(0, 3, 2, 1)); - *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2)); - *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(2, 1, 0, 3)); -} - -#[inline(always)] -unsafe fn blend_epi16(a: __m128i, b: __m128i, imm8: i32) -> __m128i { - let bits = _mm_set_epi16(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01); - let mut mask = _mm_set1_epi16(imm8 as i16); - mask = _mm_and_si128(mask, bits); - mask = _mm_cmpeq_epi16(mask, bits); - _mm_or_si128(_mm_and_si128(mask, b), _mm_andnot_si128(mask, a)) -} - -#[inline(always)] -unsafe fn compress_pre( - cv: &CVWords, - block: &[u8; BLOCK_LEN], - block_len: u8, - counter: u64, - flags: u8, -) -> [__m128i; 4] { - let row0 = &mut loadu(cv.as_ptr().add(0) as *const u8); - let row1 = &mut loadu(cv.as_ptr().add(4) as *const u8); - let row2 = &mut set4(IV[0], IV[1], IV[2], IV[3]); - let row3 = &mut set4( - counter_low(counter), - counter_high(counter), - block_len as u32, - flags as u32, - ); - - let mut m0 = loadu(block.as_ptr().add(0 * 4 * DEGREE)); - let mut m1 = loadu(block.as_ptr().add(1 * 4 * DEGREE)); - let mut m2 = loadu(block.as_ptr().add(2 * 4 * DEGREE)); - let mut m3 = loadu(block.as_ptr().add(3 * 4 * DEGREE)); - - let mut t0; - let mut t1; - let mut t2; - let mut t3; - let mut tt; - - // Round 1. The first round permutes the message words from the original - // input order, into the groups that get mixed in parallel. - t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(2, 0, 2, 0)); // 6 4 2 0 - g1(row0, row1, row2, row3, t0); - t1 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 3, 1)); // 7 5 3 1 - g2(row0, row1, row2, row3, t1); - diagonalize(row0, row2, row3); - t2 = shuffle2!(m2, m3, _MM_SHUFFLE!(2, 0, 2, 0)); // 14 12 10 8 - t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE!(2, 1, 0, 3)); // 12 10 8 14 - g1(row0, row1, row2, row3, t2); - t3 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 1, 3, 1)); // 15 13 11 9 - t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE!(2, 1, 0, 3)); // 13 11 9 15 - g2(row0, row1, row2, row3, t3); - undiagonalize(row0, row2, row3); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 2. This round and all following rounds apply a fixed permutation - // to the message words from the round before. - t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); - g1(row0, row1, row2, row3, t0); - t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); - t1 = blend_epi16(tt, t1, 0xCC); - g2(row0, row1, row2, row3, t1); - diagonalize(row0, row2, row3); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); - g1(row0, row1, row2, row3, t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); - g2(row0, row1, row2, row3, t3); - undiagonalize(row0, row2, row3); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 3 - t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); - g1(row0, row1, row2, row3, t0); - t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); - t1 = blend_epi16(tt, t1, 0xCC); - g2(row0, row1, row2, row3, t1); - diagonalize(row0, row2, row3); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); - g1(row0, row1, row2, row3, t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); - g2(row0, row1, row2, row3, t3); - undiagonalize(row0, row2, row3); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 4 - t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); - g1(row0, row1, row2, row3, t0); - t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); - t1 = blend_epi16(tt, t1, 0xCC); - g2(row0, row1, row2, row3, t1); - diagonalize(row0, row2, row3); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); - g1(row0, row1, row2, row3, t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); - g2(row0, row1, row2, row3, t3); - undiagonalize(row0, row2, row3); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 5 - t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); - g1(row0, row1, row2, row3, t0); - t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); - t1 = blend_epi16(tt, t1, 0xCC); - g2(row0, row1, row2, row3, t1); - diagonalize(row0, row2, row3); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); - g1(row0, row1, row2, row3, t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); - g2(row0, row1, row2, row3, t3); - undiagonalize(row0, row2, row3); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 6 - t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); - g1(row0, row1, row2, row3, t0); - t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); - t1 = blend_epi16(tt, t1, 0xCC); - g2(row0, row1, row2, row3, t1); - diagonalize(row0, row2, row3); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); - g1(row0, row1, row2, row3, t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); - g2(row0, row1, row2, row3, t3); - undiagonalize(row0, row2, row3); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 7 - t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); - g1(row0, row1, row2, row3, t0); - t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); - t1 = blend_epi16(tt, t1, 0xCC); - g2(row0, row1, row2, row3, t1); - diagonalize(row0, row2, row3); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); - g1(row0, row1, row2, row3, t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); - g2(row0, row1, row2, row3, t3); - undiagonalize(row0, row2, row3); - - [*row0, *row1, *row2, *row3] -} - -#[target_feature(enable = "sse2")] -pub unsafe fn compress_in_place( - cv: &mut CVWords, - block: &[u8; BLOCK_LEN], - block_len: u8, - counter: u64, - flags: u8, -) { - let [row0, row1, row2, row3] = compress_pre(cv, block, block_len, counter, flags); - storeu(xor(row0, row2), cv.as_mut_ptr().add(0) as *mut u8); - storeu(xor(row1, row3), cv.as_mut_ptr().add(4) as *mut u8); -} - -#[target_feature(enable = "sse2")] -pub unsafe fn compress_xof( - cv: &CVWords, - block: &[u8; BLOCK_LEN], - block_len: u8, - counter: u64, - flags: u8, -) -> [u8; 64] { - let [mut row0, mut row1, mut row2, mut row3] = - compress_pre(cv, block, block_len, counter, flags); - row0 = xor(row0, row2); - row1 = xor(row1, row3); - row2 = xor(row2, loadu(cv.as_ptr().add(0) as *const u8)); - row3 = xor(row3, loadu(cv.as_ptr().add(4) as *const u8)); - core::mem::transmute([row0, row1, row2, row3]) -} - -#[inline(always)] -unsafe fn round(v: &mut [__m128i; 16], m: &[__m128i; 16], r: usize) { - v[0] = add(v[0], m[MSG_SCHEDULE[r][0] as usize]); - v[1] = add(v[1], m[MSG_SCHEDULE[r][2] as usize]); - v[2] = add(v[2], m[MSG_SCHEDULE[r][4] as usize]); - v[3] = add(v[3], m[MSG_SCHEDULE[r][6] as usize]); - v[0] = add(v[0], v[4]); - v[1] = add(v[1], v[5]); - v[2] = add(v[2], v[6]); - v[3] = add(v[3], v[7]); - v[12] = xor(v[12], v[0]); - v[13] = xor(v[13], v[1]); - v[14] = xor(v[14], v[2]); - v[15] = xor(v[15], v[3]); - v[12] = rot16(v[12]); - v[13] = rot16(v[13]); - v[14] = rot16(v[14]); - v[15] = rot16(v[15]); - v[8] = add(v[8], v[12]); - v[9] = add(v[9], v[13]); - v[10] = add(v[10], v[14]); - v[11] = add(v[11], v[15]); - v[4] = xor(v[4], v[8]); - v[5] = xor(v[5], v[9]); - v[6] = xor(v[6], v[10]); - v[7] = xor(v[7], v[11]); - v[4] = rot12(v[4]); - v[5] = rot12(v[5]); - v[6] = rot12(v[6]); - v[7] = rot12(v[7]); - v[0] = add(v[0], m[MSG_SCHEDULE[r][1] as usize]); - v[1] = add(v[1], m[MSG_SCHEDULE[r][3] as usize]); - v[2] = add(v[2], m[MSG_SCHEDULE[r][5] as usize]); - v[3] = add(v[3], m[MSG_SCHEDULE[r][7] as usize]); - v[0] = add(v[0], v[4]); - v[1] = add(v[1], v[5]); - v[2] = add(v[2], v[6]); - v[3] = add(v[3], v[7]); - v[12] = xor(v[12], v[0]); - v[13] = xor(v[13], v[1]); - v[14] = xor(v[14], v[2]); - v[15] = xor(v[15], v[3]); - v[12] = rot8(v[12]); - v[13] = rot8(v[13]); - v[14] = rot8(v[14]); - v[15] = rot8(v[15]); - v[8] = add(v[8], v[12]); - v[9] = add(v[9], v[13]); - v[10] = add(v[10], v[14]); - v[11] = add(v[11], v[15]); - v[4] = xor(v[4], v[8]); - v[5] = xor(v[5], v[9]); - v[6] = xor(v[6], v[10]); - v[7] = xor(v[7], v[11]); - v[4] = rot7(v[4]); - v[5] = rot7(v[5]); - v[6] = rot7(v[6]); - v[7] = rot7(v[7]); - - v[0] = add(v[0], m[MSG_SCHEDULE[r][8] as usize]); - v[1] = add(v[1], m[MSG_SCHEDULE[r][10] as usize]); - v[2] = add(v[2], m[MSG_SCHEDULE[r][12] as usize]); - v[3] = add(v[3], m[MSG_SCHEDULE[r][14] as usize]); - v[0] = add(v[0], v[5]); - v[1] = add(v[1], v[6]); - v[2] = add(v[2], v[7]); - v[3] = add(v[3], v[4]); - v[15] = xor(v[15], v[0]); - v[12] = xor(v[12], v[1]); - v[13] = xor(v[13], v[2]); - v[14] = xor(v[14], v[3]); - v[15] = rot16(v[15]); - v[12] = rot16(v[12]); - v[13] = rot16(v[13]); - v[14] = rot16(v[14]); - v[10] = add(v[10], v[15]); - v[11] = add(v[11], v[12]); - v[8] = add(v[8], v[13]); - v[9] = add(v[9], v[14]); - v[5] = xor(v[5], v[10]); - v[6] = xor(v[6], v[11]); - v[7] = xor(v[7], v[8]); - v[4] = xor(v[4], v[9]); - v[5] = rot12(v[5]); - v[6] = rot12(v[6]); - v[7] = rot12(v[7]); - v[4] = rot12(v[4]); - v[0] = add(v[0], m[MSG_SCHEDULE[r][9] as usize]); - v[1] = add(v[1], m[MSG_SCHEDULE[r][11] as usize]); - v[2] = add(v[2], m[MSG_SCHEDULE[r][13] as usize]); - v[3] = add(v[3], m[MSG_SCHEDULE[r][15] as usize]); - v[0] = add(v[0], v[5]); - v[1] = add(v[1], v[6]); - v[2] = add(v[2], v[7]); - v[3] = add(v[3], v[4]); - v[15] = xor(v[15], v[0]); - v[12] = xor(v[12], v[1]); - v[13] = xor(v[13], v[2]); - v[14] = xor(v[14], v[3]); - v[15] = rot8(v[15]); - v[12] = rot8(v[12]); - v[13] = rot8(v[13]); - v[14] = rot8(v[14]); - v[10] = add(v[10], v[15]); - v[11] = add(v[11], v[12]); - v[8] = add(v[8], v[13]); - v[9] = add(v[9], v[14]); - v[5] = xor(v[5], v[10]); - v[6] = xor(v[6], v[11]); - v[7] = xor(v[7], v[8]); - v[4] = xor(v[4], v[9]); - v[5] = rot7(v[5]); - v[6] = rot7(v[6]); - v[7] = rot7(v[7]); - v[4] = rot7(v[4]); -} - -#[inline(always)] -unsafe fn transpose_vecs(vecs: &mut [__m128i; DEGREE]) { - // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is - // 22/33. Note that this doesn't split the vector into two lanes, as the - // AVX2 counterparts do. - let ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]); - let ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]); - let cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]); - let cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]); - - // Interleave 64-bit lanes. - let abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01); - let abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01); - let abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23); - let abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23); - - vecs[0] = abcd_0; - vecs[1] = abcd_1; - vecs[2] = abcd_2; - vecs[3] = abcd_3; -} - -#[inline(always)] -unsafe fn transpose_msg_vecs(inputs: &[*const u8; DEGREE], block_offset: usize) -> [__m128i; 16] { - let mut vecs = [ - loadu(inputs[0].add(block_offset + 0 * 4 * DEGREE)), - loadu(inputs[1].add(block_offset + 0 * 4 * DEGREE)), - loadu(inputs[2].add(block_offset + 0 * 4 * DEGREE)), - loadu(inputs[3].add(block_offset + 0 * 4 * DEGREE)), - loadu(inputs[0].add(block_offset + 1 * 4 * DEGREE)), - loadu(inputs[1].add(block_offset + 1 * 4 * DEGREE)), - loadu(inputs[2].add(block_offset + 1 * 4 * DEGREE)), - loadu(inputs[3].add(block_offset + 1 * 4 * DEGREE)), - loadu(inputs[0].add(block_offset + 2 * 4 * DEGREE)), - loadu(inputs[1].add(block_offset + 2 * 4 * DEGREE)), - loadu(inputs[2].add(block_offset + 2 * 4 * DEGREE)), - loadu(inputs[3].add(block_offset + 2 * 4 * DEGREE)), - loadu(inputs[0].add(block_offset + 3 * 4 * DEGREE)), - loadu(inputs[1].add(block_offset + 3 * 4 * DEGREE)), - loadu(inputs[2].add(block_offset + 3 * 4 * DEGREE)), - loadu(inputs[3].add(block_offset + 3 * 4 * DEGREE)), - ]; - for i in 0..DEGREE { - _mm_prefetch(inputs[i].add(block_offset + 256) as *const i8, _MM_HINT_T0); - } - let squares = mut_array_refs!(&mut vecs, DEGREE, DEGREE, DEGREE, DEGREE); - transpose_vecs(squares.0); - transpose_vecs(squares.1); - transpose_vecs(squares.2); - transpose_vecs(squares.3); - vecs -} - -#[inline(always)] -unsafe fn load_counters(counter: u64, increment_counter: IncrementCounter) -> (__m128i, __m128i) { - let mask = if increment_counter.yes() { !0 } else { 0 }; - ( - set4( - counter_low(counter + (mask & 0)), - counter_low(counter + (mask & 1)), - counter_low(counter + (mask & 2)), - counter_low(counter + (mask & 3)), - ), - set4( - counter_high(counter + (mask & 0)), - counter_high(counter + (mask & 1)), - counter_high(counter + (mask & 2)), - counter_high(counter + (mask & 3)), - ), - ) -} - -#[target_feature(enable = "sse2")] -pub unsafe fn hash4( - inputs: &[*const u8; DEGREE], - blocks: usize, - key: &CVWords, - counter: u64, - increment_counter: IncrementCounter, - flags: u8, - flags_start: u8, - flags_end: u8, - out: &mut [u8; DEGREE * OUT_LEN], -) { - let mut h_vecs = [ - set1(key[0]), - set1(key[1]), - set1(key[2]), - set1(key[3]), - set1(key[4]), - set1(key[5]), - set1(key[6]), - set1(key[7]), - ]; - let (counter_low_vec, counter_high_vec) = load_counters(counter, increment_counter); - let mut block_flags = flags | flags_start; - - for block in 0..blocks { - if block + 1 == blocks { - block_flags |= flags_end; - } - let block_len_vec = set1(BLOCK_LEN as u32); // full blocks only - let block_flags_vec = set1(block_flags as u32); - let msg_vecs = transpose_msg_vecs(inputs, block * BLOCK_LEN); - - // The transposed compression function. Note that inlining this - // manually here improves compile times by a lot, compared to factoring - // it out into its own function and making it #[inline(always)]. Just - // guessing, it might have something to do with loop unrolling. - let mut v = [ - h_vecs[0], - h_vecs[1], - h_vecs[2], - h_vecs[3], - h_vecs[4], - h_vecs[5], - h_vecs[6], - h_vecs[7], - set1(IV[0]), - set1(IV[1]), - set1(IV[2]), - set1(IV[3]), - counter_low_vec, - counter_high_vec, - block_len_vec, - block_flags_vec, - ]; - round(&mut v, &msg_vecs, 0); - round(&mut v, &msg_vecs, 1); - round(&mut v, &msg_vecs, 2); - round(&mut v, &msg_vecs, 3); - round(&mut v, &msg_vecs, 4); - round(&mut v, &msg_vecs, 5); - round(&mut v, &msg_vecs, 6); - h_vecs[0] = xor(v[0], v[8]); - h_vecs[1] = xor(v[1], v[9]); - h_vecs[2] = xor(v[2], v[10]); - h_vecs[3] = xor(v[3], v[11]); - h_vecs[4] = xor(v[4], v[12]); - h_vecs[5] = xor(v[5], v[13]); - h_vecs[6] = xor(v[6], v[14]); - h_vecs[7] = xor(v[7], v[15]); - - block_flags = flags; - } - - let squares = mut_array_refs!(&mut h_vecs, DEGREE, DEGREE); - transpose_vecs(squares.0); - transpose_vecs(squares.1); - // The first four vecs now contain the first half of each output, and the - // second four vecs contain the second half of each output. - storeu(h_vecs[0], out.as_mut_ptr().add(0 * 4 * DEGREE)); - storeu(h_vecs[4], out.as_mut_ptr().add(1 * 4 * DEGREE)); - storeu(h_vecs[1], out.as_mut_ptr().add(2 * 4 * DEGREE)); - storeu(h_vecs[5], out.as_mut_ptr().add(3 * 4 * DEGREE)); - storeu(h_vecs[2], out.as_mut_ptr().add(4 * 4 * DEGREE)); - storeu(h_vecs[6], out.as_mut_ptr().add(5 * 4 * DEGREE)); - storeu(h_vecs[3], out.as_mut_ptr().add(6 * 4 * DEGREE)); - storeu(h_vecs[7], out.as_mut_ptr().add(7 * 4 * DEGREE)); -} - -#[target_feature(enable = "sse2")] -unsafe fn hash1<A: arrayvec::Array<Item = u8>>( - input: &A, - key: &CVWords, - counter: u64, - flags: u8, - flags_start: u8, - flags_end: u8, - out: &mut CVBytes, -) { - debug_assert_eq!(A::CAPACITY % BLOCK_LEN, 0, "uneven blocks"); - let mut cv = *key; - let mut block_flags = flags | flags_start; - let mut slice = input.as_slice(); - while slice.len() >= BLOCK_LEN { - if slice.len() == BLOCK_LEN { - block_flags |= flags_end; - } - compress_in_place( - &mut cv, - array_ref!(slice, 0, BLOCK_LEN), - BLOCK_LEN as u8, - counter, - block_flags, - ); - block_flags = flags; - slice = &slice[BLOCK_LEN..]; - } - *out = core::mem::transmute(cv); // x86 is little-endian -} - -#[target_feature(enable = "sse2")] -pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>( - mut inputs: &[&A], - key: &CVWords, - mut counter: u64, - increment_counter: IncrementCounter, - flags: u8, - flags_start: u8, - flags_end: u8, - mut out: &mut [u8], -) { - debug_assert!(out.len() >= inputs.len() * OUT_LEN, "out too short"); - while inputs.len() >= DEGREE && out.len() >= DEGREE * OUT_LEN { - // Safe because the layout of arrays is guaranteed, and because the - // `blocks` count is determined statically from the argument type. - let input_ptrs: &[*const u8; DEGREE] = &*(inputs.as_ptr() as *const [*const u8; DEGREE]); - let blocks = A::CAPACITY / BLOCK_LEN; - hash4( - input_ptrs, - blocks, - key, - counter, - increment_counter, - flags, - flags_start, - flags_end, - array_mut_ref!(out, 0, DEGREE * OUT_LEN), - ); - if increment_counter.yes() { - counter += DEGREE as u64; - } - inputs = &inputs[DEGREE..]; - out = &mut out[DEGREE * OUT_LEN..]; - } - for (&input, output) in inputs.iter().zip(out.chunks_exact_mut(OUT_LEN)) { - hash1( - input, - key, - counter, - flags, - flags_start, - flags_end, - array_mut_ref!(output, 0, OUT_LEN), - ); - if increment_counter.yes() { - counter += 1; - } - } -} - -#[cfg(test)] -mod test { - use super::*; - - #[test] - fn test_transpose() { - if !crate::platform::sse2_detected() { - return; - } - - #[target_feature(enable = "sse2")] - unsafe fn transpose_wrapper(vecs: &mut [__m128i; DEGREE]) { - transpose_vecs(vecs); - } - - let mut matrix = [[0 as u32; DEGREE]; DEGREE]; - for i in 0..DEGREE { - for j in 0..DEGREE { - matrix[i][j] = (i * DEGREE + j) as u32; - } - } - - unsafe { - let mut vecs: [__m128i; DEGREE] = core::mem::transmute(matrix); - transpose_wrapper(&mut vecs); - matrix = core::mem::transmute(vecs); - } - - for i in 0..DEGREE { - for j in 0..DEGREE { - // Reversed indexes from above. - assert_eq!(matrix[j][i], (i * DEGREE + j) as u32); - } - } - } - - #[test] - fn test_compress() { - if !crate::platform::sse2_detected() { - return; - } - crate::test::test_compress_fn(compress_in_place, compress_xof); - } - - #[test] - fn test_hash_many() { - if !crate::platform::sse2_detected() { - return; - } - crate::test::test_hash_many_fn(hash_many, hash_many); - } -} diff --git a/thirdparty/BLAKE3/src/rust_sse41.rs b/thirdparty/BLAKE3/src/rust_sse41.rs deleted file mode 100644 index d5cf0f4a9..000000000 --- a/thirdparty/BLAKE3/src/rust_sse41.rs +++ /dev/null @@ -1,766 +0,0 @@ -#[cfg(target_arch = "x86")] -use core::arch::x86::*; -#[cfg(target_arch = "x86_64")] -use core::arch::x86_64::*; - -use crate::{ - counter_high, counter_low, CVBytes, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE, - OUT_LEN, -}; -use arrayref::{array_mut_ref, array_ref, mut_array_refs}; - -pub const DEGREE: usize = 4; - -#[inline(always)] -unsafe fn loadu(src: *const u8) -> __m128i { - // This is an unaligned load, so the pointer cast is allowed. - _mm_loadu_si128(src as *const __m128i) -} - -#[inline(always)] -unsafe fn storeu(src: __m128i, dest: *mut u8) { - // This is an unaligned store, so the pointer cast is allowed. - _mm_storeu_si128(dest as *mut __m128i, src) -} - -#[inline(always)] -unsafe fn add(a: __m128i, b: __m128i) -> __m128i { - _mm_add_epi32(a, b) -} - -#[inline(always)] -unsafe fn xor(a: __m128i, b: __m128i) -> __m128i { - _mm_xor_si128(a, b) -} - -#[inline(always)] -unsafe fn set1(x: u32) -> __m128i { - _mm_set1_epi32(x as i32) -} - -#[inline(always)] -unsafe fn set4(a: u32, b: u32, c: u32, d: u32) -> __m128i { - _mm_setr_epi32(a as i32, b as i32, c as i32, d as i32) -} - -// These rotations are the "simple/shifts version". For the -// "complicated/shuffles version", see -// https://github.com/sneves/blake2-avx2/blob/b3723921f668df09ece52dcd225a36d4a4eea1d9/blake2s-common.h#L63-L66. -// For a discussion of the tradeoffs, see -// https://github.com/sneves/blake2-avx2/pull/5. Due to an LLVM bug -// (https://bugs.llvm.org/show_bug.cgi?id=44379), this version performs better -// on recent x86 chips. - -#[inline(always)] -unsafe fn rot16(a: __m128i) -> __m128i { - _mm_or_si128(_mm_srli_epi32(a, 16), _mm_slli_epi32(a, 32 - 16)) -} - -#[inline(always)] -unsafe fn rot12(a: __m128i) -> __m128i { - _mm_or_si128(_mm_srli_epi32(a, 12), _mm_slli_epi32(a, 32 - 12)) -} - -#[inline(always)] -unsafe fn rot8(a: __m128i) -> __m128i { - _mm_or_si128(_mm_srli_epi32(a, 8), _mm_slli_epi32(a, 32 - 8)) -} - -#[inline(always)] -unsafe fn rot7(a: __m128i) -> __m128i { - _mm_or_si128(_mm_srli_epi32(a, 7), _mm_slli_epi32(a, 32 - 7)) -} - -#[inline(always)] -unsafe fn g1( - row0: &mut __m128i, - row1: &mut __m128i, - row2: &mut __m128i, - row3: &mut __m128i, - m: __m128i, -) { - *row0 = add(add(*row0, m), *row1); - *row3 = xor(*row3, *row0); - *row3 = rot16(*row3); - *row2 = add(*row2, *row3); - *row1 = xor(*row1, *row2); - *row1 = rot12(*row1); -} - -#[inline(always)] -unsafe fn g2( - row0: &mut __m128i, - row1: &mut __m128i, - row2: &mut __m128i, - row3: &mut __m128i, - m: __m128i, -) { - *row0 = add(add(*row0, m), *row1); - *row3 = xor(*row3, *row0); - *row3 = rot8(*row3); - *row2 = add(*row2, *row3); - *row1 = xor(*row1, *row2); - *row1 = rot7(*row1); -} - -// Adapted from https://github.com/rust-lang-nursery/stdsimd/pull/479. -macro_rules! _MM_SHUFFLE { - ($z:expr, $y:expr, $x:expr, $w:expr) => { - ($z << 6) | ($y << 4) | ($x << 2) | $w - }; -} - -macro_rules! shuffle2 { - ($a:expr, $b:expr, $c:expr) => { - _mm_castps_si128(_mm_shuffle_ps( - _mm_castsi128_ps($a), - _mm_castsi128_ps($b), - $c, - )) - }; -} - -// Note the optimization here of leaving row1 as the unrotated row, rather than -// row0. All the message loads below are adjusted to compensate for this. See -// discussion at https://github.com/sneves/blake2-avx2/pull/4 -#[inline(always)] -unsafe fn diagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) { - *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(2, 1, 0, 3)); - *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2)); - *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(0, 3, 2, 1)); -} - -#[inline(always)] -unsafe fn undiagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) { - *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(0, 3, 2, 1)); - *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2)); - *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(2, 1, 0, 3)); -} - -#[inline(always)] -unsafe fn compress_pre( - cv: &CVWords, - block: &[u8; BLOCK_LEN], - block_len: u8, - counter: u64, - flags: u8, -) -> [__m128i; 4] { - let row0 = &mut loadu(cv.as_ptr().add(0) as *const u8); - let row1 = &mut loadu(cv.as_ptr().add(4) as *const u8); - let row2 = &mut set4(IV[0], IV[1], IV[2], IV[3]); - let row3 = &mut set4( - counter_low(counter), - counter_high(counter), - block_len as u32, - flags as u32, - ); - - let mut m0 = loadu(block.as_ptr().add(0 * 4 * DEGREE)); - let mut m1 = loadu(block.as_ptr().add(1 * 4 * DEGREE)); - let mut m2 = loadu(block.as_ptr().add(2 * 4 * DEGREE)); - let mut m3 = loadu(block.as_ptr().add(3 * 4 * DEGREE)); - - let mut t0; - let mut t1; - let mut t2; - let mut t3; - let mut tt; - - // Round 1. The first round permutes the message words from the original - // input order, into the groups that get mixed in parallel. - t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(2, 0, 2, 0)); // 6 4 2 0 - g1(row0, row1, row2, row3, t0); - t1 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 3, 1)); // 7 5 3 1 - g2(row0, row1, row2, row3, t1); - diagonalize(row0, row2, row3); - t2 = shuffle2!(m2, m3, _MM_SHUFFLE!(2, 0, 2, 0)); // 14 12 10 8 - t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE!(2, 1, 0, 3)); // 12 10 8 14 - g1(row0, row1, row2, row3, t2); - t3 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 1, 3, 1)); // 15 13 11 9 - t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE!(2, 1, 0, 3)); // 13 11 9 15 - g2(row0, row1, row2, row3, t3); - undiagonalize(row0, row2, row3); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 2. This round and all following rounds apply a fixed permutation - // to the message words from the round before. - t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); - g1(row0, row1, row2, row3, t0); - t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(row0, row1, row2, row3, t1); - diagonalize(row0, row2, row3); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); - g1(row0, row1, row2, row3, t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); - g2(row0, row1, row2, row3, t3); - undiagonalize(row0, row2, row3); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 3 - t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); - g1(row0, row1, row2, row3, t0); - t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(row0, row1, row2, row3, t1); - diagonalize(row0, row2, row3); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); - g1(row0, row1, row2, row3, t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); - g2(row0, row1, row2, row3, t3); - undiagonalize(row0, row2, row3); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 4 - t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); - g1(row0, row1, row2, row3, t0); - t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(row0, row1, row2, row3, t1); - diagonalize(row0, row2, row3); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); - g1(row0, row1, row2, row3, t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); - g2(row0, row1, row2, row3, t3); - undiagonalize(row0, row2, row3); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 5 - t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); - g1(row0, row1, row2, row3, t0); - t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(row0, row1, row2, row3, t1); - diagonalize(row0, row2, row3); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); - g1(row0, row1, row2, row3, t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); - g2(row0, row1, row2, row3, t3); - undiagonalize(row0, row2, row3); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 6 - t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); - g1(row0, row1, row2, row3, t0); - t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(row0, row1, row2, row3, t1); - diagonalize(row0, row2, row3); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); - g1(row0, row1, row2, row3, t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); - g2(row0, row1, row2, row3, t3); - undiagonalize(row0, row2, row3); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 7 - t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); - g1(row0, row1, row2, row3, t0); - t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(row0, row1, row2, row3, t1); - diagonalize(row0, row2, row3); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); - g1(row0, row1, row2, row3, t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); - g2(row0, row1, row2, row3, t3); - undiagonalize(row0, row2, row3); - - [*row0, *row1, *row2, *row3] -} - -#[target_feature(enable = "sse4.1")] -pub unsafe fn compress_in_place( - cv: &mut CVWords, - block: &[u8; BLOCK_LEN], - block_len: u8, - counter: u64, - flags: u8, -) { - let [row0, row1, row2, row3] = compress_pre(cv, block, block_len, counter, flags); - storeu(xor(row0, row2), cv.as_mut_ptr().add(0) as *mut u8); - storeu(xor(row1, row3), cv.as_mut_ptr().add(4) as *mut u8); -} - -#[target_feature(enable = "sse4.1")] -pub unsafe fn compress_xof( - cv: &CVWords, - block: &[u8; BLOCK_LEN], - block_len: u8, - counter: u64, - flags: u8, -) -> [u8; 64] { - let [mut row0, mut row1, mut row2, mut row3] = - compress_pre(cv, block, block_len, counter, flags); - row0 = xor(row0, row2); - row1 = xor(row1, row3); - row2 = xor(row2, loadu(cv.as_ptr().add(0) as *const u8)); - row3 = xor(row3, loadu(cv.as_ptr().add(4) as *const u8)); - core::mem::transmute([row0, row1, row2, row3]) -} - -#[inline(always)] -unsafe fn round(v: &mut [__m128i; 16], m: &[__m128i; 16], r: usize) { - v[0] = add(v[0], m[MSG_SCHEDULE[r][0] as usize]); - v[1] = add(v[1], m[MSG_SCHEDULE[r][2] as usize]); - v[2] = add(v[2], m[MSG_SCHEDULE[r][4] as usize]); - v[3] = add(v[3], m[MSG_SCHEDULE[r][6] as usize]); - v[0] = add(v[0], v[4]); - v[1] = add(v[1], v[5]); - v[2] = add(v[2], v[6]); - v[3] = add(v[3], v[7]); - v[12] = xor(v[12], v[0]); - v[13] = xor(v[13], v[1]); - v[14] = xor(v[14], v[2]); - v[15] = xor(v[15], v[3]); - v[12] = rot16(v[12]); - v[13] = rot16(v[13]); - v[14] = rot16(v[14]); - v[15] = rot16(v[15]); - v[8] = add(v[8], v[12]); - v[9] = add(v[9], v[13]); - v[10] = add(v[10], v[14]); - v[11] = add(v[11], v[15]); - v[4] = xor(v[4], v[8]); - v[5] = xor(v[5], v[9]); - v[6] = xor(v[6], v[10]); - v[7] = xor(v[7], v[11]); - v[4] = rot12(v[4]); - v[5] = rot12(v[5]); - v[6] = rot12(v[6]); - v[7] = rot12(v[7]); - v[0] = add(v[0], m[MSG_SCHEDULE[r][1] as usize]); - v[1] = add(v[1], m[MSG_SCHEDULE[r][3] as usize]); - v[2] = add(v[2], m[MSG_SCHEDULE[r][5] as usize]); - v[3] = add(v[3], m[MSG_SCHEDULE[r][7] as usize]); - v[0] = add(v[0], v[4]); - v[1] = add(v[1], v[5]); - v[2] = add(v[2], v[6]); - v[3] = add(v[3], v[7]); - v[12] = xor(v[12], v[0]); - v[13] = xor(v[13], v[1]); - v[14] = xor(v[14], v[2]); - v[15] = xor(v[15], v[3]); - v[12] = rot8(v[12]); - v[13] = rot8(v[13]); - v[14] = rot8(v[14]); - v[15] = rot8(v[15]); - v[8] = add(v[8], v[12]); - v[9] = add(v[9], v[13]); - v[10] = add(v[10], v[14]); - v[11] = add(v[11], v[15]); - v[4] = xor(v[4], v[8]); - v[5] = xor(v[5], v[9]); - v[6] = xor(v[6], v[10]); - v[7] = xor(v[7], v[11]); - v[4] = rot7(v[4]); - v[5] = rot7(v[5]); - v[6] = rot7(v[6]); - v[7] = rot7(v[7]); - - v[0] = add(v[0], m[MSG_SCHEDULE[r][8] as usize]); - v[1] = add(v[1], m[MSG_SCHEDULE[r][10] as usize]); - v[2] = add(v[2], m[MSG_SCHEDULE[r][12] as usize]); - v[3] = add(v[3], m[MSG_SCHEDULE[r][14] as usize]); - v[0] = add(v[0], v[5]); - v[1] = add(v[1], v[6]); - v[2] = add(v[2], v[7]); - v[3] = add(v[3], v[4]); - v[15] = xor(v[15], v[0]); - v[12] = xor(v[12], v[1]); - v[13] = xor(v[13], v[2]); - v[14] = xor(v[14], v[3]); - v[15] = rot16(v[15]); - v[12] = rot16(v[12]); - v[13] = rot16(v[13]); - v[14] = rot16(v[14]); - v[10] = add(v[10], v[15]); - v[11] = add(v[11], v[12]); - v[8] = add(v[8], v[13]); - v[9] = add(v[9], v[14]); - v[5] = xor(v[5], v[10]); - v[6] = xor(v[6], v[11]); - v[7] = xor(v[7], v[8]); - v[4] = xor(v[4], v[9]); - v[5] = rot12(v[5]); - v[6] = rot12(v[6]); - v[7] = rot12(v[7]); - v[4] = rot12(v[4]); - v[0] = add(v[0], m[MSG_SCHEDULE[r][9] as usize]); - v[1] = add(v[1], m[MSG_SCHEDULE[r][11] as usize]); - v[2] = add(v[2], m[MSG_SCHEDULE[r][13] as usize]); - v[3] = add(v[3], m[MSG_SCHEDULE[r][15] as usize]); - v[0] = add(v[0], v[5]); - v[1] = add(v[1], v[6]); - v[2] = add(v[2], v[7]); - v[3] = add(v[3], v[4]); - v[15] = xor(v[15], v[0]); - v[12] = xor(v[12], v[1]); - v[13] = xor(v[13], v[2]); - v[14] = xor(v[14], v[3]); - v[15] = rot8(v[15]); - v[12] = rot8(v[12]); - v[13] = rot8(v[13]); - v[14] = rot8(v[14]); - v[10] = add(v[10], v[15]); - v[11] = add(v[11], v[12]); - v[8] = add(v[8], v[13]); - v[9] = add(v[9], v[14]); - v[5] = xor(v[5], v[10]); - v[6] = xor(v[6], v[11]); - v[7] = xor(v[7], v[8]); - v[4] = xor(v[4], v[9]); - v[5] = rot7(v[5]); - v[6] = rot7(v[6]); - v[7] = rot7(v[7]); - v[4] = rot7(v[4]); -} - -#[inline(always)] -unsafe fn transpose_vecs(vecs: &mut [__m128i; DEGREE]) { - // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is - // 22/33. Note that this doesn't split the vector into two lanes, as the - // AVX2 counterparts do. - let ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]); - let ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]); - let cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]); - let cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]); - - // Interleave 64-bit lanes. - let abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01); - let abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01); - let abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23); - let abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23); - - vecs[0] = abcd_0; - vecs[1] = abcd_1; - vecs[2] = abcd_2; - vecs[3] = abcd_3; -} - -#[inline(always)] -unsafe fn transpose_msg_vecs(inputs: &[*const u8; DEGREE], block_offset: usize) -> [__m128i; 16] { - let mut vecs = [ - loadu(inputs[0].add(block_offset + 0 * 4 * DEGREE)), - loadu(inputs[1].add(block_offset + 0 * 4 * DEGREE)), - loadu(inputs[2].add(block_offset + 0 * 4 * DEGREE)), - loadu(inputs[3].add(block_offset + 0 * 4 * DEGREE)), - loadu(inputs[0].add(block_offset + 1 * 4 * DEGREE)), - loadu(inputs[1].add(block_offset + 1 * 4 * DEGREE)), - loadu(inputs[2].add(block_offset + 1 * 4 * DEGREE)), - loadu(inputs[3].add(block_offset + 1 * 4 * DEGREE)), - loadu(inputs[0].add(block_offset + 2 * 4 * DEGREE)), - loadu(inputs[1].add(block_offset + 2 * 4 * DEGREE)), - loadu(inputs[2].add(block_offset + 2 * 4 * DEGREE)), - loadu(inputs[3].add(block_offset + 2 * 4 * DEGREE)), - loadu(inputs[0].add(block_offset + 3 * 4 * DEGREE)), - loadu(inputs[1].add(block_offset + 3 * 4 * DEGREE)), - loadu(inputs[2].add(block_offset + 3 * 4 * DEGREE)), - loadu(inputs[3].add(block_offset + 3 * 4 * DEGREE)), - ]; - for i in 0..DEGREE { - _mm_prefetch(inputs[i].add(block_offset + 256) as *const i8, _MM_HINT_T0); - } - let squares = mut_array_refs!(&mut vecs, DEGREE, DEGREE, DEGREE, DEGREE); - transpose_vecs(squares.0); - transpose_vecs(squares.1); - transpose_vecs(squares.2); - transpose_vecs(squares.3); - vecs -} - -#[inline(always)] -unsafe fn load_counters(counter: u64, increment_counter: IncrementCounter) -> (__m128i, __m128i) { - let mask = if increment_counter.yes() { !0 } else { 0 }; - ( - set4( - counter_low(counter + (mask & 0)), - counter_low(counter + (mask & 1)), - counter_low(counter + (mask & 2)), - counter_low(counter + (mask & 3)), - ), - set4( - counter_high(counter + (mask & 0)), - counter_high(counter + (mask & 1)), - counter_high(counter + (mask & 2)), - counter_high(counter + (mask & 3)), - ), - ) -} - -#[target_feature(enable = "sse4.1")] -pub unsafe fn hash4( - inputs: &[*const u8; DEGREE], - blocks: usize, - key: &CVWords, - counter: u64, - increment_counter: IncrementCounter, - flags: u8, - flags_start: u8, - flags_end: u8, - out: &mut [u8; DEGREE * OUT_LEN], -) { - let mut h_vecs = [ - set1(key[0]), - set1(key[1]), - set1(key[2]), - set1(key[3]), - set1(key[4]), - set1(key[5]), - set1(key[6]), - set1(key[7]), - ]; - let (counter_low_vec, counter_high_vec) = load_counters(counter, increment_counter); - let mut block_flags = flags | flags_start; - - for block in 0..blocks { - if block + 1 == blocks { - block_flags |= flags_end; - } - let block_len_vec = set1(BLOCK_LEN as u32); // full blocks only - let block_flags_vec = set1(block_flags as u32); - let msg_vecs = transpose_msg_vecs(inputs, block * BLOCK_LEN); - - // The transposed compression function. Note that inlining this - // manually here improves compile times by a lot, compared to factoring - // it out into its own function and making it #[inline(always)]. Just - // guessing, it might have something to do with loop unrolling. - let mut v = [ - h_vecs[0], - h_vecs[1], - h_vecs[2], - h_vecs[3], - h_vecs[4], - h_vecs[5], - h_vecs[6], - h_vecs[7], - set1(IV[0]), - set1(IV[1]), - set1(IV[2]), - set1(IV[3]), - counter_low_vec, - counter_high_vec, - block_len_vec, - block_flags_vec, - ]; - round(&mut v, &msg_vecs, 0); - round(&mut v, &msg_vecs, 1); - round(&mut v, &msg_vecs, 2); - round(&mut v, &msg_vecs, 3); - round(&mut v, &msg_vecs, 4); - round(&mut v, &msg_vecs, 5); - round(&mut v, &msg_vecs, 6); - h_vecs[0] = xor(v[0], v[8]); - h_vecs[1] = xor(v[1], v[9]); - h_vecs[2] = xor(v[2], v[10]); - h_vecs[3] = xor(v[3], v[11]); - h_vecs[4] = xor(v[4], v[12]); - h_vecs[5] = xor(v[5], v[13]); - h_vecs[6] = xor(v[6], v[14]); - h_vecs[7] = xor(v[7], v[15]); - - block_flags = flags; - } - - let squares = mut_array_refs!(&mut h_vecs, DEGREE, DEGREE); - transpose_vecs(squares.0); - transpose_vecs(squares.1); - // The first four vecs now contain the first half of each output, and the - // second four vecs contain the second half of each output. - storeu(h_vecs[0], out.as_mut_ptr().add(0 * 4 * DEGREE)); - storeu(h_vecs[4], out.as_mut_ptr().add(1 * 4 * DEGREE)); - storeu(h_vecs[1], out.as_mut_ptr().add(2 * 4 * DEGREE)); - storeu(h_vecs[5], out.as_mut_ptr().add(3 * 4 * DEGREE)); - storeu(h_vecs[2], out.as_mut_ptr().add(4 * 4 * DEGREE)); - storeu(h_vecs[6], out.as_mut_ptr().add(5 * 4 * DEGREE)); - storeu(h_vecs[3], out.as_mut_ptr().add(6 * 4 * DEGREE)); - storeu(h_vecs[7], out.as_mut_ptr().add(7 * 4 * DEGREE)); -} - -#[target_feature(enable = "sse4.1")] -unsafe fn hash1<A: arrayvec::Array<Item = u8>>( - input: &A, - key: &CVWords, - counter: u64, - flags: u8, - flags_start: u8, - flags_end: u8, - out: &mut CVBytes, -) { - debug_assert_eq!(A::CAPACITY % BLOCK_LEN, 0, "uneven blocks"); - let mut cv = *key; - let mut block_flags = flags | flags_start; - let mut slice = input.as_slice(); - while slice.len() >= BLOCK_LEN { - if slice.len() == BLOCK_LEN { - block_flags |= flags_end; - } - compress_in_place( - &mut cv, - array_ref!(slice, 0, BLOCK_LEN), - BLOCK_LEN as u8, - counter, - block_flags, - ); - block_flags = flags; - slice = &slice[BLOCK_LEN..]; - } - *out = core::mem::transmute(cv); // x86 is little-endian -} - -#[target_feature(enable = "sse4.1")] -pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>( - mut inputs: &[&A], - key: &CVWords, - mut counter: u64, - increment_counter: IncrementCounter, - flags: u8, - flags_start: u8, - flags_end: u8, - mut out: &mut [u8], -) { - debug_assert!(out.len() >= inputs.len() * OUT_LEN, "out too short"); - while inputs.len() >= DEGREE && out.len() >= DEGREE * OUT_LEN { - // Safe because the layout of arrays is guaranteed, and because the - // `blocks` count is determined statically from the argument type. - let input_ptrs: &[*const u8; DEGREE] = &*(inputs.as_ptr() as *const [*const u8; DEGREE]); - let blocks = A::CAPACITY / BLOCK_LEN; - hash4( - input_ptrs, - blocks, - key, - counter, - increment_counter, - flags, - flags_start, - flags_end, - array_mut_ref!(out, 0, DEGREE * OUT_LEN), - ); - if increment_counter.yes() { - counter += DEGREE as u64; - } - inputs = &inputs[DEGREE..]; - out = &mut out[DEGREE * OUT_LEN..]; - } - for (&input, output) in inputs.iter().zip(out.chunks_exact_mut(OUT_LEN)) { - hash1( - input, - key, - counter, - flags, - flags_start, - flags_end, - array_mut_ref!(output, 0, OUT_LEN), - ); - if increment_counter.yes() { - counter += 1; - } - } -} - -#[cfg(test)] -mod test { - use super::*; - - #[test] - fn test_transpose() { - if !crate::platform::sse41_detected() { - return; - } - - #[target_feature(enable = "sse4.1")] - unsafe fn transpose_wrapper(vecs: &mut [__m128i; DEGREE]) { - transpose_vecs(vecs); - } - - let mut matrix = [[0 as u32; DEGREE]; DEGREE]; - for i in 0..DEGREE { - for j in 0..DEGREE { - matrix[i][j] = (i * DEGREE + j) as u32; - } - } - - unsafe { - let mut vecs: [__m128i; DEGREE] = core::mem::transmute(matrix); - transpose_wrapper(&mut vecs); - matrix = core::mem::transmute(vecs); - } - - for i in 0..DEGREE { - for j in 0..DEGREE { - // Reversed indexes from above. - assert_eq!(matrix[j][i], (i * DEGREE + j) as u32); - } - } - } - - #[test] - fn test_compress() { - if !crate::platform::sse41_detected() { - return; - } - crate::test::test_compress_fn(compress_in_place, compress_xof); - } - - #[test] - fn test_hash_many() { - if !crate::platform::sse41_detected() { - return; - } - crate::test::test_hash_many_fn(hash_many, hash_many); - } -} diff --git a/thirdparty/BLAKE3/src/test.rs b/thirdparty/BLAKE3/src/test.rs deleted file mode 100644 index eefb1a354..000000000 --- a/thirdparty/BLAKE3/src/test.rs +++ /dev/null @@ -1,569 +0,0 @@ -use crate::{CVBytes, CVWords, IncrementCounter, BLOCK_LEN, CHUNK_LEN, OUT_LEN}; -use arrayref::array_ref; -use arrayvec::ArrayVec; -use core::sync::atomic::{AtomicUsize, Ordering}; -use core::usize; -use rand::prelude::*; - -// Interesting input lengths to run tests on. -pub const TEST_CASES: &[usize] = &[ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - BLOCK_LEN - 1, - BLOCK_LEN, - BLOCK_LEN + 1, - 2 * BLOCK_LEN - 1, - 2 * BLOCK_LEN, - 2 * BLOCK_LEN + 1, - CHUNK_LEN - 1, - CHUNK_LEN, - CHUNK_LEN + 1, - 2 * CHUNK_LEN, - 2 * CHUNK_LEN + 1, - 3 * CHUNK_LEN, - 3 * CHUNK_LEN + 1, - 4 * CHUNK_LEN, - 4 * CHUNK_LEN + 1, - 5 * CHUNK_LEN, - 5 * CHUNK_LEN + 1, - 6 * CHUNK_LEN, - 6 * CHUNK_LEN + 1, - 7 * CHUNK_LEN, - 7 * CHUNK_LEN + 1, - 8 * CHUNK_LEN, - 8 * CHUNK_LEN + 1, - 16 * CHUNK_LEN, // AVX512's bandwidth - 31 * CHUNK_LEN, // 16 + 8 + 4 + 2 + 1 - 100 * CHUNK_LEN, // subtrees larger than MAX_SIMD_DEGREE chunks -]; - -pub const TEST_CASES_MAX: usize = 100 * CHUNK_LEN; - -// There's a test to make sure these two are equal below. -pub const TEST_KEY: CVBytes = *b"whats the Elvish word for friend"; -pub const TEST_KEY_WORDS: CVWords = [ - 1952540791, 1752440947, 1816469605, 1752394102, 1919907616, 1868963940, 1919295602, 1684956521, -]; - -// Paint the input with a repeating byte pattern. We use a cycle length of 251, -// because that's the largets prime number less than 256. This makes it -// unlikely to swapping any two adjacent input blocks or chunks will give the -// same answer. -pub fn paint_test_input(buf: &mut [u8]) { - for (i, b) in buf.iter_mut().enumerate() { - *b = (i % 251) as u8; - } -} - -type CompressInPlaceFn = - unsafe fn(cv: &mut CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8); - -type CompressXofFn = unsafe fn( - cv: &CVWords, - block: &[u8; BLOCK_LEN], - block_len: u8, - counter: u64, - flags: u8, -) -> [u8; 64]; - -// A shared helper function for platform-specific tests. -pub fn test_compress_fn(compress_in_place_fn: CompressInPlaceFn, compress_xof_fn: CompressXofFn) { - let initial_state = TEST_KEY_WORDS; - let block_len: u8 = 61; - let mut block = [0; BLOCK_LEN]; - paint_test_input(&mut block[..block_len as usize]); - // Use a counter with set bits in both 32-bit words. - let counter = (5u64 << 32) + 6; - let flags = crate::CHUNK_END | crate::ROOT | crate::KEYED_HASH; - - let portable_out = - crate::portable::compress_xof(&initial_state, &block, block_len, counter as u64, flags); - - let mut test_state = initial_state; - unsafe { compress_in_place_fn(&mut test_state, &block, block_len, counter as u64, flags) }; - let test_state_bytes = crate::platform::le_bytes_from_words_32(&test_state); - let test_xof = - unsafe { compress_xof_fn(&initial_state, &block, block_len, counter as u64, flags) }; - - assert_eq!(&portable_out[..32], &test_state_bytes[..]); - assert_eq!(&portable_out[..], &test_xof[..]); -} - -type HashManyFn<A> = unsafe fn( - inputs: &[&A], - key: &CVWords, - counter: u64, - increment_counter: IncrementCounter, - flags: u8, - flags_start: u8, - flags_end: u8, - out: &mut [u8], -); - -// A shared helper function for platform-specific tests. -pub fn test_hash_many_fn( - hash_many_chunks_fn: HashManyFn<[u8; CHUNK_LEN]>, - hash_many_parents_fn: HashManyFn<[u8; 2 * OUT_LEN]>, -) { - // 31 (16 + 8 + 4 + 2 + 1) inputs - const NUM_INPUTS: usize = 31; - let mut input_buf = [0; CHUNK_LEN * NUM_INPUTS]; - crate::test::paint_test_input(&mut input_buf); - // A counter just prior to u32::MAX. - let counter = (1u64 << 32) - 1; - - // First hash chunks. - let mut chunks = ArrayVec::<[&[u8; CHUNK_LEN]; NUM_INPUTS]>::new(); - for i in 0..NUM_INPUTS { - chunks.push(array_ref!(input_buf, i * CHUNK_LEN, CHUNK_LEN)); - } - let mut portable_chunks_out = [0; NUM_INPUTS * OUT_LEN]; - crate::portable::hash_many( - &chunks, - &TEST_KEY_WORDS, - counter, - IncrementCounter::Yes, - crate::KEYED_HASH, - crate::CHUNK_START, - crate::CHUNK_END, - &mut portable_chunks_out, - ); - - let mut test_chunks_out = [0; NUM_INPUTS * OUT_LEN]; - unsafe { - hash_many_chunks_fn( - &chunks[..], - &TEST_KEY_WORDS, - counter, - IncrementCounter::Yes, - crate::KEYED_HASH, - crate::CHUNK_START, - crate::CHUNK_END, - &mut test_chunks_out, - ); - } - for n in 0..NUM_INPUTS { - #[cfg(feature = "std")] - dbg!(n); - assert_eq!( - &portable_chunks_out[n * OUT_LEN..][..OUT_LEN], - &test_chunks_out[n * OUT_LEN..][..OUT_LEN] - ); - } - - // Then hash parents. - let mut parents = ArrayVec::<[&[u8; 2 * OUT_LEN]; NUM_INPUTS]>::new(); - for i in 0..NUM_INPUTS { - parents.push(array_ref!(input_buf, i * 2 * OUT_LEN, 2 * OUT_LEN)); - } - let mut portable_parents_out = [0; NUM_INPUTS * OUT_LEN]; - crate::portable::hash_many( - &parents, - &TEST_KEY_WORDS, - counter, - IncrementCounter::No, - crate::KEYED_HASH | crate::PARENT, - 0, - 0, - &mut portable_parents_out, - ); - - let mut test_parents_out = [0; NUM_INPUTS * OUT_LEN]; - unsafe { - hash_many_parents_fn( - &parents[..], - &TEST_KEY_WORDS, - counter, - IncrementCounter::No, - crate::KEYED_HASH | crate::PARENT, - 0, - 0, - &mut test_parents_out, - ); - } - for n in 0..NUM_INPUTS { - #[cfg(feature = "std")] - dbg!(n); - assert_eq!( - &portable_parents_out[n * OUT_LEN..][..OUT_LEN], - &test_parents_out[n * OUT_LEN..][..OUT_LEN] - ); - } -} - -#[test] -fn test_key_bytes_equal_key_words() { - assert_eq!( - TEST_KEY_WORDS, - crate::platform::words_from_le_bytes_32(&TEST_KEY), - ); -} - -#[test] -fn test_reference_impl_size() { - // Because the Rust compiler optimizes struct layout, it's possible that - // some future version of the compiler will produce a different size. If - // that happens, we can either disable this test, or test for multiple - // expected values. For now, the purpose of this test is to make sure we - // notice if that happens. - assert_eq!(1880, core::mem::size_of::<reference_impl::Hasher>()); -} - -#[test] -fn test_counter_words() { - let counter: u64 = (1 << 32) + 2; - assert_eq!(crate::counter_low(counter), 2); - assert_eq!(crate::counter_high(counter), 1); -} - -#[test] -fn test_largest_power_of_two_leq() { - let input_output = &[ - // The zero case is nonsensical, but it does work. - (0, 1), - (1, 1), - (2, 2), - (3, 2), - (4, 4), - (5, 4), - (6, 4), - (7, 4), - (8, 8), - // the largest possible usize - (usize::MAX, (usize::MAX >> 1) + 1), - ]; - for &(input, output) in input_output { - assert_eq!( - output, - crate::largest_power_of_two_leq(input), - "wrong output for n={}", - input - ); - } -} - -#[test] -fn test_left_len() { - let input_output = &[ - (CHUNK_LEN + 1, CHUNK_LEN), - (2 * CHUNK_LEN - 1, CHUNK_LEN), - (2 * CHUNK_LEN, CHUNK_LEN), - (2 * CHUNK_LEN + 1, 2 * CHUNK_LEN), - (4 * CHUNK_LEN - 1, 2 * CHUNK_LEN), - (4 * CHUNK_LEN, 2 * CHUNK_LEN), - (4 * CHUNK_LEN + 1, 4 * CHUNK_LEN), - ]; - for &(input, output) in input_output { - assert_eq!(crate::left_len(input), output); - } -} - -#[test] -fn test_compare_reference_impl() { - const OUT: usize = 303; // more than 64, not a multiple of 4 - let mut input_buf = [0; TEST_CASES_MAX]; - paint_test_input(&mut input_buf); - for &case in TEST_CASES { - let input = &input_buf[..case]; - #[cfg(feature = "std")] - dbg!(case); - - // regular - { - let mut reference_hasher = reference_impl::Hasher::new(); - reference_hasher.update(input); - let mut expected_out = [0; OUT]; - reference_hasher.finalize(&mut expected_out); - - // all at once - let test_out = crate::hash(input); - assert_eq!(test_out, *array_ref!(expected_out, 0, 32)); - // incremental - let mut hasher = crate::Hasher::new(); - hasher.update(input); - assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32)); - assert_eq!(hasher.finalize(), test_out); - // xof - let mut extended = [0; OUT]; - hasher.finalize_xof().fill(&mut extended); - assert_eq!(extended[..], expected_out[..]); - } - - // keyed - { - let mut reference_hasher = reference_impl::Hasher::new_keyed(&TEST_KEY); - reference_hasher.update(input); - let mut expected_out = [0; OUT]; - reference_hasher.finalize(&mut expected_out); - - // all at once - let test_out = crate::keyed_hash(&TEST_KEY, input); - assert_eq!(test_out, *array_ref!(expected_out, 0, 32)); - // incremental - let mut hasher = crate::Hasher::new_keyed(&TEST_KEY); - hasher.update(input); - assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32)); - assert_eq!(hasher.finalize(), test_out); - // xof - let mut extended = [0; OUT]; - hasher.finalize_xof().fill(&mut extended); - assert_eq!(extended[..], expected_out[..]); - } - - // derive_key - { - let context = "BLAKE3 2019-12-27 16:13:59 example context (not the test vector one)"; - let mut reference_hasher = reference_impl::Hasher::new_derive_key(context); - reference_hasher.update(input); - let mut expected_out = [0; OUT]; - reference_hasher.finalize(&mut expected_out); - - // all at once - let mut test_out = [0; OUT]; - crate::derive_key(context, input, &mut test_out); - assert_eq!(test_out[..], expected_out[..]); - // incremental - let mut hasher = crate::Hasher::new_derive_key(context); - hasher.update(input); - assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32)); - assert_eq!(hasher.finalize(), *array_ref!(test_out, 0, 32)); - // xof - let mut extended = [0; OUT]; - hasher.finalize_xof().fill(&mut extended); - assert_eq!(extended[..], expected_out[..]); - } - } -} - -fn reference_hash(input: &[u8]) -> crate::Hash { - let mut hasher = reference_impl::Hasher::new(); - hasher.update(input); - let mut bytes = [0; 32]; - hasher.finalize(&mut bytes); - bytes.into() -} - -#[test] -fn test_compare_update_multiple() { - // Don't use all the long test cases here, since that's unnecessarily slow - // in debug mode. - let mut short_test_cases = TEST_CASES; - while *short_test_cases.last().unwrap() > 4 * CHUNK_LEN { - short_test_cases = &short_test_cases[..short_test_cases.len() - 1]; - } - assert_eq!(*short_test_cases.last().unwrap(), 4 * CHUNK_LEN); - - let mut input_buf = [0; 2 * TEST_CASES_MAX]; - paint_test_input(&mut input_buf); - - for &first_update in short_test_cases { - #[cfg(feature = "std")] - dbg!(first_update); - let first_input = &input_buf[..first_update]; - let mut test_hasher = crate::Hasher::new(); - test_hasher.update(first_input); - - for &second_update in short_test_cases { - #[cfg(feature = "std")] - dbg!(second_update); - let second_input = &input_buf[first_update..][..second_update]; - let total_input = &input_buf[..first_update + second_update]; - - // Clone the hasher with first_update bytes already written, so - // that the next iteration can reuse it. - let mut test_hasher = test_hasher.clone(); - test_hasher.update(second_input); - let expected = reference_hash(total_input); - assert_eq!(expected, test_hasher.finalize()); - } - } -} - -#[test] -fn test_fuzz_hasher() { - const INPUT_MAX: usize = 4 * CHUNK_LEN; - let mut input_buf = [0; 3 * INPUT_MAX]; - paint_test_input(&mut input_buf); - - // Don't do too many iterations in debug mode, to keep the tests under a - // second or so. CI should run tests in release mode also. Provide an - // environment variable for specifying a larger number of fuzz iterations. - let num_tests = if cfg!(debug_assertions) { 100 } else { 10_000 }; - - // Use a fixed RNG seed for reproducibility. - let mut rng = rand_chacha::ChaCha8Rng::from_seed([1; 32]); - for _num_test in 0..num_tests { - #[cfg(feature = "std")] - dbg!(_num_test); - let mut hasher = crate::Hasher::new(); - let mut total_input = 0; - // For each test, write 3 inputs of random length. - for _ in 0..3 { - let input_len = rng.gen_range(0, INPUT_MAX + 1); - #[cfg(feature = "std")] - dbg!(input_len); - let input = &input_buf[total_input..][..input_len]; - hasher.update(input); - total_input += input_len; - } - let expected = reference_hash(&input_buf[..total_input]); - assert_eq!(expected, hasher.finalize()); - } -} - -#[test] -fn test_xof_seek() { - let mut out = [0; 533]; - let mut hasher = crate::Hasher::new(); - hasher.update(b"foo"); - hasher.finalize_xof().fill(&mut out); - assert_eq!(hasher.finalize().as_bytes(), &out[0..32]); - - let mut reader = hasher.finalize_xof(); - reader.set_position(303); - let mut out2 = [0; 102]; - reader.fill(&mut out2); - assert_eq!(&out[303..][..102], &out2[..]); - - #[cfg(feature = "std")] - { - use std::io::prelude::*; - let mut reader = hasher.finalize_xof(); - reader.seek(std::io::SeekFrom::Start(303)).unwrap(); - let mut out3 = Vec::new(); - reader.by_ref().take(102).read_to_end(&mut out3).unwrap(); - assert_eq!(&out[303..][..102], &out3[..]); - - assert_eq!( - reader.seek(std::io::SeekFrom::Current(0)).unwrap(), - 303 + 102 - ); - reader.seek(std::io::SeekFrom::Current(-5)).unwrap(); - assert_eq!( - reader.seek(std::io::SeekFrom::Current(0)).unwrap(), - 303 + 102 - 5 - ); - let mut out4 = [0; 17]; - assert_eq!(reader.read(&mut out4).unwrap(), 17); - assert_eq!(&out[303 + 102 - 5..][..17], &out4[..]); - assert_eq!( - reader.seek(std::io::SeekFrom::Current(0)).unwrap(), - 303 + 102 - 5 + 17 - ); - assert!(reader.seek(std::io::SeekFrom::End(0)).is_err()); - assert!(reader.seek(std::io::SeekFrom::Current(-1000)).is_err()); - } -} - -#[test] -fn test_msg_schdule_permutation() { - let permutation = [2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8]; - - let mut generated = [[0; 16]; 7]; - generated[0] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]; - - for round in 1..7 { - for i in 0..16 { - generated[round][i] = generated[round - 1][permutation[i]]; - } - } - - assert_eq!(generated, crate::MSG_SCHEDULE); -} - -#[test] -fn test_reset() { - let mut hasher = crate::Hasher::new(); - hasher.update(&[42; 3 * CHUNK_LEN + 7]); - hasher.reset(); - hasher.update(&[42; CHUNK_LEN + 3]); - assert_eq!(hasher.finalize(), crate::hash(&[42; CHUNK_LEN + 3])); - - let key = &[99; crate::KEY_LEN]; - let mut keyed_hasher = crate::Hasher::new_keyed(key); - keyed_hasher.update(&[42; 3 * CHUNK_LEN + 7]); - keyed_hasher.reset(); - keyed_hasher.update(&[42; CHUNK_LEN + 3]); - assert_eq!( - keyed_hasher.finalize(), - crate::keyed_hash(key, &[42; CHUNK_LEN + 3]), - ); - - let context = "BLAKE3 2020-02-12 10:20:58 reset test"; - let mut kdf = crate::Hasher::new_derive_key(context); - kdf.update(&[42; 3 * CHUNK_LEN + 7]); - kdf.reset(); - kdf.update(&[42; CHUNK_LEN + 3]); - let mut expected = [0; crate::OUT_LEN]; - crate::derive_key(context, &[42; CHUNK_LEN + 3], &mut expected); - assert_eq!(kdf.finalize(), expected); -} - -#[test] -#[cfg(feature = "rayon")] -fn test_update_with_rayon_join() { - let mut input = [0; TEST_CASES_MAX]; - paint_test_input(&mut input); - let rayon_hash = crate::Hasher::new() - .update_with_join::<crate::join::RayonJoin>(&input) - .finalize(); - assert_eq!(crate::hash(&input), rayon_hash); -} - -// Test that the length values given to Join::join are what they're supposed to -// be. -#[test] -fn test_join_lengths() { - // Use static atomics to let us safely get a couple of values in and out of - // CustomJoin. This avoids depending on std, though it assumes that this - // thread will only run once in the lifetime of the runner process. - static SINGLE_THREAD_LEN: AtomicUsize = AtomicUsize::new(0); - static CUSTOM_JOIN_CALLS: AtomicUsize = AtomicUsize::new(0); - - // Use an input that's exactly (simd_degree * CHUNK_LEN) + 1. That should - // guarantee that compress_subtree_wide does exactly one split, with the - // last byte on the right side. Note that it we used - // Hasher::update_with_join, we would end up buffering that last byte, - // rather than splitting and joining it. - let single_thread_len = crate::platform::Platform::detect().simd_degree() * CHUNK_LEN; - SINGLE_THREAD_LEN.store(single_thread_len, Ordering::SeqCst); - let mut input_buf = [0; 2 * crate::platform::MAX_SIMD_DEGREE * CHUNK_LEN]; - paint_test_input(&mut input_buf); - let input = &input_buf[..single_thread_len + 1]; - - enum CustomJoin {} - - impl crate::join::Join for CustomJoin { - fn join<A, B, RA, RB>(oper_a: A, oper_b: B, len_a: usize, len_b: usize) -> (RA, RB) - where - A: FnOnce() -> RA + Send, - B: FnOnce() -> RB + Send, - RA: Send, - RB: Send, - { - let prev_calls = CUSTOM_JOIN_CALLS.fetch_add(1, Ordering::SeqCst); - assert_eq!(prev_calls, 0); - assert_eq!(len_a, SINGLE_THREAD_LEN.load(Ordering::SeqCst)); - assert_eq!(len_b, 1); - (oper_a(), oper_b()) - } - } - - let mut out_buf = [0; crate::platform::MAX_SIMD_DEGREE_OR_2 * CHUNK_LEN]; - crate::compress_subtree_wide::<CustomJoin>( - input, - crate::IV, - 0, - 0, - crate::platform::Platform::detect(), - &mut out_buf, - ); - assert_eq!(CUSTOM_JOIN_CALLS.load(Ordering::SeqCst), 1); -} diff --git a/thirdparty/BLAKE3/src/traits.rs b/thirdparty/BLAKE3/src/traits.rs deleted file mode 100644 index 9704e0106..000000000 --- a/thirdparty/BLAKE3/src/traits.rs +++ /dev/null @@ -1,184 +0,0 @@ -//! Implementations of commonly used traits like -//! [`digest::Digest`](https://crates.io/crates/digest) and -//! [`crypto_mac::Mac`](https://crates.io/crates/crypto-mac). - -pub use crypto_mac; -pub use digest; - -use crate::{Hasher, OutputReader}; -use digest::generic_array::{ - typenum::{U32, U64}, - GenericArray, -}; - -impl digest::BlockInput for Hasher { - type BlockSize = U64; -} - -impl digest::Update for Hasher { - #[inline] - fn update(&mut self, data: impl AsRef<[u8]>) { - self.update(data.as_ref()); - } -} - -impl digest::Reset for Hasher { - #[inline] - fn reset(&mut self) { - self.reset(); // the inherent method - } -} - -impl digest::FixedOutput for Hasher { - type OutputSize = U32; - - #[inline] - fn finalize_into(self, out: &mut GenericArray<u8, Self::OutputSize>) { - out.copy_from_slice(self.finalize().as_bytes()); - } - - #[inline] - fn finalize_into_reset(&mut self, out: &mut GenericArray<u8, Self::OutputSize>) { - out.copy_from_slice(self.finalize().as_bytes()); - self.reset(); - } -} - -impl digest::ExtendableOutput for Hasher { - type Reader = OutputReader; - - #[inline] - fn finalize_xof(self) -> Self::Reader { - Hasher::finalize_xof(&self) - } - - #[inline] - fn finalize_xof_reset(&mut self) -> Self::Reader { - let reader = Hasher::finalize_xof(self); - self.reset(); - reader - } -} - -impl digest::XofReader for OutputReader { - #[inline] - fn read(&mut self, buffer: &mut [u8]) { - self.fill(buffer); - } -} - -impl crypto_mac::NewMac for Hasher { - type KeySize = U32; - - #[inline] - fn new(key: &crypto_mac::Key<Self>) -> Self { - let key_bytes: [u8; 32] = (*key).into(); - Hasher::new_keyed(&key_bytes) - } -} - -impl crypto_mac::Mac for Hasher { - type OutputSize = U32; - - #[inline] - fn update(&mut self, data: &[u8]) { - self.update(data); - } - - #[inline] - fn reset(&mut self) { - self.reset(); - } - - #[inline] - fn finalize(self) -> crypto_mac::Output<Self> { - crypto_mac::Output::new(digest::Digest::finalize(self)) - } -} - -#[cfg(test)] -mod test { - use super::*; - - #[test] - fn test_digest_traits() { - // Inherent methods. - let mut hasher1 = crate::Hasher::new(); - hasher1.update(b"foo"); - hasher1.update(b"bar"); - hasher1.update(b"baz"); - let out1 = hasher1.finalize(); - let mut xof1 = [0; 301]; - hasher1.finalize_xof().fill(&mut xof1); - assert_eq!(out1.as_bytes(), &xof1[..32]); - - // Trait implementations. - let mut hasher2: crate::Hasher = digest::Digest::new(); - digest::Digest::update(&mut hasher2, b"xxx"); - digest::Digest::reset(&mut hasher2); - digest::Digest::update(&mut hasher2, b"foo"); - digest::Digest::update(&mut hasher2, b"bar"); - digest::Digest::update(&mut hasher2, b"baz"); - let out2 = digest::Digest::finalize(hasher2.clone()); - let mut xof2 = [0; 301]; - digest::XofReader::read( - &mut digest::ExtendableOutput::finalize_xof(hasher2.clone()), - &mut xof2, - ); - assert_eq!(out1.as_bytes(), &out2[..]); - assert_eq!(xof1[..], xof2[..]); - - // Again with the resetting variants. - let mut hasher3: crate::Hasher = digest::Digest::new(); - digest::Digest::update(&mut hasher3, b"foobarbaz"); - let mut out3 = [0; 32]; - digest::FixedOutput::finalize_into_reset( - &mut hasher3, - GenericArray::from_mut_slice(&mut out3), - ); - digest::Digest::update(&mut hasher3, b"foobarbaz"); - let mut out4 = [0; 32]; - digest::FixedOutput::finalize_into_reset( - &mut hasher3, - GenericArray::from_mut_slice(&mut out4), - ); - digest::Digest::update(&mut hasher3, b"foobarbaz"); - let mut xof3 = [0; 301]; - digest::XofReader::read( - &mut digest::ExtendableOutput::finalize_xof_reset(&mut hasher3), - &mut xof3, - ); - digest::Digest::update(&mut hasher3, b"foobarbaz"); - let mut xof4 = [0; 301]; - digest::XofReader::read( - &mut digest::ExtendableOutput::finalize_xof_reset(&mut hasher3), - &mut xof4, - ); - assert_eq!(out1.as_bytes(), &out3[..]); - assert_eq!(out1.as_bytes(), &out4[..]); - assert_eq!(xof1[..], xof3[..]); - assert_eq!(xof1[..], xof4[..]); - } - - #[test] - fn test_mac_trait() { - // Inherent methods. - let key = b"some super secret key bytes fooo"; - let mut hasher1 = crate::Hasher::new_keyed(key); - hasher1.update(b"foo"); - hasher1.update(b"bar"); - hasher1.update(b"baz"); - let out1 = hasher1.finalize(); - - // Trait implementation. - let generic_key = (*key).into(); - let mut hasher2: crate::Hasher = crypto_mac::NewMac::new(&generic_key); - crypto_mac::Mac::update(&mut hasher2, b"xxx"); - crypto_mac::Mac::reset(&mut hasher2); - crypto_mac::Mac::update(&mut hasher2, b"foo"); - crypto_mac::Mac::update(&mut hasher2, b"bar"); - crypto_mac::Mac::update(&mut hasher2, b"baz"); - let out2 = crypto_mac::Mac::finalize(hasher2); - assert_eq!(out1.as_bytes(), out2.into_bytes().as_slice()); - } -} |