diff options
Diffstat (limited to '3rdparty/BLAKE3/src/platform.rs')
| -rw-r--r-- | 3rdparty/BLAKE3/src/platform.rs | 487 |
1 files changed, 487 insertions, 0 deletions
diff --git a/3rdparty/BLAKE3/src/platform.rs b/3rdparty/BLAKE3/src/platform.rs new file mode 100644 index 000000000..4bd67de7a --- /dev/null +++ b/3rdparty/BLAKE3/src/platform.rs @@ -0,0 +1,487 @@ +use crate::{portable, CVWords, IncrementCounter, BLOCK_LEN}; +use arrayref::{array_mut_ref, array_ref}; + +cfg_if::cfg_if! { + if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { + cfg_if::cfg_if! { + if #[cfg(blake3_avx512_ffi)] { + pub const MAX_SIMD_DEGREE: usize = 16; + } else { + pub const MAX_SIMD_DEGREE: usize = 8; + } + } + } else if #[cfg(feature = "neon")] { + pub const MAX_SIMD_DEGREE: usize = 4; + } else { + pub const MAX_SIMD_DEGREE: usize = 1; + } +} + +// There are some places where we want a static size that's equal to the +// MAX_SIMD_DEGREE, but also at least 2. Constant contexts aren't currently +// allowed to use cmp::max, so we have to hardcode this additional constant +// value. Get rid of this once cmp::max is a const fn. +cfg_if::cfg_if! { + if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { + cfg_if::cfg_if! { + if #[cfg(blake3_avx512_ffi)] { + pub const MAX_SIMD_DEGREE_OR_2: usize = 16; + } else { + pub const MAX_SIMD_DEGREE_OR_2: usize = 8; + } + } + } else if #[cfg(feature = "neon")] { + pub const MAX_SIMD_DEGREE_OR_2: usize = 4; + } else { + pub const MAX_SIMD_DEGREE_OR_2: usize = 2; + } +} + +#[derive(Clone, Copy, Debug)] +pub enum Platform { + Portable, + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + SSE2, + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + SSE41, + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + AVX2, + #[cfg(blake3_avx512_ffi)] + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + AVX512, + #[cfg(feature = "neon")] + NEON, +} + +impl Platform { + #[allow(unreachable_code)] + pub fn detect() -> Self { + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { + #[cfg(blake3_avx512_ffi)] + { + if avx512_detected() { + return Platform::AVX512; + } + } + if avx2_detected() { + return Platform::AVX2; + } + if sse41_detected() { + return Platform::SSE41; + } + if sse2_detected() { + return Platform::SSE2; + } + } + // We don't use dynamic feature detection for NEON. If the "neon" + // feature is on, NEON is assumed to be supported. + #[cfg(feature = "neon")] + { + return Platform::NEON; + } + Platform::Portable + } + + pub fn simd_degree(&self) -> usize { + let degree = match self { + Platform::Portable => 1, + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::SSE2 => 4, + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::SSE41 => 4, + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::AVX2 => 8, + #[cfg(blake3_avx512_ffi)] + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::AVX512 => 16, + #[cfg(feature = "neon")] + Platform::NEON => 4, + }; + debug_assert!(degree <= MAX_SIMD_DEGREE); + degree + } + + pub fn compress_in_place( + &self, + cv: &mut CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, + ) { + match self { + Platform::Portable => portable::compress_in_place(cv, block, block_len, counter, flags), + // Safe because detect() checked for platform support. + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::SSE2 => unsafe { + crate::sse2::compress_in_place(cv, block, block_len, counter, flags) + }, + // Safe because detect() checked for platform support. + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::SSE41 | Platform::AVX2 => unsafe { + crate::sse41::compress_in_place(cv, block, block_len, counter, flags) + }, + // Safe because detect() checked for platform support. + #[cfg(blake3_avx512_ffi)] + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::AVX512 => unsafe { + crate::avx512::compress_in_place(cv, block, block_len, counter, flags) + }, + // No NEON compress_in_place() implementation yet. + #[cfg(feature = "neon")] + Platform::NEON => portable::compress_in_place(cv, block, block_len, counter, flags), + } + } + + pub fn compress_xof( + &self, + cv: &CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, + ) -> [u8; 64] { + match self { + Platform::Portable => portable::compress_xof(cv, block, block_len, counter, flags), + // Safe because detect() checked for platform support. + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::SSE2 => unsafe { + crate::sse2::compress_xof(cv, block, block_len, counter, flags) + }, + // Safe because detect() checked for platform support. + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::SSE41 | Platform::AVX2 => unsafe { + crate::sse41::compress_xof(cv, block, block_len, counter, flags) + }, + // Safe because detect() checked for platform support. + #[cfg(blake3_avx512_ffi)] + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::AVX512 => unsafe { + crate::avx512::compress_xof(cv, block, block_len, counter, flags) + }, + // No NEON compress_xof() implementation yet. + #[cfg(feature = "neon")] + Platform::NEON => portable::compress_xof(cv, block, block_len, counter, flags), + } + } + + // IMPLEMENTATION NOTE + // =================== + // hash_many() applies two optimizations. The critically important + // optimization is the high-performance parallel SIMD hashing mode, + // described in detail in the spec. This more than doubles throughput per + // thread. Another optimization is keeping the state vectors transposed + // from block to block within a chunk. When state vectors are transposed + // after every block, there's a small but measurable performance loss. + // Compressing chunks with a dedicated loop avoids this. + + pub fn hash_many<A: arrayvec::Array<Item = u8>>( + &self, + inputs: &[&A], + key: &CVWords, + counter: u64, + increment_counter: IncrementCounter, + flags: u8, + flags_start: u8, + flags_end: u8, + out: &mut [u8], + ) { + match self { + Platform::Portable => portable::hash_many( + inputs, + key, + counter, + increment_counter, + flags, + flags_start, + flags_end, + out, + ), + // Safe because detect() checked for platform support. + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::SSE2 => unsafe { + crate::sse2::hash_many( + inputs, + key, + counter, + increment_counter, + flags, + flags_start, + flags_end, + out, + ) + }, + // Safe because detect() checked for platform support. + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::SSE41 => unsafe { + crate::sse41::hash_many( + inputs, + key, + counter, + increment_counter, + flags, + flags_start, + flags_end, + out, + ) + }, + // Safe because detect() checked for platform support. + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::AVX2 => unsafe { + crate::avx2::hash_many( + inputs, + key, + counter, + increment_counter, + flags, + flags_start, + flags_end, + out, + ) + }, + // Safe because detect() checked for platform support. + #[cfg(blake3_avx512_ffi)] + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::AVX512 => unsafe { + crate::avx512::hash_many( + inputs, + key, + counter, + increment_counter, + flags, + flags_start, + flags_end, + out, + ) + }, + // Assumed to be safe if the "neon" feature is on. + #[cfg(feature = "neon")] + Platform::NEON => unsafe { + crate::neon::hash_many( + inputs, + key, + counter, + increment_counter, + flags, + flags_start, + flags_end, + out, + ) + }, + } + } + + // Explicit platform constructors, for benchmarks. + + pub fn portable() -> Self { + Self::Portable + } + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + pub fn sse2() -> Option<Self> { + if sse2_detected() { + Some(Self::SSE2) + } else { + None + } + } + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + pub fn sse41() -> Option<Self> { + if sse41_detected() { + Some(Self::SSE41) + } else { + None + } + } + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + pub fn avx2() -> Option<Self> { + if avx2_detected() { + Some(Self::AVX2) + } else { + None + } + } + + #[cfg(blake3_avx512_ffi)] + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + pub fn avx512() -> Option<Self> { + if avx512_detected() { + Some(Self::AVX512) + } else { + None + } + } + + #[cfg(feature = "neon")] + pub fn neon() -> Option<Self> { + // Assumed to be safe if the "neon" feature is on. + Some(Self::NEON) + } +} + +// Note that AVX-512 is divided into multiple featuresets, and we use two of +// them, F and VL. +#[cfg(blake3_avx512_ffi)] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +#[inline(always)] +pub fn avx512_detected() -> bool { + // A testing-only short-circuit. + if cfg!(feature = "no_avx512") { + return false; + } + // Static check, e.g. for building with target-cpu=native. + #[cfg(all(target_feature = "avx512f", target_feature = "avx512vl"))] + { + return true; + } + // Dynamic check, if std is enabled. + #[cfg(feature = "std")] + { + if is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512vl") { + return true; + } + } + false +} + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +#[inline(always)] +pub fn avx2_detected() -> bool { + // A testing-only short-circuit. + if cfg!(feature = "no_avx2") { + return false; + } + // Static check, e.g. for building with target-cpu=native. + #[cfg(target_feature = "avx2")] + { + return true; + } + // Dynamic check, if std is enabled. + #[cfg(feature = "std")] + { + if is_x86_feature_detected!("avx2") { + return true; + } + } + false +} + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +#[inline(always)] +pub fn sse41_detected() -> bool { + // A testing-only short-circuit. + if cfg!(feature = "no_sse41") { + return false; + } + // Static check, e.g. for building with target-cpu=native. + #[cfg(target_feature = "sse4.1")] + { + return true; + } + // Dynamic check, if std is enabled. + #[cfg(feature = "std")] + { + if is_x86_feature_detected!("sse4.1") { + return true; + } + } + false +} + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +#[inline(always)] +#[allow(unreachable_code)] +pub fn sse2_detected() -> bool { + // A testing-only short-circuit. + if cfg!(feature = "no_sse2") { + return false; + } + // Static check, e.g. for building with target-cpu=native. + #[cfg(target_feature = "sse2")] + { + return true; + } + // Dynamic check, if std is enabled. + #[cfg(feature = "std")] + { + if is_x86_feature_detected!("sse2") { + return true; + } + } + false +} + +#[inline(always)] +pub fn words_from_le_bytes_32(bytes: &[u8; 32]) -> [u32; 8] { + let mut out = [0; 8]; + out[0] = u32::from_le_bytes(*array_ref!(bytes, 0 * 4, 4)); + out[1] = u32::from_le_bytes(*array_ref!(bytes, 1 * 4, 4)); + out[2] = u32::from_le_bytes(*array_ref!(bytes, 2 * 4, 4)); + out[3] = u32::from_le_bytes(*array_ref!(bytes, 3 * 4, 4)); + out[4] = u32::from_le_bytes(*array_ref!(bytes, 4 * 4, 4)); + out[5] = u32::from_le_bytes(*array_ref!(bytes, 5 * 4, 4)); + out[6] = u32::from_le_bytes(*array_ref!(bytes, 6 * 4, 4)); + out[7] = u32::from_le_bytes(*array_ref!(bytes, 7 * 4, 4)); + out +} + +#[inline(always)] +pub fn words_from_le_bytes_64(bytes: &[u8; 64]) -> [u32; 16] { + let mut out = [0; 16]; + out[0] = u32::from_le_bytes(*array_ref!(bytes, 0 * 4, 4)); + out[1] = u32::from_le_bytes(*array_ref!(bytes, 1 * 4, 4)); + out[2] = u32::from_le_bytes(*array_ref!(bytes, 2 * 4, 4)); + out[3] = u32::from_le_bytes(*array_ref!(bytes, 3 * 4, 4)); + out[4] = u32::from_le_bytes(*array_ref!(bytes, 4 * 4, 4)); + out[5] = u32::from_le_bytes(*array_ref!(bytes, 5 * 4, 4)); + out[6] = u32::from_le_bytes(*array_ref!(bytes, 6 * 4, 4)); + out[7] = u32::from_le_bytes(*array_ref!(bytes, 7 * 4, 4)); + out[8] = u32::from_le_bytes(*array_ref!(bytes, 8 * 4, 4)); + out[9] = u32::from_le_bytes(*array_ref!(bytes, 9 * 4, 4)); + out[10] = u32::from_le_bytes(*array_ref!(bytes, 10 * 4, 4)); + out[11] = u32::from_le_bytes(*array_ref!(bytes, 11 * 4, 4)); + out[12] = u32::from_le_bytes(*array_ref!(bytes, 12 * 4, 4)); + out[13] = u32::from_le_bytes(*array_ref!(bytes, 13 * 4, 4)); + out[14] = u32::from_le_bytes(*array_ref!(bytes, 14 * 4, 4)); + out[15] = u32::from_le_bytes(*array_ref!(bytes, 15 * 4, 4)); + out +} + +#[inline(always)] +pub fn le_bytes_from_words_32(words: &[u32; 8]) -> [u8; 32] { + let mut out = [0; 32]; + *array_mut_ref!(out, 0 * 4, 4) = words[0].to_le_bytes(); + *array_mut_ref!(out, 1 * 4, 4) = words[1].to_le_bytes(); + *array_mut_ref!(out, 2 * 4, 4) = words[2].to_le_bytes(); + *array_mut_ref!(out, 3 * 4, 4) = words[3].to_le_bytes(); + *array_mut_ref!(out, 4 * 4, 4) = words[4].to_le_bytes(); + *array_mut_ref!(out, 5 * 4, 4) = words[5].to_le_bytes(); + *array_mut_ref!(out, 6 * 4, 4) = words[6].to_le_bytes(); + *array_mut_ref!(out, 7 * 4, 4) = words[7].to_le_bytes(); + out +} + +#[inline(always)] +pub fn le_bytes_from_words_64(words: &[u32; 16]) -> [u8; 64] { + let mut out = [0; 64]; + *array_mut_ref!(out, 0 * 4, 4) = words[0].to_le_bytes(); + *array_mut_ref!(out, 1 * 4, 4) = words[1].to_le_bytes(); + *array_mut_ref!(out, 2 * 4, 4) = words[2].to_le_bytes(); + *array_mut_ref!(out, 3 * 4, 4) = words[3].to_le_bytes(); + *array_mut_ref!(out, 4 * 4, 4) = words[4].to_le_bytes(); + *array_mut_ref!(out, 5 * 4, 4) = words[5].to_le_bytes(); + *array_mut_ref!(out, 6 * 4, 4) = words[6].to_le_bytes(); + *array_mut_ref!(out, 7 * 4, 4) = words[7].to_le_bytes(); + *array_mut_ref!(out, 8 * 4, 4) = words[8].to_le_bytes(); + *array_mut_ref!(out, 9 * 4, 4) = words[9].to_le_bytes(); + *array_mut_ref!(out, 10 * 4, 4) = words[10].to_le_bytes(); + *array_mut_ref!(out, 11 * 4, 4) = words[11].to_le_bytes(); + *array_mut_ref!(out, 12 * 4, 4) = words[12].to_le_bytes(); + *array_mut_ref!(out, 13 * 4, 4) = words[13].to_le_bytes(); + *array_mut_ref!(out, 14 * 4, 4) = words[14].to_le_bytes(); + *array_mut_ref!(out, 15 * 4, 4) = words[15].to_le_bytes(); + out +} |