aboutsummaryrefslogtreecommitdiff
path: root/thirdparty/blake3/src/platform.rs
diff options
context:
space:
mode:
authorStefan Boberg <[email protected]>2025-11-07 14:49:13 +0100
committerGitHub Enterprise <[email protected]>2025-11-07 14:49:13 +0100
commit24e43a913f29ac3b314354e8ce5175f135bcc64f (patch)
treeca442937ceeb63461012b33a4576e9835099f106 /thirdparty/blake3/src/platform.rs
parentget oplog attachments (#622) (diff)
downloadzen-24e43a913f29ac3b314354e8ce5175f135bcc64f.tar.xz
zen-24e43a913f29ac3b314354e8ce5175f135bcc64f.zip
switch to xmake for package management (#611)
This change removes our dependency on vcpkg for package management, in favour of bringing some code in-tree in the `thirdparty` folder as well as using the xmake build-in package management feature. For the latter, all the package definitions are maintained in the zen repo itself, in the `repo` folder. It should now also be easier to build the project as it will no longer depend on having the right version of vcpkg installed, which has been a common problem for new people coming in to the codebase. Now you should only need xmake to build. * Bumps xmake requirement on github runners to 2.9.9 to resolve an issue where xmake on Windows invokes cmake with `v144` toolchain which does not exist * BLAKE3 is now in-tree at `thirdparty/blake3` * cpr is now in-tree at `thirdparty/cpr` * cxxopts is now in-tree at `thirdparty/cxxopts` * fmt is now in-tree at `thirdparty/fmt` * robin-map is now in-tree at `thirdparty/robin-map` * ryml is now in-tree at `thirdparty/ryml` * sol2 is now in-tree at `thirdparty/sol2` * spdlog is now in-tree at `thirdparty/spdlog` * utfcpp is now in-tree at `thirdparty/utfcpp` * xmake package repo definitions is in `repo` * implemented support for sanitizers. ASAN is supported on windows, TSAN, UBSAN, MSAN etc are supported on Linux/MacOS though I have not yet tested it extensively on MacOS * the zencore encryption implementation also now supports using mbedTLS which is used on MacOS, though for now we still use openssl on Linux * crashpad * bumps libcurl to 8.11.0 (from 8.8.0) which should address a rare build upload bug
Diffstat (limited to 'thirdparty/blake3/src/platform.rs')
-rw-r--r--thirdparty/blake3/src/platform.rs587
1 files changed, 587 insertions, 0 deletions
diff --git a/thirdparty/blake3/src/platform.rs b/thirdparty/blake3/src/platform.rs
new file mode 100644
index 000000000..51b3b7b17
--- /dev/null
+++ b/thirdparty/blake3/src/platform.rs
@@ -0,0 +1,587 @@
+use crate::{portable, CVWords, IncrementCounter, BLOCK_LEN};
+use arrayref::{array_mut_ref, array_ref};
+
+cfg_if::cfg_if! {
+ if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
+ cfg_if::cfg_if! {
+ if #[cfg(blake3_avx512_ffi)] {
+ pub const MAX_SIMD_DEGREE: usize = 16;
+ } else {
+ pub const MAX_SIMD_DEGREE: usize = 8;
+ }
+ }
+ } else if #[cfg(blake3_neon)] {
+ pub const MAX_SIMD_DEGREE: usize = 4;
+ } else if #[cfg(blake3_wasm32_simd)] {
+ pub const MAX_SIMD_DEGREE: usize = 4;
+ } else {
+ pub const MAX_SIMD_DEGREE: usize = 1;
+ }
+}
+
+// There are some places where we want a static size that's equal to the
+// MAX_SIMD_DEGREE, but also at least 2. Constant contexts aren't currently
+// allowed to use cmp::max, so we have to hardcode this additional constant
+// value. Get rid of this once cmp::max is a const fn.
+cfg_if::cfg_if! {
+ if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
+ cfg_if::cfg_if! {
+ if #[cfg(blake3_avx512_ffi)] {
+ pub const MAX_SIMD_DEGREE_OR_2: usize = 16;
+ } else {
+ pub const MAX_SIMD_DEGREE_OR_2: usize = 8;
+ }
+ }
+ } else if #[cfg(blake3_neon)] {
+ pub const MAX_SIMD_DEGREE_OR_2: usize = 4;
+ } else if #[cfg(blake3_wasm32_simd)] {
+ pub const MAX_SIMD_DEGREE_OR_2: usize = 4;
+ } else {
+ pub const MAX_SIMD_DEGREE_OR_2: usize = 2;
+ }
+}
+
+#[derive(Clone, Copy, Debug)]
+pub enum Platform {
+ Portable,
+ #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+ SSE2,
+ #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+ SSE41,
+ #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+ AVX2,
+ #[cfg(blake3_avx512_ffi)]
+ #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+ AVX512,
+ #[cfg(blake3_neon)]
+ NEON,
+ #[cfg(blake3_wasm32_simd)]
+ #[allow(non_camel_case_types)]
+ WASM32_SIMD,
+}
+
+impl Platform {
+ #[allow(unreachable_code)]
+ pub fn detect() -> Self {
+ #[cfg(miri)]
+ {
+ return Platform::Portable;
+ }
+
+ #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+ {
+ #[cfg(blake3_avx512_ffi)]
+ {
+ if avx512_detected() {
+ return Platform::AVX512;
+ }
+ }
+ if avx2_detected() {
+ return Platform::AVX2;
+ }
+ if sse41_detected() {
+ return Platform::SSE41;
+ }
+ if sse2_detected() {
+ return Platform::SSE2;
+ }
+ }
+ // We don't use dynamic feature detection for NEON. If the "neon"
+ // feature is on, NEON is assumed to be supported.
+ #[cfg(blake3_neon)]
+ {
+ return Platform::NEON;
+ }
+ #[cfg(blake3_wasm32_simd)]
+ {
+ return Platform::WASM32_SIMD;
+ }
+ Platform::Portable
+ }
+
+ pub fn simd_degree(&self) -> usize {
+ let degree = match self {
+ Platform::Portable => 1,
+ #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+ Platform::SSE2 => 4,
+ #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+ Platform::SSE41 => 4,
+ #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+ Platform::AVX2 => 8,
+ #[cfg(blake3_avx512_ffi)]
+ #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+ Platform::AVX512 => 16,
+ #[cfg(blake3_neon)]
+ Platform::NEON => 4,
+ #[cfg(blake3_wasm32_simd)]
+ Platform::WASM32_SIMD => 4,
+ };
+ debug_assert!(degree <= MAX_SIMD_DEGREE);
+ degree
+ }
+
+ pub fn compress_in_place(
+ &self,
+ cv: &mut CVWords,
+ block: &[u8; BLOCK_LEN],
+ block_len: u8,
+ counter: u64,
+ flags: u8,
+ ) {
+ match self {
+ Platform::Portable => portable::compress_in_place(cv, block, block_len, counter, flags),
+ // Safe because detect() checked for platform support.
+ #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+ Platform::SSE2 => unsafe {
+ crate::sse2::compress_in_place(cv, block, block_len, counter, flags)
+ },
+ // Safe because detect() checked for platform support.
+ #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+ Platform::SSE41 | Platform::AVX2 => unsafe {
+ crate::sse41::compress_in_place(cv, block, block_len, counter, flags)
+ },
+ // Safe because detect() checked for platform support.
+ #[cfg(blake3_avx512_ffi)]
+ #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+ Platform::AVX512 => unsafe {
+ crate::avx512::compress_in_place(cv, block, block_len, counter, flags)
+ },
+ // No NEON compress_in_place() implementation yet.
+ #[cfg(blake3_neon)]
+ Platform::NEON => portable::compress_in_place(cv, block, block_len, counter, flags),
+ #[cfg(blake3_wasm32_simd)]
+ Platform::WASM32_SIMD => {
+ crate::wasm32_simd::compress_in_place(cv, block, block_len, counter, flags)
+ }
+ }
+ }
+
+ pub fn compress_xof(
+ &self,
+ cv: &CVWords,
+ block: &[u8; BLOCK_LEN],
+ block_len: u8,
+ counter: u64,
+ flags: u8,
+ ) -> [u8; 64] {
+ match self {
+ Platform::Portable => portable::compress_xof(cv, block, block_len, counter, flags),
+ // Safe because detect() checked for platform support.
+ #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+ Platform::SSE2 => unsafe {
+ crate::sse2::compress_xof(cv, block, block_len, counter, flags)
+ },
+ // Safe because detect() checked for platform support.
+ #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+ Platform::SSE41 | Platform::AVX2 => unsafe {
+ crate::sse41::compress_xof(cv, block, block_len, counter, flags)
+ },
+ // Safe because detect() checked for platform support.
+ #[cfg(blake3_avx512_ffi)]
+ #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+ Platform::AVX512 => unsafe {
+ crate::avx512::compress_xof(cv, block, block_len, counter, flags)
+ },
+ // No NEON compress_xof() implementation yet.
+ #[cfg(blake3_neon)]
+ Platform::NEON => portable::compress_xof(cv, block, block_len, counter, flags),
+ #[cfg(blake3_wasm32_simd)]
+ Platform::WASM32_SIMD => {
+ crate::wasm32_simd::compress_xof(cv, block, block_len, counter, flags)
+ }
+ }
+ }
+
+ // IMPLEMENTATION NOTE
+ // ===================
+ // hash_many() applies two optimizations. The critically important
+ // optimization is the high-performance parallel SIMD hashing mode,
+ // described in detail in the spec. This more than doubles throughput per
+ // thread. Another optimization is keeping the state vectors transposed
+ // from block to block within a chunk. When state vectors are transposed
+ // after every block, there's a small but measurable performance loss.
+ // Compressing chunks with a dedicated loop avoids this.
+
+ pub fn hash_many<const N: usize>(
+ &self,
+ inputs: &[&[u8; N]],
+ key: &CVWords,
+ counter: u64,
+ increment_counter: IncrementCounter,
+ flags: u8,
+ flags_start: u8,
+ flags_end: u8,
+ out: &mut [u8],
+ ) {
+ match self {
+ Platform::Portable => portable::hash_many(
+ inputs,
+ key,
+ counter,
+ increment_counter,
+ flags,
+ flags_start,
+ flags_end,
+ out,
+ ),
+ // Safe because detect() checked for platform support.
+ #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+ Platform::SSE2 => unsafe {
+ crate::sse2::hash_many(
+ inputs,
+ key,
+ counter,
+ increment_counter,
+ flags,
+ flags_start,
+ flags_end,
+ out,
+ )
+ },
+ // Safe because detect() checked for platform support.
+ #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+ Platform::SSE41 => unsafe {
+ crate::sse41::hash_many(
+ inputs,
+ key,
+ counter,
+ increment_counter,
+ flags,
+ flags_start,
+ flags_end,
+ out,
+ )
+ },
+ // Safe because detect() checked for platform support.
+ #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+ Platform::AVX2 => unsafe {
+ crate::avx2::hash_many(
+ inputs,
+ key,
+ counter,
+ increment_counter,
+ flags,
+ flags_start,
+ flags_end,
+ out,
+ )
+ },
+ // Safe because detect() checked for platform support.
+ #[cfg(blake3_avx512_ffi)]
+ #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+ Platform::AVX512 => unsafe {
+ crate::avx512::hash_many(
+ inputs,
+ key,
+ counter,
+ increment_counter,
+ flags,
+ flags_start,
+ flags_end,
+ out,
+ )
+ },
+ // Assumed to be safe if the "neon" feature is on.
+ #[cfg(blake3_neon)]
+ Platform::NEON => unsafe {
+ crate::neon::hash_many(
+ inputs,
+ key,
+ counter,
+ increment_counter,
+ flags,
+ flags_start,
+ flags_end,
+ out,
+ )
+ },
+ // Assumed to be safe if the "wasm32_simd" feature is on.
+ #[cfg(blake3_wasm32_simd)]
+ Platform::WASM32_SIMD => unsafe {
+ crate::wasm32_simd::hash_many(
+ inputs,
+ key,
+ counter,
+ increment_counter,
+ flags,
+ flags_start,
+ flags_end,
+ out,
+ )
+ },
+ }
+ }
+
+ pub fn xof_many(
+ &self,
+ cv: &CVWords,
+ block: &[u8; BLOCK_LEN],
+ block_len: u8,
+ mut counter: u64,
+ flags: u8,
+ out: &mut [u8],
+ ) {
+ debug_assert_eq!(0, out.len() % BLOCK_LEN, "whole blocks only");
+ if out.is_empty() {
+ // The current assembly implementation always outputs at least 1 block.
+ return;
+ }
+ match self {
+ // Safe because detect() checked for platform support.
+ #[cfg(blake3_avx512_ffi)]
+ #[cfg(unix)]
+ #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+ Platform::AVX512 => unsafe {
+ crate::avx512::xof_many(cv, block, block_len, counter, flags, out)
+ },
+ _ => {
+ // For platforms without an optimized xof_many, fall back to a loop over
+ // compress_xof. This is still faster than portable code.
+ for out_block in out.chunks_exact_mut(BLOCK_LEN) {
+ // TODO: Use array_chunks_mut here once that's stable.
+ let out_array: &mut [u8; BLOCK_LEN] = out_block.try_into().unwrap();
+ *out_array = self.compress_xof(cv, block, block_len, counter, flags);
+ counter += 1;
+ }
+ }
+ }
+ }
+
+ // Explicit platform constructors, for benchmarks.
+
+ pub fn portable() -> Self {
+ Self::Portable
+ }
+
+ #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+ pub fn sse2() -> Option<Self> {
+ if sse2_detected() {
+ Some(Self::SSE2)
+ } else {
+ None
+ }
+ }
+
+ #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+ pub fn sse41() -> Option<Self> {
+ if sse41_detected() {
+ Some(Self::SSE41)
+ } else {
+ None
+ }
+ }
+
+ #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+ pub fn avx2() -> Option<Self> {
+ if avx2_detected() {
+ Some(Self::AVX2)
+ } else {
+ None
+ }
+ }
+
+ #[cfg(blake3_avx512_ffi)]
+ #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+ pub fn avx512() -> Option<Self> {
+ if avx512_detected() {
+ Some(Self::AVX512)
+ } else {
+ None
+ }
+ }
+
+ #[cfg(blake3_neon)]
+ pub fn neon() -> Option<Self> {
+ // Assumed to be safe if the "neon" feature is on.
+ Some(Self::NEON)
+ }
+
+ #[cfg(blake3_wasm32_simd)]
+ pub fn wasm32_simd() -> Option<Self> {
+ // Assumed to be safe if the "wasm32_simd" feature is on.
+ Some(Self::WASM32_SIMD)
+ }
+}
+
+// Note that AVX-512 is divided into multiple featuresets, and we use two of
+// them, F and VL.
+#[cfg(blake3_avx512_ffi)]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+#[inline(always)]
+#[allow(unreachable_code)]
+pub fn avx512_detected() -> bool {
+ if cfg!(miri) {
+ return false;
+ }
+
+ // A testing-only short-circuit.
+ if cfg!(feature = "no_avx512") {
+ return false;
+ }
+ // Static check, e.g. for building with target-cpu=native.
+ #[cfg(all(target_feature = "avx512f", target_feature = "avx512vl"))]
+ {
+ return true;
+ }
+ // Dynamic check, if std is enabled.
+ #[cfg(feature = "std")]
+ {
+ if is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512vl") {
+ return true;
+ }
+ }
+ false
+}
+
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+#[inline(always)]
+#[allow(unreachable_code)]
+pub fn avx2_detected() -> bool {
+ if cfg!(miri) {
+ return false;
+ }
+
+ // A testing-only short-circuit.
+ if cfg!(feature = "no_avx2") {
+ return false;
+ }
+ // Static check, e.g. for building with target-cpu=native.
+ #[cfg(target_feature = "avx2")]
+ {
+ return true;
+ }
+ // Dynamic check, if std is enabled.
+ #[cfg(feature = "std")]
+ {
+ if is_x86_feature_detected!("avx2") {
+ return true;
+ }
+ }
+ false
+}
+
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+#[inline(always)]
+#[allow(unreachable_code)]
+pub fn sse41_detected() -> bool {
+ if cfg!(miri) {
+ return false;
+ }
+
+ // A testing-only short-circuit.
+ if cfg!(feature = "no_sse41") {
+ return false;
+ }
+ // Static check, e.g. for building with target-cpu=native.
+ #[cfg(target_feature = "sse4.1")]
+ {
+ return true;
+ }
+ // Dynamic check, if std is enabled.
+ #[cfg(feature = "std")]
+ {
+ if is_x86_feature_detected!("sse4.1") {
+ return true;
+ }
+ }
+ false
+}
+
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+#[inline(always)]
+#[allow(unreachable_code)]
+pub fn sse2_detected() -> bool {
+ if cfg!(miri) {
+ return false;
+ }
+
+ // A testing-only short-circuit.
+ if cfg!(feature = "no_sse2") {
+ return false;
+ }
+ // Static check, e.g. for building with target-cpu=native.
+ #[cfg(target_feature = "sse2")]
+ {
+ return true;
+ }
+ // Dynamic check, if std is enabled.
+ #[cfg(feature = "std")]
+ {
+ if is_x86_feature_detected!("sse2") {
+ return true;
+ }
+ }
+ false
+}
+
+#[inline(always)]
+pub fn words_from_le_bytes_32(bytes: &[u8; 32]) -> [u32; 8] {
+ let mut out = [0; 8];
+ out[0] = u32::from_le_bytes(*array_ref!(bytes, 0 * 4, 4));
+ out[1] = u32::from_le_bytes(*array_ref!(bytes, 1 * 4, 4));
+ out[2] = u32::from_le_bytes(*array_ref!(bytes, 2 * 4, 4));
+ out[3] = u32::from_le_bytes(*array_ref!(bytes, 3 * 4, 4));
+ out[4] = u32::from_le_bytes(*array_ref!(bytes, 4 * 4, 4));
+ out[5] = u32::from_le_bytes(*array_ref!(bytes, 5 * 4, 4));
+ out[6] = u32::from_le_bytes(*array_ref!(bytes, 6 * 4, 4));
+ out[7] = u32::from_le_bytes(*array_ref!(bytes, 7 * 4, 4));
+ out
+}
+
+#[inline(always)]
+pub fn words_from_le_bytes_64(bytes: &[u8; 64]) -> [u32; 16] {
+ let mut out = [0; 16];
+ out[0] = u32::from_le_bytes(*array_ref!(bytes, 0 * 4, 4));
+ out[1] = u32::from_le_bytes(*array_ref!(bytes, 1 * 4, 4));
+ out[2] = u32::from_le_bytes(*array_ref!(bytes, 2 * 4, 4));
+ out[3] = u32::from_le_bytes(*array_ref!(bytes, 3 * 4, 4));
+ out[4] = u32::from_le_bytes(*array_ref!(bytes, 4 * 4, 4));
+ out[5] = u32::from_le_bytes(*array_ref!(bytes, 5 * 4, 4));
+ out[6] = u32::from_le_bytes(*array_ref!(bytes, 6 * 4, 4));
+ out[7] = u32::from_le_bytes(*array_ref!(bytes, 7 * 4, 4));
+ out[8] = u32::from_le_bytes(*array_ref!(bytes, 8 * 4, 4));
+ out[9] = u32::from_le_bytes(*array_ref!(bytes, 9 * 4, 4));
+ out[10] = u32::from_le_bytes(*array_ref!(bytes, 10 * 4, 4));
+ out[11] = u32::from_le_bytes(*array_ref!(bytes, 11 * 4, 4));
+ out[12] = u32::from_le_bytes(*array_ref!(bytes, 12 * 4, 4));
+ out[13] = u32::from_le_bytes(*array_ref!(bytes, 13 * 4, 4));
+ out[14] = u32::from_le_bytes(*array_ref!(bytes, 14 * 4, 4));
+ out[15] = u32::from_le_bytes(*array_ref!(bytes, 15 * 4, 4));
+ out
+}
+
+#[inline(always)]
+pub fn le_bytes_from_words_32(words: &[u32; 8]) -> [u8; 32] {
+ let mut out = [0; 32];
+ *array_mut_ref!(out, 0 * 4, 4) = words[0].to_le_bytes();
+ *array_mut_ref!(out, 1 * 4, 4) = words[1].to_le_bytes();
+ *array_mut_ref!(out, 2 * 4, 4) = words[2].to_le_bytes();
+ *array_mut_ref!(out, 3 * 4, 4) = words[3].to_le_bytes();
+ *array_mut_ref!(out, 4 * 4, 4) = words[4].to_le_bytes();
+ *array_mut_ref!(out, 5 * 4, 4) = words[5].to_le_bytes();
+ *array_mut_ref!(out, 6 * 4, 4) = words[6].to_le_bytes();
+ *array_mut_ref!(out, 7 * 4, 4) = words[7].to_le_bytes();
+ out
+}
+
+#[inline(always)]
+pub fn le_bytes_from_words_64(words: &[u32; 16]) -> [u8; 64] {
+ let mut out = [0; 64];
+ *array_mut_ref!(out, 0 * 4, 4) = words[0].to_le_bytes();
+ *array_mut_ref!(out, 1 * 4, 4) = words[1].to_le_bytes();
+ *array_mut_ref!(out, 2 * 4, 4) = words[2].to_le_bytes();
+ *array_mut_ref!(out, 3 * 4, 4) = words[3].to_le_bytes();
+ *array_mut_ref!(out, 4 * 4, 4) = words[4].to_le_bytes();
+ *array_mut_ref!(out, 5 * 4, 4) = words[5].to_le_bytes();
+ *array_mut_ref!(out, 6 * 4, 4) = words[6].to_le_bytes();
+ *array_mut_ref!(out, 7 * 4, 4) = words[7].to_le_bytes();
+ *array_mut_ref!(out, 8 * 4, 4) = words[8].to_le_bytes();
+ *array_mut_ref!(out, 9 * 4, 4) = words[9].to_le_bytes();
+ *array_mut_ref!(out, 10 * 4, 4) = words[10].to_le_bytes();
+ *array_mut_ref!(out, 11 * 4, 4) = words[11].to_le_bytes();
+ *array_mut_ref!(out, 12 * 4, 4) = words[12].to_le_bytes();
+ *array_mut_ref!(out, 13 * 4, 4) = words[13].to_le_bytes();
+ *array_mut_ref!(out, 14 * 4, 4) = words[14].to_le_bytes();
+ *array_mut_ref!(out, 15 * 4, 4) = words[15].to_le_bytes();
+ out
+}