aboutsummaryrefslogtreecommitdiff
path: root/thirdparty/blake3/c/blake3_dispatch.c
diff options
context:
space:
mode:
authorStefan Boberg <[email protected]>2025-11-07 14:49:13 +0100
committerGitHub Enterprise <[email protected]>2025-11-07 14:49:13 +0100
commit24e43a913f29ac3b314354e8ce5175f135bcc64f (patch)
treeca442937ceeb63461012b33a4576e9835099f106 /thirdparty/blake3/c/blake3_dispatch.c
parentget oplog attachments (#622) (diff)
downloadzen-24e43a913f29ac3b314354e8ce5175f135bcc64f.tar.xz
zen-24e43a913f29ac3b314354e8ce5175f135bcc64f.zip
switch to xmake for package management (#611)
This change removes our dependency on vcpkg for package management, in favour of bringing some code in-tree in the `thirdparty` folder as well as using the xmake build-in package management feature. For the latter, all the package definitions are maintained in the zen repo itself, in the `repo` folder. It should now also be easier to build the project as it will no longer depend on having the right version of vcpkg installed, which has been a common problem for new people coming in to the codebase. Now you should only need xmake to build. * Bumps xmake requirement on github runners to 2.9.9 to resolve an issue where xmake on Windows invokes cmake with `v144` toolchain which does not exist * BLAKE3 is now in-tree at `thirdparty/blake3` * cpr is now in-tree at `thirdparty/cpr` * cxxopts is now in-tree at `thirdparty/cxxopts` * fmt is now in-tree at `thirdparty/fmt` * robin-map is now in-tree at `thirdparty/robin-map` * ryml is now in-tree at `thirdparty/ryml` * sol2 is now in-tree at `thirdparty/sol2` * spdlog is now in-tree at `thirdparty/spdlog` * utfcpp is now in-tree at `thirdparty/utfcpp` * xmake package repo definitions is in `repo` * implemented support for sanitizers. ASAN is supported on windows, TSAN, UBSAN, MSAN etc are supported on Linux/MacOS though I have not yet tested it extensively on MacOS * the zencore encryption implementation also now supports using mbedTLS which is used on MacOS, though for now we still use openssl on Linux * crashpad * bumps libcurl to 8.11.0 (from 8.8.0) which should address a rare build upload bug
Diffstat (limited to 'thirdparty/blake3/c/blake3_dispatch.c')
-rw-r--r--thirdparty/blake3/c/blake3_dispatch.c332
1 files changed, 332 insertions, 0 deletions
diff --git a/thirdparty/blake3/c/blake3_dispatch.c b/thirdparty/blake3/c/blake3_dispatch.c
new file mode 100644
index 000000000..52db90586
--- /dev/null
+++ b/thirdparty/blake3/c/blake3_dispatch.c
@@ -0,0 +1,332 @@
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "blake3_impl.h"
+
+#if defined(_MSC_VER)
+#include <Windows.h>
+#endif
+
+#if defined(IS_X86)
+#if defined(_MSC_VER)
+#include <intrin.h>
+#elif defined(__GNUC__)
+#include <immintrin.h>
+#else
+#undef IS_X86 /* Unimplemented! */
+#endif
+#endif
+
+#if !defined(BLAKE3_ATOMICS)
+#if defined(__has_include)
+#if __has_include(<stdatomic.h>) && !defined(_MSC_VER)
+#define BLAKE3_ATOMICS 1
+#else
+#define BLAKE3_ATOMICS 0
+#endif /* __has_include(<stdatomic.h>) && !defined(_MSC_VER) */
+#else
+#define BLAKE3_ATOMICS 0
+#endif /* defined(__has_include) */
+#endif /* BLAKE3_ATOMICS */
+
+#if BLAKE3_ATOMICS
+#define ATOMIC_INT _Atomic int
+#define ATOMIC_LOAD(x) x
+#define ATOMIC_STORE(x, y) x = y
+#elif defined(_MSC_VER)
+#define ATOMIC_INT LONG
+#define ATOMIC_LOAD(x) InterlockedOr(&x, 0)
+#define ATOMIC_STORE(x, y) InterlockedExchange(&x, y)
+#else
+#define ATOMIC_INT int
+#define ATOMIC_LOAD(x) x
+#define ATOMIC_STORE(x, y) x = y
+#endif
+
+#define MAYBE_UNUSED(x) (void)((x))
+
+#if defined(IS_X86)
+static uint64_t xgetbv(void) {
+#if defined(_MSC_VER)
+ return _xgetbv(0);
+#else
+ uint32_t eax = 0, edx = 0;
+ __asm__ __volatile__("xgetbv\n" : "=a"(eax), "=d"(edx) : "c"(0));
+ return ((uint64_t)edx << 32) | eax;
+#endif
+}
+
+static void cpuid(uint32_t out[4], uint32_t id) {
+#if defined(_MSC_VER)
+ __cpuid((int *)out, id);
+#elif defined(__i386__) || defined(_M_IX86)
+ __asm__ __volatile__("movl %%ebx, %1\n"
+ "cpuid\n"
+ "xchgl %1, %%ebx\n"
+ : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
+ : "a"(id));
+#else
+ __asm__ __volatile__("cpuid\n"
+ : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
+ : "a"(id));
+#endif
+}
+
+static void cpuidex(uint32_t out[4], uint32_t id, uint32_t sid) {
+#if defined(_MSC_VER)
+ __cpuidex((int *)out, id, sid);
+#elif defined(__i386__) || defined(_M_IX86)
+ __asm__ __volatile__("movl %%ebx, %1\n"
+ "cpuid\n"
+ "xchgl %1, %%ebx\n"
+ : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
+ : "a"(id), "c"(sid));
+#else
+ __asm__ __volatile__("cpuid\n"
+ : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
+ : "a"(id), "c"(sid));
+#endif
+}
+
+#endif
+
+enum cpu_feature {
+ SSE2 = 1 << 0,
+ SSSE3 = 1 << 1,
+ SSE41 = 1 << 2,
+ AVX = 1 << 3,
+ AVX2 = 1 << 4,
+ AVX512F = 1 << 5,
+ AVX512VL = 1 << 6,
+ /* ... */
+ UNDEFINED = 1 << 30
+};
+
+#if !defined(BLAKE3_TESTING)
+static /* Allow the variable to be controlled manually for testing */
+#endif
+ ATOMIC_INT g_cpu_features = UNDEFINED;
+
+#if !defined(BLAKE3_TESTING)
+static
+#endif
+ enum cpu_feature
+ get_cpu_features(void) {
+
+ /* If TSAN detects a data race here, try compiling with -DBLAKE3_ATOMICS=1 */
+ enum cpu_feature features = ATOMIC_LOAD(g_cpu_features);
+ if (features != UNDEFINED) {
+ return features;
+ } else {
+#if defined(IS_X86)
+ uint32_t regs[4] = {0};
+ uint32_t *eax = &regs[0], *ebx = &regs[1], *ecx = &regs[2], *edx = &regs[3];
+ (void)edx;
+ features = 0;
+ cpuid(regs, 0);
+ const int max_id = *eax;
+ cpuid(regs, 1);
+#if defined(__amd64__) || defined(_M_X64)
+ features |= SSE2;
+#else
+ if (*edx & (1UL << 26))
+ features |= SSE2;
+#endif
+ if (*ecx & (1UL << 9))
+ features |= SSSE3;
+ if (*ecx & (1UL << 19))
+ features |= SSE41;
+
+ if (*ecx & (1UL << 27)) { // OSXSAVE
+ const uint64_t mask = xgetbv();
+ if ((mask & 6) == 6) { // SSE and AVX states
+ if (*ecx & (1UL << 28))
+ features |= AVX;
+ if (max_id >= 7) {
+ cpuidex(regs, 7, 0);
+ if (*ebx & (1UL << 5))
+ features |= AVX2;
+ if ((mask & 224) == 224) { // Opmask, ZMM_Hi256, Hi16_Zmm
+ if (*ebx & (1UL << 31))
+ features |= AVX512VL;
+ if (*ebx & (1UL << 16))
+ features |= AVX512F;
+ }
+ }
+ }
+ }
+ ATOMIC_STORE(g_cpu_features, features);
+ return features;
+#else
+ /* How to detect NEON? */
+ return 0;
+#endif
+ }
+}
+
+void blake3_compress_in_place(uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN],
+ uint8_t block_len, uint64_t counter,
+ uint8_t flags) {
+#if defined(IS_X86)
+ const enum cpu_feature features = get_cpu_features();
+ MAYBE_UNUSED(features);
+#if !defined(BLAKE3_NO_AVX512)
+ if (features & AVX512VL) {
+ blake3_compress_in_place_avx512(cv, block, block_len, counter, flags);
+ return;
+ }
+#endif
+#if !defined(BLAKE3_NO_SSE41)
+ if (features & SSE41) {
+ blake3_compress_in_place_sse41(cv, block, block_len, counter, flags);
+ return;
+ }
+#endif
+#if !defined(BLAKE3_NO_SSE2)
+ if (features & SSE2) {
+ blake3_compress_in_place_sse2(cv, block, block_len, counter, flags);
+ return;
+ }
+#endif
+#endif
+ blake3_compress_in_place_portable(cv, block, block_len, counter, flags);
+}
+
+void blake3_compress_xof(const uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN],
+ uint8_t block_len, uint64_t counter, uint8_t flags,
+ uint8_t out[64]) {
+#if defined(IS_X86)
+ const enum cpu_feature features = get_cpu_features();
+ MAYBE_UNUSED(features);
+#if !defined(BLAKE3_NO_AVX512)
+ if (features & AVX512VL) {
+ blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out);
+ return;
+ }
+#endif
+#if !defined(BLAKE3_NO_SSE41)
+ if (features & SSE41) {
+ blake3_compress_xof_sse41(cv, block, block_len, counter, flags, out);
+ return;
+ }
+#endif
+#if !defined(BLAKE3_NO_SSE2)
+ if (features & SSE2) {
+ blake3_compress_xof_sse2(cv, block, block_len, counter, flags, out);
+ return;
+ }
+#endif
+#endif
+ blake3_compress_xof_portable(cv, block, block_len, counter, flags, out);
+}
+
+
+void blake3_xof_many(const uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN],
+ uint8_t block_len, uint64_t counter, uint8_t flags,
+ uint8_t out[64], size_t outblocks) {
+ if (outblocks == 0) {
+ // The current assembly implementation always outputs at least 1 block.
+ return;
+ }
+#if defined(IS_X86)
+ const enum cpu_feature features = get_cpu_features();
+ MAYBE_UNUSED(features);
+#if !defined(_WIN32) && !defined(BLAKE3_NO_AVX512)
+ if (features & AVX512VL) {
+ blake3_xof_many_avx512(cv, block, block_len, counter, flags, out, outblocks);
+ return;
+ }
+#endif
+#endif
+ for(size_t i = 0; i < outblocks; ++i) {
+ blake3_compress_xof(cv, block, block_len, counter + i, flags, out + 64*i);
+ }
+}
+
+void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
+ size_t blocks, const uint32_t key[8], uint64_t counter,
+ bool increment_counter, uint8_t flags,
+ uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
+#if defined(IS_X86)
+ const enum cpu_feature features = get_cpu_features();
+ MAYBE_UNUSED(features);
+#if !defined(BLAKE3_NO_AVX512)
+ if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
+ blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter,
+ increment_counter, flags, flags_start, flags_end,
+ out);
+ return;
+ }
+#endif
+#if !defined(BLAKE3_NO_AVX2)
+ if (features & AVX2) {
+ blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter,
+ increment_counter, flags, flags_start, flags_end,
+ out);
+ return;
+ }
+#endif
+#if !defined(BLAKE3_NO_SSE41)
+ if (features & SSE41) {
+ blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter,
+ increment_counter, flags, flags_start, flags_end,
+ out);
+ return;
+ }
+#endif
+#if !defined(BLAKE3_NO_SSE2)
+ if (features & SSE2) {
+ blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter,
+ increment_counter, flags, flags_start, flags_end,
+ out);
+ return;
+ }
+#endif
+#endif
+
+#if BLAKE3_USE_NEON == 1
+ blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter,
+ increment_counter, flags, flags_start, flags_end, out);
+ return;
+#endif
+
+ blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter,
+ increment_counter, flags, flags_start, flags_end,
+ out);
+}
+
+// The dynamically detected SIMD degree of the current platform.
+size_t blake3_simd_degree(void) {
+#if defined(IS_X86)
+ const enum cpu_feature features = get_cpu_features();
+ MAYBE_UNUSED(features);
+#if !defined(BLAKE3_NO_AVX512)
+ if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
+ return 16;
+ }
+#endif
+#if !defined(BLAKE3_NO_AVX2)
+ if (features & AVX2) {
+ return 8;
+ }
+#endif
+#if !defined(BLAKE3_NO_SSE41)
+ if (features & SSE41) {
+ return 4;
+ }
+#endif
+#if !defined(BLAKE3_NO_SSE2)
+ if (features & SSE2) {
+ return 4;
+ }
+#endif
+#endif
+#if BLAKE3_USE_NEON == 1
+ return 4;
+#endif
+ return 1;
+}