diff options
| author | Stefan Boberg <[email protected]> | 2026-03-23 12:54:14 +0100 |
|---|---|---|
| committer | GitHub Enterprise <[email protected]> | 2026-03-23 12:54:14 +0100 |
| commit | 8e2c307bdb501db0ab0ce2d51bc61b552855ee11 (patch) | |
| tree | 8f9be7e926bc555318a68794ee75ad5ad0dd979f /src/zentelemetry/include | |
| parent | Logger simplification (#883) (diff) | |
| download | zen-8e2c307bdb501db0ab0ce2d51bc61b552855ee11.tar.xz zen-8e2c307bdb501db0ab0ce2d51bc61b552855ee11.zip | |
Unique session/client tracking using HyperLogLog (#884)
## Summary
Adds probabilistic cardinality estimation for tracking unique HTTP clients and sessions using a HyperLogLog implementation.
- Add a `HyperLogLog<Precision>` template in `zentelemetry` with thread-safe lock-free register updates, merge support, and XXH3 hashing
- Feed client IP addresses (via raw bytes) and session IDs (via `Oid` bytes) into their respective HyperLogLog estimators from both the ASIO and http.sys server backends
- Emit `distinct_clients` and `distinct_sessions` cardinality estimates in HTTP `CollectStats()`
- Add tests covering empty, single, duplicates, accuracy, merge, and clear scenarios
## Why HyperLogLog
Tracking exact unique counts would require storing every observed IP or session ID. HyperLogLog provides a memory-bounded probabilistic estimate (~1–2% error) using only a few KB of memory regardless of traffic volume.
Diffstat (limited to 'src/zentelemetry/include')
| -rw-r--r-- | src/zentelemetry/include/zentelemetry/hyperloglog.h | 165 |
1 files changed, 165 insertions, 0 deletions
diff --git a/src/zentelemetry/include/zentelemetry/hyperloglog.h b/src/zentelemetry/include/zentelemetry/hyperloglog.h new file mode 100644 index 000000000..2daf75a43 --- /dev/null +++ b/src/zentelemetry/include/zentelemetry/hyperloglog.h @@ -0,0 +1,165 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include "zentelemetry.h" + +#include <zencore/intmath.h> +#include <zencore/xxhash.h> + +#include <array> +#include <atomic> +#include <cstdint> +#include <string_view> + +namespace zen::metrics { + +/** HyperLogLog cardinality estimator. + * + * Estimates the number of distinct elements in a stream using O(2^Precision) + * bytes of memory. The relative error is approximately 1.04 / sqrt(2^Precision). + * + * @tparam Precision Number of bits used for register indexing (4..16). + * Higher precision = more memory but lower error. + * Default 14 gives 16384 registers (~16 KiB) and ~0.81% error. + * + * All operations are thread-safe. Add() uses relaxed atomic compare-exchange + * so concurrent updates are lock-free but may occasionally lose a race (which + * only means a slightly delayed register update, not a correctness issue). + */ +template<uint8_t Precision = 14> +class HyperLogLog +{ + static_assert(Precision >= 4 && Precision <= 16, "Precision must be between 4 and 16"); + + static constexpr uint32_t RegisterCount = 1u << Precision; + static constexpr uint64_t IndexMask = RegisterCount - 1; + +public: + HyperLogLog() { Clear(); } + + /** Add a raw byte sequence to the sketch. */ + void Add(const void* Data, size_t Size) + { + XXH3_128 Hash = XXH3_128::HashMemory(Data, Size); + AddHash(Hash); + } + + /** Add a string value to the sketch. */ + void Add(std::string_view Value) { Add(Value.data(), Value.size()); } + + /** Add a pre-computed hash to the sketch. */ + void AddHash(const XXH3_128& Hash) + { + // Use the first 8 bytes of the hash as our 64-bit value + uint64_t H; + memcpy(&H, Hash.Hash, sizeof(H)); + + uint32_t Index = static_cast<uint32_t>(H >> (64 - Precision)) & IndexMask; + uint64_t W = H << Precision; + + // Count leading zeros of W + 1 (the +1 ensures we never store 0) + uint8_t Rank = static_cast<uint8_t>(CountLeadingZeros64(W) + 1); + + // Atomically update the register to the maximum observed rank + uint8_t Current = m_Registers[Index].load(std::memory_order_relaxed); + while (Rank > Current) + { + if (m_Registers[Index].compare_exchange_weak(Current, Rank, std::memory_order_relaxed)) + { + break; + } + } + } + + /** Estimate the number of distinct elements added. */ + double Estimate() const + { + // Compute the raw harmonic mean estimate + double Sum = 0.0; + uint32_t ZeroCount = 0; + + for (uint32_t i = 0; i < RegisterCount; ++i) + { + uint8_t Val = m_Registers[i].load(std::memory_order_relaxed); + Sum += 1.0 / static_cast<double>(1ull << Val); + if (Val == 0) + { + ++ZeroCount; + } + } + + double Alpha = AlphaM(); + double RawEstimate = Alpha * RegisterCount * RegisterCount / Sum; + + // Small range correction using LinearCounting + if (RawEstimate <= 2.5 * RegisterCount && ZeroCount > 0) + { + return RegisterCount * std::log(static_cast<double>(RegisterCount) / ZeroCount); + } + + return RawEstimate; + } + + /** Return the estimate as an integer. */ + uint64_t Count() const + { + double E = Estimate(); + return E < 0.5 ? 0 : static_cast<uint64_t>(E + 0.5); + } + + /** Reset all registers to zero. */ + void Clear() + { + for (uint32_t i = 0; i < RegisterCount; ++i) + { + m_Registers[i].store(0, std::memory_order_relaxed); + } + } + + /** Merge another HyperLogLog sketch into this one (union). */ + void Merge(const HyperLogLog& Other) + { + for (uint32_t i = 0; i < RegisterCount; ++i) + { + uint8_t OtherVal = Other.m_Registers[i].load(std::memory_order_relaxed); + uint8_t Current = m_Registers[i].load(std::memory_order_relaxed); + while (OtherVal > Current) + { + if (m_Registers[i].compare_exchange_weak(Current, OtherVal, std::memory_order_relaxed)) + { + break; + } + } + } + } + +private: + std::array<std::atomic<uint8_t>, RegisterCount> m_Registers; + + static double AlphaM() + { + if constexpr (Precision == 4) + { + return 0.673; + } + else if constexpr (Precision == 5) + { + return 0.697; + } + else if constexpr (Precision == 6) + { + return 0.709; + } + else + { + return 0.7213 / (1.0 + 1.079 / RegisterCount); + } + } +}; + +} // namespace zen::metrics + +namespace zen { +void hyperloglog_forcelink(); +} // namespace zen |