From 8e2c307bdb501db0ab0ce2d51bc61b552855ee11 Mon Sep 17 00:00:00 2001 From: Stefan Boberg Date: Mon, 23 Mar 2026 12:54:14 +0100 Subject: Unique session/client tracking using HyperLogLog (#884) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary Adds probabilistic cardinality estimation for tracking unique HTTP clients and sessions using a HyperLogLog implementation. - Add a `HyperLogLog` template in `zentelemetry` with thread-safe lock-free register updates, merge support, and XXH3 hashing - Feed client IP addresses (via raw bytes) and session IDs (via `Oid` bytes) into their respective HyperLogLog estimators from both the ASIO and http.sys server backends - Emit `distinct_clients` and `distinct_sessions` cardinality estimates in HTTP `CollectStats()` - Add tests covering empty, single, duplicates, accuracy, merge, and clear scenarios ## Why HyperLogLog Tracking exact unique counts would require storing every observed IP or session ID. HyperLogLog provides a memory-bounded probabilistic estimate (~1–2% error) using only a few KB of memory regardless of traffic volume. --- src/zentelemetry/hyperloglog.cpp | 127 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 127 insertions(+) create mode 100644 src/zentelemetry/hyperloglog.cpp (limited to 'src/zentelemetry/hyperloglog.cpp') diff --git a/src/zentelemetry/hyperloglog.cpp b/src/zentelemetry/hyperloglog.cpp new file mode 100644 index 000000000..3a06fc59f --- /dev/null +++ b/src/zentelemetry/hyperloglog.cpp @@ -0,0 +1,127 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#include + +namespace zen { +void +hyperloglog_forcelink() +{ +} +} // namespace zen + +#if ZEN_WITH_TESTS + +# include + +# include + +# include +# include + +namespace zen::tests { + +TEST_SUITE_BEGIN("telemetry.hyperloglog"); + +TEST_CASE("hyperloglog.empty") +{ + metrics::HyperLogLog<> Hll; + CHECK_EQ(Hll.Count(), 0); +} + +TEST_CASE("hyperloglog.single_element") +{ + metrics::HyperLogLog<> Hll; + Hll.Add("hello"); + CHECK(Hll.Count() >= 1); + CHECK(Hll.Count() <= 2); +} + +TEST_CASE("hyperloglog.duplicates") +{ + metrics::HyperLogLog<> Hll; + for (int i = 0; i < 1000; ++i) + { + Hll.Add("same_value"); + } + CHECK_EQ(Hll.Count(), 1); +} + +TEST_CASE("hyperloglog.estimate_accuracy") +{ + // With precision 14, expected error is ~0.81% + // Use a generous margin (5%) for the test to be reliable + constexpr uint64_t N = 100000; + + metrics::HyperLogLog<14> Hll; + for (uint64_t i = 0; i < N; ++i) + { + std::string Value = fmt::format("item_{}", i); + Hll.Add(Value); + } + + uint64_t Estimate = Hll.Count(); + double Error = std::abs(static_cast(Estimate) - static_cast(N)) / static_cast(N); + + CHECK(Error < 0.05); +} + +TEST_CASE("hyperloglog.small_cardinality") +{ + metrics::HyperLogLog<> Hll; + for (int i = 0; i < 10; ++i) + { + std::string Value = fmt::format("key_{}", i); + Hll.Add(Value); + } + + uint64_t Estimate = Hll.Count(); + CHECK(Estimate >= 8); + CHECK(Estimate <= 12); +} + +TEST_CASE("hyperloglog.clear") +{ + metrics::HyperLogLog<> Hll; + for (int i = 0; i < 100; ++i) + { + std::string Value = fmt::format("v_{}", i); + Hll.Add(Value); + } + CHECK(Hll.Count() > 0); + + Hll.Clear(); + CHECK_EQ(Hll.Count(), 0); +} + +TEST_CASE("hyperloglog.merge") +{ + constexpr uint64_t N = 50000; + + metrics::HyperLogLog<14> A; + metrics::HyperLogLog<14> B; + + // A gets even numbers, B gets odd numbers + for (uint64_t i = 0; i < N; ++i) + { + std::string Value = fmt::format("item_{}", i * 2); + A.Add(Value); + } + for (uint64_t i = 0; i < N; ++i) + { + std::string Value = fmt::format("item_{}", i * 2 + 1); + B.Add(Value); + } + + A.Merge(B); + + uint64_t Estimate = A.Count(); + double Error = std::abs(static_cast(Estimate) - static_cast(N * 2)) / static_cast(N * 2); + + CHECK(Error < 0.05); +} + +TEST_SUITE_END(); + +} // namespace zen::tests + +#endif // ZEN_WITH_TESTS -- cgit v1.2.3