diff options
| author | Per Larsson <[email protected]> | 2022-01-24 11:11:10 +0100 |
|---|---|---|
| committer | Per Larsson <[email protected]> | 2022-01-24 11:11:10 +0100 |
| commit | dc6becffb513280170958f94e18c1b2966ade4d1 (patch) | |
| tree | c7f9cccafcc21e241abdecde6f5219ab1009aff6 /zenserver/upstream/upstreamcache.cpp | |
| parent | Format fix. (diff) | |
| download | zen-dc6becffb513280170958f94e18c1b2966ade4d1.tar.xz zen-dc6becffb513280170958f94e18c1b2966ade4d1.zip | |
Refactored upstream cache to better handle different states in prep for dynamic auth tokens.
Diffstat (limited to 'zenserver/upstream/upstreamcache.cpp')
| -rw-r--r-- | zenserver/upstream/upstreamcache.cpp | 690 |
1 files changed, 376 insertions, 314 deletions
diff --git a/zenserver/upstream/upstreamcache.cpp b/zenserver/upstream/upstreamcache.cpp index 65624ef17..3d6641a4f 100644 --- a/zenserver/upstream/upstreamcache.cpp +++ b/zenserver/upstream/upstreamcache.cpp @@ -25,6 +25,7 @@ #include <algorithm> #include <atomic> +#include <shared_mutex> #include <thread> #include <unordered_map> @@ -34,6 +35,52 @@ using namespace std::literals; namespace detail { + class UpstreamStatus + { + public: + UpstreamEndpointState EndpointState() const { return static_cast<UpstreamEndpointState>(m_State.load(std::memory_order_relaxed)); } + + UpstreamEndpointStatus EndpointStatus() const + { + const UpstreamEndpointState State = EndpointState(); + { + std::unique_lock _(m_Mutex); + return {.Reason = m_ErrorText, .State = State}; + } + } + + void Set(UpstreamEndpointState NewState) + { + m_State.store(static_cast<uint32_t>(NewState), std::memory_order_relaxed); + { + std::unique_lock _(m_Mutex); + m_ErrorText.clear(); + } + } + + void Set(UpstreamEndpointState NewState, std::string ErrorText) + { + m_State.store(static_cast<uint32_t>(NewState), std::memory_order_relaxed); + { + std::unique_lock _(m_Mutex); + m_ErrorText = std::move(ErrorText); + } + } + + void SetFromErrorCode(int32_t ErrorCode, std::string_view ErrorText) + { + if (ErrorCode != 0) + { + Set(ErrorCode == 401 ? UpstreamEndpointState::kUnauthorized : UpstreamEndpointState::kError, std::string(ErrorText)); + } + } + + private: + mutable std::mutex m_Mutex; + std::string m_ErrorText; + std::atomic_uint32_t m_State; + }; + class JupiterUpstreamEndpoint final : public UpstreamEndpoint { public: @@ -41,7 +88,8 @@ namespace detail { : m_Log(zen::logging::Get("upstream")) , m_UseLegacyDdc(Options.UseLegacyDdc) { - m_Info.Name = "Horde"sv; + ZEN_ASSERT(!Options.Name.empty()); + m_Info.Name = Options.Name; m_Info.Url = Options.ServiceUrl; m_Client = new CloudCacheClient(Options); } @@ -50,27 +98,45 @@ namespace detail { virtual const UpstreamEndpointInfo& GetEndpointInfo() const override { return m_Info; } - virtual UpstreamEndpointHealth Initialize() override { return CheckHealth(); } - - virtual bool IsHealthy() const override { return m_HealthOk.load(); } - - virtual UpstreamEndpointHealth CheckHealth() override + virtual UpstreamEndpointStatus Initialize() override { try { + if (m_Status.EndpointState() == UpstreamEndpointState::kOk) + { + return {.State = UpstreamEndpointState::kOk}; + } + CloudCacheSession Session(m_Client); const CloudCacheResult Result = Session.Authenticate(); - m_HealthOk = Result.Success && Result.ErrorCode == 0; + if (Result.Success) + { + m_Status.Set(UpstreamEndpointState::kOk); + } + else if (Result.ErrorCode != 0) + { + m_Status.SetFromErrorCode(Result.ErrorCode, Result.Reason); + } + else + { + m_Status.Set(UpstreamEndpointState::kUnauthorized); + } - return {.Reason = std::move(Result.Reason), .Ok = Result.Success}; + return m_Status.EndpointStatus(); } catch (std::exception& Err) { - return {.Reason = Err.what(), .Ok = false}; + m_Status.Set(UpstreamEndpointState::kError, Err.what()); + + return {.Reason = Err.what(), .State = GetState()}; } } + virtual UpstreamEndpointState GetState() override { return m_Status.EndpointState(); } + + virtual UpstreamEndpointStatus GetStatus() override { return m_Status.EndpointStatus(); } + virtual GetUpstreamCacheResult GetCacheRecord(CacheKey CacheKey, ZenContentType Type) override { ZEN_TRACE_CPU("Upstream::Horde::GetSingleCacheRecord"); @@ -127,6 +193,8 @@ namespace detail { } } + m_Status.SetFromErrorCode(Result.ErrorCode, Result.Reason); + if (Result.ErrorCode == 0) { return {.Value = Result.Response, @@ -136,13 +204,13 @@ namespace detail { } else { - m_HealthOk = false; return {.Error{.ErrorCode = Result.ErrorCode, .Reason = std::move(Result.Reason)}}; } } catch (std::exception& Err) { - m_HealthOk = false; + m_Status.Set(UpstreamEndpointState::kError, Err.what()); + return {.Error{.ErrorCode = -1, .Reason = Err.what()}}; } } @@ -170,6 +238,8 @@ namespace detail { CloudCacheResult RefResult = Session.GetRef(CacheKey.Bucket, CacheKey.Hash, ZenContentType::kCbObject); AppendResult(RefResult, Result); + m_Status.SetFromErrorCode(RefResult.ErrorCode, RefResult.Reason); + if (RefResult.ErrorCode == 0) { const CbValidateError ValidationResult = ValidateCompactBinary(RefResult.Response, CbValidateMode::All); @@ -180,6 +250,8 @@ namespace detail { CloudCacheResult BlobResult = Session.GetCompressedBlob(AttachmentHash.AsHash()); AppendResult(BlobResult, Result); + m_Status.SetFromErrorCode(BlobResult.ErrorCode, BlobResult.Reason); + if (BlobResult.ErrorCode == 0) { if (CompressedBuffer Chunk = CompressedBuffer::FromCompressed(SharedBuffer(BlobResult.Response))) @@ -187,17 +259,9 @@ namespace detail { Package.AddAttachment(CbAttachment(Chunk)); } } - else - { - m_HealthOk = false; - } }); } } - else - { - m_HealthOk = false; - } } OnComplete({.Key = CacheKey, .KeyIndex = Index, .Record = Record, .Package = Package}); @@ -215,6 +279,8 @@ namespace detail { CloudCacheSession Session(m_Client); const CloudCacheResult Result = Session.GetCompressedBlob(PayloadId); + m_Status.SetFromErrorCode(Result.ErrorCode, Result.Reason); + if (Result.ErrorCode == 0) { return {.Value = Result.Response, @@ -224,13 +290,13 @@ namespace detail { } else { - m_HealthOk = false; return {.Error{.ErrorCode = Result.ErrorCode, .Reason = std::move(Result.Reason)}}; } } catch (std::exception& Err) { - m_HealthOk = false; + m_Status.Set(UpstreamEndpointState::kError, Err.what()); + return {.Error{.ErrorCode = -1, .Reason = Err.what()}}; } } @@ -255,7 +321,8 @@ namespace detail { Payload = BlobResult.Response; AppendResult(BlobResult, Result); - m_HealthOk = BlobResult.ErrorCode == 0; + + m_Status.SetFromErrorCode(BlobResult.ErrorCode, BlobResult.Reason); } OnComplete({.Request = Request, .RequestIndex = Index, .Payload = Payload}); @@ -292,7 +359,7 @@ namespace detail { } } - m_HealthOk = Result.ErrorCode == 0; + m_Status.SetFromErrorCode(Result.ErrorCode, Result.Reason); return {.Reason = std::move(Result.Reason), .Bytes = Result.Bytes, @@ -323,7 +390,7 @@ namespace detail { BlobResult = Session.PutCompressedBlob(CacheRecord.PayloadIds[Idx], Payloads[Idx]); } - m_HealthOk = BlobResult.ErrorCode == 0; + m_Status.SetFromErrorCode(BlobResult.ErrorCode, BlobResult.Reason); if (!BlobResult.Success) { @@ -344,7 +411,7 @@ namespace detail { RefResult = Session.PutRef(CacheRecord.Key.Bucket, CacheRecord.Key.Hash, RecordValue, ZenContentType::kCbObject); } - m_HealthOk = RefResult.ErrorCode == 0; + m_Status.SetFromErrorCode(RefResult.ErrorCode, RefResult.Reason); if (!RefResult.Success) { @@ -366,7 +433,8 @@ namespace detail { const IoHash RefHash = IoHash::HashBuffer(RecordValue); FinalizeRefResult FinalizeResult = Session.FinalizeRef(CacheRecord.Key.Bucket, CacheRecord.Key.Hash, RefHash); - m_HealthOk = FinalizeResult.ErrorCode == 0; + + m_Status.SetFromErrorCode(FinalizeResult.ErrorCode, FinalizeResult.Reason); if (!FinalizeResult.Success) { @@ -385,7 +453,8 @@ namespace detail { } FinalizeResult = Session.FinalizeRef(CacheRecord.Key.Bucket, CacheRecord.Key.Hash, RefHash); - m_HealthOk = FinalizeResult.ErrorCode == 0; + + m_Status.SetFromErrorCode(FinalizeResult.ErrorCode, FinalizeResult.Reason); if (!FinalizeResult.Success) { @@ -420,7 +489,8 @@ namespace detail { } catch (std::exception& Err) { - m_HealthOk = false; + m_Status.Set(UpstreamEndpointState::kError, Err.what()); + return {.Reason = std::string(Err.what()), .Success = false}; } } @@ -444,11 +514,10 @@ namespace detail { spdlog::logger& m_Log; UpstreamEndpointInfo m_Info; + UpstreamStatus m_Status; + UpstreamEndpointStats m_Stats; bool m_UseLegacyDdc; - std::string m_DisplayName; RefPtr<CloudCacheClient> m_Client; - UpstreamEndpointStats m_Stats; - std::atomic_bool m_HealthOk{false}; }; class ZenUpstreamEndpoint final : public UpstreamEndpoint @@ -466,7 +535,7 @@ namespace detail { public: ZenUpstreamEndpoint(const ZenStructuredCacheClientOptions& Options) : m_Log(zen::logging::Get("upstream")) - , m_Info({.Name = std::string("Zen")}) + , m_Info({.Name = std::string(Options.Name)}) , m_ConnectTimeout(Options.ConnectTimeout) , m_Timeout(Options.Timeout) { @@ -480,62 +549,43 @@ namespace detail { virtual const UpstreamEndpointInfo& GetEndpointInfo() const override { return m_Info; } - virtual UpstreamEndpointHealth Initialize() override - { - const ZenEndpoint& Ep = GetEndpoint(); - if (Ep.Ok) - { - m_Info.Url = Ep.Url; - m_Client = new ZenStructuredCacheClient({.Url = m_Info.Url, .ConnectTimeout = m_ConnectTimeout, .Timeout = m_Timeout}); - - m_HealthOk = true; - return {.Ok = true}; - } - - m_HealthOk = false; - return {.Reason = Ep.Reason}; - } - - virtual bool IsHealthy() const override { return m_HealthOk; } - - virtual UpstreamEndpointHealth CheckHealth() override + virtual UpstreamEndpointStatus Initialize() override { try { - if (m_Client.IsNull()) + if (m_Status.EndpointState() == UpstreamEndpointState::kOk) { - const ZenEndpoint& Ep = GetEndpoint(); - if (Ep.Ok) - { - m_Info.Url = Ep.Url; - m_Client = - new ZenStructuredCacheClient({.Url = m_Info.Url, .ConnectTimeout = m_ConnectTimeout, .Timeout = m_Timeout}); - - m_HealthOk = true; - return {.Ok = true}; - } - - return {.Reason = Ep.Reason}; + return {.State = UpstreamEndpointState::kOk}; } - ZenStructuredCacheSession Session(*m_Client); - ZenCacheResult Result; + const ZenEndpoint& Ep = GetEndpoint(); - for (int32_t Attempt = 0, MaxAttempts = 2; Attempt < MaxAttempts && !Result.Success; ++Attempt) + m_Info.Url = Ep.Url; + + if (Ep.Ok) { - Result = Session.CheckHealth(); + m_Client = new ZenStructuredCacheClient({.Url = m_Info.Url, .ConnectTimeout = m_ConnectTimeout, .Timeout = m_Timeout}); + m_Status.Set(UpstreamEndpointState::kOk); + } + else + { + m_Status.Set(UpstreamEndpointState::kError, Ep.Reason); } - m_HealthOk = Result.ErrorCode == 0; - - return {.Reason = std::move(Result.Reason), .Ok = m_HealthOk}; + return m_Status.EndpointStatus(); } catch (std::exception& Err) { - return {.Reason = Err.what(), .Ok = false}; + m_Status.Set(UpstreamEndpointState::kError, Err.what()); + + return {.Reason = Err.what(), .State = GetState()}; } } + virtual UpstreamEndpointState GetState() override { return m_Status.EndpointState(); } + + virtual UpstreamEndpointStatus GetStatus() override { return m_Status.EndpointStatus(); } + virtual GetUpstreamCacheResult GetCacheRecord(CacheKey CacheKey, ZenContentType Type) override { ZEN_TRACE_CPU("Upstream::Zen::GetSingleCacheRecord"); @@ -545,6 +595,8 @@ namespace detail { ZenStructuredCacheSession Session(*m_Client); const ZenCacheResult Result = Session.GetCacheRecord(CacheKey.Bucket, CacheKey.Hash, Type); + m_Status.SetFromErrorCode(Result.ErrorCode, Result.Reason); + if (Result.ErrorCode == 0) { return {.Value = Result.Response, @@ -554,13 +606,13 @@ namespace detail { } else { - m_HealthOk = false; return {.Error{.ErrorCode = Result.ErrorCode, .Reason = std::move(Result.Reason)}}; } } catch (std::exception& Err) { - m_HealthOk = false; + m_Status.Set(UpstreamEndpointState::kError, Err.what()); + return {.Error{.ErrorCode = -1, .Reason = Err.what()}}; } } @@ -608,6 +660,8 @@ namespace detail { Result = Session.InvokeRpc(BatchRequest.Save()); } + m_Status.SetFromErrorCode(Result.ErrorCode, Result.Reason); + if (Result.Success) { if (BatchResponse.TryLoad(Result.Response)) @@ -621,10 +675,6 @@ namespace detail { return {.Bytes = Result.Bytes, .ElapsedSeconds = Result.ElapsedSeconds, .Success = true}; } } - else if (Result.ErrorCode) - { - m_HealthOk = false; - } for (size_t Index : KeyIndex) { @@ -643,6 +693,8 @@ namespace detail { ZenStructuredCacheSession Session(*m_Client); const ZenCacheResult Result = Session.GetCachePayload(CacheKey.Bucket, CacheKey.Hash, PayloadId); + m_Status.SetFromErrorCode(Result.ErrorCode, Result.Reason); + if (Result.ErrorCode == 0) { return {.Value = Result.Response, @@ -652,13 +704,13 @@ namespace detail { } else { - m_HealthOk = false; return {.Error{.ErrorCode = Result.ErrorCode, .Reason = std::move(Result.Reason)}}; } } catch (std::exception& Err) { - m_HealthOk = false; + m_Status.Set(UpstreamEndpointState::kError, Err.what()); + return {.Error{.ErrorCode = -1, .Reason = Err.what()}}; } } @@ -713,6 +765,8 @@ namespace detail { Result = Session.InvokeRpc(BatchRequest.Save()); } + m_Status.SetFromErrorCode(Result.ErrorCode, Result.Reason); + if (Result.Success) { if (BatchResponse.TryLoad(Result.Response)) @@ -736,10 +790,6 @@ namespace detail { return {.Bytes = Result.Bytes, .ElapsedSeconds = Result.ElapsedSeconds, .Success = true}; } } - else if (Result.ErrorCode) - { - m_HealthOk = false; - } for (size_t Index : RequestIndex) { @@ -789,10 +839,10 @@ namespace detail { for (uint32_t Attempt = 0; Attempt < MaxAttempts && !Result.Success; Attempt++) { Result = Session.PutCacheRecord(CacheRecord.Key.Bucket, CacheRecord.Key.Hash, PackagePayload, CacheRecord.Type); - - m_HealthOk = Result.ErrorCode == 0; } + m_Status.SetFromErrorCode(Result.ErrorCode, Result.Reason); + TotalBytes = Result.Bytes; TotalElapsedSeconds = Result.ElapsedSeconds; } @@ -807,10 +857,10 @@ namespace detail { CacheRecord.Key.Hash, CacheRecord.PayloadIds[Idx], Payloads[Idx]); - - m_HealthOk = Result.ErrorCode == 0; } + m_Status.SetFromErrorCode(Result.ErrorCode, Result.Reason); + TotalBytes += Result.Bytes; TotalElapsedSeconds += Result.ElapsedSeconds; @@ -827,10 +877,10 @@ namespace detail { for (uint32_t Attempt = 0; Attempt < MaxAttempts && !Result.Success; Attempt++) { Result = Session.PutCacheRecord(CacheRecord.Key.Bucket, CacheRecord.Key.Hash, RecordValue, CacheRecord.Type); - - m_HealthOk = Result.ErrorCode == 0; } + m_Status.SetFromErrorCode(Result.ErrorCode, Result.Reason); + TotalBytes += Result.Bytes; TotalElapsedSeconds += Result.ElapsedSeconds; } @@ -842,7 +892,8 @@ namespace detail { } catch (std::exception& Err) { - m_HealthOk = false; + m_Status.Set(UpstreamEndpointState::kError, Err.what()); + return {.Reason = std::string(Err.what()), .Success = false}; } } @@ -885,109 +936,18 @@ namespace detail { spdlog::logger& m_Log; UpstreamEndpointInfo m_Info; + UpstreamStatus m_Status; + UpstreamEndpointStats m_Stats; std::vector<ZenEndpoint> m_Endpoints; std::chrono::milliseconds m_ConnectTimeout; std::chrono::milliseconds m_Timeout; RefPtr<ZenStructuredCacheClient> m_Client; - UpstreamEndpointStats m_Stats; - std::atomic_bool m_HealthOk{false}; }; } // namespace detail ////////////////////////////////////////////////////////////////////////// -struct UpstreamStats -{ - static constexpr uint64_t MaxSampleCount = 1000ull; - - UpstreamStats(bool Enabled) : m_Enabled(Enabled) {} - - void Add(spdlog::logger& Logger, - UpstreamEndpoint& Endpoint, - const GetUpstreamCacheResult& Result, - const std::vector<std::unique_ptr<UpstreamEndpoint>>& Endpoints) - { - UpstreamEndpointStats& Stats = Endpoint.Stats(); - - if (Result.Error) - { - Stats.ErrorCount++; - } - else if (Result.Success) - { - Stats.HitCount++; - Stats.DownBytes.fetch_add(Result.Bytes); - Stats.TimeDownMs.fetch_add(uint64_t(Result.ElapsedSeconds * 1000.0)); - } - else - { - Stats.MissCount++; - } - - if (m_Enabled && m_SampleCount++ % MaxSampleCount) - { - Dump(Logger, Endpoints); - } - } - - void Add(spdlog::logger& Logger, - UpstreamEndpoint& Endpoint, - const PutUpstreamCacheResult& Result, - const std::vector<std::unique_ptr<UpstreamEndpoint>>& Endpoints) - { - UpstreamEndpointStats& Stats = Endpoint.Stats(); - if (Result.Success) - { - Stats.UpCount++; - Stats.UpBytes.fetch_add(Result.Bytes); - Stats.TimeUpMs.fetch_add(uint64_t(Result.ElapsedSeconds * 1000.0)); - } - else - { - Stats.ErrorCount++; - } - - if (m_Enabled && m_SampleCount++ % MaxSampleCount) - { - Dump(Logger, Endpoints); - } - } - - void Dump(spdlog::logger& Logger, const std::vector<std::unique_ptr<UpstreamEndpoint>>& Endpoints) - { - for (auto& Ep : Endpoints) - { - // These stats will not be totally correct as the numbers are not captured atomically - UpstreamEndpointStats& Stats = Ep->Stats(); - const uint64_t HitCount = Stats.HitCount; - const uint64_t MissCount = Stats.MissCount; - const double DownMBytes = double(Stats.DownBytes) / 1024.0 / 1024.0; - const double SecondsDown = double(Stats.TimeDownMs) / 1000.0; - const double UpMBytes = double(Stats.UpBytes) / 1024.0 / 1024.0; - const double SecondsUp = double(Stats.TimeUpMs) / 1000.0; - - const double UpSpeed = UpMBytes > 0 ? UpMBytes / SecondsUp : 0.0; - const double DownSpeed = DownMBytes > 0 ? DownMBytes / SecondsDown : 0.0; - const uint64_t TotalCount = HitCount + MissCount; - const double HitRate = TotalCount > 0 ? (double(HitCount) / double(TotalCount)) : 0.0; - - Logger.debug("STATS - '{}', Hit rate: {:.2f}%, DOWN: '{:.2f} MiB {:.2f} MiB/s', UP: '{:.2f} MiB {:.2f} MiB/s'", - Ep->GetEndpointInfo().Name, - HitRate, - DownMBytes, - DownSpeed, - UpMBytes, - UpSpeed); - } - } - - bool m_Enabled; - std::atomic_uint64_t m_SampleCount = {}; -}; - -////////////////////////////////////////////////////////////////////////// - class DefaultUpstreamCache final : public UpstreamCache { public: @@ -996,71 +956,87 @@ public: , m_Options(Options) , m_CacheStore(CacheStore) , m_CidStore(CidStore) - , m_Stats(Options.StatsEnabled) { } virtual ~DefaultUpstreamCache() { Shutdown(); } - virtual bool Initialize() override + virtual void Initialize() override { - for (auto& Endpoint : m_Endpoints) + for (uint32_t Idx = 0; Idx < m_Options.ThreadCount; Idx++) { - const UpstreamEndpointHealth Health = Endpoint->Initialize(); - const UpstreamEndpointInfo& Info = Endpoint->GetEndpointInfo(); - - if (Health.Ok) - { - ZEN_INFO("'{}' endpoint '{}' OK", Info.Name, Info.Url); - } - else - { - ZEN_WARN("'{}' endpoint '{}' FAILED, reason '{}'", Info.Name, Info.Url, Health.Reason); - } + m_UpstreamThreads.emplace_back(&DefaultUpstreamCache::ProcessUpstreamQueue, this); } - m_RunState.IsRunning = !m_Endpoints.empty(); + m_EndpointMonitorThread = std::thread(&DefaultUpstreamCache::MonitorEndpoints, this); + m_RunState.IsRunning = true; + } + + virtual void RegisterEndpoint(std::unique_ptr<UpstreamEndpoint> Endpoint) override + { + const UpstreamEndpointStatus Status = Endpoint->Initialize(); + const UpstreamEndpointInfo& Info = Endpoint->GetEndpointInfo(); - if (m_RunState.IsRunning) + ZEN_INFO("register endpoint '{} - {}' {}", Info.Name, Info.Url, ToString(Status.State)); + + // Register endpoint even if it fails, the health monitor thread will probe failing endpoint(s) + std::unique_lock<std::shared_mutex> _(m_EndpointsMutex); + m_Endpoints.emplace_back(std::move(Endpoint)); + } + + virtual void IterateEndpoints(std::function<bool(UpstreamEndpoint&)>&& Fn) override + { + std::shared_lock<std::shared_mutex> _(m_EndpointsMutex); + + for (auto& Ep : m_Endpoints) { - for (uint32_t Idx = 0; Idx < m_Options.ThreadCount; Idx++) + if (!Fn(*Ep)) { - m_UpstreamThreads.emplace_back(&DefaultUpstreamCache::ProcessUpstreamQueue, this); + break; } - - m_EndpointMonitorThread = std::thread(&DefaultUpstreamCache::MonitorEndpoints, this); } - - return m_RunState.IsRunning; } - virtual void RegisterEndpoint(std::unique_ptr<UpstreamEndpoint> Endpoint) override { m_Endpoints.emplace_back(std::move(Endpoint)); } - virtual GetUpstreamCacheResult GetCacheRecord(CacheKey CacheKey, ZenContentType Type) override { ZEN_TRACE_CPU("Upstream::GetCacheRecord"); + std::shared_lock<std::shared_mutex> _(m_EndpointsMutex); + if (m_Options.ReadUpstream) { for (auto& Endpoint : m_Endpoints) { - if (Endpoint->IsHealthy()) + if (Endpoint->GetState() != UpstreamEndpointState::kOk) { - const GetUpstreamCacheResult Result = Endpoint->GetCacheRecord(CacheKey, Type); - m_Stats.Add(m_Log, *Endpoint, Result, m_Endpoints); + continue; + } - if (Result.Success) - { - return Result; - } + UpstreamEndpointStats& Stats = Endpoint->Stats(); + GetUpstreamCacheResult Result; + { + metrics::OperationTiming::Scope Scope(Stats.CacheGetRequestTiming); + Result = Endpoint->GetCacheRecord(CacheKey, Type); + } - if (Result.Error) - { - ZEN_ERROR("get cache record FAILED, endpoint '{}', reason '{}', error code '{}'", - Endpoint->GetEndpointInfo().Url, - Result.Error.Reason, - Result.Error.ErrorCode); - } + Stats.CacheGetCount.Increment(1); + Stats.CacheGetTotalBytes.Increment(Result.Bytes); + + if (Result.Success) + { + Stats.CacheHitCount.Increment(1); + + return Result; + } + + if (Result.Error) + { + Stats.CacheErrorCount.Increment(1); + + ZEN_ERROR("get cache record FAILED, endpoint '{}', reason '{}', error code '{}'", + Endpoint->GetEndpointInfo().Url, + Result.Error.Reason, + Result.Error.ErrorCode); } } } @@ -1075,42 +1051,62 @@ public: { ZEN_TRACE_CPU("Upstream::GetCacheRecords"); - std::vector<size_t> MissingKeys(KeyIndex.begin(), KeyIndex.end()); + std::shared_lock<std::shared_mutex> _(m_EndpointsMutex); + + std::vector<size_t> RemainingKeys(KeyIndex.begin(), KeyIndex.end()); if (m_Options.ReadUpstream) { for (auto& Endpoint : m_Endpoints) { - if (Endpoint->IsHealthy() && !MissingKeys.empty()) + if (RemainingKeys.empty()) + { + break; + } + + if (Endpoint->GetState() != UpstreamEndpointState::kOk) + { + continue; + } + + UpstreamEndpointStats& Stats = Endpoint->Stats(); + std::vector<size_t> Missing; + GetUpstreamCacheResult Result; { - std::vector<size_t> Missing; + metrics::OperationTiming::Scope Scope(Stats.CacheGetRequestTiming); - auto Result = Endpoint->GetCacheRecords(CacheKeys, MissingKeys, Policy, [&](CacheRecordGetCompleteParams&& Params) { + Result = Endpoint->GetCacheRecords(CacheKeys, RemainingKeys, Policy, [&](CacheRecordGetCompleteParams&& Params) { if (Params.Record) { OnComplete(std::forward<CacheRecordGetCompleteParams>(Params)); + + Stats.CacheHitCount.Increment(1); } else { Missing.push_back(Params.KeyIndex); } }); + } - if (Result.Error) - { - ZEN_ERROR("get cache record(s) (rpc) FAILED, endpoint '{}', reason '{}', error code '{}'", - Endpoint->GetEndpointInfo().Url, - Result.Error.Reason, - Result.Error.ErrorCode); - } + Stats.CacheGetCount.Increment(int64_t(RemainingKeys.size())); + Stats.CacheGetTotalBytes.Increment(Result.Bytes); - m_Stats.Add(m_Log, *Endpoint, Result, m_Endpoints); - MissingKeys = std::move(Missing); + if (Result.Error) + { + Stats.CacheErrorCount.Increment(1); + + ZEN_ERROR("get cache record(s) (rpc) FAILED, endpoint '{}', reason '{}', error code '{}'", + Endpoint->GetEndpointInfo().Url, + Result.Error.Reason, + Result.Error.ErrorCode); } + + RemainingKeys = std::move(Missing); } } - for (size_t Index : MissingKeys) + for (size_t Index : RemainingKeys) { OnComplete({.Key = CacheKeys[Index], .KeyIndex = Index, .Record = CbObjectView(), .Package = CbPackage()}); } @@ -1122,43 +1118,62 @@ public: { ZEN_TRACE_CPU("Upstream::GetCachePayloads"); - std::vector<size_t> MissingPayloads(RequestIndex.begin(), RequestIndex.end()); + std::shared_lock<std::shared_mutex> _(m_EndpointsMutex); + + std::vector<size_t> RemainingKeys(RequestIndex.begin(), RequestIndex.end()); if (m_Options.ReadUpstream) { for (auto& Endpoint : m_Endpoints) { - if (Endpoint->IsHealthy() && !MissingPayloads.empty()) + if (RemainingKeys.empty()) { - std::vector<size_t> Missing; + break; + } - auto Result = - Endpoint->GetCachePayloads(CacheChunkRequests, MissingPayloads, [&](CachePayloadGetCompleteParams&& Params) { - if (Params.Payload) - { - OnComplete(std::forward<CachePayloadGetCompleteParams>(Params)); - } - else - { - Missing.push_back(Params.RequestIndex); - } - }); + if (Endpoint->GetState() != UpstreamEndpointState::kOk) + { + continue; + } - if (Result.Error) - { - ZEN_ERROR("get cache payloads(s) (rpc) FAILED, endpoint '{}', reason '{}', error code '{}'", - Endpoint->GetEndpointInfo().Url, - Result.Error.Reason, - Result.Error.ErrorCode); - } + UpstreamEndpointStats& Stats = Endpoint->Stats(); + std::vector<size_t> Missing; + GetUpstreamCacheResult Result; + { + metrics::OperationTiming::Scope Scope(Endpoint->Stats().CacheGetRequestTiming); + + Result = Endpoint->GetCachePayloads(CacheChunkRequests, RemainingKeys, [&](CachePayloadGetCompleteParams&& Params) { + if (Params.Payload) + { + OnComplete(std::forward<CachePayloadGetCompleteParams>(Params)); + + Stats.CacheHitCount.Increment(1); + } + else + { + Missing.push_back(Params.RequestIndex); + } + }); + } - m_Stats.Add(m_Log, *Endpoint, Result, m_Endpoints); - MissingPayloads = std::move(Missing); + Stats.CacheGetCount.Increment(int64_t(RemainingKeys.size())); + Stats.CacheGetTotalBytes.Increment(Result.Bytes); + + if (Result.Error) + { + Stats.CacheErrorCount.Increment(1); + + ZEN_ERROR("get cache payloads(s) (rpc) FAILED, endpoint '{}', reason '{}', error code '{}'", + Endpoint->GetEndpointInfo().Url, + Result.Error.Reason, + Result.Error.ErrorCode); } + + RemainingKeys = std::move(Missing); } } - for (size_t Index : MissingPayloads) + for (size_t Index : RemainingKeys) { OnComplete({.Request = CacheChunkRequests[Index], .RequestIndex = Index, .Payload = IoBuffer()}); } @@ -1172,23 +1187,37 @@ public: { for (auto& Endpoint : m_Endpoints) { - if (Endpoint->IsHealthy()) + if (Endpoint->GetState() != UpstreamEndpointState::kOk) { - const GetUpstreamCacheResult Result = Endpoint->GetCachePayload(CacheKey, PayloadId); - m_Stats.Add(m_Log, *Endpoint, Result, m_Endpoints); + continue; + } - if (Result.Success) - { - return Result; - } + UpstreamEndpointStats& Stats = Endpoint->Stats(); + GetUpstreamCacheResult Result; - if (Result.Error) - { - ZEN_ERROR("get cache payload FAILED, endpoint '{}', reason '{}', error code '{}'", - Endpoint->GetEndpointInfo().Url, - Result.Error.Reason, - Result.Error.ErrorCode); - } + { + metrics::OperationTiming::Scope Scope(Stats.CacheGetRequestTiming); + Result = Endpoint->GetCachePayload(CacheKey, PayloadId); + } + + Stats.CacheGetCount.Increment(1); + Stats.CacheGetTotalBytes.Increment(Result.Bytes); + + if (Result.Success) + { + Stats.CacheHitCount.Increment(1); + + return Result; + } + + if (Result.Error) + { + Stats.CacheErrorCount.Increment(1); + + ZEN_ERROR("get cache payload FAILED, endpoint '{}', reason '{}', error code '{}'", + Endpoint->GetEndpointInfo().Url, + Result.Error.Reason, + Result.Error.ErrorCode); } } } @@ -1196,7 +1225,7 @@ public: return {}; } - virtual EnqueueResult EnqueueUpstream(UpstreamCacheRecord CacheRecord) override + virtual void EnqueueUpstream(UpstreamCacheRecord CacheRecord) override { if (m_RunState.IsRunning && m_Options.WriteUpstream) { @@ -1208,11 +1237,7 @@ public: { ProcessCacheRecord(std::move(CacheRecord)); } - - return {.Success = true}; } - - return {}; } virtual void GetStatus(CbObjectWriter& Status) override @@ -1225,22 +1250,35 @@ public: Status.BeginArray("endpoints"); for (const auto& Ep : m_Endpoints) { - const UpstreamEndpointInfo& Info = Ep->GetEndpointInfo(); - Status.BeginObject(); - Status << "name" << Info.Name; - Status << "url" << Info.Url; - Status << "health" << (Ep->IsHealthy() ? "ok"sv : "inactive"sv); + const UpstreamEndpointInfo& EpInfo = Ep->GetEndpointInfo(); + const UpstreamEndpointStatus EpStatus = Ep->GetStatus(); + UpstreamEndpointStats& EpStats = Ep->Stats(); - UpstreamEndpointStats& Stats = Ep->Stats(); - const uint64_t HitCount = Stats.HitCount; - const uint64_t MissCount = Stats.MissCount; - const uint64_t TotalCount = HitCount + MissCount; - const double HitRate = TotalCount > 0 ? (double(HitCount) / double(TotalCount)) : 0.0; + Status.BeginObject(); + Status << "name" << EpInfo.Name; + Status << "url" << EpInfo.Url; + Status << "state" << ToString(EpStatus.State); + Status << "reason" << EpStatus.Reason; - Status << "hit_ratio" << HitRate; - Status << "downloaded_mb" << (double(Stats.DownBytes) / 1024.0 / 1024.0); - Status << "uploaded_mb" << Stats.UpBytes; - Status << "error_count" << Stats.ErrorCount; + Status.BeginObject("cache"sv); + { + const int64_t GetCount = EpStats.CacheGetCount.Value(); + const int64_t HitCount = EpStats.CacheHitCount.Value(); + const int64_t ErrorCount = EpStats.CacheErrorCount.Value(); + const double HitRatio = GetCount > 0 ? double(HitCount) / double(GetCount) : 0.0; + const double ErrorRatio = GetCount > 0 ? double(ErrorCount) / double(GetCount) : 0.0; + + metrics::EmitSnapshot("get_requests"sv, EpStats.CacheGetRequestTiming, Status); + Status << "get_bytes" << EpStats.CacheGetTotalBytes.Value(); + Status << "get_count" << GetCount; + Status << "hit_count" << HitCount; + Status << "hit_ratio" << HitRatio; + Status << "error_count" << ErrorCount; + Status << "error_ratio" << ErrorRatio; + metrics::EmitSnapshot("put_requests"sv, EpStats.CachePutRequestTiming, Status); + Status << "put_bytes" << EpStats.CachePutTotalBytes.Value(); + } + Status.EndObject(); Status.EndObject(); } @@ -1277,21 +1315,31 @@ private: } } + std::shared_lock<std::shared_mutex> _(m_EndpointsMutex); + for (auto& Endpoint : m_Endpoints) { - if (Endpoint->IsHealthy()) + if (Endpoint->GetState() != UpstreamEndpointState::kOk) { - const PutUpstreamCacheResult Result = Endpoint->PutCacheRecord(CacheRecord, CacheValue.Value, std::span(Payloads)); - m_Stats.Add(m_Log, *Endpoint, Result, m_Endpoints); + continue; + } - if (!Result.Success) - { - ZEN_WARN("upload cache record '{}/{}' FAILED, endpoint '{}', reason '{}'", - CacheRecord.Key.Bucket, - CacheRecord.Key.Hash, - Endpoint->GetEndpointInfo().Url, - Result.Reason); - } + UpstreamEndpointStats& Stats = Endpoint->Stats(); + PutUpstreamCacheResult Result; + { + metrics::OperationTiming::Scope Scope(Stats.CachePutRequestTiming); + Result = Endpoint->PutCacheRecord(CacheRecord, CacheValue.Value, std::span(Payloads)); + } + + Stats.CachePutTotalBytes.Increment(Result.Bytes); + + if (!Result.Success) + { + ZEN_WARN("upload cache record '{}/{}' FAILED, endpoint '{}', reason '{}'", + CacheRecord.Key.Bucket, + CacheRecord.Key.Hash, + Endpoint->GetEndpointInfo().Url, + Result.Reason); } } } @@ -1334,21 +1382,35 @@ private: try { - for (auto& Endpoint : m_Endpoints) + std::vector<UpstreamEndpoint*> Endpoints; + { - if (!Endpoint->IsHealthy()) + std::shared_lock<std::shared_mutex> _(m_EndpointsMutex); + + for (auto& Endpoint : m_Endpoints) { - const UpstreamEndpointInfo& Info = Endpoint->GetEndpointInfo(); - if (const UpstreamEndpointHealth Health = Endpoint->CheckHealth(); Health.Ok) + if (Endpoint->GetState() == UpstreamEndpointState::kError || + Endpoint->GetState() == UpstreamEndpointState::kUnauthorized) { - ZEN_INFO("health check endpoint '{} - {}' OK", Info.Name, Info.Url, Health.Reason); - } - else - { - ZEN_WARN("health check endpoint '{} - {}' FAILED, reason '{}'", Info.Name, Info.Url, Health.Reason); + Endpoints.push_back(Endpoint.get()); } } } + + for (auto& Endpoint : Endpoints) + { + const UpstreamEndpointInfo& Info = Endpoint->GetEndpointInfo(); + const UpstreamEndpointStatus Status = Endpoint->Initialize(); + + if (Status.State == UpstreamEndpointState::kOk) + { + ZEN_INFO("health check endpoint '{} - {}' OK", Info.Name, Info.Url); + } + else + { + ZEN_WARN("health check endpoint '{} - {}' FAILED, reason '{}'", Info.Name, Info.Url, Status.Reason); + } + } } catch (std::exception& Err) { @@ -1403,7 +1465,7 @@ private: ZenCacheStore& m_CacheStore; CidStore& m_CidStore; UpstreamQueue m_UpstreamQueue; - UpstreamStats m_Stats; + std::shared_mutex m_EndpointsMutex; std::vector<std::unique_ptr<UpstreamEndpoint>> m_Endpoints; std::vector<std::thread> m_UpstreamThreads; std::thread m_EndpointMonitorThread; |