diff options
| author | Dan Engelbrecht <[email protected]> | 2024-08-14 10:22:57 +0200 |
|---|---|---|
| committer | GitHub Enterprise <[email protected]> | 2024-08-14 10:22:57 +0200 |
| commit | 831f0787fca39682dec0141714eaebb7772b8639 (patch) | |
| tree | a72de0a27bb2d62589c3d968b3e69b43ef3ce0fb | |
| parent | improve logging on main failure (#111) (diff) | |
| download | zen-831f0787fca39682dec0141714eaebb7772b8639.tar.xz zen-831f0787fca39682dec0141714eaebb7772b8639.zip | |
hardening and reduced spam from GC on failure (#112)
* Retry writing GC state if it fails to handle transient problems
* If GC operation fails demote errors to warnings on consecutive fails
| -rw-r--r-- | CHANGELOG.md | 2 | ||||
| -rw-r--r-- | src/zenstore/gc.cpp | 370 | ||||
| -rw-r--r-- | src/zenstore/include/zenstore/gc.h | 5 |
3 files changed, 234 insertions, 143 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md index e89f1709a..d56570520 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,8 @@ - Improvement: A request for an out of range project store chunk will now result in a "not found" status result - Improvement: Callstack logs on assert/error is now indented for better readability - Improvement: Full command line is now logged on startup and added as context information in Sentry reports +- Improvement: Retry writing GC state if it fails to handle transient problems +- Improvement: If GC operation fails we report an error, after that errors are demoted to warnings until we manage to do a successful GC run. This is to reduce spam on Sentry. ## 5.5.3 - Feature: New 'workspaces' service which allows a user to share a local folder via zenserver. A workspace can have mulitple workspace shares and they provie an HTTP API that is compatible with the project oplog HTTP API. Workspaces and shares are preserved between runs. Workspaces feature is disabled by default - enable with `--workspaces-enabled` option when launching zenserver. diff --git a/src/zenstore/gc.cpp b/src/zenstore/gc.cpp index dc75a29f2..8ed74b54b 100644 --- a/src/zenstore/gc.cpp +++ b/src/zenstore/gc.cpp @@ -1726,6 +1726,8 @@ GcScheduler::SchedulerThread() std::chrono::seconds WaitTime{0}; + bool SilenceErrors = false; + for (;;) { bool Timeout = false; @@ -2016,15 +2018,26 @@ GcScheduler::SchedulerThread() } } - CollectGarbage(CacheExpireTime, - ProjectStoreExpireTime, - DoDelete, - CollectSmallObjects, - SkipCid, - UseGCVersion, - CompactBlockUsageThresholdPercent, - Verbose, - SingleThreaded); + bool GcSuccess = CollectGarbage(CacheExpireTime, + ProjectStoreExpireTime, + DoDelete, + CollectSmallObjects, + SkipCid, + UseGCVersion, + CompactBlockUsageThresholdPercent, + Verbose, + SingleThreaded, + SilenceErrors); + if (!GcSuccess) + { + SilenceErrors = true; + ZEN_INFO("gc going into error state - gc errors will be demoted to warnings until we recover"); + } + else if (SilenceErrors) + { + SilenceErrors = false; + ZEN_INFO("gc recovered from error state - gc errors will be will be reported as errors again"); + } m_GcManager.SetCancelGC(false); @@ -2103,7 +2116,7 @@ GcScheduler::ScrubStorage(bool DoDelete, bool SkipCid, std::chrono::seconds Time ZEN_INFO("scrubbing DONE (in {})", NiceTimeSpanMs(Timer.GetElapsedTimeMs())); } -void +bool GcScheduler::CollectGarbage(const GcClock::TimePoint& CacheExpireTime, const GcClock::TimePoint& ProjectStoreExpireTime, bool Delete, @@ -2112,176 +2125,251 @@ GcScheduler::CollectGarbage(const GcClock::TimePoint& CacheExpireTime, GcVersion UseGCVersion, uint32_t CompactBlockUsageThresholdPercent, bool Verbose, - bool SingleThreaded) + bool SingleThreaded, + bool SilenceErrors) { ZEN_TRACE_CPU("GcScheduler::CollectGarbage"); - GcContext GcCtx(CacheExpireTime, ProjectStoreExpireTime); - GcCtx.SetDeletionMode(Delete); - GcCtx.SetSkipCid(SkipCid); - GcCtx.CollectSmallObjects(CollectSmallObjects); - GcCtx.DiskReservePath(m_Config.RootDirectory / "reserve.gc"); + try + { + GcContext GcCtx(CacheExpireTime, ProjectStoreExpireTime); + GcCtx.SetDeletionMode(Delete); + GcCtx.SetSkipCid(SkipCid); + GcCtx.CollectSmallObjects(CollectSmallObjects); + GcCtx.DiskReservePath(m_Config.RootDirectory / "reserve.gc"); - auto ReclaimDiskReserve = [&]() { - std::error_code Ec = CreateGCReserve(m_Config.RootDirectory / "reserve.gc", m_Config.DiskReserveSize); - if (Ec) - { - ZEN_WARN("unable to create GC reserve at '{}' with size {}, reason: '{}'", - m_Config.RootDirectory / "reserve.gc", - NiceBytes(m_Config.DiskReserveSize), - Ec.message()); - } - }; + auto ReclaimDiskReserve = [&]() { + std::error_code Ec = CreateGCReserve(m_Config.RootDirectory / "reserve.gc", m_Config.DiskReserveSize); + if (Ec) + { + ZEN_WARN("unable to create GC reserve at '{}' with size {}, reason: '{}'", + m_Config.RootDirectory / "reserve.gc", + NiceBytes(m_Config.DiskReserveSize), + Ec.message()); + } + }; - ReclaimDiskReserve(); - const auto _ = MakeGuard([&] { ReclaimDiskReserve(); }); + ReclaimDiskReserve(); + const auto _ = MakeGuard([&] { ReclaimDiskReserve(); }); - CheckDiskSpace(); + CheckDiskSpace(); - if (m_AreDiskWritesBlocked.load()) - { - // We are low on disk, check if we can release our extra storage reserve, if we can't bail from doing GC - uint64_t ReleasedSpace = GcCtx.ClaimGCReserve(); - if (ReleasedSpace == 0) + if (m_AreDiskWritesBlocked.load()) { - ZEN_WARN("Disk space is very low and we have no GC reserve, skipping GC as this requires at least some space to write to '{}'", - m_Config.RootDirectory); - return; + // We are low on disk, check if we can release our extra storage reserve, if we can't bail from doing GC + uint64_t ReleasedSpace = GcCtx.ClaimGCReserve(); + if (ReleasedSpace == 0) + { + ZEN_WARN( + "Disk space is very low and we have no GC reserve, skipping GC as this requires at least some space to write to '{}'", + m_Config.RootDirectory); + return true; // Treat this as a success as we don't want to silence any errors that happens when we are no longer low on + // disk space + } } - } - ZEN_INFO("garbage collection STARTING, small objects gc {}, {} CAS. Cache cutoff time {}, project store cutoff time {}", - GcCtx.CollectSmallObjects() ? "ENABLED"sv : "DISABLED"sv, - SkipCid ? "skip"sv : "include"sv, - CacheExpireTime, - ProjectStoreExpireTime); - { - Stopwatch Timer; - const auto __ = MakeGuard([&] { ZEN_INFO("garbage collection DONE in {}", NiceTimeSpanMs(Timer.GetElapsedTimeMs())); }); - - GcStorageSize Diff; - switch (UseGCVersion) + ZEN_INFO("garbage collection STARTING, small objects gc {}, {} CAS. Cache cutoff time {}, project store cutoff time {}", + GcCtx.CollectSmallObjects() ? "ENABLED"sv : "DISABLED"sv, + SkipCid ? "skip"sv : "include"sv, + CacheExpireTime, + ProjectStoreExpireTime); { - case GcVersion::kV1: - Diff = m_GcManager.CollectGarbage(GcCtx); - if (SkipCid) + Stopwatch Timer; + const auto __ = MakeGuard([&] { ZEN_INFO("garbage collection DONE in {}", NiceTimeSpanMs(Timer.GetElapsedTimeMs())); }); + + GcStorageSize Diff; + switch (UseGCVersion) + { + case GcVersion::kV1: + Diff = m_GcManager.CollectGarbage(GcCtx); + if (SkipCid) + { + m_LastLightweightGCV2Result.reset(); + } + else + { + m_LastFullGCV2Result.reset(); + } + break; + case GcVersion::kV2: + { + const GcSettings Settings = {.CacheExpireTime = CacheExpireTime, + .ProjectStoreExpireTime = ProjectStoreExpireTime, + .CollectSmallObjects = CollectSmallObjects, + .IsDeleteMode = Delete, + .SkipCidDelete = SkipCid, + .Verbose = Verbose, + .SingleThread = SingleThreaded, + .CompactBlockUsageThresholdPercent = CompactBlockUsageThresholdPercent, + .DiskReservePath = m_Config.RootDirectory / "reserve.gc"}; + GcClock::TimePoint GcStartTime = GcClock::Now(); + GcResult Result = m_GcManager.CollectGarbage(Settings); + + ZEN_INFO( + "GCV2: Found {} expired items out of {}, deleted {}. " + "Found {} unreferenced Cid entries out of {}, deleted {}. " + "Freed {} on disk and {} of memory in {}. " + "CacheExpireTime: {}, ProjectStoreExpireTime: {}, CollectSmallObjects: {}, " + "IsDeleteMode: {}, SkipCidDelete: {}", + Result.ReferencerStatSum.RemoveExpiredDataStats.FoundCount, + Result.ReferencerStatSum.RemoveExpiredDataStats.CheckedCount, + Result.ReferencerStatSum.RemoveExpiredDataStats.DeletedCount, + + Result.ReferenceStoreStatSum.RemoveUnreferencedDataStats.FoundCount, + Result.ReferenceStoreStatSum.RemoveUnreferencedDataStats.CheckedCount, + Result.ReferenceStoreStatSum.RemoveUnreferencedDataStats.DeletedCount, + + NiceBytes(Result.CompactStoresStatSum.RemovedDisk), + NiceBytes(Result.ReferencerStatSum.RemoveExpiredDataStats.FreedMemory), + NiceTimeSpanMs(Result.ElapsedMS.count()), + Settings.CacheExpireTime, + Settings.ProjectStoreExpireTime, + Settings.CollectSmallObjects, + Settings.IsDeleteMode, + Settings.SkipCidDelete); + + AppendGCLog(GcStartTime, Settings, Result); + + if (SkipCid) + { + m_LastLightweightGCV2Result = Result; + } + else + { + m_LastFullGCV2Result = Result; + } + Diff.DiskSize = Result.CompactStoresStatSum.RemovedDisk; + Diff.MemorySize = Result.ReferencerStatSum.RemoveExpiredDataStats.FreedMemory; + } + break; + } + + std::chrono::milliseconds ElapsedMS = std::chrono::milliseconds(Timer.GetElapsedTimeMs()); + if (SkipCid) + { + m_LastLightweightGcTime = GcClock::Now(); + m_LastLightweightGcDuration = ElapsedMS; + m_LastLightweightGCDiff = Diff; + } + else + { + if (Delete) { - m_LastLightweightGCV2Result.reset(); + GcClock::TimePoint KeepRangeStart = Min(CacheExpireTime, ProjectStoreExpireTime); + m_LastGcExpireTime = KeepRangeStart; + std::unique_lock Lock(m_GcMutex); + m_DiskUsageWindow.KeepRange(KeepRangeStart.time_since_epoch().count(), GcClock::Duration::max().count()); } - else + + m_LastGcTime = GcClock::Now(); + m_LastLightweightGcTime = m_LastGcTime; + m_LastFullGcDuration = ElapsedMS; + m_LastFullGCDiff = Diff; + } + + for (uint32_t RetryCount = 0; RetryCount < 3; RetryCount++) + { + if (RetryCount > 0) { - m_LastFullGCV2Result.reset(); + ZEN_INFO("Writing GC state failed {} time(s), pausing and trying again", RetryCount); + Sleep(250); } - break; - case GcVersion::kV2: + try { - const GcSettings Settings = {.CacheExpireTime = CacheExpireTime, - .ProjectStoreExpireTime = ProjectStoreExpireTime, - .CollectSmallObjects = CollectSmallObjects, - .IsDeleteMode = Delete, - .SkipCidDelete = SkipCid, - .Verbose = Verbose, - .SingleThread = SingleThreaded, - .CompactBlockUsageThresholdPercent = CompactBlockUsageThresholdPercent, - .DiskReservePath = m_Config.RootDirectory / "reserve.gc"}; - GcClock::TimePoint GcStartTime = GcClock::Now(); - GcResult Result = m_GcManager.CollectGarbage(Settings); - - ZEN_INFO( - "GCV2: Found {} expired items out of {}, deleted {}. " - "Found {} unreferenced Cid entries out of {}, deleted {}. " - "Freed {} on disk and {} of memory in {}. " - "CacheExpireTime: {}, ProjectStoreExpireTime: {}, CollectSmallObjects: {}, " - "IsDeleteMode: {}, SkipCidDelete: {}", - Result.ReferencerStatSum.RemoveExpiredDataStats.FoundCount, - Result.ReferencerStatSum.RemoveExpiredDataStats.CheckedCount, - Result.ReferencerStatSum.RemoveExpiredDataStats.DeletedCount, - - Result.ReferenceStoreStatSum.RemoveUnreferencedDataStats.FoundCount, - Result.ReferenceStoreStatSum.RemoveUnreferencedDataStats.CheckedCount, - Result.ReferenceStoreStatSum.RemoveUnreferencedDataStats.DeletedCount, - - NiceBytes(Result.CompactStoresStatSum.RemovedDisk), - NiceBytes(Result.ReferencerStatSum.RemoveExpiredDataStats.FreedMemory), - NiceTimeSpanMs(Result.ElapsedMS.count()), - Settings.CacheExpireTime, - Settings.ProjectStoreExpireTime, - Settings.CollectSmallObjects, - Settings.IsDeleteMode, - Settings.SkipCidDelete); - - AppendGCLog(GcStartTime, Settings, Result); - - if (SkipCid) + const fs::path Path = m_Config.RootDirectory / "gc_state"; + ZEN_DEBUG("saving scheduler state to '{}'", Path); + CbObjectWriter SchedulerState; + SchedulerState << "LastGcTime"sv << static_cast<int64_t>(m_LastGcTime.time_since_epoch().count()); + SchedulerState << "LastGcExpireTime"sv << static_cast<int64_t>(m_LastGcExpireTime.time_since_epoch().count()); + SaveCompactBinaryObject(Path, SchedulerState.Save()); + if (RetryCount > 0) { - m_LastLightweightGCV2Result = Result; + ZEN_INFO("Writing GC state succeeded after {} attempts", RetryCount + 1); + } + break; + } + catch (const std::system_error& SystemError) + { + if (IsOOM(SystemError.code())) + { + ZEN_WARN("writing gc scheduler state ran out of memory: '{}'", SystemError.what()); + } + else if (IsOOD(SystemError.code())) + { + ZEN_WARN("writing gc scheduler state ran out of disk space: '{}'", SystemError.what()); + } + if (RetryCount == 0) + { + ZEN_ERROR("writing gc scheduler state failed with system error exception: '{}'", SystemError.what()); } else { - m_LastFullGCV2Result = Result; + ZEN_WARN("writing gc scheduler state failed with system error exception: '{}'", SystemError.what()); } - Diff.DiskSize = Result.CompactStoresStatSum.RemovedDisk; - Diff.MemorySize = Result.ReferencerStatSum.RemoveExpiredDataStats.FreedMemory; } - break; + catch (const std::bad_alloc& BadAlloc) + { + ZEN_WARN("writing gc scheduler state ran out of memory: '{}'", BadAlloc.what()); + } + catch (const std::exception& Ex) + { + if (RetryCount == 0) + { + ZEN_ERROR("writing gc scheduler state failed with: '{}'", Ex.what()); + } + else + { + ZEN_WARN("writing gc scheduler state failed with: '{}'", Ex.what()); + } + } + } } - - std::chrono::milliseconds ElapsedMS = std::chrono::milliseconds(Timer.GetElapsedTimeMs()); - if (SkipCid) + return true; + } + catch (const std::system_error& SystemError) + { + if (IsOOM(SystemError.code())) { - m_LastLightweightGcTime = GcClock::Now(); - m_LastLightweightGcDuration = ElapsedMS; - m_LastLightweightGCDiff = Diff; + ZEN_WARN("scheduling garbage collection ran out of memory: '{}'", SystemError.what()); } - else - { - if (Delete) - { - GcClock::TimePoint KeepRangeStart = Min(CacheExpireTime, ProjectStoreExpireTime); - m_LastGcExpireTime = KeepRangeStart; - std::unique_lock Lock(m_GcMutex); - m_DiskUsageWindow.KeepRange(KeepRangeStart.time_since_epoch().count(), GcClock::Duration::max().count()); - } - - m_LastGcTime = GcClock::Now(); - m_LastLightweightGcTime = m_LastGcTime; - m_LastFullGcDuration = ElapsedMS; - m_LastFullGCDiff = Diff; - } - - try + else if (IsOOD(SystemError.code())) { - const fs::path Path = m_Config.RootDirectory / "gc_state"; - ZEN_DEBUG("saving scheduler state to '{}'", Path); - CbObjectWriter SchedulerState; - SchedulerState << "LastGcTime"sv << static_cast<int64_t>(m_LastGcTime.time_since_epoch().count()); - SchedulerState << "LastGcExpireTime"sv << static_cast<int64_t>(m_LastGcExpireTime.time_since_epoch().count()); - SaveCompactBinaryObject(Path, SchedulerState.Save()); + ZEN_WARN("scheduling garbage collection ran out of disk space: '{}'", SystemError.what()); } - catch (const std::system_error& SystemError) + else { - if (IsOOM(SystemError.code())) - { - ZEN_WARN("writing gc scheduler state ran out of memory: '{}'", SystemError.what()); - } - else if (IsOOD(SystemError.code())) + if (SilenceErrors) { - ZEN_WARN("writing gc scheduler state ran out of disk space: '{}'", SystemError.what()); + ZEN_WARN("scheduling garbage collection failed with system error exception: '{}'", SystemError.what()); } else { - ZEN_ERROR("writing gc scheduler state failed with system error exception: '{}'", SystemError.what()); + ZEN_ERROR("scheduling garbage collection failed with system error exception: '{}'", SystemError.what()); } } - catch (const std::bad_alloc& BadAlloc) + m_LastGcTime = GcClock::Now(); + m_LastLightweightGcTime = m_LastGcTime; + } + catch (const std::bad_alloc& BadAlloc) + { + ZEN_WARN("scheduling garbage collection ran out of memory: '{}'", BadAlloc.what()); + m_LastGcTime = GcClock::Now(); + m_LastLightweightGcTime = m_LastGcTime; + } + catch (const std::exception& Ex) + { + if (SilenceErrors) { - ZEN_WARN("writing gc scheduler state ran out of memory: '{}'", BadAlloc.what()); + ZEN_WARN("scheduling garbage collection failed with: '{}'", Ex.what()); } - catch (const std::exception& Ex) + else { - ZEN_ERROR("writing gc scheduler state failed with: '{}'", Ex.what()); + ZEN_ERROR("scheduling garbage collection failed with: '{}'", Ex.what()); } + m_LastGcTime = GcClock::Now(); + m_LastLightweightGcTime = m_LastGcTime; } + return false; } ////////////////////////////////////////////////////////////////////////// diff --git a/src/zenstore/include/zenstore/gc.h b/src/zenstore/include/zenstore/gc.h index 2edb5e0a5..92f2bdf78 100644 --- a/src/zenstore/include/zenstore/gc.h +++ b/src/zenstore/include/zenstore/gc.h @@ -537,7 +537,7 @@ public: private: void SchedulerThread(); - void CollectGarbage(const GcClock::TimePoint& CacheExpireTime, + bool CollectGarbage(const GcClock::TimePoint& CacheExpireTime, const GcClock::TimePoint& ProjectStoreExpireTime, bool Delete, bool CollectSmallObjects, @@ -545,7 +545,8 @@ private: GcVersion UseGCVersion, uint32_t CompactBlockUsageThresholdPercent, bool Verbose, - bool SingleThreaded); + bool SingleThreaded, + bool SilenceErrors); void ScrubStorage(bool DoDelete, bool SkipCid, std::chrono::seconds TimeSlice); LoggerRef Log() { return m_Log; } virtual bool AreDiskWritesAllowed() const override { return !m_AreDiskWritesBlocked.load(); } |