diff options
| author | Dan Engelbrecht <[email protected]> | 2024-08-14 10:22:57 +0200 |
|---|---|---|
| committer | GitHub Enterprise <[email protected]> | 2024-08-14 10:22:57 +0200 |
| commit | 831f0787fca39682dec0141714eaebb7772b8639 (patch) | |
| tree | a72de0a27bb2d62589c3d968b3e69b43ef3ce0fb /src | |
| parent | improve logging on main failure (#111) (diff) | |
| download | zen-831f0787fca39682dec0141714eaebb7772b8639.tar.xz zen-831f0787fca39682dec0141714eaebb7772b8639.zip | |
hardening and reduced spam from GC on failure (#112)
* Retry writing GC state if it fails to handle transient problems
* If GC operation fails demote errors to warnings on consecutive fails
Diffstat (limited to 'src')
| -rw-r--r-- | src/zenstore/gc.cpp | 370 | ||||
| -rw-r--r-- | src/zenstore/include/zenstore/gc.h | 5 |
2 files changed, 232 insertions, 143 deletions
diff --git a/src/zenstore/gc.cpp b/src/zenstore/gc.cpp index dc75a29f2..8ed74b54b 100644 --- a/src/zenstore/gc.cpp +++ b/src/zenstore/gc.cpp @@ -1726,6 +1726,8 @@ GcScheduler::SchedulerThread() std::chrono::seconds WaitTime{0}; + bool SilenceErrors = false; + for (;;) { bool Timeout = false; @@ -2016,15 +2018,26 @@ GcScheduler::SchedulerThread() } } - CollectGarbage(CacheExpireTime, - ProjectStoreExpireTime, - DoDelete, - CollectSmallObjects, - SkipCid, - UseGCVersion, - CompactBlockUsageThresholdPercent, - Verbose, - SingleThreaded); + bool GcSuccess = CollectGarbage(CacheExpireTime, + ProjectStoreExpireTime, + DoDelete, + CollectSmallObjects, + SkipCid, + UseGCVersion, + CompactBlockUsageThresholdPercent, + Verbose, + SingleThreaded, + SilenceErrors); + if (!GcSuccess) + { + SilenceErrors = true; + ZEN_INFO("gc going into error state - gc errors will be demoted to warnings until we recover"); + } + else if (SilenceErrors) + { + SilenceErrors = false; + ZEN_INFO("gc recovered from error state - gc errors will be will be reported as errors again"); + } m_GcManager.SetCancelGC(false); @@ -2103,7 +2116,7 @@ GcScheduler::ScrubStorage(bool DoDelete, bool SkipCid, std::chrono::seconds Time ZEN_INFO("scrubbing DONE (in {})", NiceTimeSpanMs(Timer.GetElapsedTimeMs())); } -void +bool GcScheduler::CollectGarbage(const GcClock::TimePoint& CacheExpireTime, const GcClock::TimePoint& ProjectStoreExpireTime, bool Delete, @@ -2112,176 +2125,251 @@ GcScheduler::CollectGarbage(const GcClock::TimePoint& CacheExpireTime, GcVersion UseGCVersion, uint32_t CompactBlockUsageThresholdPercent, bool Verbose, - bool SingleThreaded) + bool SingleThreaded, + bool SilenceErrors) { ZEN_TRACE_CPU("GcScheduler::CollectGarbage"); - GcContext GcCtx(CacheExpireTime, ProjectStoreExpireTime); - GcCtx.SetDeletionMode(Delete); - GcCtx.SetSkipCid(SkipCid); - GcCtx.CollectSmallObjects(CollectSmallObjects); - GcCtx.DiskReservePath(m_Config.RootDirectory / "reserve.gc"); + try + { + GcContext GcCtx(CacheExpireTime, ProjectStoreExpireTime); + GcCtx.SetDeletionMode(Delete); + GcCtx.SetSkipCid(SkipCid); + GcCtx.CollectSmallObjects(CollectSmallObjects); + GcCtx.DiskReservePath(m_Config.RootDirectory / "reserve.gc"); - auto ReclaimDiskReserve = [&]() { - std::error_code Ec = CreateGCReserve(m_Config.RootDirectory / "reserve.gc", m_Config.DiskReserveSize); - if (Ec) - { - ZEN_WARN("unable to create GC reserve at '{}' with size {}, reason: '{}'", - m_Config.RootDirectory / "reserve.gc", - NiceBytes(m_Config.DiskReserveSize), - Ec.message()); - } - }; + auto ReclaimDiskReserve = [&]() { + std::error_code Ec = CreateGCReserve(m_Config.RootDirectory / "reserve.gc", m_Config.DiskReserveSize); + if (Ec) + { + ZEN_WARN("unable to create GC reserve at '{}' with size {}, reason: '{}'", + m_Config.RootDirectory / "reserve.gc", + NiceBytes(m_Config.DiskReserveSize), + Ec.message()); + } + }; - ReclaimDiskReserve(); - const auto _ = MakeGuard([&] { ReclaimDiskReserve(); }); + ReclaimDiskReserve(); + const auto _ = MakeGuard([&] { ReclaimDiskReserve(); }); - CheckDiskSpace(); + CheckDiskSpace(); - if (m_AreDiskWritesBlocked.load()) - { - // We are low on disk, check if we can release our extra storage reserve, if we can't bail from doing GC - uint64_t ReleasedSpace = GcCtx.ClaimGCReserve(); - if (ReleasedSpace == 0) + if (m_AreDiskWritesBlocked.load()) { - ZEN_WARN("Disk space is very low and we have no GC reserve, skipping GC as this requires at least some space to write to '{}'", - m_Config.RootDirectory); - return; + // We are low on disk, check if we can release our extra storage reserve, if we can't bail from doing GC + uint64_t ReleasedSpace = GcCtx.ClaimGCReserve(); + if (ReleasedSpace == 0) + { + ZEN_WARN( + "Disk space is very low and we have no GC reserve, skipping GC as this requires at least some space to write to '{}'", + m_Config.RootDirectory); + return true; // Treat this as a success as we don't want to silence any errors that happens when we are no longer low on + // disk space + } } - } - ZEN_INFO("garbage collection STARTING, small objects gc {}, {} CAS. Cache cutoff time {}, project store cutoff time {}", - GcCtx.CollectSmallObjects() ? "ENABLED"sv : "DISABLED"sv, - SkipCid ? "skip"sv : "include"sv, - CacheExpireTime, - ProjectStoreExpireTime); - { - Stopwatch Timer; - const auto __ = MakeGuard([&] { ZEN_INFO("garbage collection DONE in {}", NiceTimeSpanMs(Timer.GetElapsedTimeMs())); }); - - GcStorageSize Diff; - switch (UseGCVersion) + ZEN_INFO("garbage collection STARTING, small objects gc {}, {} CAS. Cache cutoff time {}, project store cutoff time {}", + GcCtx.CollectSmallObjects() ? "ENABLED"sv : "DISABLED"sv, + SkipCid ? "skip"sv : "include"sv, + CacheExpireTime, + ProjectStoreExpireTime); { - case GcVersion::kV1: - Diff = m_GcManager.CollectGarbage(GcCtx); - if (SkipCid) + Stopwatch Timer; + const auto __ = MakeGuard([&] { ZEN_INFO("garbage collection DONE in {}", NiceTimeSpanMs(Timer.GetElapsedTimeMs())); }); + + GcStorageSize Diff; + switch (UseGCVersion) + { + case GcVersion::kV1: + Diff = m_GcManager.CollectGarbage(GcCtx); + if (SkipCid) + { + m_LastLightweightGCV2Result.reset(); + } + else + { + m_LastFullGCV2Result.reset(); + } + break; + case GcVersion::kV2: + { + const GcSettings Settings = {.CacheExpireTime = CacheExpireTime, + .ProjectStoreExpireTime = ProjectStoreExpireTime, + .CollectSmallObjects = CollectSmallObjects, + .IsDeleteMode = Delete, + .SkipCidDelete = SkipCid, + .Verbose = Verbose, + .SingleThread = SingleThreaded, + .CompactBlockUsageThresholdPercent = CompactBlockUsageThresholdPercent, + .DiskReservePath = m_Config.RootDirectory / "reserve.gc"}; + GcClock::TimePoint GcStartTime = GcClock::Now(); + GcResult Result = m_GcManager.CollectGarbage(Settings); + + ZEN_INFO( + "GCV2: Found {} expired items out of {}, deleted {}. " + "Found {} unreferenced Cid entries out of {}, deleted {}. " + "Freed {} on disk and {} of memory in {}. " + "CacheExpireTime: {}, ProjectStoreExpireTime: {}, CollectSmallObjects: {}, " + "IsDeleteMode: {}, SkipCidDelete: {}", + Result.ReferencerStatSum.RemoveExpiredDataStats.FoundCount, + Result.ReferencerStatSum.RemoveExpiredDataStats.CheckedCount, + Result.ReferencerStatSum.RemoveExpiredDataStats.DeletedCount, + + Result.ReferenceStoreStatSum.RemoveUnreferencedDataStats.FoundCount, + Result.ReferenceStoreStatSum.RemoveUnreferencedDataStats.CheckedCount, + Result.ReferenceStoreStatSum.RemoveUnreferencedDataStats.DeletedCount, + + NiceBytes(Result.CompactStoresStatSum.RemovedDisk), + NiceBytes(Result.ReferencerStatSum.RemoveExpiredDataStats.FreedMemory), + NiceTimeSpanMs(Result.ElapsedMS.count()), + Settings.CacheExpireTime, + Settings.ProjectStoreExpireTime, + Settings.CollectSmallObjects, + Settings.IsDeleteMode, + Settings.SkipCidDelete); + + AppendGCLog(GcStartTime, Settings, Result); + + if (SkipCid) + { + m_LastLightweightGCV2Result = Result; + } + else + { + m_LastFullGCV2Result = Result; + } + Diff.DiskSize = Result.CompactStoresStatSum.RemovedDisk; + Diff.MemorySize = Result.ReferencerStatSum.RemoveExpiredDataStats.FreedMemory; + } + break; + } + + std::chrono::milliseconds ElapsedMS = std::chrono::milliseconds(Timer.GetElapsedTimeMs()); + if (SkipCid) + { + m_LastLightweightGcTime = GcClock::Now(); + m_LastLightweightGcDuration = ElapsedMS; + m_LastLightweightGCDiff = Diff; + } + else + { + if (Delete) { - m_LastLightweightGCV2Result.reset(); + GcClock::TimePoint KeepRangeStart = Min(CacheExpireTime, ProjectStoreExpireTime); + m_LastGcExpireTime = KeepRangeStart; + std::unique_lock Lock(m_GcMutex); + m_DiskUsageWindow.KeepRange(KeepRangeStart.time_since_epoch().count(), GcClock::Duration::max().count()); } - else + + m_LastGcTime = GcClock::Now(); + m_LastLightweightGcTime = m_LastGcTime; + m_LastFullGcDuration = ElapsedMS; + m_LastFullGCDiff = Diff; + } + + for (uint32_t RetryCount = 0; RetryCount < 3; RetryCount++) + { + if (RetryCount > 0) { - m_LastFullGCV2Result.reset(); + ZEN_INFO("Writing GC state failed {} time(s), pausing and trying again", RetryCount); + Sleep(250); } - break; - case GcVersion::kV2: + try { - const GcSettings Settings = {.CacheExpireTime = CacheExpireTime, - .ProjectStoreExpireTime = ProjectStoreExpireTime, - .CollectSmallObjects = CollectSmallObjects, - .IsDeleteMode = Delete, - .SkipCidDelete = SkipCid, - .Verbose = Verbose, - .SingleThread = SingleThreaded, - .CompactBlockUsageThresholdPercent = CompactBlockUsageThresholdPercent, - .DiskReservePath = m_Config.RootDirectory / "reserve.gc"}; - GcClock::TimePoint GcStartTime = GcClock::Now(); - GcResult Result = m_GcManager.CollectGarbage(Settings); - - ZEN_INFO( - "GCV2: Found {} expired items out of {}, deleted {}. " - "Found {} unreferenced Cid entries out of {}, deleted {}. " - "Freed {} on disk and {} of memory in {}. " - "CacheExpireTime: {}, ProjectStoreExpireTime: {}, CollectSmallObjects: {}, " - "IsDeleteMode: {}, SkipCidDelete: {}", - Result.ReferencerStatSum.RemoveExpiredDataStats.FoundCount, - Result.ReferencerStatSum.RemoveExpiredDataStats.CheckedCount, - Result.ReferencerStatSum.RemoveExpiredDataStats.DeletedCount, - - Result.ReferenceStoreStatSum.RemoveUnreferencedDataStats.FoundCount, - Result.ReferenceStoreStatSum.RemoveUnreferencedDataStats.CheckedCount, - Result.ReferenceStoreStatSum.RemoveUnreferencedDataStats.DeletedCount, - - NiceBytes(Result.CompactStoresStatSum.RemovedDisk), - NiceBytes(Result.ReferencerStatSum.RemoveExpiredDataStats.FreedMemory), - NiceTimeSpanMs(Result.ElapsedMS.count()), - Settings.CacheExpireTime, - Settings.ProjectStoreExpireTime, - Settings.CollectSmallObjects, - Settings.IsDeleteMode, - Settings.SkipCidDelete); - - AppendGCLog(GcStartTime, Settings, Result); - - if (SkipCid) + const fs::path Path = m_Config.RootDirectory / "gc_state"; + ZEN_DEBUG("saving scheduler state to '{}'", Path); + CbObjectWriter SchedulerState; + SchedulerState << "LastGcTime"sv << static_cast<int64_t>(m_LastGcTime.time_since_epoch().count()); + SchedulerState << "LastGcExpireTime"sv << static_cast<int64_t>(m_LastGcExpireTime.time_since_epoch().count()); + SaveCompactBinaryObject(Path, SchedulerState.Save()); + if (RetryCount > 0) { - m_LastLightweightGCV2Result = Result; + ZEN_INFO("Writing GC state succeeded after {} attempts", RetryCount + 1); + } + break; + } + catch (const std::system_error& SystemError) + { + if (IsOOM(SystemError.code())) + { + ZEN_WARN("writing gc scheduler state ran out of memory: '{}'", SystemError.what()); + } + else if (IsOOD(SystemError.code())) + { + ZEN_WARN("writing gc scheduler state ran out of disk space: '{}'", SystemError.what()); + } + if (RetryCount == 0) + { + ZEN_ERROR("writing gc scheduler state failed with system error exception: '{}'", SystemError.what()); } else { - m_LastFullGCV2Result = Result; + ZEN_WARN("writing gc scheduler state failed with system error exception: '{}'", SystemError.what()); } - Diff.DiskSize = Result.CompactStoresStatSum.RemovedDisk; - Diff.MemorySize = Result.ReferencerStatSum.RemoveExpiredDataStats.FreedMemory; } - break; + catch (const std::bad_alloc& BadAlloc) + { + ZEN_WARN("writing gc scheduler state ran out of memory: '{}'", BadAlloc.what()); + } + catch (const std::exception& Ex) + { + if (RetryCount == 0) + { + ZEN_ERROR("writing gc scheduler state failed with: '{}'", Ex.what()); + } + else + { + ZEN_WARN("writing gc scheduler state failed with: '{}'", Ex.what()); + } + } + } } - - std::chrono::milliseconds ElapsedMS = std::chrono::milliseconds(Timer.GetElapsedTimeMs()); - if (SkipCid) + return true; + } + catch (const std::system_error& SystemError) + { + if (IsOOM(SystemError.code())) { - m_LastLightweightGcTime = GcClock::Now(); - m_LastLightweightGcDuration = ElapsedMS; - m_LastLightweightGCDiff = Diff; + ZEN_WARN("scheduling garbage collection ran out of memory: '{}'", SystemError.what()); } - else - { - if (Delete) - { - GcClock::TimePoint KeepRangeStart = Min(CacheExpireTime, ProjectStoreExpireTime); - m_LastGcExpireTime = KeepRangeStart; - std::unique_lock Lock(m_GcMutex); - m_DiskUsageWindow.KeepRange(KeepRangeStart.time_since_epoch().count(), GcClock::Duration::max().count()); - } - - m_LastGcTime = GcClock::Now(); - m_LastLightweightGcTime = m_LastGcTime; - m_LastFullGcDuration = ElapsedMS; - m_LastFullGCDiff = Diff; - } - - try + else if (IsOOD(SystemError.code())) { - const fs::path Path = m_Config.RootDirectory / "gc_state"; - ZEN_DEBUG("saving scheduler state to '{}'", Path); - CbObjectWriter SchedulerState; - SchedulerState << "LastGcTime"sv << static_cast<int64_t>(m_LastGcTime.time_since_epoch().count()); - SchedulerState << "LastGcExpireTime"sv << static_cast<int64_t>(m_LastGcExpireTime.time_since_epoch().count()); - SaveCompactBinaryObject(Path, SchedulerState.Save()); + ZEN_WARN("scheduling garbage collection ran out of disk space: '{}'", SystemError.what()); } - catch (const std::system_error& SystemError) + else { - if (IsOOM(SystemError.code())) - { - ZEN_WARN("writing gc scheduler state ran out of memory: '{}'", SystemError.what()); - } - else if (IsOOD(SystemError.code())) + if (SilenceErrors) { - ZEN_WARN("writing gc scheduler state ran out of disk space: '{}'", SystemError.what()); + ZEN_WARN("scheduling garbage collection failed with system error exception: '{}'", SystemError.what()); } else { - ZEN_ERROR("writing gc scheduler state failed with system error exception: '{}'", SystemError.what()); + ZEN_ERROR("scheduling garbage collection failed with system error exception: '{}'", SystemError.what()); } } - catch (const std::bad_alloc& BadAlloc) + m_LastGcTime = GcClock::Now(); + m_LastLightweightGcTime = m_LastGcTime; + } + catch (const std::bad_alloc& BadAlloc) + { + ZEN_WARN("scheduling garbage collection ran out of memory: '{}'", BadAlloc.what()); + m_LastGcTime = GcClock::Now(); + m_LastLightweightGcTime = m_LastGcTime; + } + catch (const std::exception& Ex) + { + if (SilenceErrors) { - ZEN_WARN("writing gc scheduler state ran out of memory: '{}'", BadAlloc.what()); + ZEN_WARN("scheduling garbage collection failed with: '{}'", Ex.what()); } - catch (const std::exception& Ex) + else { - ZEN_ERROR("writing gc scheduler state failed with: '{}'", Ex.what()); + ZEN_ERROR("scheduling garbage collection failed with: '{}'", Ex.what()); } + m_LastGcTime = GcClock::Now(); + m_LastLightweightGcTime = m_LastGcTime; } + return false; } ////////////////////////////////////////////////////////////////////////// diff --git a/src/zenstore/include/zenstore/gc.h b/src/zenstore/include/zenstore/gc.h index 2edb5e0a5..92f2bdf78 100644 --- a/src/zenstore/include/zenstore/gc.h +++ b/src/zenstore/include/zenstore/gc.h @@ -537,7 +537,7 @@ public: private: void SchedulerThread(); - void CollectGarbage(const GcClock::TimePoint& CacheExpireTime, + bool CollectGarbage(const GcClock::TimePoint& CacheExpireTime, const GcClock::TimePoint& ProjectStoreExpireTime, bool Delete, bool CollectSmallObjects, @@ -545,7 +545,8 @@ private: GcVersion UseGCVersion, uint32_t CompactBlockUsageThresholdPercent, bool Verbose, - bool SingleThreaded); + bool SingleThreaded, + bool SilenceErrors); void ScrubStorage(bool DoDelete, bool SkipCid, std::chrono::seconds TimeSlice); LoggerRef Log() { return m_Log; } virtual bool AreDiskWritesAllowed() const override { return !m_AreDiskWritesBlocked.load(); } |