aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorDan Engelbrecht <[email protected]>2024-08-14 10:22:57 +0200
committerGitHub Enterprise <[email protected]>2024-08-14 10:22:57 +0200
commit831f0787fca39682dec0141714eaebb7772b8639 (patch)
treea72de0a27bb2d62589c3d968b3e69b43ef3ce0fb /src
parentimprove logging on main failure (#111) (diff)
downloadzen-831f0787fca39682dec0141714eaebb7772b8639.tar.xz
zen-831f0787fca39682dec0141714eaebb7772b8639.zip
hardening and reduced spam from GC on failure (#112)
* Retry writing GC state if it fails to handle transient problems * If GC operation fails demote errors to warnings on consecutive fails
Diffstat (limited to 'src')
-rw-r--r--src/zenstore/gc.cpp370
-rw-r--r--src/zenstore/include/zenstore/gc.h5
2 files changed, 232 insertions, 143 deletions
diff --git a/src/zenstore/gc.cpp b/src/zenstore/gc.cpp
index dc75a29f2..8ed74b54b 100644
--- a/src/zenstore/gc.cpp
+++ b/src/zenstore/gc.cpp
@@ -1726,6 +1726,8 @@ GcScheduler::SchedulerThread()
std::chrono::seconds WaitTime{0};
+ bool SilenceErrors = false;
+
for (;;)
{
bool Timeout = false;
@@ -2016,15 +2018,26 @@ GcScheduler::SchedulerThread()
}
}
- CollectGarbage(CacheExpireTime,
- ProjectStoreExpireTime,
- DoDelete,
- CollectSmallObjects,
- SkipCid,
- UseGCVersion,
- CompactBlockUsageThresholdPercent,
- Verbose,
- SingleThreaded);
+ bool GcSuccess = CollectGarbage(CacheExpireTime,
+ ProjectStoreExpireTime,
+ DoDelete,
+ CollectSmallObjects,
+ SkipCid,
+ UseGCVersion,
+ CompactBlockUsageThresholdPercent,
+ Verbose,
+ SingleThreaded,
+ SilenceErrors);
+ if (!GcSuccess)
+ {
+ SilenceErrors = true;
+ ZEN_INFO("gc going into error state - gc errors will be demoted to warnings until we recover");
+ }
+ else if (SilenceErrors)
+ {
+ SilenceErrors = false;
+ ZEN_INFO("gc recovered from error state - gc errors will be will be reported as errors again");
+ }
m_GcManager.SetCancelGC(false);
@@ -2103,7 +2116,7 @@ GcScheduler::ScrubStorage(bool DoDelete, bool SkipCid, std::chrono::seconds Time
ZEN_INFO("scrubbing DONE (in {})", NiceTimeSpanMs(Timer.GetElapsedTimeMs()));
}
-void
+bool
GcScheduler::CollectGarbage(const GcClock::TimePoint& CacheExpireTime,
const GcClock::TimePoint& ProjectStoreExpireTime,
bool Delete,
@@ -2112,176 +2125,251 @@ GcScheduler::CollectGarbage(const GcClock::TimePoint& CacheExpireTime,
GcVersion UseGCVersion,
uint32_t CompactBlockUsageThresholdPercent,
bool Verbose,
- bool SingleThreaded)
+ bool SingleThreaded,
+ bool SilenceErrors)
{
ZEN_TRACE_CPU("GcScheduler::CollectGarbage");
- GcContext GcCtx(CacheExpireTime, ProjectStoreExpireTime);
- GcCtx.SetDeletionMode(Delete);
- GcCtx.SetSkipCid(SkipCid);
- GcCtx.CollectSmallObjects(CollectSmallObjects);
- GcCtx.DiskReservePath(m_Config.RootDirectory / "reserve.gc");
+ try
+ {
+ GcContext GcCtx(CacheExpireTime, ProjectStoreExpireTime);
+ GcCtx.SetDeletionMode(Delete);
+ GcCtx.SetSkipCid(SkipCid);
+ GcCtx.CollectSmallObjects(CollectSmallObjects);
+ GcCtx.DiskReservePath(m_Config.RootDirectory / "reserve.gc");
- auto ReclaimDiskReserve = [&]() {
- std::error_code Ec = CreateGCReserve(m_Config.RootDirectory / "reserve.gc", m_Config.DiskReserveSize);
- if (Ec)
- {
- ZEN_WARN("unable to create GC reserve at '{}' with size {}, reason: '{}'",
- m_Config.RootDirectory / "reserve.gc",
- NiceBytes(m_Config.DiskReserveSize),
- Ec.message());
- }
- };
+ auto ReclaimDiskReserve = [&]() {
+ std::error_code Ec = CreateGCReserve(m_Config.RootDirectory / "reserve.gc", m_Config.DiskReserveSize);
+ if (Ec)
+ {
+ ZEN_WARN("unable to create GC reserve at '{}' with size {}, reason: '{}'",
+ m_Config.RootDirectory / "reserve.gc",
+ NiceBytes(m_Config.DiskReserveSize),
+ Ec.message());
+ }
+ };
- ReclaimDiskReserve();
- const auto _ = MakeGuard([&] { ReclaimDiskReserve(); });
+ ReclaimDiskReserve();
+ const auto _ = MakeGuard([&] { ReclaimDiskReserve(); });
- CheckDiskSpace();
+ CheckDiskSpace();
- if (m_AreDiskWritesBlocked.load())
- {
- // We are low on disk, check if we can release our extra storage reserve, if we can't bail from doing GC
- uint64_t ReleasedSpace = GcCtx.ClaimGCReserve();
- if (ReleasedSpace == 0)
+ if (m_AreDiskWritesBlocked.load())
{
- ZEN_WARN("Disk space is very low and we have no GC reserve, skipping GC as this requires at least some space to write to '{}'",
- m_Config.RootDirectory);
- return;
+ // We are low on disk, check if we can release our extra storage reserve, if we can't bail from doing GC
+ uint64_t ReleasedSpace = GcCtx.ClaimGCReserve();
+ if (ReleasedSpace == 0)
+ {
+ ZEN_WARN(
+ "Disk space is very low and we have no GC reserve, skipping GC as this requires at least some space to write to '{}'",
+ m_Config.RootDirectory);
+ return true; // Treat this as a success as we don't want to silence any errors that happens when we are no longer low on
+ // disk space
+ }
}
- }
- ZEN_INFO("garbage collection STARTING, small objects gc {}, {} CAS. Cache cutoff time {}, project store cutoff time {}",
- GcCtx.CollectSmallObjects() ? "ENABLED"sv : "DISABLED"sv,
- SkipCid ? "skip"sv : "include"sv,
- CacheExpireTime,
- ProjectStoreExpireTime);
- {
- Stopwatch Timer;
- const auto __ = MakeGuard([&] { ZEN_INFO("garbage collection DONE in {}", NiceTimeSpanMs(Timer.GetElapsedTimeMs())); });
-
- GcStorageSize Diff;
- switch (UseGCVersion)
+ ZEN_INFO("garbage collection STARTING, small objects gc {}, {} CAS. Cache cutoff time {}, project store cutoff time {}",
+ GcCtx.CollectSmallObjects() ? "ENABLED"sv : "DISABLED"sv,
+ SkipCid ? "skip"sv : "include"sv,
+ CacheExpireTime,
+ ProjectStoreExpireTime);
{
- case GcVersion::kV1:
- Diff = m_GcManager.CollectGarbage(GcCtx);
- if (SkipCid)
+ Stopwatch Timer;
+ const auto __ = MakeGuard([&] { ZEN_INFO("garbage collection DONE in {}", NiceTimeSpanMs(Timer.GetElapsedTimeMs())); });
+
+ GcStorageSize Diff;
+ switch (UseGCVersion)
+ {
+ case GcVersion::kV1:
+ Diff = m_GcManager.CollectGarbage(GcCtx);
+ if (SkipCid)
+ {
+ m_LastLightweightGCV2Result.reset();
+ }
+ else
+ {
+ m_LastFullGCV2Result.reset();
+ }
+ break;
+ case GcVersion::kV2:
+ {
+ const GcSettings Settings = {.CacheExpireTime = CacheExpireTime,
+ .ProjectStoreExpireTime = ProjectStoreExpireTime,
+ .CollectSmallObjects = CollectSmallObjects,
+ .IsDeleteMode = Delete,
+ .SkipCidDelete = SkipCid,
+ .Verbose = Verbose,
+ .SingleThread = SingleThreaded,
+ .CompactBlockUsageThresholdPercent = CompactBlockUsageThresholdPercent,
+ .DiskReservePath = m_Config.RootDirectory / "reserve.gc"};
+ GcClock::TimePoint GcStartTime = GcClock::Now();
+ GcResult Result = m_GcManager.CollectGarbage(Settings);
+
+ ZEN_INFO(
+ "GCV2: Found {} expired items out of {}, deleted {}. "
+ "Found {} unreferenced Cid entries out of {}, deleted {}. "
+ "Freed {} on disk and {} of memory in {}. "
+ "CacheExpireTime: {}, ProjectStoreExpireTime: {}, CollectSmallObjects: {}, "
+ "IsDeleteMode: {}, SkipCidDelete: {}",
+ Result.ReferencerStatSum.RemoveExpiredDataStats.FoundCount,
+ Result.ReferencerStatSum.RemoveExpiredDataStats.CheckedCount,
+ Result.ReferencerStatSum.RemoveExpiredDataStats.DeletedCount,
+
+ Result.ReferenceStoreStatSum.RemoveUnreferencedDataStats.FoundCount,
+ Result.ReferenceStoreStatSum.RemoveUnreferencedDataStats.CheckedCount,
+ Result.ReferenceStoreStatSum.RemoveUnreferencedDataStats.DeletedCount,
+
+ NiceBytes(Result.CompactStoresStatSum.RemovedDisk),
+ NiceBytes(Result.ReferencerStatSum.RemoveExpiredDataStats.FreedMemory),
+ NiceTimeSpanMs(Result.ElapsedMS.count()),
+ Settings.CacheExpireTime,
+ Settings.ProjectStoreExpireTime,
+ Settings.CollectSmallObjects,
+ Settings.IsDeleteMode,
+ Settings.SkipCidDelete);
+
+ AppendGCLog(GcStartTime, Settings, Result);
+
+ if (SkipCid)
+ {
+ m_LastLightweightGCV2Result = Result;
+ }
+ else
+ {
+ m_LastFullGCV2Result = Result;
+ }
+ Diff.DiskSize = Result.CompactStoresStatSum.RemovedDisk;
+ Diff.MemorySize = Result.ReferencerStatSum.RemoveExpiredDataStats.FreedMemory;
+ }
+ break;
+ }
+
+ std::chrono::milliseconds ElapsedMS = std::chrono::milliseconds(Timer.GetElapsedTimeMs());
+ if (SkipCid)
+ {
+ m_LastLightweightGcTime = GcClock::Now();
+ m_LastLightweightGcDuration = ElapsedMS;
+ m_LastLightweightGCDiff = Diff;
+ }
+ else
+ {
+ if (Delete)
{
- m_LastLightweightGCV2Result.reset();
+ GcClock::TimePoint KeepRangeStart = Min(CacheExpireTime, ProjectStoreExpireTime);
+ m_LastGcExpireTime = KeepRangeStart;
+ std::unique_lock Lock(m_GcMutex);
+ m_DiskUsageWindow.KeepRange(KeepRangeStart.time_since_epoch().count(), GcClock::Duration::max().count());
}
- else
+
+ m_LastGcTime = GcClock::Now();
+ m_LastLightweightGcTime = m_LastGcTime;
+ m_LastFullGcDuration = ElapsedMS;
+ m_LastFullGCDiff = Diff;
+ }
+
+ for (uint32_t RetryCount = 0; RetryCount < 3; RetryCount++)
+ {
+ if (RetryCount > 0)
{
- m_LastFullGCV2Result.reset();
+ ZEN_INFO("Writing GC state failed {} time(s), pausing and trying again", RetryCount);
+ Sleep(250);
}
- break;
- case GcVersion::kV2:
+ try
{
- const GcSettings Settings = {.CacheExpireTime = CacheExpireTime,
- .ProjectStoreExpireTime = ProjectStoreExpireTime,
- .CollectSmallObjects = CollectSmallObjects,
- .IsDeleteMode = Delete,
- .SkipCidDelete = SkipCid,
- .Verbose = Verbose,
- .SingleThread = SingleThreaded,
- .CompactBlockUsageThresholdPercent = CompactBlockUsageThresholdPercent,
- .DiskReservePath = m_Config.RootDirectory / "reserve.gc"};
- GcClock::TimePoint GcStartTime = GcClock::Now();
- GcResult Result = m_GcManager.CollectGarbage(Settings);
-
- ZEN_INFO(
- "GCV2: Found {} expired items out of {}, deleted {}. "
- "Found {} unreferenced Cid entries out of {}, deleted {}. "
- "Freed {} on disk and {} of memory in {}. "
- "CacheExpireTime: {}, ProjectStoreExpireTime: {}, CollectSmallObjects: {}, "
- "IsDeleteMode: {}, SkipCidDelete: {}",
- Result.ReferencerStatSum.RemoveExpiredDataStats.FoundCount,
- Result.ReferencerStatSum.RemoveExpiredDataStats.CheckedCount,
- Result.ReferencerStatSum.RemoveExpiredDataStats.DeletedCount,
-
- Result.ReferenceStoreStatSum.RemoveUnreferencedDataStats.FoundCount,
- Result.ReferenceStoreStatSum.RemoveUnreferencedDataStats.CheckedCount,
- Result.ReferenceStoreStatSum.RemoveUnreferencedDataStats.DeletedCount,
-
- NiceBytes(Result.CompactStoresStatSum.RemovedDisk),
- NiceBytes(Result.ReferencerStatSum.RemoveExpiredDataStats.FreedMemory),
- NiceTimeSpanMs(Result.ElapsedMS.count()),
- Settings.CacheExpireTime,
- Settings.ProjectStoreExpireTime,
- Settings.CollectSmallObjects,
- Settings.IsDeleteMode,
- Settings.SkipCidDelete);
-
- AppendGCLog(GcStartTime, Settings, Result);
-
- if (SkipCid)
+ const fs::path Path = m_Config.RootDirectory / "gc_state";
+ ZEN_DEBUG("saving scheduler state to '{}'", Path);
+ CbObjectWriter SchedulerState;
+ SchedulerState << "LastGcTime"sv << static_cast<int64_t>(m_LastGcTime.time_since_epoch().count());
+ SchedulerState << "LastGcExpireTime"sv << static_cast<int64_t>(m_LastGcExpireTime.time_since_epoch().count());
+ SaveCompactBinaryObject(Path, SchedulerState.Save());
+ if (RetryCount > 0)
{
- m_LastLightweightGCV2Result = Result;
+ ZEN_INFO("Writing GC state succeeded after {} attempts", RetryCount + 1);
+ }
+ break;
+ }
+ catch (const std::system_error& SystemError)
+ {
+ if (IsOOM(SystemError.code()))
+ {
+ ZEN_WARN("writing gc scheduler state ran out of memory: '{}'", SystemError.what());
+ }
+ else if (IsOOD(SystemError.code()))
+ {
+ ZEN_WARN("writing gc scheduler state ran out of disk space: '{}'", SystemError.what());
+ }
+ if (RetryCount == 0)
+ {
+ ZEN_ERROR("writing gc scheduler state failed with system error exception: '{}'", SystemError.what());
}
else
{
- m_LastFullGCV2Result = Result;
+ ZEN_WARN("writing gc scheduler state failed with system error exception: '{}'", SystemError.what());
}
- Diff.DiskSize = Result.CompactStoresStatSum.RemovedDisk;
- Diff.MemorySize = Result.ReferencerStatSum.RemoveExpiredDataStats.FreedMemory;
}
- break;
+ catch (const std::bad_alloc& BadAlloc)
+ {
+ ZEN_WARN("writing gc scheduler state ran out of memory: '{}'", BadAlloc.what());
+ }
+ catch (const std::exception& Ex)
+ {
+ if (RetryCount == 0)
+ {
+ ZEN_ERROR("writing gc scheduler state failed with: '{}'", Ex.what());
+ }
+ else
+ {
+ ZEN_WARN("writing gc scheduler state failed with: '{}'", Ex.what());
+ }
+ }
+ }
}
-
- std::chrono::milliseconds ElapsedMS = std::chrono::milliseconds(Timer.GetElapsedTimeMs());
- if (SkipCid)
+ return true;
+ }
+ catch (const std::system_error& SystemError)
+ {
+ if (IsOOM(SystemError.code()))
{
- m_LastLightweightGcTime = GcClock::Now();
- m_LastLightweightGcDuration = ElapsedMS;
- m_LastLightweightGCDiff = Diff;
+ ZEN_WARN("scheduling garbage collection ran out of memory: '{}'", SystemError.what());
}
- else
- {
- if (Delete)
- {
- GcClock::TimePoint KeepRangeStart = Min(CacheExpireTime, ProjectStoreExpireTime);
- m_LastGcExpireTime = KeepRangeStart;
- std::unique_lock Lock(m_GcMutex);
- m_DiskUsageWindow.KeepRange(KeepRangeStart.time_since_epoch().count(), GcClock::Duration::max().count());
- }
-
- m_LastGcTime = GcClock::Now();
- m_LastLightweightGcTime = m_LastGcTime;
- m_LastFullGcDuration = ElapsedMS;
- m_LastFullGCDiff = Diff;
- }
-
- try
+ else if (IsOOD(SystemError.code()))
{
- const fs::path Path = m_Config.RootDirectory / "gc_state";
- ZEN_DEBUG("saving scheduler state to '{}'", Path);
- CbObjectWriter SchedulerState;
- SchedulerState << "LastGcTime"sv << static_cast<int64_t>(m_LastGcTime.time_since_epoch().count());
- SchedulerState << "LastGcExpireTime"sv << static_cast<int64_t>(m_LastGcExpireTime.time_since_epoch().count());
- SaveCompactBinaryObject(Path, SchedulerState.Save());
+ ZEN_WARN("scheduling garbage collection ran out of disk space: '{}'", SystemError.what());
}
- catch (const std::system_error& SystemError)
+ else
{
- if (IsOOM(SystemError.code()))
- {
- ZEN_WARN("writing gc scheduler state ran out of memory: '{}'", SystemError.what());
- }
- else if (IsOOD(SystemError.code()))
+ if (SilenceErrors)
{
- ZEN_WARN("writing gc scheduler state ran out of disk space: '{}'", SystemError.what());
+ ZEN_WARN("scheduling garbage collection failed with system error exception: '{}'", SystemError.what());
}
else
{
- ZEN_ERROR("writing gc scheduler state failed with system error exception: '{}'", SystemError.what());
+ ZEN_ERROR("scheduling garbage collection failed with system error exception: '{}'", SystemError.what());
}
}
- catch (const std::bad_alloc& BadAlloc)
+ m_LastGcTime = GcClock::Now();
+ m_LastLightweightGcTime = m_LastGcTime;
+ }
+ catch (const std::bad_alloc& BadAlloc)
+ {
+ ZEN_WARN("scheduling garbage collection ran out of memory: '{}'", BadAlloc.what());
+ m_LastGcTime = GcClock::Now();
+ m_LastLightweightGcTime = m_LastGcTime;
+ }
+ catch (const std::exception& Ex)
+ {
+ if (SilenceErrors)
{
- ZEN_WARN("writing gc scheduler state ran out of memory: '{}'", BadAlloc.what());
+ ZEN_WARN("scheduling garbage collection failed with: '{}'", Ex.what());
}
- catch (const std::exception& Ex)
+ else
{
- ZEN_ERROR("writing gc scheduler state failed with: '{}'", Ex.what());
+ ZEN_ERROR("scheduling garbage collection failed with: '{}'", Ex.what());
}
+ m_LastGcTime = GcClock::Now();
+ m_LastLightweightGcTime = m_LastGcTime;
}
+ return false;
}
//////////////////////////////////////////////////////////////////////////
diff --git a/src/zenstore/include/zenstore/gc.h b/src/zenstore/include/zenstore/gc.h
index 2edb5e0a5..92f2bdf78 100644
--- a/src/zenstore/include/zenstore/gc.h
+++ b/src/zenstore/include/zenstore/gc.h
@@ -537,7 +537,7 @@ public:
private:
void SchedulerThread();
- void CollectGarbage(const GcClock::TimePoint& CacheExpireTime,
+ bool CollectGarbage(const GcClock::TimePoint& CacheExpireTime,
const GcClock::TimePoint& ProjectStoreExpireTime,
bool Delete,
bool CollectSmallObjects,
@@ -545,7 +545,8 @@ private:
GcVersion UseGCVersion,
uint32_t CompactBlockUsageThresholdPercent,
bool Verbose,
- bool SingleThreaded);
+ bool SingleThreaded,
+ bool SilenceErrors);
void ScrubStorage(bool DoDelete, bool SkipCid, std::chrono::seconds TimeSlice);
LoggerRef Log() { return m_Log; }
virtual bool AreDiskWritesAllowed() const override { return !m_AreDiskWritesBlocked.load(); }