diff options
| author | Dan Engelbrecht <[email protected]> | 2024-11-15 10:06:39 +0100 |
|---|---|---|
| committer | GitHub Enterprise <[email protected]> | 2024-11-15 10:06:39 +0100 |
| commit | aca6f56fde841454b13ed18136008b0ffe946aed (patch) | |
| tree | 3770efa6c789b45de8ea3ec426da7a77e7813775 /src/zenstore/gc.cpp | |
| parent | fixed some issues with ZenServerInstance::SpawnServer (#218) (diff) | |
| download | zen-aca6f56fde841454b13ed18136008b0ffe946aed.tar.xz zen-aca6f56fde841454b13ed18136008b0ffe946aed.zip | |
oplog prep gc fix (#216)
- Added option gc-validation to zenserver that does a check for missing references in all oplog post full GC. Enabled by default.
- Feature: Added option gc-validation to zen gc command to control reference validation. Enabled by default.
- Added more details in post GC log.
- Fixed race condition in oplog writes which could cause used attachments to be incorrectly removed by GC
Diffstat (limited to 'src/zenstore/gc.cpp')
| -rw-r--r-- | src/zenstore/gc.cpp | 600 |
1 files changed, 415 insertions, 185 deletions
diff --git a/src/zenstore/gc.cpp b/src/zenstore/gc.cpp index be8fc0148..b2b574799 100644 --- a/src/zenstore/gc.cpp +++ b/src/zenstore/gc.cpp @@ -240,15 +240,15 @@ WriteCompactStoreStats(CbObjectWriter& Writer, const GcCompactStoreStats& Stats, void WriteReferencerStats(CbObjectWriter& Writer, const GcReferencerStats& Stats, bool HumanReadable) { - if (Stats.RemoveExpiredDataStats.CheckedCount == 0) + Writer << "CreateReferenceValidators" << ToTimeSpan(Stats.CreateReferenceValidatorsMS); + if (Stats.RemoveExpiredDataStats.CheckedCount != 0) { - return; - } - Writer.BeginObject("RemoveExpired"); - { - WriteGcStats(Writer, Stats.RemoveExpiredDataStats, HumanReadable); + Writer.BeginObject("RemoveExpired"); + { + WriteGcStats(Writer, Stats.RemoveExpiredDataStats, HumanReadable); + } + Writer.EndObject(); } - Writer.EndObject(); Writer.BeginObject("Compact"); { @@ -265,15 +265,14 @@ WriteReferencerStats(CbObjectWriter& Writer, const GcReferencerStats& Stats, boo void WriteReferenceStoreStats(CbObjectWriter& Writer, const GcReferenceStoreStats& Stats, bool HumanReadable) { - if (Stats.RemoveUnreferencedDataStats.CheckedCount == 0) - { - return; - } - Writer.BeginObject("RemoveUnreferenced"); + if (Stats.RemoveUnreferencedDataStats.CheckedCount != 0) { - WriteGcStats(Writer, Stats.RemoveUnreferencedDataStats, HumanReadable); + Writer.BeginObject("RemoveUnreferenced"); + { + WriteGcStats(Writer, Stats.RemoveUnreferencedDataStats, HumanReadable); + } + Writer.EndObject(); } - Writer.EndObject(); Writer.BeginObject("Compact"); { @@ -286,6 +285,21 @@ WriteReferenceStoreStats(CbObjectWriter& Writer, const GcReferenceStoreStats& St }; void +WriteReferenceValidatorStats(CbObjectWriter& Writer, const GcReferenceValidatorStats& Stats, bool /*HumanReadable*/) +{ + Writer << "Checked" << Stats.CheckedCount; + Writer << "Missing" << Stats.MissingCount; + if (Stats.MissingCount > 0) + { + Writer << "MissingChunks" << Stats.MissingChunks; + Writer << "MissingFiles" << Stats.MissingFiles; + Writer << "MissingMetas" << Stats.MissingMetas; + Writer << "MissingAttachments" << Stats.MissingAttachments; + } + Writer << "Elapsed" << ToTimeSpan(Stats.ElapsedMS); +}; + +void WriteGCResult(CbObjectWriter& Writer, const GcResult& Result, bool HumanReadable, bool IncludeDetails) { if (!IncludeDetails) @@ -324,8 +338,15 @@ WriteGCResult(CbObjectWriter& Writer, const GcResult& Result, bool HumanReadable } Writer.EndObject(); + Writer.BeginObject("ReferenceValidator"); + { + WriteReferenceValidatorStats(Writer, Result.ReferenceValidatorStatSum, HumanReadable); + } + Writer.EndObject(); + Writer << "RemoveExpiredData" << ToTimeSpan(Result.RemoveExpiredDataMS); Writer << "CreateReferenceCheckers" << ToTimeSpan(Result.CreateReferenceCheckersMS); + Writer << "CreateReferenceValidators" << ToTimeSpan(Result.CreateReferenceValidatorsMS); Writer << "PreCacheState" << ToTimeSpan(Result.PreCacheStateMS); Writer << "LockState" << ToTimeSpan(Result.LockStateMS); Writer << "UpdateLockedState" << ToTimeSpan(Result.UpdateLockedStateMS); @@ -333,6 +354,7 @@ WriteGCResult(CbObjectWriter& Writer, const GcResult& Result, bool HumanReadable Writer << "CreateReferencePruners" << ToTimeSpan(Result.CreateReferencePrunersMS); Writer << "RemoveUnreferencedData" << ToTimeSpan(Result.RemoveUnreferencedDataMS); Writer << "CompactStores" << ToTimeSpan(Result.CompactStoresMS); + Writer << "Validate" << ToTimeSpan(Result.ValidateReferencersMS); Writer << "WriteBlock" << ToTimeSpan(Result.WriteBlockMS); Writer << "Elapsed" << ToTimeSpan(Result.ElapsedMS); @@ -362,6 +384,18 @@ WriteGCResult(CbObjectWriter& Writer, const GcResult& Result, bool HumanReadable } Writer.EndArray(); } + if (!Result.ReferenceValidatorStats.empty()) + { + Writer.BeginArray("ReferenceValidators"); + for (const std::pair<std::string, GcReferenceValidatorStats>& It : Result.ReferenceValidatorStats) + { + Writer.BeginObject(); + Writer << "Name" << It.first; + WriteReferenceValidatorStats(Writer, It.second, HumanReadable); + Writer.EndObject(); + } + Writer.EndArray(); + } }; void @@ -387,7 +421,7 @@ void Sum(GcReferencerStats& Stat) { Stat.ElapsedMS = Stat.RemoveExpiredDataStats.ElapsedMS + Stat.CompactStoreStats.ElapsedMS + Stat.CreateReferenceCheckersMS + - Stat.PreCacheStateMS + Stat.UpdateLockedStateMS; + Stat.CreateReferenceValidatorsMS + Stat.PreCacheStateMS + Stat.UpdateLockedStateMS; } void @@ -397,6 +431,7 @@ Add(GcReferencerStats& Sum, const GcReferencerStats& Sub) Add(Sum.CompactStoreStats, Sub.CompactStoreStats); Sum.CreateReferenceCheckersMS += Sub.CreateReferenceCheckersMS; + Sum.CreateReferenceValidatorsMS += Sub.CreateReferenceValidatorsMS; Sum.PreCacheStateMS += Sub.PreCacheStateMS; Sum.UpdateLockedStateMS += Sub.UpdateLockedStateMS; @@ -420,6 +455,23 @@ Add(GcReferenceStoreStats& Sum, const GcReferenceStoreStats& Sub) Sum.ElapsedMS += Sub.ElapsedMS; } +void +Add(GcReferenceValidatorStats& Sum, const GcReferenceValidatorStats& Sub) +{ + Sum.CheckedCount += Sub.CheckedCount; + Sum.MissingChunks += Sub.MissingChunks; + Sum.MissingFiles += Sub.MissingFiles; + Sum.MissingMetas += Sub.MissingMetas; + Sum.MissingAttachments += Sub.MissingAttachments; + Sum.ElapsedMS += Sub.ElapsedMS; +} + +void +Sum(GcReferenceValidatorStats& Stat) +{ + Stat.MissingCount = Stat.MissingChunks + Stat.MissingFiles + Stat.MissingMetas + Stat.MissingAttachments; +} + GcResult& Sum(GcResult& Stat, bool Cancelled = false) { @@ -435,9 +487,16 @@ Sum(GcResult& Stat, bool Cancelled = false) Sum(SubStat); Add(Stat.ReferenceStoreStatSum, SubStat); } + for (std::pair<std::string, GcReferenceValidatorStats>& ReferenceValidator : Stat.ReferenceValidatorStats) + { + GcReferenceValidatorStats& SubStat = ReferenceValidator.second; + Sum(SubStat); + Add(Stat.ReferenceValidatorStatSum, SubStat); + } Sum(Stat.ReferencerStatSum); Sum(Stat.ReferenceStoreStatSum); + Sum(Stat.ReferenceValidatorStatSum); Add(Stat.CompactStoresStatSum, Stat.ReferencerStatSum.CompactStoreStats); Add(Stat.CompactStoresStatSum, Stat.ReferenceStoreStatSum.CompactStoreStats); @@ -630,18 +689,21 @@ GcManager::CollectGarbage(const GcSettings& Settings) Result.ReferencerStats.resize(m_GcReferencers.size()); + std::unordered_map<std::unique_ptr<GcReferenceChecker>, size_t> ReferenceCheckers; std::unordered_map<std::unique_ptr<GcStoreCompactor>, GcCompactStoreStats*> StoreCompactors; RwLock StoreCompactorsLock; + std::unordered_map<std::unique_ptr<GcReferenceValidator>, size_t> ReferenceValidators; + RwLock ReferenceValidatorsLock; WorkerThreadPool& PreCachePhaseThreadPool = Settings.SingleThread ? GetSyncWorkerPool() : GetSmallWorkerPool(EWorkloadType::Background); - ZEN_INFO("GCV2: Removing expired data from {} referencers", m_GcReferencers.size()); if (!m_GcReferencers.empty()) { if (CheckGCCancel()) { return Sum(Result, true); } + ZEN_INFO("GCV2: Removing expired data from {} referencers", m_GcReferencers.size()); ZEN_TRACE_CPU("GcV2::RemoveExpiredData"); Latch WorkLeft(1); @@ -697,9 +759,11 @@ GcManager::CollectGarbage(const GcSettings& Settings) return Sum(Result, true); } + ZEN_INFO("GCV2: Creating reference pruners from {} reference stores", m_GcReferenceStores.size()); + ZEN_TRACE_CPU("GcV2::CreateReferencePruners"); + Result.ReferenceStoreStats.resize(m_GcReferenceStores.size()); - ZEN_INFO("GCV2: Creating reference pruners from {} reference stores", m_GcReferenceStores.size()); std::unordered_map<size_t, std::unique_ptr<GcReferencePruner>> ReferencePruners; if (!m_GcReferenceStores.empty()) { @@ -771,23 +835,100 @@ GcManager::CollectGarbage(const GcSettings& Settings) } ZEN_INFO("GCV2: Creating reference checkers from {} referencers", m_GcReferencers.size()); - std::unordered_map<std::unique_ptr<GcReferenceChecker>, size_t> ReferenceCheckers; - if (!m_GcReferencers.empty()) { ZEN_TRACE_CPU("GcV2::CreateReferenceCheckers"); - ReferenceCheckers.reserve(m_GcReferencers.size()); - Latch WorkLeft(1); - RwLock ReferenceCheckersLock; + if (!m_GcReferencers.empty()) { - SCOPED_TIMER(Result.CreateReferenceCheckersMS = std::chrono::milliseconds(Timer.GetElapsedTimeMs()); + ZEN_TRACE_CPU("GcV2::CreateReferenceCheckers"); + + ReferenceCheckers.reserve(m_GcReferencers.size()); + Latch WorkLeft(1); + RwLock ReferenceCheckersLock; + { + SCOPED_TIMER(Result.CreateReferenceCheckersMS = std::chrono::milliseconds(Timer.GetElapsedTimeMs()); + if (Ctx.Settings.Verbose) { + ZEN_INFO("GCV2: Created {} reference checkers using {} referencers in {}", + ReferenceCheckers.size(), + m_GcReferencers.size(), + NiceTimeSpanMs(Result.CreateReferenceCheckersMS.count())); + }); + // Lock all reference owners from changing the reference data and get access to check for referenced data + for (size_t Index = 0; Index < m_GcReferencers.size(); Index++) + { + if (CheckGCCancel()) + { + WorkLeft.CountDown(); + WorkLeft.Wait(); + return Sum(Result, true); + } + + GcReferencer* Referencer = m_GcReferencers[Index]; + std::pair<std::string, GcReferencerStats>* Stats = &Result.ReferencerStats[Index]; + WorkLeft.AddCount(1); + PreCachePhaseThreadPool.ScheduleWork( + [this, &Ctx, &WorkLeft, Referencer, Index, Stats, &ReferenceCheckersLock, &ReferenceCheckers]() { + auto _ = MakeGuard([&WorkLeft]() { WorkLeft.CountDown(); }); + // The Referencer will create a reference checker that guarantees that the references do not change + // as long as it lives + std::vector<GcReferenceChecker*> Checkers; + try + { + { + SCOPED_TIMER(Stats->second.CreateReferenceCheckersMS = + std::chrono::milliseconds(Timer.GetElapsedTimeMs());); + Checkers = Referencer->CreateReferenceCheckers(Ctx); + } + if (!Checkers.empty()) + { + RwLock::ExclusiveLockScope __(ReferenceCheckersLock); + for (auto& Checker : Checkers) + { + ReferenceCheckers.insert_or_assign(std::unique_ptr<GcReferenceChecker>(Checker), Index); + Checker = nullptr; + } + } + } + catch (const std::exception& Ex) + { + ZEN_ERROR("GCV2: Failed creating reference checkers for {}. Reason: '{}'", + Referencer->GetGcName(Ctx), + Ex.what()); + SetCancelGC(true); + while (!Checkers.empty()) + { + delete Checkers.back(); + Checkers.pop_back(); + } + } + }); + } + WorkLeft.CountDown(); + WorkLeft.Wait(); + } + } + } + + if (CheckGCCancel()) + { + return Sum(Result, true); + } + + if (!m_GcReferencers.empty() && Settings.EnableValidation) + { + ZEN_INFO("GCV2: Creating reference validators from {} referencers", m_GcReferencers.size()); + ZEN_TRACE_CPU("GcV2::CreateReferenceValidators"); + + ReferenceValidators.reserve(m_GcReferencers.size()); + Latch WorkLeft(1); + { + SCOPED_TIMER(Result.CreateReferenceValidatorsMS = std::chrono::milliseconds(Timer.GetElapsedTimeMs()); if (Ctx.Settings.Verbose) { - ZEN_INFO("GCV2: Created {} reference checkers using {} referencers in {}", - ReferenceCheckers.size(), + ZEN_INFO("GCV2: Created {} reference validators using {} referencers in {}", + ReferenceValidators.size(), m_GcReferencers.size(), - NiceTimeSpanMs(Result.CreateReferenceCheckersMS.count())); + NiceTimeSpanMs(Result.CreateReferenceValidatorsMS.count())); }); - // Lock all reference owners from changing the reference data and get access to check for referenced data for (size_t Index = 0; Index < m_GcReferencers.size(); Index++) { if (CheckGCCancel()) @@ -797,100 +938,106 @@ GcManager::CollectGarbage(const GcSettings& Settings) return Sum(Result, true); } - GcReferencer* Referencer = m_GcReferencers[Index]; - std::pair<std::string, GcReferencerStats>* Stats = &Result.ReferencerStats[Index]; + GcReferencer* Referencer = m_GcReferencers[Index]; + std::pair<std::string, GcReferencerStats>* ReferemcerStats = &Result.ReferencerStats[Index]; WorkLeft.AddCount(1); - PreCachePhaseThreadPool.ScheduleWork( - [this, &Ctx, &WorkLeft, Referencer, Index, Stats, &ReferenceCheckersLock, &ReferenceCheckers]() { - auto _ = MakeGuard([&WorkLeft]() { WorkLeft.CountDown(); }); - // The Referencer will create a reference checker that guarantees that the references do not change as - // long as it lives - std::vector<GcReferenceChecker*> Checkers; - try + PreCachePhaseThreadPool.ScheduleWork([this, + &Ctx, + &WorkLeft, + Referencer, + Index, + Result = &Result, + ReferemcerStats, + &ReferenceValidatorsLock, + &ReferenceValidators]() { + auto _ = MakeGuard([&WorkLeft]() { WorkLeft.CountDown(); }); + std::vector<GcReferenceValidator*> Validators; + try + { { - { - SCOPED_TIMER(Stats->second.CreateReferenceCheckersMS = - std::chrono::milliseconds(Timer.GetElapsedTimeMs());); - Checkers = Referencer->CreateReferenceCheckers(Ctx); - } - if (!Checkers.empty()) - { - RwLock::ExclusiveLockScope __(ReferenceCheckersLock); - for (auto& Checker : Checkers) - { - ReferenceCheckers.insert_or_assign(std::unique_ptr<GcReferenceChecker>(Checker), Index); - Checker = nullptr; - } - } + SCOPED_TIMER(ReferemcerStats->second.CreateReferenceValidatorsMS = + std::chrono::milliseconds(Timer.GetElapsedTimeMs());); + Validators = Referencer->CreateReferenceValidators(Ctx); } - catch (const std::exception& Ex) + if (!Validators.empty()) { - ZEN_ERROR("GCV2: Failed creating reference checkers for {}. Reason: '{}'", - Referencer->GetGcName(Ctx), - Ex.what()); - SetCancelGC(true); - while (!Checkers.empty()) + RwLock::ExclusiveLockScope __(ReferenceValidatorsLock); + for (auto& ReferenceValidator : Validators) { - delete Checkers.back(); - Checkers.pop_back(); + size_t ReferencesStatsIndex = Result->ReferenceValidatorStats.size(); + Result->ReferenceValidatorStats.push_back({ReferenceValidator->GetGcName(Ctx), {}}); + ReferenceValidators.insert_or_assign(std::unique_ptr<GcReferenceValidator>(ReferenceValidator), + ReferencesStatsIndex); + ReferenceValidator = nullptr; } } - }); + } + catch (const std::exception& Ex) + { + ZEN_ERROR("GCV2: Failed creating reference validators for {}. Reason: '{}'", + Referencer->GetGcName(Ctx), + Ex.what()); + SetCancelGC(true); + while (!Validators.empty()) + { + delete Validators.back(); + Validators.pop_back(); + } + } + }); } WorkLeft.CountDown(); WorkLeft.Wait(); } } + if (!ReferenceCheckers.empty()) { - ZEN_INFO("GCV2: Precaching state for {} reference checkers", ReferenceCheckers.size()); - if (!ReferenceCheckers.empty()) + if (CheckGCCancel()) { - if (CheckGCCancel()) - { - return Sum(Result, true); - } - ZEN_TRACE_CPU("GcV2::PreCache"); + return Sum(Result, true); + } + ZEN_INFO("GCV2: Precaching state for {} reference checkers", ReferenceCheckers.size()); + ZEN_TRACE_CPU("GcV2::PreCache"); - Latch WorkLeft(1); + Latch WorkLeft(1); + { + SCOPED_TIMER(Result.PreCacheStateMS = std::chrono::milliseconds(Timer.GetElapsedTimeMs()); + if (Ctx.Settings.Verbose) { + ZEN_INFO("GCV2: Precached state using {} reference checkers in {}", + ReferenceCheckers.size(), + NiceTimeSpanMs(Result.PreCacheStateMS.count())); + }); + for (auto& It : ReferenceCheckers) { - SCOPED_TIMER(Result.PreCacheStateMS = std::chrono::milliseconds(Timer.GetElapsedTimeMs()); - if (Ctx.Settings.Verbose) { - ZEN_INFO("GCV2: Precached state using {} reference checkers in {}", - ReferenceCheckers.size(), - NiceTimeSpanMs(Result.PreCacheStateMS.count())); - }); - for (auto& It : ReferenceCheckers) + if (CheckGCCancel()) { - if (CheckGCCancel()) + WorkLeft.CountDown(); + WorkLeft.Wait(); + return Sum(Result, true); + } + + GcReferenceChecker* Checker = It.first.get(); + size_t Index = It.second; + std::pair<std::string, GcReferencerStats>* Stats = &Result.ReferencerStats[Index]; + WorkLeft.AddCount(1); + PreCachePhaseThreadPool.ScheduleWork([this, &Ctx, Checker, Index, Stats, &WorkLeft]() { + auto _ = MakeGuard([&WorkLeft]() { WorkLeft.CountDown(); }); + try { - WorkLeft.CountDown(); - WorkLeft.Wait(); - return Sum(Result, true); + SCOPED_TIMER(Stats->second.PreCacheStateMS = std::chrono::milliseconds(Timer.GetElapsedTimeMs());); + Checker->PreCache(Ctx); } - - GcReferenceChecker* Checker = It.first.get(); - size_t Index = It.second; - std::pair<std::string, GcReferencerStats>* Stats = &Result.ReferencerStats[Index]; - WorkLeft.AddCount(1); - PreCachePhaseThreadPool.ScheduleWork([this, &Ctx, Checker, Index, Stats, &WorkLeft]() { - auto _ = MakeGuard([&WorkLeft]() { WorkLeft.CountDown(); }); - try - { - SCOPED_TIMER(Stats->second.PreCacheStateMS = std::chrono::milliseconds(Timer.GetElapsedTimeMs());); - Checker->PreCache(Ctx); - } - catch (const std::exception& Ex) - { - ZEN_ERROR("GCV2: Failed precaching for {}. Reason: '{}'", Checker->GetGcName(Ctx), Ex.what()); - SetCancelGC(true); - } - }); - } - WorkLeft.CountDown(); - WorkLeft.Wait(); + catch (const std::exception& Ex) + { + ZEN_ERROR("GCV2: Failed precaching for {}. Reason: '{}'", Checker->GetGcName(Ctx), Ex.what()); + SetCancelGC(true); + } + }); } + WorkLeft.CountDown(); + WorkLeft.Wait(); } } @@ -898,8 +1045,7 @@ GcManager::CollectGarbage(const GcSettings& Settings) Settings.SingleThread ? GetSyncWorkerPool() : GetMediumWorkerPool(EWorkloadType::Background); std::vector<RwLock::SharedLockScope> LockerScopes; - SCOPED_TIMER(uint64_t ElapsedMS = Timer.GetElapsedTimeMs(); Result.WriteBlockMS = std::chrono::milliseconds(ElapsedMS); - ZEN_INFO("GCV2: Writes blocked for {}", NiceTimeSpanMs(ElapsedMS))); + SCOPED_TIMER(uint64_t ElapsedMS = Timer.GetElapsedTimeMs(); Result.WriteBlockMS = std::chrono::milliseconds(ElapsedMS);); { if (!ReferenceCheckers.empty()) { @@ -984,100 +1130,98 @@ GcManager::CollectGarbage(const GcSettings& Settings) } } } + + if (CheckGCCancel()) { - ZEN_INFO("GCV2: Removing unreferenced data for {} reference pruners", ReferencePruners.size()); - if (CheckGCCancel()) - { - return Sum(Result, true); - } - { - const auto GetUnusedReferences = [&ReferenceCheckers, &Ctx](std::span<IoHash> References) -> std::span<IoHash> { - std::span<IoHash> UnusedCids(References); - ZEN_ASSERT(UnusedCids.empty() || UnusedCids[0] != IoHash::Zero); - for (const auto& It : ReferenceCheckers) + return Sum(Result, true); + } + ZEN_INFO("GCV2: Removing unreferenced data for {} reference pruners", ReferencePruners.size()); + { + ZEN_TRACE_CPU("GcV2::RemoveUnreferencedData"); + + const auto GetUnusedReferences = [&ReferenceCheckers, &Ctx](std::span<IoHash> References) -> std::span<IoHash> { + std::span<IoHash> UnusedCids(References); + ZEN_ASSERT(UnusedCids.empty() || UnusedCids[0] != IoHash::Zero); + for (const auto& It : ReferenceCheckers) + { + GcReferenceChecker* ReferenceChecker = It.first.get(); + UnusedCids = ReferenceChecker->GetUnusedReferences(Ctx, UnusedCids); + if (UnusedCids.empty()) { - GcReferenceChecker* ReferenceChecker = It.first.get(); - UnusedCids = ReferenceChecker->GetUnusedReferences(Ctx, UnusedCids); - if (UnusedCids.empty()) - { - return {}; - } + return {}; } - return UnusedCids; - }; - - // checking all Cids agains references in cache - // Ask stores to remove data that the ReferenceCheckers says are not referenced - this should be a lightweight - // operation that only updates in-memory index, actual disk changes should be done by the ReferenceStoreCompactors + } + return UnusedCids; + }; - ZEN_TRACE_CPU("GcV2::RemoveUnreferencedData"); + // checking all Cids agains references in cache + // Ask stores to remove data that the ReferenceCheckers says are not referenced - this should be a lightweight + // operation that only updates in-memory index, actual disk changes should be done by the ReferenceStoreCompactors - Latch WorkLeft(1); + Latch WorkLeft(1); + { + SCOPED_TIMER(Result.RemoveUnreferencedDataMS = std::chrono::milliseconds(Timer.GetElapsedTimeMs()); + if (Ctx.Settings.Verbose) { + ZEN_INFO("GCV2: Removed unused data using {} pruners in {}", + ReferencePruners.size(), + NiceTimeSpanMs(Result.RemoveUnreferencedDataMS.count())); + }); + for (auto& It : ReferencePruners) { - SCOPED_TIMER(Result.RemoveUnreferencedDataMS = std::chrono::milliseconds(Timer.GetElapsedTimeMs()); - if (Ctx.Settings.Verbose) { - ZEN_INFO("GCV2: Removed unused data using {} pruners in {}", - ReferencePruners.size(), - NiceTimeSpanMs(Result.RemoveUnreferencedDataMS.count())); - }); - for (auto& It : ReferencePruners) + if (CheckGCCancel()) { - if (CheckGCCancel()) - { - WorkLeft.CountDown(); - WorkLeft.Wait(); - return Sum(Result, true); - } + WorkLeft.CountDown(); + WorkLeft.Wait(); + return Sum(Result, true); + } - GcReferencePruner* Pruner = It.second.get(); - size_t Index = It.first; - GcReferenceStoreStats* Stats = &Result.ReferenceStoreStats[Index].second; - WorkLeft.AddCount(1); - LockedPhaseThreadPool.ScheduleWork( - [this, &Ctx, Pruner, Stats, &WorkLeft, &GetUnusedReferences, &StoreCompactorsLock, &StoreCompactors]() { - auto _ = MakeGuard([&WorkLeft]() { WorkLeft.CountDown(); }); - // Go through all the ReferenceCheckers to see if the list of Cids the collector selected are - // referenced or not. - try + GcReferencePruner* Pruner = It.second.get(); + size_t Index = It.first; + GcReferenceStoreStats* Stats = &Result.ReferenceStoreStats[Index].second; + WorkLeft.AddCount(1); + LockedPhaseThreadPool.ScheduleWork( + [this, &Ctx, Pruner, Stats, &WorkLeft, &GetUnusedReferences, &StoreCompactorsLock, &StoreCompactors]() { + auto _ = MakeGuard([&WorkLeft]() { WorkLeft.CountDown(); }); + // Go through all the ReferenceCheckers to see if the list of Cids the collector selected are + // referenced or not. + try + { + std::unique_ptr<GcStoreCompactor> StoreCompactor; { - std::unique_ptr<GcStoreCompactor> StoreCompactor; - { - SCOPED_TIMER(Stats->RemoveUnreferencedDataStats.ElapsedMS = - std::chrono::milliseconds(Timer.GetElapsedTimeMs());); - StoreCompactor = std::unique_ptr<GcStoreCompactor>( - Pruner->RemoveUnreferencedData(Ctx, - Stats->RemoveUnreferencedDataStats, - GetUnusedReferences)); - } - if (StoreCompactor) - { - RwLock::ExclusiveLockScope __(StoreCompactorsLock); - StoreCompactors.insert_or_assign(std::move(StoreCompactor), &Stats->CompactStoreStats); - } + SCOPED_TIMER(Stats->RemoveUnreferencedDataStats.ElapsedMS = + std::chrono::milliseconds(Timer.GetElapsedTimeMs());); + StoreCompactor = std::unique_ptr<GcStoreCompactor>( + Pruner->RemoveUnreferencedData(Ctx, + Stats->RemoveUnreferencedDataStats, + GetUnusedReferences)); } - catch (const std::exception& Ex) + if (StoreCompactor) { - ZEN_ERROR("GCV2: Failed removing unused data for {}. Reason: '{}'", - Pruner->GetGcName(Ctx), - Ex.what()); - SetCancelGC(true); + RwLock::ExclusiveLockScope __(StoreCompactorsLock); + StoreCompactors.insert_or_assign(std::move(StoreCompactor), &Stats->CompactStoreStats); } - }); - } - WorkLeft.CountDown(); - WorkLeft.Wait(); + } + catch (const std::exception& Ex) + { + ZEN_ERROR("GCV2: Failed removing unused data for {}. Reason: '{}'", + Pruner->GetGcName(Ctx), + Ex.what()); + SetCancelGC(true); + } + }); } + WorkLeft.CountDown(); + WorkLeft.Wait(); } - // Let the GcReferencers add new data, we will only change on-disk data at this point, adding new data is allowed - LockerScopes.clear(); - ReferenceCheckers.clear(); - ReferencePruners.clear(); } + // Let the GcReferencers add new data, we will only change on-disk data at this point, adding new data is allowed + LockerScopes.clear(); + ReferenceCheckers.clear(); + ReferencePruners.clear(); } } - ZEN_INFO("GCV2: Compacting using {} store compactors", StoreCompactors.size()); if (!StoreCompactors.empty()) { if (CheckGCCancel()) @@ -1085,6 +1229,7 @@ GcManager::CollectGarbage(const GcSettings& Settings) return Sum(Result, true); } + ZEN_INFO("GCV2: Compacting using {} store compactors", StoreCompactors.size()); ZEN_TRACE_CPU("GcV2::CompactStores"); auto ClaimDiskReserve = [&]() -> uint64_t { @@ -1129,6 +1274,47 @@ GcManager::CollectGarbage(const GcSettings& Settings) } StoreCompactors.clear(); } + + if (!ReferenceValidators.empty()) + { + if (CheckGCCancel()) + { + return Sum(Result, true); + } + + ZEN_INFO("GCV2: Validating using {} reference validators", ReferenceValidators.size()); + ZEN_TRACE_CPU("GcV2::ValidateReferences"); + + // Remove the stuff we deemed unreferenced from disk - may be heavy operation + // Don't do in parallel, we don't want to steal CPU/Disk from regular operation + { + SCOPED_TIMER(Result.ValidateReferencersMS = std::chrono::milliseconds(Timer.GetElapsedTimeMs()); if (Ctx.Settings.Verbose) { + ZEN_INFO("GCV2: Compacted {} stores in {}", StoreCompactors.size(), NiceTimeSpanMs(Result.CompactStoresMS.count())); + }); + for (auto& It : ReferenceValidators) + { + if (CheckGCCancel()) + { + return Sum(Result, true); + } + + GcReferenceValidator* ReferenceValidator = It.first.get(); + GcReferenceValidatorStats& Stats = Result.ReferenceValidatorStats[It.second].second; + try + { + // Go through all the ReferenceCheckers to see if the list of Cids the collector selected are referenced or + SCOPED_TIMER(Stats.ElapsedMS = std::chrono::milliseconds(Timer.GetElapsedTimeMs());); + ReferenceValidator->Validate(Ctx, Stats); + } + catch (const std::exception& Ex) + { + ZEN_ERROR("GCV2: Failed validating referencer {}. Reason: '{}'", ReferenceValidator->GetGcName(Ctx), Ex.what()); + SetCancelGC(true); + } + } + } + ReferenceValidators.clear(); + } } return Sum(Result); @@ -1521,6 +1707,7 @@ GcScheduler::AppendGCLog(std::string_view Id, GcClock::TimePoint StartTime, cons Writer << "AttachmentRangeMax"sv << Settings.AttachmentRangeMin; Writer << "ForceStoreCacheAttachmentMetaData"sv << Settings.StoreCacheAttachmentMetaData; Writer << "ForceStoreProjectAttachmentMetaData"sv << Settings.StoreProjectAttachmentMetaData; + Writer << "EnableValidation"sv << Settings.EnableValidation; } Writer.EndObject(); @@ -1719,6 +1906,7 @@ GcScheduler::SchedulerThread() IoHash AttachmentRangeMax = IoHash::Max; bool StoreCacheAttachmentMetaData = m_Config.StoreCacheAttachmentMetaData; bool StoreProjectAttachmentMetaData = m_Config.StoreProjectAttachmentMetaData; + bool EnableValidation = m_Config.EnableValidation; uint8_t NextAttachmentPassIndex = ComputeAttachmentRange(m_AttachmentPassIndex, m_Config.AttachmentPassCount, AttachmentRangeMin, AttachmentRangeMax); @@ -1774,6 +1962,10 @@ GcScheduler::SchedulerThread() { StoreProjectAttachmentMetaData = TriggerParams.StoreProjectAttachmentMetaData.value(); } + if (TriggerParams.EnableValidation.has_value()) + { + EnableValidation = TriggerParams.EnableValidation.value(); + } DoGc = true; } @@ -2002,6 +2194,7 @@ GcScheduler::SchedulerThread() AttachmentRangeMax, StoreCacheAttachmentMetaData, StoreProjectAttachmentMetaData, + EnableValidation, SilenceErrors); if (!GcSuccess) { @@ -2105,6 +2298,7 @@ GcScheduler::CollectGarbage(const GcClock::TimePoint& CacheExpireTime, const IoHash& AttachmentRangeMax, bool StoreCacheAttachmentMetaData, bool StoreProjectAttachmentMetaData, + bool EnableValidation, bool SilenceErrors) { ZEN_TRACE_CPU("GcScheduler::CollectGarbage"); @@ -2184,25 +2378,26 @@ GcScheduler::CollectGarbage(const GcClock::TimePoint& CacheExpireTime, .AttachmentRangeMin = AttachmentRangeMin, .AttachmentRangeMax = AttachmentRangeMax, .StoreCacheAttachmentMetaData = StoreCacheAttachmentMetaData, - .StoreProjectAttachmentMetaData = StoreProjectAttachmentMetaData}; + .StoreProjectAttachmentMetaData = StoreProjectAttachmentMetaData, + .EnableValidation = EnableValidation}; auto AppendSettings = [](StringBuilderBase& SB, const GcSettings& Settings) { SB.Append( - fmt::format(" GC small objects: {}\n", Settings.CollectSmallObjects ? "yes"sv : "no"sv)); - SB.Append(fmt::format(" GC Cid store: {}\n", Settings.SkipCidDelete ? "no"sv : "yes"sv)); + fmt::format(" GC small objects: {}\n", Settings.CollectSmallObjects ? "yes"sv : "no"sv)); + SB.Append(fmt::format(" GC Cid store: {}\n", Settings.SkipCidDelete ? "no"sv : "yes"sv)); if (!Settings.SkipCidDelete) { if (Settings.AttachmentRangeMin != IoHash::Zero || Settings.AttachmentRangeMax != IoHash::Max) { - SB.Append(fmt::format(" Attachment range: {}-{}\n", + SB.Append(fmt::format(" Attachment range: {}-{}\n", Settings.AttachmentRangeMin, Settings.AttachmentRangeMax)); } - SB.Append(fmt::format(" Cache attachment meta: {}\n", Settings.StoreCacheAttachmentMetaData)); - SB.Append(fmt::format(" Project attachment meta: {}\n", Settings.StoreProjectAttachmentMetaData)); + SB.Append(fmt::format(" Cache attachment meta: {}\n", Settings.StoreCacheAttachmentMetaData)); + SB.Append(fmt::format(" Project attachment meta: {}\n", Settings.StoreProjectAttachmentMetaData)); + SB.Append(fmt::format(" Enable validation: {}\n", Settings.EnableValidation)); } - SB.Append(fmt::format(" Cache cutoff time: {}\n", Settings.CacheExpireTime)); - SB.Append(fmt::format(" Project store cutoff time: {}", Settings.ProjectStoreExpireTime)); + SB.Append(fmt::format(" Cache cutoff time: {}\n", Settings.CacheExpireTime)); }; { @@ -2224,7 +2419,36 @@ GcScheduler::CollectGarbage(const GcClock::TimePoint& CacheExpireTime, { SB.Append(fmt::format("COMPLETED '{}' in {}\n", GcId, NiceTimeSpanMs(Result.ElapsedMS.count()))); AppendSettings(SB, Settings); - SB.Append("\n\n"); + SB.Append("\n"); + SB.Append( + fmt::format(" Remove Expired Data: {}\n", NiceTimeSpanMs(Result.RemoveExpiredDataMS.count()))); + if (!Settings.SkipCidDelete) + { + SB.Append(fmt::format(" Create Reference Pruners: {}\n", + NiceTimeSpanMs(Result.CreateReferencePrunersMS.count()))); + SB.Append(fmt::format(" Create Reference Checkers: {}\n", + NiceTimeSpanMs(Result.CreateReferenceCheckersMS.count()))); + if (EnableValidation) + { + SB.Append(fmt::format(" Create Reference Validators: {}\n", + NiceTimeSpanMs(Result.CreateReferenceValidatorsMS.count()))); + } + SB.Append( + fmt::format(" Precache State: {}\n", NiceTimeSpanMs(Result.PreCacheStateMS.count()))); + SB.Append( + fmt::format(" Writes blocked: {}\n", NiceTimeSpanMs(Result.WriteBlockMS.count()))); + SB.Append( + fmt::format(" Lock State: {}\n", NiceTimeSpanMs(Result.LockStateMS.count()))); + SB.Append(fmt::format(" Update Lock State: {}\n", + NiceTimeSpanMs(Result.UpdateLockedStateMS.count()))); + SB.Append(fmt::format(" Remove Unreferenced: {}\n", + NiceTimeSpanMs(Result.RemoveUnreferencedDataMS.count()))); + SB.Append( + fmt::format(" Compacting Stores: {}\n", NiceTimeSpanMs(Result.CompactStoresMS.count()))); + SB.Append(fmt::format(" Reference Validation: {}\n", + NiceTimeSpanMs(Result.ValidateReferencersMS.count()))); + SB.Append("\n"); + } SB.Append(fmt::format(" Found {} expired items out of {}, deleted {}\n", Result.ReferencerStatSum.RemoveExpiredDataStats.FoundCount, Result.ReferencerStatSum.RemoveExpiredDataStats.CheckedCount, @@ -2235,6 +2459,12 @@ GcScheduler::CollectGarbage(const GcClock::TimePoint& CacheExpireTime, Result.ReferenceStoreStatSum.RemoveUnreferencedDataStats.FoundCount, Result.ReferenceStoreStatSum.RemoveUnreferencedDataStats.CheckedCount, Result.ReferenceStoreStatSum.RemoveUnreferencedDataStats.DeletedCount)); + if (EnableValidation) + { + SB.Append(fmt::format(" Validated {} items and found {} missing references\n", + Result.ReferenceValidatorStatSum.CheckedCount, + Result.ReferenceValidatorStatSum.MissingCount)); + } } SB.Append(fmt::format(" Freed {} on disk and {} of memory\n", NiceBytes(Result.CompactStoresStatSum.RemovedDisk), |