From a417d19e6d2af229e7fd33c559f6fefee3a81042 Mon Sep 17 00:00:00 2001 From: Dan Engelbrecht Date: Mon, 12 May 2025 11:03:46 +0200 Subject: keep snapshot on log delete fail (#391) - Improvement: Cleaned up snapshot writing for CompactCAS/FileCas/Cache/Project stores - Improvement: Safer recovery when failing to delete log for CompactCAS/FileCas/Cache/Project stores - Improvement: Added log file reset when writing snapshot at startup for FileCas --- src/zenstore/cache/cachedisklayer.cpp | 47 ++++++++++++++++------------------- 1 file changed, 21 insertions(+), 26 deletions(-) (limited to 'src/zenstore/cache/cachedisklayer.cpp') diff --git a/src/zenstore/cache/cachedisklayer.cpp b/src/zenstore/cache/cachedisklayer.cpp index e7b2e6bc6..91bd9cba8 100644 --- a/src/zenstore/cache/cachedisklayer.cpp +++ b/src/zenstore/cache/cachedisklayer.cpp @@ -824,12 +824,11 @@ ZenCacheDiskLayer::CacheBucket::OpenOrCreate(std::filesystem::path BucketDir, bo } void -ZenCacheDiskLayer::CacheBucket::WriteIndexSnapshotLocked(bool FlushLockPosition, const std::function& ClaimDiskReserveFunc) +ZenCacheDiskLayer::CacheBucket::WriteIndexSnapshotLocked(bool ResetLog, const std::function& ClaimDiskReserveFunc) { ZEN_TRACE_CPU("Z$::Bucket::WriteIndexSnapshot"); - const uint64_t LogCount = FlushLockPosition ? 0 : m_SlogFile.GetLogCount(); - if (m_LogFlushPosition == LogCount) + if (m_LogFlushPosition == m_SlogFile.GetLogCount()) { return; } @@ -846,7 +845,7 @@ ZenCacheDiskLayer::CacheBucket::WriteIndexSnapshotLocked(bool FlushLockPosition, namespace fs = std::filesystem; - fs::path IndexPath = cache::impl::GetIndexPath(m_BucketDir, m_BucketName); + const fs::path IndexPath = cache::impl::GetIndexPath(m_BucketDir, m_BucketName); try { @@ -878,8 +877,10 @@ ZenCacheDiskLayer::CacheBucket::WriteIndexSnapshotLocked(bool FlushLockPosition, throw std::system_error(Ec, fmt::format("failed to create new snapshot file in '{}'", m_BucketDir)); } + const uint64_t IndexLogPosition = ResetLog ? 0 : m_SlogFile.GetLogCount(); + cache::impl::CacheBucketIndexHeader Header = {.EntryCount = EntryCount, - .LogPosition = LogCount, + .LogPosition = IndexLogPosition, .PayloadAlignment = gsl::narrow(m_Configuration.PayloadAlignment)}; Header.Checksum = cache::impl::CacheBucketIndexHeader::ComputeChecksum(Header); @@ -916,34 +917,28 @@ ZenCacheDiskLayer::CacheBucket::WriteIndexSnapshotLocked(bool FlushLockPosition, ObjectIndexFile.MoveTemporaryIntoPlace(IndexPath, Ec); if (Ec) { - std::filesystem::path TempFilePath = ObjectIndexFile.GetPath(); - ZEN_WARN("snapshot failed to rename new snapshot '{}' to '{}', reason: '{}'", TempFilePath, IndexPath, Ec.message()); + throw std::system_error(Ec, + fmt::format("Snapshot failed to rename new snapshot '{}' to '{}', reason: '{}'", + ObjectIndexFile.GetPath(), + IndexPath, + Ec.message())); } - else + + if (ResetLog) { - // We must only update the log flush position once the snapshot write succeeds - if (FlushLockPosition) - { - std::filesystem::path LogPath = cache::impl::GetLogPath(m_BucketDir, m_BucketName); + const std::filesystem::path LogPath = cache::impl::GetLogPath(m_BucketDir, m_BucketName); - if (IsFile(LogPath)) + if (IsFile(LogPath)) + { + if (!RemoveFile(LogPath, Ec) || Ec) { - if (!RemoveFile(LogPath, Ec) || Ec) - { - ZEN_WARN("snapshot failed to clean log file '{}', removing index at '{}', reason: '{}'", - LogPath, - IndexPath, - Ec.message()); - std::error_code RemoveIndexEc; - RemoveFile(IndexPath, RemoveIndexEc); - } + // This is non-critical, it only means that we will replay the events of the log over the snapshot - inefficent but in + // the end it will be the same result + ZEN_WARN("snapshot failed to clean log file '{}', reason: '{}'", LogPath, IndexPath, Ec.message()); } } - if (!Ec) - { - m_LogFlushPosition = LogCount; - } } + m_LogFlushPosition = IndexLogPosition; } catch (const std::exception& Err) { -- cgit v1.2.3