diff options
| author | Dan Engelbrecht <[email protected]> | 2025-05-12 11:03:46 +0200 |
|---|---|---|
| committer | GitHub Enterprise <[email protected]> | 2025-05-12 11:03:46 +0200 |
| commit | a417d19e6d2af229e7fd33c559f6fefee3a81042 (patch) | |
| tree | 48d23f4c4700be6cbaa820d9ac798ddea70218cd /src | |
| parent | enable per bucket config (#388) (diff) | |
| download | zen-a417d19e6d2af229e7fd33c559f6fefee3a81042.tar.xz zen-a417d19e6d2af229e7fd33c559f6fefee3a81042.zip | |
keep snapshot on log delete fail (#391)
- Improvement: Cleaned up snapshot writing for CompactCAS/FileCas/Cache/Project stores
- Improvement: Safer recovery when failing to delete log for CompactCAS/FileCas/Cache/Project stores
- Improvement: Added log file reset when writing snapshot at startup for FileCas
Diffstat (limited to 'src')
| -rw-r--r-- | src/zenserver/projectstore/projectstore.cpp | 61 | ||||
| -rw-r--r-- | src/zenstore/cache/cachedisklayer.cpp | 47 | ||||
| -rw-r--r-- | src/zenstore/compactcas.cpp | 46 | ||||
| -rw-r--r-- | src/zenstore/compactcas.h | 2 | ||||
| -rw-r--r-- | src/zenstore/filecas.cpp | 78 | ||||
| -rw-r--r-- | src/zenstore/filecas.h | 2 | ||||
| -rw-r--r-- | src/zenstore/include/zenstore/cache/cachedisklayer.h | 10 |
7 files changed, 88 insertions, 158 deletions
diff --git a/src/zenserver/projectstore/projectstore.cpp b/src/zenserver/projectstore/projectstore.cpp index 071aab137..6a55efdb7 100644 --- a/src/zenserver/projectstore/projectstore.cpp +++ b/src/zenserver/projectstore/projectstore.cpp @@ -1643,31 +1643,9 @@ ProjectStore::Oplog::WriteIndexSnapshot() namespace fs = std::filesystem; - fs::path IndexPath = m_BasePath / "ops.zidx"; - fs::path TempIndexPath = m_BasePath / "ops.zidx.tmp"; - - // Move index away, we keep it if something goes wrong - if (IsFile(TempIndexPath)) - { - std::error_code Ec; - if (!RemoveFile(TempIndexPath, Ec) || Ec) - { - ZEN_WARN("oplog '{}/{}': snapshot failed to clean up temp snapshot at {}, reason: '{}'", - GetOuterProject()->Identifier, - m_OplogId, - TempIndexPath, - Ec.message()); - return; - } - } - + const fs::path IndexPath = m_BasePath / "ops.zidx"; try { - if (IsFile(IndexPath)) - { - RenameFile(IndexPath, TempIndexPath); - } - // Write the current state of the location map to a new index state std::vector<uint32_t> LSNEntries; std::vector<Oid> Keys; @@ -1781,7 +1759,11 @@ ProjectStore::Oplog::WriteIndexSnapshot() ObjectIndexFile.MoveTemporaryIntoPlace(IndexPath, Ec); if (Ec) { - throw std::system_error(Ec, fmt::format("Failed to move temp file '{}' to '{}'", ObjectIndexFile.GetPath(), IndexPath)); + throw std::system_error(Ec, + fmt::format("Snapshot failed to rename new snapshot '{}' to '{}', reason: '{}'", + ObjectIndexFile.GetPath(), + IndexPath, + Ec.message())); } EntryCount = LSNEntries.size(); m_LogFlushPosition = IndexLogPosition; @@ -1789,35 +1771,6 @@ ProjectStore::Oplog::WriteIndexSnapshot() catch (const std::exception& Err) { ZEN_WARN("oplog '{}/{}': snapshot FAILED, reason: '{}'", m_OuterProject->Identifier, m_OplogId, Err.what()); - - // Restore any previous snapshot - - if (IsFile(TempIndexPath)) - { - std::error_code Ec; - RemoveFile(IndexPath, Ec); // We don't care if this fails, we try to move the old temp file regardless - RenameFile(TempIndexPath, IndexPath, Ec); - if (Ec) - { - ZEN_WARN("oplog '{}/{}': snapshot failed to restore old snapshot from {}, reason: '{}'", - m_OuterProject->Identifier, - m_OplogId, - TempIndexPath, - Ec.message()); - } - } - } - if (IsFile(TempIndexPath)) - { - std::error_code Ec; - if (!RemoveFile(TempIndexPath, Ec) || Ec) - { - ZEN_WARN("oplog '{}/{}': snapshot failed to remove temporary file {}, reason: '{}'", - m_OuterProject->Identifier, - m_OplogId, - TempIndexPath, - Ec.message()); - } } } @@ -1827,7 +1780,7 @@ ProjectStore::Oplog::ReadIndexSnapshot() ZEN_MEMSCOPE(GetProjectstoreTag()); ZEN_TRACE_CPU("Oplog::ReadIndexSnapshot"); - std::filesystem::path IndexPath = m_BasePath / "ops.zidx"; + const std::filesystem::path IndexPath = m_BasePath / "ops.zidx"; if (IsFile(IndexPath)) { uint64_t EntryCount = 0; diff --git a/src/zenstore/cache/cachedisklayer.cpp b/src/zenstore/cache/cachedisklayer.cpp index e7b2e6bc6..91bd9cba8 100644 --- a/src/zenstore/cache/cachedisklayer.cpp +++ b/src/zenstore/cache/cachedisklayer.cpp @@ -824,12 +824,11 @@ ZenCacheDiskLayer::CacheBucket::OpenOrCreate(std::filesystem::path BucketDir, bo } void -ZenCacheDiskLayer::CacheBucket::WriteIndexSnapshotLocked(bool FlushLockPosition, const std::function<uint64_t()>& ClaimDiskReserveFunc) +ZenCacheDiskLayer::CacheBucket::WriteIndexSnapshotLocked(bool ResetLog, const std::function<uint64_t()>& ClaimDiskReserveFunc) { ZEN_TRACE_CPU("Z$::Bucket::WriteIndexSnapshot"); - const uint64_t LogCount = FlushLockPosition ? 0 : m_SlogFile.GetLogCount(); - if (m_LogFlushPosition == LogCount) + if (m_LogFlushPosition == m_SlogFile.GetLogCount()) { return; } @@ -846,7 +845,7 @@ ZenCacheDiskLayer::CacheBucket::WriteIndexSnapshotLocked(bool FlushLockPosition, namespace fs = std::filesystem; - fs::path IndexPath = cache::impl::GetIndexPath(m_BucketDir, m_BucketName); + const fs::path IndexPath = cache::impl::GetIndexPath(m_BucketDir, m_BucketName); try { @@ -878,8 +877,10 @@ ZenCacheDiskLayer::CacheBucket::WriteIndexSnapshotLocked(bool FlushLockPosition, throw std::system_error(Ec, fmt::format("failed to create new snapshot file in '{}'", m_BucketDir)); } + const uint64_t IndexLogPosition = ResetLog ? 0 : m_SlogFile.GetLogCount(); + cache::impl::CacheBucketIndexHeader Header = {.EntryCount = EntryCount, - .LogPosition = LogCount, + .LogPosition = IndexLogPosition, .PayloadAlignment = gsl::narrow<uint32_t>(m_Configuration.PayloadAlignment)}; Header.Checksum = cache::impl::CacheBucketIndexHeader::ComputeChecksum(Header); @@ -916,34 +917,28 @@ ZenCacheDiskLayer::CacheBucket::WriteIndexSnapshotLocked(bool FlushLockPosition, ObjectIndexFile.MoveTemporaryIntoPlace(IndexPath, Ec); if (Ec) { - std::filesystem::path TempFilePath = ObjectIndexFile.GetPath(); - ZEN_WARN("snapshot failed to rename new snapshot '{}' to '{}', reason: '{}'", TempFilePath, IndexPath, Ec.message()); + throw std::system_error(Ec, + fmt::format("Snapshot failed to rename new snapshot '{}' to '{}', reason: '{}'", + ObjectIndexFile.GetPath(), + IndexPath, + Ec.message())); } - else + + if (ResetLog) { - // We must only update the log flush position once the snapshot write succeeds - if (FlushLockPosition) - { - std::filesystem::path LogPath = cache::impl::GetLogPath(m_BucketDir, m_BucketName); + const std::filesystem::path LogPath = cache::impl::GetLogPath(m_BucketDir, m_BucketName); - if (IsFile(LogPath)) + if (IsFile(LogPath)) + { + if (!RemoveFile(LogPath, Ec) || Ec) { - if (!RemoveFile(LogPath, Ec) || Ec) - { - ZEN_WARN("snapshot failed to clean log file '{}', removing index at '{}', reason: '{}'", - LogPath, - IndexPath, - Ec.message()); - std::error_code RemoveIndexEc; - RemoveFile(IndexPath, RemoveIndexEc); - } + // This is non-critical, it only means that we will replay the events of the log over the snapshot - inefficent but in + // the end it will be the same result + ZEN_WARN("snapshot failed to clean log file '{}', reason: '{}'", LogPath, IndexPath, Ec.message()); } } - if (!Ec) - { - m_LogFlushPosition = LogCount; - } } + m_LogFlushPosition = IndexLogPosition; } catch (const std::exception& Err) { diff --git a/src/zenstore/compactcas.cpp b/src/zenstore/compactcas.cpp index a8914ed20..8cf241e34 100644 --- a/src/zenstore/compactcas.cpp +++ b/src/zenstore/compactcas.cpp @@ -461,7 +461,7 @@ CasContainerStrategy::Flush() ZEN_TRACE_CPU("CasContainer::Flush"); m_BlockStore.Flush(/*ForceNewBlock*/ false); m_CasLog.Flush(); - MakeIndexSnapshot(/*FlushLockPosition*/ false); + MakeIndexSnapshot(/*ResetLog*/ false); } void @@ -924,13 +924,12 @@ CasContainerStrategy::StorageSize() const } void -CasContainerStrategy::MakeIndexSnapshot(bool FlushLockPosition) +CasContainerStrategy::MakeIndexSnapshot(bool ResetLog) { ZEN_MEMSCOPE(GetCasContainerTag()); ZEN_TRACE_CPU("CasContainer::MakeIndexSnapshot"); - const uint64_t LogCount = FlushLockPosition ? 0 : m_CasLog.GetLogCount(); - if (m_LogFlushPosition == LogCount) + if (m_LogFlushPosition == m_CasLog.GetLogCount()) { return; } @@ -947,7 +946,7 @@ CasContainerStrategy::MakeIndexSnapshot(bool FlushLockPosition) namespace fs = std::filesystem; - fs::path IndexPath = cas::impl::GetIndexPath(m_RootDirectory, m_ContainerBaseName); + const fs::path IndexPath = cas::impl::GetIndexPath(m_RootDirectory, m_ContainerBaseName); try { @@ -957,7 +956,10 @@ CasContainerStrategy::MakeIndexSnapshot(bool FlushLockPosition) { RwLock::SharedLockScope ___(m_LocationMapLock); - IndexLogPosition = m_CasLog.GetLogCount(); + if (!ResetLog) + { + IndexLogPosition = m_CasLog.GetLogCount(); + } Entries.resize(m_LocationMap.size()); uint64_t EntryIndex = 0; @@ -967,6 +969,7 @@ CasContainerStrategy::MakeIndexSnapshot(bool FlushLockPosition) IndexEntry.Key = Entry.first; IndexEntry.Location = m_Locations[Entry.second]; } + EntryCount = m_LocationMap.size(); } TemporaryFile ObjectIndexFile; @@ -976,7 +979,6 @@ CasContainerStrategy::MakeIndexSnapshot(bool FlushLockPosition) { throw std::system_error(Ec, fmt::format("Failed to create temp file for index snapshot at '{}'", IndexPath)); } - EntryCount = Entries.size(); CasDiskIndexHeader Header = {.EntryCount = EntryCount, .LogPosition = IndexLogPosition, .PayloadAlignment = gsl::narrow<uint32_t>(m_PayloadAlignment)}; @@ -989,32 +991,28 @@ CasContainerStrategy::MakeIndexSnapshot(bool FlushLockPosition) ObjectIndexFile.MoveTemporaryIntoPlace(IndexPath, Ec); if (Ec) { - ZEN_WARN("snapshot failed to rename new snapshot '{}' to '{}', reason: '{}'", - ObjectIndexFile.GetPath(), - IndexPath, - Ec.message()); + throw std::system_error(Ec, + fmt::format("Snapshot failed to rename new snapshot '{}' to '{}', reason: '{}'", + ObjectIndexFile.GetPath(), + IndexPath, + Ec.message())); } - if (FlushLockPosition) + + if (ResetLog) { - std::filesystem::path LogPath = cas::impl::GetLogPath(m_RootDirectory, m_ContainerBaseName); + const std::filesystem::path LogPath = cas::impl::GetLogPath(m_RootDirectory, m_ContainerBaseName); if (IsFile(LogPath)) { if (!RemoveFile(LogPath, Ec) || Ec) { - ZEN_WARN("snapshot failed to clean log file '{}', removing index at '{}', reason: '{}'", - LogPath, - IndexPath, - Ec.message()); - std::error_code RemoveIndexEc; - RemoveFile(IndexPath, RemoveIndexEc); + // This is non-critical, it only means that we will replay the events of the log over the snapshot - inefficent but in + // the end it will be the same result + ZEN_WARN("Snapshot failed to clean log file '{}', reason: '{}'", LogPath, IndexPath, Ec.message()); } } } - if (!Ec) - { - m_LogFlushPosition = LogCount; - } + m_LogFlushPosition = IndexLogPosition; } catch (const std::exception& Err) { @@ -1214,7 +1212,7 @@ CasContainerStrategy::OpenContainer(bool IsNewStore) if (IsNewStore || (LogEntryCount > 0)) { - MakeIndexSnapshot(/*FlushLockPosition*/ true); + MakeIndexSnapshot(/*ResetLog*/ true); } // TODO: should validate integrity of container files here diff --git a/src/zenstore/compactcas.h b/src/zenstore/compactcas.h index b7ea7591e..15e4cbf81 100644 --- a/src/zenstore/compactcas.h +++ b/src/zenstore/compactcas.h @@ -77,7 +77,7 @@ struct CasContainerStrategy final : public GcStorage, public GcReferenceStore private: CasStore::InsertResult InsertChunk(const void* ChunkData, size_t ChunkSize, const IoHash& ChunkHash); - void MakeIndexSnapshot(bool FlushLockPosition); + void MakeIndexSnapshot(bool ResetLog); uint64_t ReadIndexFile(const std::filesystem::path& IndexPath, uint32_t& OutVersion); uint64_t ReadLog(const std::filesystem::path& LogPath, uint64_t SkipEntryCount); void OpenContainer(bool IsNewStore); diff --git a/src/zenstore/filecas.cpp b/src/zenstore/filecas.cpp index 14bdc41f0..4911bffb9 100644 --- a/src/zenstore/filecas.cpp +++ b/src/zenstore/filecas.cpp @@ -251,7 +251,7 @@ FileCasStrategy::Initialize(const std::filesystem::path& RootDirectory, bool IsN if (IsNewStore || LogEntryCount > 0) { - MakeIndexSnapshot(); + MakeIndexSnapshot(/*ResetLog*/ true); } } @@ -727,7 +727,7 @@ FileCasStrategy::Flush() ZEN_TRACE_CPU("FileCas::Flush"); m_CasLog.Flush(); - MakeIndexSnapshot(); + MakeIndexSnapshot(/*ResetLog*/ false); } void @@ -912,15 +912,14 @@ FileCasStrategy::ValidateEntry(const FileCasIndexEntry& Entry, std::string& OutR } void -FileCasStrategy::MakeIndexSnapshot() +FileCasStrategy::MakeIndexSnapshot(bool ResetLog) { ZEN_MEMSCOPE(GetFileCasTag()); ZEN_TRACE_CPU("FileCas::MakeIndexSnapshot"); using namespace filecas::impl; - uint64_t LogCount = m_CasLog.GetLogCount(); - if (m_LogFlushPosition == LogCount) + if (m_LogFlushPosition == m_CasLog.GetLogCount()) { return; } @@ -937,34 +936,20 @@ FileCasStrategy::MakeIndexSnapshot() namespace fs = std::filesystem; - fs::path IndexPath = GetIndexPath(m_RootDirectory); - fs::path STmpIndexPath = GetTempIndexPath(m_RootDirectory); - - // Move index away, we keep it if something goes wrong - if (IsFile(STmpIndexPath)) - { - std::error_code Ec; - if (!RemoveFile(STmpIndexPath, Ec) || Ec) - { - ZEN_WARN("snapshot failed to clean up temp snapshot at {}, reason: '{}'", STmpIndexPath, Ec.message()); - return; - } - } + const fs::path IndexPath = GetIndexPath(m_RootDirectory); try { - if (IsFile(IndexPath)) - { - RenameFile(IndexPath, STmpIndexPath); - } - // Write the current state of the location map to a new index state std::vector<FileCasIndexEntry> Entries; uint64_t IndexLogPosition = 0; { RwLock::SharedLockScope __(m_Lock); - IndexLogPosition = m_CasLog.GetLogCount(); + if (!ResetLog) + { + IndexLogPosition = m_CasLog.GetLogCount(); + } Entries.resize(m_Index.size()); uint64_t EntryIndex = 0; @@ -974,6 +959,7 @@ FileCasStrategy::MakeIndexSnapshot() IndexEntry.Key = Entry.first; IndexEntry.Size = Entry.second.Size; } + EntryCount = m_Index.size(); } TemporaryFile ObjectIndexFile; @@ -983,47 +969,45 @@ FileCasStrategy::MakeIndexSnapshot() { throw std::system_error(Ec, fmt::format("Failed to create temp file for index snapshot at '{}'", IndexPath)); } - filecas::impl::FileCasIndexHeader Header = {.EntryCount = Entries.size(), .LogPosition = IndexLogPosition}; + filecas::impl::FileCasIndexHeader Header = {.EntryCount = EntryCount, .LogPosition = IndexLogPosition}; Header.Checksum = filecas::impl::FileCasIndexHeader::ComputeChecksum(Header); ObjectIndexFile.Write(&Header, sizeof(filecas::impl::FileCasIndexHeader), 0); - ObjectIndexFile.Write(Entries.data(), Entries.size() * sizeof(FileCasIndexEntry), sizeof(filecas::impl::FileCasIndexHeader)); + ObjectIndexFile.Write(Entries.data(), EntryCount * sizeof(FileCasIndexEntry), sizeof(filecas::impl::FileCasIndexHeader)); ObjectIndexFile.Flush(); ObjectIndexFile.MoveTemporaryIntoPlace(IndexPath, Ec); if (Ec) { - throw std::system_error(Ec, fmt::format("Failed to move temp file '{}' to '{}'", ObjectIndexFile.GetPath(), IndexPath)); + throw std::system_error(Ec, + fmt::format("Snapshot failed to rename new snapshot '{}' to '{}', reason: '{}'", + ObjectIndexFile.GetPath(), + IndexPath, + Ec.message())); } - EntryCount = Entries.size(); - m_LogFlushPosition = IndexLogPosition; - } - catch (const std::exception& Err) - { - ZEN_WARN("snapshot FAILED, reason: '{}'", Err.what()); - - // Restore any previous snapshot - if (IsFile(STmpIndexPath)) + if (ResetLog) { - std::error_code Ec; - RemoveFile(IndexPath, Ec); // We don't care if this fails, we try to move the old temp file regardless - RenameFile(STmpIndexPath, IndexPath, Ec); - if (Ec) + const std::filesystem::path LogPath = GetLogPath(m_RootDirectory); + + if (IsFile(LogPath)) { - ZEN_WARN("snapshot failed to restore old snapshot from {}, reason: '{}'", STmpIndexPath, Ec.message()); + if (!RemoveFile(LogPath, Ec) || Ec) + { + // This is non-critical, it only means that we will replay the events of the log over the snapshot - inefficent but in + // the end it will be the same result + ZEN_WARN("Snapshot failed to clean log file '{}', reason: '{}'", LogPath, IndexPath, Ec.message()); + } } } + m_LogFlushPosition = IndexLogPosition; } - if (IsFile(STmpIndexPath)) + catch (const std::exception& Err) { - std::error_code Ec; - if (!RemoveFile(STmpIndexPath, Ec) || Ec) - { - ZEN_WARN("snapshot failed to remove temporary file {}, reason: '{}'", STmpIndexPath, Ec.message()); - } + ZEN_WARN("snapshot FAILED, reason: '{}'", Err.what()); } } + uint64_t FileCasStrategy::ReadIndexFile(const std::filesystem::path& IndexPath, uint32_t& OutVersion) { diff --git a/src/zenstore/filecas.h b/src/zenstore/filecas.h index 21d8c3b9e..e93356927 100644 --- a/src/zenstore/filecas.h +++ b/src/zenstore/filecas.h @@ -50,7 +50,7 @@ struct FileCasStrategy final : public GcStorage, public GcReferenceStore virtual GcReferencePruner* CreateReferencePruner(GcCtx& Ctx, GcReferenceStoreStats& Stats) override; private: - void MakeIndexSnapshot(); + void MakeIndexSnapshot(bool ResetLog); uint64_t ReadIndexFile(const std::filesystem::path& IndexPath, uint32_t& OutVersion); uint64_t ReadLog(const std::filesystem::path& LogPath, uint64_t LogPosition); LoggerRef Log() { return m_Log; } diff --git a/src/zenstore/include/zenstore/cache/cachedisklayer.h b/src/zenstore/include/zenstore/cache/cachedisklayer.h index f36e780b9..54ebb324d 100644 --- a/src/zenstore/include/zenstore/cache/cachedisklayer.h +++ b/src/zenstore/include/zenstore/cache/cachedisklayer.h @@ -409,20 +409,20 @@ public: void SaveSnapshot(const std::function<uint64_t()>& ClaimDiskReserveFunc = []() { return 0; }); void WriteIndexSnapshot( RwLock::ExclusiveLockScope&, - bool FlushLockPosition, + bool ResetLog, const std::function<uint64_t()>& ClaimDiskReserveFunc = []() { return 0; }) { - WriteIndexSnapshotLocked(FlushLockPosition, ClaimDiskReserveFunc); + WriteIndexSnapshotLocked(ResetLog, ClaimDiskReserveFunc); } void WriteIndexSnapshot( RwLock::SharedLockScope&, - bool FlushLockPosition, + bool ResetLog, const std::function<uint64_t()>& ClaimDiskReserveFunc = []() { return 0; }) { - WriteIndexSnapshotLocked(FlushLockPosition, ClaimDiskReserveFunc); + WriteIndexSnapshotLocked(ResetLog, ClaimDiskReserveFunc); } void WriteIndexSnapshotLocked( - bool FlushLockPosition, + bool ResetLog, const std::function<uint64_t()>& ClaimDiskReserveFunc = []() { return 0; }); void CompactState(RwLock::ExclusiveLockScope& IndexLock, |