aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorDan Engelbrecht <[email protected]>2025-05-12 11:03:46 +0200
committerGitHub Enterprise <[email protected]>2025-05-12 11:03:46 +0200
commita417d19e6d2af229e7fd33c559f6fefee3a81042 (patch)
tree48d23f4c4700be6cbaa820d9ac798ddea70218cd /src
parentenable per bucket config (#388) (diff)
downloadzen-a417d19e6d2af229e7fd33c559f6fefee3a81042.tar.xz
zen-a417d19e6d2af229e7fd33c559f6fefee3a81042.zip
keep snapshot on log delete fail (#391)
- Improvement: Cleaned up snapshot writing for CompactCAS/FileCas/Cache/Project stores - Improvement: Safer recovery when failing to delete log for CompactCAS/FileCas/Cache/Project stores - Improvement: Added log file reset when writing snapshot at startup for FileCas
Diffstat (limited to 'src')
-rw-r--r--src/zenserver/projectstore/projectstore.cpp61
-rw-r--r--src/zenstore/cache/cachedisklayer.cpp47
-rw-r--r--src/zenstore/compactcas.cpp46
-rw-r--r--src/zenstore/compactcas.h2
-rw-r--r--src/zenstore/filecas.cpp78
-rw-r--r--src/zenstore/filecas.h2
-rw-r--r--src/zenstore/include/zenstore/cache/cachedisklayer.h10
7 files changed, 88 insertions, 158 deletions
diff --git a/src/zenserver/projectstore/projectstore.cpp b/src/zenserver/projectstore/projectstore.cpp
index 071aab137..6a55efdb7 100644
--- a/src/zenserver/projectstore/projectstore.cpp
+++ b/src/zenserver/projectstore/projectstore.cpp
@@ -1643,31 +1643,9 @@ ProjectStore::Oplog::WriteIndexSnapshot()
namespace fs = std::filesystem;
- fs::path IndexPath = m_BasePath / "ops.zidx";
- fs::path TempIndexPath = m_BasePath / "ops.zidx.tmp";
-
- // Move index away, we keep it if something goes wrong
- if (IsFile(TempIndexPath))
- {
- std::error_code Ec;
- if (!RemoveFile(TempIndexPath, Ec) || Ec)
- {
- ZEN_WARN("oplog '{}/{}': snapshot failed to clean up temp snapshot at {}, reason: '{}'",
- GetOuterProject()->Identifier,
- m_OplogId,
- TempIndexPath,
- Ec.message());
- return;
- }
- }
-
+ const fs::path IndexPath = m_BasePath / "ops.zidx";
try
{
- if (IsFile(IndexPath))
- {
- RenameFile(IndexPath, TempIndexPath);
- }
-
// Write the current state of the location map to a new index state
std::vector<uint32_t> LSNEntries;
std::vector<Oid> Keys;
@@ -1781,7 +1759,11 @@ ProjectStore::Oplog::WriteIndexSnapshot()
ObjectIndexFile.MoveTemporaryIntoPlace(IndexPath, Ec);
if (Ec)
{
- throw std::system_error(Ec, fmt::format("Failed to move temp file '{}' to '{}'", ObjectIndexFile.GetPath(), IndexPath));
+ throw std::system_error(Ec,
+ fmt::format("Snapshot failed to rename new snapshot '{}' to '{}', reason: '{}'",
+ ObjectIndexFile.GetPath(),
+ IndexPath,
+ Ec.message()));
}
EntryCount = LSNEntries.size();
m_LogFlushPosition = IndexLogPosition;
@@ -1789,35 +1771,6 @@ ProjectStore::Oplog::WriteIndexSnapshot()
catch (const std::exception& Err)
{
ZEN_WARN("oplog '{}/{}': snapshot FAILED, reason: '{}'", m_OuterProject->Identifier, m_OplogId, Err.what());
-
- // Restore any previous snapshot
-
- if (IsFile(TempIndexPath))
- {
- std::error_code Ec;
- RemoveFile(IndexPath, Ec); // We don't care if this fails, we try to move the old temp file regardless
- RenameFile(TempIndexPath, IndexPath, Ec);
- if (Ec)
- {
- ZEN_WARN("oplog '{}/{}': snapshot failed to restore old snapshot from {}, reason: '{}'",
- m_OuterProject->Identifier,
- m_OplogId,
- TempIndexPath,
- Ec.message());
- }
- }
- }
- if (IsFile(TempIndexPath))
- {
- std::error_code Ec;
- if (!RemoveFile(TempIndexPath, Ec) || Ec)
- {
- ZEN_WARN("oplog '{}/{}': snapshot failed to remove temporary file {}, reason: '{}'",
- m_OuterProject->Identifier,
- m_OplogId,
- TempIndexPath,
- Ec.message());
- }
}
}
@@ -1827,7 +1780,7 @@ ProjectStore::Oplog::ReadIndexSnapshot()
ZEN_MEMSCOPE(GetProjectstoreTag());
ZEN_TRACE_CPU("Oplog::ReadIndexSnapshot");
- std::filesystem::path IndexPath = m_BasePath / "ops.zidx";
+ const std::filesystem::path IndexPath = m_BasePath / "ops.zidx";
if (IsFile(IndexPath))
{
uint64_t EntryCount = 0;
diff --git a/src/zenstore/cache/cachedisklayer.cpp b/src/zenstore/cache/cachedisklayer.cpp
index e7b2e6bc6..91bd9cba8 100644
--- a/src/zenstore/cache/cachedisklayer.cpp
+++ b/src/zenstore/cache/cachedisklayer.cpp
@@ -824,12 +824,11 @@ ZenCacheDiskLayer::CacheBucket::OpenOrCreate(std::filesystem::path BucketDir, bo
}
void
-ZenCacheDiskLayer::CacheBucket::WriteIndexSnapshotLocked(bool FlushLockPosition, const std::function<uint64_t()>& ClaimDiskReserveFunc)
+ZenCacheDiskLayer::CacheBucket::WriteIndexSnapshotLocked(bool ResetLog, const std::function<uint64_t()>& ClaimDiskReserveFunc)
{
ZEN_TRACE_CPU("Z$::Bucket::WriteIndexSnapshot");
- const uint64_t LogCount = FlushLockPosition ? 0 : m_SlogFile.GetLogCount();
- if (m_LogFlushPosition == LogCount)
+ if (m_LogFlushPosition == m_SlogFile.GetLogCount())
{
return;
}
@@ -846,7 +845,7 @@ ZenCacheDiskLayer::CacheBucket::WriteIndexSnapshotLocked(bool FlushLockPosition,
namespace fs = std::filesystem;
- fs::path IndexPath = cache::impl::GetIndexPath(m_BucketDir, m_BucketName);
+ const fs::path IndexPath = cache::impl::GetIndexPath(m_BucketDir, m_BucketName);
try
{
@@ -878,8 +877,10 @@ ZenCacheDiskLayer::CacheBucket::WriteIndexSnapshotLocked(bool FlushLockPosition,
throw std::system_error(Ec, fmt::format("failed to create new snapshot file in '{}'", m_BucketDir));
}
+ const uint64_t IndexLogPosition = ResetLog ? 0 : m_SlogFile.GetLogCount();
+
cache::impl::CacheBucketIndexHeader Header = {.EntryCount = EntryCount,
- .LogPosition = LogCount,
+ .LogPosition = IndexLogPosition,
.PayloadAlignment = gsl::narrow<uint32_t>(m_Configuration.PayloadAlignment)};
Header.Checksum = cache::impl::CacheBucketIndexHeader::ComputeChecksum(Header);
@@ -916,34 +917,28 @@ ZenCacheDiskLayer::CacheBucket::WriteIndexSnapshotLocked(bool FlushLockPosition,
ObjectIndexFile.MoveTemporaryIntoPlace(IndexPath, Ec);
if (Ec)
{
- std::filesystem::path TempFilePath = ObjectIndexFile.GetPath();
- ZEN_WARN("snapshot failed to rename new snapshot '{}' to '{}', reason: '{}'", TempFilePath, IndexPath, Ec.message());
+ throw std::system_error(Ec,
+ fmt::format("Snapshot failed to rename new snapshot '{}' to '{}', reason: '{}'",
+ ObjectIndexFile.GetPath(),
+ IndexPath,
+ Ec.message()));
}
- else
+
+ if (ResetLog)
{
- // We must only update the log flush position once the snapshot write succeeds
- if (FlushLockPosition)
- {
- std::filesystem::path LogPath = cache::impl::GetLogPath(m_BucketDir, m_BucketName);
+ const std::filesystem::path LogPath = cache::impl::GetLogPath(m_BucketDir, m_BucketName);
- if (IsFile(LogPath))
+ if (IsFile(LogPath))
+ {
+ if (!RemoveFile(LogPath, Ec) || Ec)
{
- if (!RemoveFile(LogPath, Ec) || Ec)
- {
- ZEN_WARN("snapshot failed to clean log file '{}', removing index at '{}', reason: '{}'",
- LogPath,
- IndexPath,
- Ec.message());
- std::error_code RemoveIndexEc;
- RemoveFile(IndexPath, RemoveIndexEc);
- }
+ // This is non-critical, it only means that we will replay the events of the log over the snapshot - inefficent but in
+ // the end it will be the same result
+ ZEN_WARN("snapshot failed to clean log file '{}', reason: '{}'", LogPath, IndexPath, Ec.message());
}
}
- if (!Ec)
- {
- m_LogFlushPosition = LogCount;
- }
}
+ m_LogFlushPosition = IndexLogPosition;
}
catch (const std::exception& Err)
{
diff --git a/src/zenstore/compactcas.cpp b/src/zenstore/compactcas.cpp
index a8914ed20..8cf241e34 100644
--- a/src/zenstore/compactcas.cpp
+++ b/src/zenstore/compactcas.cpp
@@ -461,7 +461,7 @@ CasContainerStrategy::Flush()
ZEN_TRACE_CPU("CasContainer::Flush");
m_BlockStore.Flush(/*ForceNewBlock*/ false);
m_CasLog.Flush();
- MakeIndexSnapshot(/*FlushLockPosition*/ false);
+ MakeIndexSnapshot(/*ResetLog*/ false);
}
void
@@ -924,13 +924,12 @@ CasContainerStrategy::StorageSize() const
}
void
-CasContainerStrategy::MakeIndexSnapshot(bool FlushLockPosition)
+CasContainerStrategy::MakeIndexSnapshot(bool ResetLog)
{
ZEN_MEMSCOPE(GetCasContainerTag());
ZEN_TRACE_CPU("CasContainer::MakeIndexSnapshot");
- const uint64_t LogCount = FlushLockPosition ? 0 : m_CasLog.GetLogCount();
- if (m_LogFlushPosition == LogCount)
+ if (m_LogFlushPosition == m_CasLog.GetLogCount())
{
return;
}
@@ -947,7 +946,7 @@ CasContainerStrategy::MakeIndexSnapshot(bool FlushLockPosition)
namespace fs = std::filesystem;
- fs::path IndexPath = cas::impl::GetIndexPath(m_RootDirectory, m_ContainerBaseName);
+ const fs::path IndexPath = cas::impl::GetIndexPath(m_RootDirectory, m_ContainerBaseName);
try
{
@@ -957,7 +956,10 @@ CasContainerStrategy::MakeIndexSnapshot(bool FlushLockPosition)
{
RwLock::SharedLockScope ___(m_LocationMapLock);
- IndexLogPosition = m_CasLog.GetLogCount();
+ if (!ResetLog)
+ {
+ IndexLogPosition = m_CasLog.GetLogCount();
+ }
Entries.resize(m_LocationMap.size());
uint64_t EntryIndex = 0;
@@ -967,6 +969,7 @@ CasContainerStrategy::MakeIndexSnapshot(bool FlushLockPosition)
IndexEntry.Key = Entry.first;
IndexEntry.Location = m_Locations[Entry.second];
}
+ EntryCount = m_LocationMap.size();
}
TemporaryFile ObjectIndexFile;
@@ -976,7 +979,6 @@ CasContainerStrategy::MakeIndexSnapshot(bool FlushLockPosition)
{
throw std::system_error(Ec, fmt::format("Failed to create temp file for index snapshot at '{}'", IndexPath));
}
- EntryCount = Entries.size();
CasDiskIndexHeader Header = {.EntryCount = EntryCount,
.LogPosition = IndexLogPosition,
.PayloadAlignment = gsl::narrow<uint32_t>(m_PayloadAlignment)};
@@ -989,32 +991,28 @@ CasContainerStrategy::MakeIndexSnapshot(bool FlushLockPosition)
ObjectIndexFile.MoveTemporaryIntoPlace(IndexPath, Ec);
if (Ec)
{
- ZEN_WARN("snapshot failed to rename new snapshot '{}' to '{}', reason: '{}'",
- ObjectIndexFile.GetPath(),
- IndexPath,
- Ec.message());
+ throw std::system_error(Ec,
+ fmt::format("Snapshot failed to rename new snapshot '{}' to '{}', reason: '{}'",
+ ObjectIndexFile.GetPath(),
+ IndexPath,
+ Ec.message()));
}
- if (FlushLockPosition)
+
+ if (ResetLog)
{
- std::filesystem::path LogPath = cas::impl::GetLogPath(m_RootDirectory, m_ContainerBaseName);
+ const std::filesystem::path LogPath = cas::impl::GetLogPath(m_RootDirectory, m_ContainerBaseName);
if (IsFile(LogPath))
{
if (!RemoveFile(LogPath, Ec) || Ec)
{
- ZEN_WARN("snapshot failed to clean log file '{}', removing index at '{}', reason: '{}'",
- LogPath,
- IndexPath,
- Ec.message());
- std::error_code RemoveIndexEc;
- RemoveFile(IndexPath, RemoveIndexEc);
+ // This is non-critical, it only means that we will replay the events of the log over the snapshot - inefficent but in
+ // the end it will be the same result
+ ZEN_WARN("Snapshot failed to clean log file '{}', reason: '{}'", LogPath, IndexPath, Ec.message());
}
}
}
- if (!Ec)
- {
- m_LogFlushPosition = LogCount;
- }
+ m_LogFlushPosition = IndexLogPosition;
}
catch (const std::exception& Err)
{
@@ -1214,7 +1212,7 @@ CasContainerStrategy::OpenContainer(bool IsNewStore)
if (IsNewStore || (LogEntryCount > 0))
{
- MakeIndexSnapshot(/*FlushLockPosition*/ true);
+ MakeIndexSnapshot(/*ResetLog*/ true);
}
// TODO: should validate integrity of container files here
diff --git a/src/zenstore/compactcas.h b/src/zenstore/compactcas.h
index b7ea7591e..15e4cbf81 100644
--- a/src/zenstore/compactcas.h
+++ b/src/zenstore/compactcas.h
@@ -77,7 +77,7 @@ struct CasContainerStrategy final : public GcStorage, public GcReferenceStore
private:
CasStore::InsertResult InsertChunk(const void* ChunkData, size_t ChunkSize, const IoHash& ChunkHash);
- void MakeIndexSnapshot(bool FlushLockPosition);
+ void MakeIndexSnapshot(bool ResetLog);
uint64_t ReadIndexFile(const std::filesystem::path& IndexPath, uint32_t& OutVersion);
uint64_t ReadLog(const std::filesystem::path& LogPath, uint64_t SkipEntryCount);
void OpenContainer(bool IsNewStore);
diff --git a/src/zenstore/filecas.cpp b/src/zenstore/filecas.cpp
index 14bdc41f0..4911bffb9 100644
--- a/src/zenstore/filecas.cpp
+++ b/src/zenstore/filecas.cpp
@@ -251,7 +251,7 @@ FileCasStrategy::Initialize(const std::filesystem::path& RootDirectory, bool IsN
if (IsNewStore || LogEntryCount > 0)
{
- MakeIndexSnapshot();
+ MakeIndexSnapshot(/*ResetLog*/ true);
}
}
@@ -727,7 +727,7 @@ FileCasStrategy::Flush()
ZEN_TRACE_CPU("FileCas::Flush");
m_CasLog.Flush();
- MakeIndexSnapshot();
+ MakeIndexSnapshot(/*ResetLog*/ false);
}
void
@@ -912,15 +912,14 @@ FileCasStrategy::ValidateEntry(const FileCasIndexEntry& Entry, std::string& OutR
}
void
-FileCasStrategy::MakeIndexSnapshot()
+FileCasStrategy::MakeIndexSnapshot(bool ResetLog)
{
ZEN_MEMSCOPE(GetFileCasTag());
ZEN_TRACE_CPU("FileCas::MakeIndexSnapshot");
using namespace filecas::impl;
- uint64_t LogCount = m_CasLog.GetLogCount();
- if (m_LogFlushPosition == LogCount)
+ if (m_LogFlushPosition == m_CasLog.GetLogCount())
{
return;
}
@@ -937,34 +936,20 @@ FileCasStrategy::MakeIndexSnapshot()
namespace fs = std::filesystem;
- fs::path IndexPath = GetIndexPath(m_RootDirectory);
- fs::path STmpIndexPath = GetTempIndexPath(m_RootDirectory);
-
- // Move index away, we keep it if something goes wrong
- if (IsFile(STmpIndexPath))
- {
- std::error_code Ec;
- if (!RemoveFile(STmpIndexPath, Ec) || Ec)
- {
- ZEN_WARN("snapshot failed to clean up temp snapshot at {}, reason: '{}'", STmpIndexPath, Ec.message());
- return;
- }
- }
+ const fs::path IndexPath = GetIndexPath(m_RootDirectory);
try
{
- if (IsFile(IndexPath))
- {
- RenameFile(IndexPath, STmpIndexPath);
- }
-
// Write the current state of the location map to a new index state
std::vector<FileCasIndexEntry> Entries;
uint64_t IndexLogPosition = 0;
{
RwLock::SharedLockScope __(m_Lock);
- IndexLogPosition = m_CasLog.GetLogCount();
+ if (!ResetLog)
+ {
+ IndexLogPosition = m_CasLog.GetLogCount();
+ }
Entries.resize(m_Index.size());
uint64_t EntryIndex = 0;
@@ -974,6 +959,7 @@ FileCasStrategy::MakeIndexSnapshot()
IndexEntry.Key = Entry.first;
IndexEntry.Size = Entry.second.Size;
}
+ EntryCount = m_Index.size();
}
TemporaryFile ObjectIndexFile;
@@ -983,47 +969,45 @@ FileCasStrategy::MakeIndexSnapshot()
{
throw std::system_error(Ec, fmt::format("Failed to create temp file for index snapshot at '{}'", IndexPath));
}
- filecas::impl::FileCasIndexHeader Header = {.EntryCount = Entries.size(), .LogPosition = IndexLogPosition};
+ filecas::impl::FileCasIndexHeader Header = {.EntryCount = EntryCount, .LogPosition = IndexLogPosition};
Header.Checksum = filecas::impl::FileCasIndexHeader::ComputeChecksum(Header);
ObjectIndexFile.Write(&Header, sizeof(filecas::impl::FileCasIndexHeader), 0);
- ObjectIndexFile.Write(Entries.data(), Entries.size() * sizeof(FileCasIndexEntry), sizeof(filecas::impl::FileCasIndexHeader));
+ ObjectIndexFile.Write(Entries.data(), EntryCount * sizeof(FileCasIndexEntry), sizeof(filecas::impl::FileCasIndexHeader));
ObjectIndexFile.Flush();
ObjectIndexFile.MoveTemporaryIntoPlace(IndexPath, Ec);
if (Ec)
{
- throw std::system_error(Ec, fmt::format("Failed to move temp file '{}' to '{}'", ObjectIndexFile.GetPath(), IndexPath));
+ throw std::system_error(Ec,
+ fmt::format("Snapshot failed to rename new snapshot '{}' to '{}', reason: '{}'",
+ ObjectIndexFile.GetPath(),
+ IndexPath,
+ Ec.message()));
}
- EntryCount = Entries.size();
- m_LogFlushPosition = IndexLogPosition;
- }
- catch (const std::exception& Err)
- {
- ZEN_WARN("snapshot FAILED, reason: '{}'", Err.what());
-
- // Restore any previous snapshot
- if (IsFile(STmpIndexPath))
+ if (ResetLog)
{
- std::error_code Ec;
- RemoveFile(IndexPath, Ec); // We don't care if this fails, we try to move the old temp file regardless
- RenameFile(STmpIndexPath, IndexPath, Ec);
- if (Ec)
+ const std::filesystem::path LogPath = GetLogPath(m_RootDirectory);
+
+ if (IsFile(LogPath))
{
- ZEN_WARN("snapshot failed to restore old snapshot from {}, reason: '{}'", STmpIndexPath, Ec.message());
+ if (!RemoveFile(LogPath, Ec) || Ec)
+ {
+ // This is non-critical, it only means that we will replay the events of the log over the snapshot - inefficent but in
+ // the end it will be the same result
+ ZEN_WARN("Snapshot failed to clean log file '{}', reason: '{}'", LogPath, IndexPath, Ec.message());
+ }
}
}
+ m_LogFlushPosition = IndexLogPosition;
}
- if (IsFile(STmpIndexPath))
+ catch (const std::exception& Err)
{
- std::error_code Ec;
- if (!RemoveFile(STmpIndexPath, Ec) || Ec)
- {
- ZEN_WARN("snapshot failed to remove temporary file {}, reason: '{}'", STmpIndexPath, Ec.message());
- }
+ ZEN_WARN("snapshot FAILED, reason: '{}'", Err.what());
}
}
+
uint64_t
FileCasStrategy::ReadIndexFile(const std::filesystem::path& IndexPath, uint32_t& OutVersion)
{
diff --git a/src/zenstore/filecas.h b/src/zenstore/filecas.h
index 21d8c3b9e..e93356927 100644
--- a/src/zenstore/filecas.h
+++ b/src/zenstore/filecas.h
@@ -50,7 +50,7 @@ struct FileCasStrategy final : public GcStorage, public GcReferenceStore
virtual GcReferencePruner* CreateReferencePruner(GcCtx& Ctx, GcReferenceStoreStats& Stats) override;
private:
- void MakeIndexSnapshot();
+ void MakeIndexSnapshot(bool ResetLog);
uint64_t ReadIndexFile(const std::filesystem::path& IndexPath, uint32_t& OutVersion);
uint64_t ReadLog(const std::filesystem::path& LogPath, uint64_t LogPosition);
LoggerRef Log() { return m_Log; }
diff --git a/src/zenstore/include/zenstore/cache/cachedisklayer.h b/src/zenstore/include/zenstore/cache/cachedisklayer.h
index f36e780b9..54ebb324d 100644
--- a/src/zenstore/include/zenstore/cache/cachedisklayer.h
+++ b/src/zenstore/include/zenstore/cache/cachedisklayer.h
@@ -409,20 +409,20 @@ public:
void SaveSnapshot(const std::function<uint64_t()>& ClaimDiskReserveFunc = []() { return 0; });
void WriteIndexSnapshot(
RwLock::ExclusiveLockScope&,
- bool FlushLockPosition,
+ bool ResetLog,
const std::function<uint64_t()>& ClaimDiskReserveFunc = []() { return 0; })
{
- WriteIndexSnapshotLocked(FlushLockPosition, ClaimDiskReserveFunc);
+ WriteIndexSnapshotLocked(ResetLog, ClaimDiskReserveFunc);
}
void WriteIndexSnapshot(
RwLock::SharedLockScope&,
- bool FlushLockPosition,
+ bool ResetLog,
const std::function<uint64_t()>& ClaimDiskReserveFunc = []() { return 0; })
{
- WriteIndexSnapshotLocked(FlushLockPosition, ClaimDiskReserveFunc);
+ WriteIndexSnapshotLocked(ResetLog, ClaimDiskReserveFunc);
}
void WriteIndexSnapshotLocked(
- bool FlushLockPosition,
+ bool ResetLog,
const std::function<uint64_t()>& ClaimDiskReserveFunc = []() { return 0; });
void CompactState(RwLock::ExclusiveLockScope& IndexLock,