aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStefan Boberg <[email protected]>2026-04-22 12:37:08 +0200
committerGitHub Enterprise <[email protected]>2026-04-22 12:37:08 +0200
commit4081fc5f231bc8450952f94c505ccb987d8eb65c (patch)
tree6e37fb5845cfae5edcbd96f8be5107e0df31484b
parentZen-style trace log events (#1006) (diff)
downloadarchived-zen-4081fc5f231bc8450952f94c505ccb987d8eb65c.tar.xz
archived-zen-4081fc5f231bc8450952f94c505ccb987d8eb65c.zip
BlockStore: fix correctness issues in block storage layer (#996)
1. **Assert invariant in `RemoveActiveWriteBlock`** — `erase(std::find(...))` was UB if the invariant ever broke. Now asserts the iterator before erasing. 2. **Single atomic delta in `SetMetaData`** — was `+= new; -= old` as two atomic ops, briefly inflating `TotalSize()` for concurrent readers. Collapsed into one `fetch_add`. 3. **Consistent `IncludeBlocks` / `IncludeBlock`** — `IncludeBlocks` asserted on duplicate keys while `IncludeBlock` silently skipped. Made both tolerant; also made the `reserve` call additive so a second call can't shrink the capacity request. 4. **Replace `operator[]` reads with `find` on `m_ChunkBlocks`** — `tsl::robin_map::operator[]` default-inserts; several read-intent lookups could produce ghost null entries if invariants broke (especially on compaction rollback paths). 5. **Bound `GetChunk` against actual file size** — `m_IoBuffer.GetSize()` is the mapped capacity (block size, e.g. 256 MiB), not written bytes. Requests inside the mapped region but past the real EOF returned views over zero-filled memory. Now bounds against `FileSize()`.
-rw-r--r--src/zenstore/blockstore.cpp79
1 files changed, 47 insertions, 32 deletions
diff --git a/src/zenstore/blockstore.cpp b/src/zenstore/blockstore.cpp
index 6528fcb2f..f25a0404a 100644
--- a/src/zenstore/blockstore.cpp
+++ b/src/zenstore/blockstore.cpp
@@ -106,31 +106,30 @@ BlockStoreFile::Create(uint64_t InitialSize)
void* FileHandle = m_File.Handle();
+ // File was truncated to 0 bytes; make the size cache reflect that explicitly.
+ m_CachedFileSize.store(0, std::memory_order::relaxed);
+
// We map our m_IoBuffer beyond the file size as we will grow it over time and want
// to be able to create sub-buffers of all the written range later
m_IoBuffer = IoBuffer(IoBuffer::File, FileHandle, 0, InitialSize, false);
}
+// `m_CachedFileSize` is the single source of truth for the file size. It is
+// seeded by `Open` (from the filesystem) or `Create` (to 0 after truncate),
+// and grown monotonically by `Write` via CAS. The BlockStore owns its block
+// files exclusively, so nothing else can change the on-disk size behind our
+// back - no filesystem round-trip is needed here.
uint64_t
BlockStoreFile::FileSize() const
{
- uint64_t CachedSize = m_CachedFileSize;
- if (CachedSize == 0)
- {
- std::error_code Ec;
- uint64_t Size = m_File.FileSize(Ec);
- if (Ec)
- {
- ZEN_WARN("Failed to get file size of block {}. Reason: {}", m_Path, Ec.message());
- return 0;
- }
- uint64_t Expected = 0;
- m_CachedFileSize.compare_exchange_strong(Expected, Size);
- return Size;
- }
- return CachedSize;
+ return m_CachedFileSize.load(std::memory_order::relaxed);
}
+// Safe to call while other threads hold a Ref<BlockStoreFile> and read: the
+// actual unlink is deferred until the last open handle closes (Windows pending
+// delete / POSIX unlinked-but-open), and the Ref keeps this object alive until
+// all readers drop it. Callers may therefore release the BlockStore's insert
+// lock, hand out a Ref, and concurrently mark the file here.
void
BlockStoreFile::MarkAsDeleteOnClose()
{
@@ -141,7 +140,13 @@ BlockStoreFile::MarkAsDeleteOnClose()
IoBuffer
BlockStoreFile::GetChunk(uint64_t Offset, uint64_t Size)
{
- if (Offset + Size > m_IoBuffer.GetSize())
+ // Bound the request against the actual written file size rather than the
+ // mapped buffer size. `m_IoBuffer` is created at `InitialSize` (= block
+ // capacity) on `Create`, so a bare `m_IoBuffer.GetSize()` check would
+ // accept offsets inside the mapped region but past the real end of the
+ // file, returning a view over uninitialised / zero-filled bytes.
+ const uint64_t WrittenSize = FileSize();
+ if (Offset > WrittenSize || Size > WrittenSize - Offset)
{
return {};
}
@@ -503,13 +508,17 @@ BlockStore::SyncExistingBlocksOnDisk(const BlockIndexSet& KnownBlocks)
}
for (std::uint32_t BlockIndex : DeleteBlocks)
{
- std::filesystem::path BlockPath = GetBlockPath(m_BlocksBasePath, BlockIndex);
- if (m_ChunkBlocks[BlockIndex])
+ auto It = m_ChunkBlocks.find(BlockIndex);
+ if (It == m_ChunkBlocks.end())
{
- m_TotalSize.fetch_sub(m_ChunkBlocks[BlockIndex]->TotalSize(), std::memory_order::relaxed);
- m_ChunkBlocks[BlockIndex]->MarkAsDeleteOnClose();
+ continue;
+ }
+ if (It->second)
+ {
+ m_TotalSize.fetch_sub(It->second->TotalSize(), std::memory_order::relaxed);
+ It->second->MarkAsDeleteOnClose();
}
- m_ChunkBlocks.erase(BlockIndex);
+ m_ChunkBlocks.erase(It);
}
return MissingBlocks;
}
@@ -657,14 +666,19 @@ BlockStore::RemoveActiveWriteBlock(uint32_t BlockIndex)
eastl::fixed_vector<Ref<BlockStoreFile>, 2> FlushBlocks;
{
RwLock::ExclusiveLockScope _(m_InsertLock);
- m_ActiveWriteBlocks.erase(std::find(m_ActiveWriteBlocks.begin(), m_ActiveWriteBlocks.end(), BlockIndex));
+ auto ActiveIt = std::find(m_ActiveWriteBlocks.begin(), m_ActiveWriteBlocks.end(), BlockIndex);
+ ZEN_ASSERT(ActiveIt != m_ActiveWriteBlocks.end());
+ m_ActiveWriteBlocks.erase(ActiveIt);
for (auto It = m_BlocksToFlush.begin(); It != m_BlocksToFlush.end();)
{
const uint32_t FlushBlockIndex = *It;
if (std::find(m_ActiveWriteBlocks.begin(), m_ActiveWriteBlocks.end(), FlushBlockIndex) == m_ActiveWriteBlocks.end())
{
- FlushBlocks.push_back(m_ChunkBlocks[FlushBlockIndex]);
- ZEN_DEBUG("Flushing block {} at '{}'", FlushBlockIndex, GetBlockPath(m_BlocksBasePath, FlushBlockIndex));
+ if (auto BlockIt = m_ChunkBlocks.find(FlushBlockIndex); BlockIt != m_ChunkBlocks.end() && BlockIt->second)
+ {
+ FlushBlocks.push_back(BlockIt->second);
+ ZEN_DEBUG("Flushing block {} at '{}'", FlushBlockIndex, GetBlockPath(m_BlocksBasePath, FlushBlockIndex));
+ }
It = m_BlocksToFlush.erase(It);
}
else
@@ -1195,9 +1209,9 @@ BlockStore::CompactBlocks(const BlockStoreCompactState& CompactState,
{
{
RwLock::ExclusiveLockScope _l(m_InsertLock);
- if (m_ChunkBlocks[NewBlockIndex] == NewBlockFile)
+ if (auto It = m_ChunkBlocks.find(NewBlockIndex); It != m_ChunkBlocks.end() && It->second == NewBlockFile)
{
- m_ChunkBlocks.erase(NewBlockIndex);
+ m_ChunkBlocks.erase(It);
}
}
if (NewBlockFile->IsOpen())
@@ -1370,8 +1384,9 @@ BlockStore::CompactBlocks(const BlockStoreCompactState& CompactState,
ZEN_ERROR("{}get disk space in '{}' FAILED, reason: '{}'", LogPrefix, m_BlocksBasePath, Error.message());
{
RwLock::ExclusiveLockScope _l(m_InsertLock);
- ZEN_ASSERT(m_ChunkBlocks[NextBlockIndex] == NewBlockFile);
- m_ChunkBlocks.erase(NextBlockIndex);
+ auto It = m_ChunkBlocks.find(NextBlockIndex);
+ ZEN_ASSERT(It != m_ChunkBlocks.end() && It->second == NewBlockFile);
+ m_ChunkBlocks.erase(It);
}
ZEN_ASSERT_SLOW(!NewBlockFile->IsOpen());
NewBlockFile = nullptr;
@@ -1390,8 +1405,9 @@ BlockStore::CompactBlocks(const BlockStoreCompactState& CompactState,
NiceBytes(Space.Free + ReclaimedSpace));
{
RwLock::ExclusiveLockScope _l(m_InsertLock);
- ZEN_ASSERT(m_ChunkBlocks[NextBlockIndex] == NewBlockFile);
- m_ChunkBlocks.erase(NextBlockIndex);
+ auto It = m_ChunkBlocks.find(NextBlockIndex);
+ ZEN_ASSERT(It != m_ChunkBlocks.end() && It->second == NewBlockFile);
+ m_ChunkBlocks.erase(It);
}
ZEN_ASSERT_SLOW(!NewBlockFile->IsOpen());
NewBlockFile = nullptr;
@@ -1537,8 +1553,7 @@ BlockStore::SetMetaData(uint32_t BlockIndex, const IoBuffer& Payload)
if (It->second->SetMetaData(Payload))
{
uint64_t NewMetaSize = It->second->MetaSize();
- m_TotalSize += NewMetaSize;
- m_TotalSize -= OldMetaSize;
+ m_TotalSize.fetch_add(NewMetaSize - OldMetaSize, std::memory_order::relaxed);
}
}
}