diff options
| author | Dan Engelbrecht <[email protected]> | 2022-05-01 23:31:35 +0200 |
|---|---|---|
| committer | Dan Engelbrecht <[email protected]> | 2022-05-01 23:31:35 +0200 |
| commit | 08a0dc388f98e6d3eb8387b983a9a7fb959fe603 (patch) | |
| tree | 5f4bef8e81e55e22f65690250c84d943098c738b | |
| parent | remove m_TotalSize for blockstore (diff) | |
| download | zen-08a0dc388f98e6d3eb8387b983a9a7fb959fe603.tar.xz zen-08a0dc388f98e6d3eb8387b983a9a7fb959fe603.zip | |
reimplement CasContainerStrategy::Scrub
| -rw-r--r-- | zenstore/blockstore.cpp | 82 | ||||
| -rw-r--r-- | zenstore/compactcas.cpp | 132 | ||||
| -rw-r--r-- | zenstore/include/zenstore/blockstore.h | 4 |
3 files changed, 140 insertions, 78 deletions
diff --git a/zenstore/blockstore.cpp b/zenstore/blockstore.cpp index b4aa0f7c3..559dfc1ee 100644 --- a/zenstore/blockstore.cpp +++ b/zenstore/blockstore.cpp @@ -105,6 +105,12 @@ BlockStoreFile::Flush() m_File.Flush(); } +BasicFile& +BlockStoreFile::GetBasicFile() +{ + return m_File; +} + void BlockStoreFile::StreamByteRange(uint64_t FileOffset, uint64_t Size, std::function<void(const void* Data, uint64_t Size)>&& ChunkFun) { @@ -577,6 +583,82 @@ BlockStore::ReclaimSpace(const ReclaimSnapshotState& Snapshot, } } +void +BlockStore::IterateChunks(const std::vector<BlockStoreLocation>& ChunkLocations, + std::function<void(size_t ChunkIndex, const void* Data, uint64_t Size)> SmallChunkCallback, + std::function<void(size_t ChunkIndex, BasicFile& BlockFile, uint64_t Offset, uint64_t Size)> LargeChunkCallback) +{ + // We do a read sweep through the payloads file and validate + // any entries that are contained within each segment, with + // the assumption that most entries will be checked in this + // pass. An alternative strategy would be to use memory mapping. + + { + std::vector<size_t> BigChunks; + const uint64_t WindowSize = 4 * 1024 * 1024; + IoBuffer ReadBuffer{WindowSize}; + void* BufferBase = ReadBuffer.MutableData(); + + RwLock::SharedLockScope _(m_InsertLock); // TODO: Refactor so we don't have to keep m_InsertLock all the time? + + for (const auto& Block : m_ChunkBlocks) + { + uint64_t WindowStart = 0; + uint64_t WindowEnd = WindowSize; + uint32_t BlockIndex = Block.first; + const Ref<BlockStoreFile>& BlockFile = Block.second; + BlockFile->Open(); + const uint64_t FileSize = BlockFile->FileSize(); + + do + { + const uint64_t ChunkSize = Min(WindowSize, FileSize - WindowStart); + BlockFile->Read(BufferBase, ChunkSize, WindowStart); + + // TODO: We could be smarter here if the ChunkLocations were sorted on block index - we could + // then only scan a subset of ChunkLocations instead of scanning through them all... + for (size_t ChunkIndex = 0; ChunkIndex < ChunkLocations.size(); ++ChunkIndex) + { + const BlockStoreLocation Location = ChunkLocations[ChunkIndex]; + if (BlockIndex != Location.BlockIndex) + { + continue; + } + + const uint64_t EntryOffset = Location.Offset; + if ((EntryOffset >= WindowStart) && (EntryOffset < WindowEnd)) + { + const uint64_t EntryEnd = EntryOffset + Location.Size; + + if (EntryEnd >= WindowEnd) + { + BigChunks.push_back(ChunkIndex); + + continue; + } + + SmallChunkCallback(ChunkIndex, + reinterpret_cast<uint8_t*>(BufferBase) + Location.Offset - WindowStart, + Location.Size); + } + } + + WindowStart += WindowSize; + WindowEnd += WindowSize; + } while (WindowStart < FileSize); + } + + // Deal with large chunks + + for (size_t ChunkIndex : BigChunks) + { + const BlockStoreLocation Location = ChunkLocations[ChunkIndex]; + BasicFile& BlockFile = m_ChunkBlocks[Location.BlockIndex]->GetBasicFile(); + LargeChunkCallback(ChunkIndex, BlockFile, Location.Offset, Location.Size); + } + } +} + const char* BlockStore::GetBlockFileExtension() { diff --git a/zenstore/compactcas.cpp b/zenstore/compactcas.cpp index 30747f554..a6e617474 100644 --- a/zenstore/compactcas.cpp +++ b/zenstore/compactcas.cpp @@ -338,84 +338,56 @@ void CasContainerStrategy::Scrub(ScrubContext& Ctx) { ZEN_UNUSED(Ctx); -#if 0 - std::vector<CasDiskIndexEntry> BadChunks; - // We do a read sweep through the payloads file and validate - // any entries that are contained within each segment, with - // the assumption that most entries will be checked in this - // pass. An alternative strategy would be to use memory mapping. + RwLock::SharedLockScope _(m_LocationMapLock); + uint64_t TotalChunkCount = m_LocationMap.size(); + std::vector<BlockStoreLocation> ChunkLocations; + std::vector<IoHash> ChunkIndexToChunkHash; + ChunkLocations.reserve(TotalChunkCount); + ChunkIndexToChunkHash.reserve(TotalChunkCount); { - std::vector<CasDiskIndexEntry> BigChunks; - const uint64_t WindowSize = 4 * 1024 * 1024; - IoBuffer ReadBuffer{WindowSize}; - void* BufferBase = ReadBuffer.MutableData(); - - RwLock::SharedLockScope _(m_InsertLock); // TODO: Refactor so we don't have to keep m_InsertLock all the time? - RwLock::SharedLockScope __(m_LocationMapLock); - - for (const auto& Block : m_ChunkBlocks) + for (const auto& Entry : m_LocationMap) { - uint64_t WindowStart = 0; - uint64_t WindowEnd = WindowSize; - const Ref<BlockStoreFile>& BlockFile = Block.second; - BlockFile->Open(); - const uint64_t FileSize = BlockFile->FileSize(); + const IoHash& ChunkHash = Entry.first; + const BlockStoreDiskLocation& DiskLocation = Entry.second; + BlockStoreLocation Location = DiskLocation.Get(m_PayloadAlignment); + size_t ChunkIndex = ChunkLocations.size(); - do - { - const uint64_t ChunkSize = Min(WindowSize, FileSize - WindowStart); - BlockFile->Read(BufferBase, ChunkSize, WindowStart); - - for (auto& Entry : m_LocationMap) - { - const BlockStoreLocation Location = Entry.second.Get(m_PayloadAlignment); - const uint64_t EntryOffset = Location.Offset; - - if ((EntryOffset >= WindowStart) && (EntryOffset < WindowEnd)) - { - const uint64_t EntryEnd = EntryOffset + Location.Size; - - if (EntryEnd >= WindowEnd) - { - BigChunks.push_back({.Key = Entry.first, .Location = Entry.second}); - - continue; - } - - const IoHash ComputedHash = - IoHash::HashBuffer(reinterpret_cast<uint8_t*>(BufferBase) + Location.Offset - WindowStart, Location.Size); - - if (Entry.first != ComputedHash) - { - // Hash mismatch - BadChunks.push_back({.Key = Entry.first, .Location = Entry.second, .Flags = CasDiskIndexEntry::kTombstone}); - } - } - } - - WindowStart += WindowSize; - WindowEnd += WindowSize; - } while (WindowStart < FileSize); + ChunkLocations.push_back(Location); + ChunkIndexToChunkHash[ChunkIndex] = ChunkHash; } + } - // Deal with large chunks + std::vector<IoHash> BadChunks; - for (const CasDiskIndexEntry& Entry : BigChunks) - { - IoHashStream Hasher; - const BlockStoreLocation Location = Entry.Location.Get(m_PayloadAlignment); - const Ref<BlockStoreFile>& BlockFile = m_ChunkBlocks[Location.BlockIndex]; - BlockFile->StreamByteRange(Location.Offset, Location.Size, [&](const void* Data, uint64_t Size) { Hasher.Append(Data, Size); }); - IoHash ComputedHash = Hasher.GetHash(); + // We do a read sweep through the payloads file and validate + // any entries that are contained within each segment, with + // the assumption that most entries will be checked in this + // pass. An alternative strategy would be to use memory mapping. - if (Entry.Key != ComputedHash) + m_BlockStore.IterateChunks( + ChunkLocations, + [&ChunkIndexToChunkHash, &BadChunks](size_t ChunkIndex, const void* Data, uint64_t Size) { + const IoHash ComputedHash = IoHash::HashBuffer(Data, Size); + const IoHash& ExpectedHash = ChunkIndexToChunkHash[ChunkIndex]; + if (ComputedHash != ExpectedHash) { - BadChunks.push_back({.Key = Entry.Key, .Location = Entry.Location, .Flags = CasDiskIndexEntry::kTombstone}); + // Hash mismatch + BadChunks.push_back(ExpectedHash); } - } - } + }, + [&ChunkIndexToChunkHash, &BadChunks](size_t ChunkIndex, BasicFile& BlockFile, uint64_t Offset, uint64_t Size) { + IoHashStream Hasher; + BlockFile.StreamByteRange(Offset, Size, [&](const void* Data, uint64_t Size) { Hasher.Append(Data, Size); }); + IoHash ComputedHash = Hasher.GetHash(); + const IoHash& ExpectedHash = ChunkIndexToChunkHash[ChunkIndex]; + if (ComputedHash != ExpectedHash) + { + // Hash mismatch + BadChunks.push_back(ExpectedHash); + } + }); if (BadChunks.empty()) { @@ -424,26 +396,31 @@ CasContainerStrategy::Scrub(ScrubContext& Ctx) ZEN_ERROR("Scrubbing found {} bad chunks in '{}'", BadChunks.size(), m_Config.RootDirectory / m_ContainerBaseName); + _.ReleaseNow(); // Deal with bad chunks by removing them from our lookup map - std::vector<IoHash> BadChunkHashes; - BadChunkHashes.reserve(BadChunks.size()); - - m_CasLog.Append(BadChunks); + std::vector<CasDiskIndexEntry> LogEntries; + LogEntries.reserve(BadChunks.size()); { - RwLock::ExclusiveLockScope _(m_LocationMapLock); - for (const CasDiskIndexEntry& Entry : BadChunks) + RwLock::ExclusiveLockScope __(m_LocationMapLock); + for (const IoHash& ChunkHash : BadChunks) { - BadChunkHashes.push_back(Entry.Key); - m_LocationMap.erase(Entry.Key); + const auto KeyIt = m_LocationMap.find(ChunkHash); + if (KeyIt == m_LocationMap.end()) + { + // Might have been GC'd + continue; + } + LogEntries.push_back({.Key = KeyIt->first, .Location = KeyIt->second, .Flags = CasDiskIndexEntry::kTombstone}); + m_LocationMap.erase(KeyIt); } } + m_CasLog.Append(LogEntries); // Let whomever it concerns know about the bad chunks. This could // be used to invalidate higher level data structures more efficiently // than a full validation pass might be able to do - Ctx.ReportBadCasChunks(BadChunkHashes); -#endif // 0 + Ctx.ReportBadCasChunks(BadChunks); } void @@ -481,7 +458,6 @@ CasContainerStrategy::CollectGarbage(GcContext& GcCtx) std::vector<size_t> KeepChunkIndexes; std::vector<IoHash> ChunkIndexToChunkHash; ChunkLocations.reserve(TotalChunkCount); - ChunkLocations.reserve(TotalChunkCount); ChunkIndexToChunkHash.reserve(TotalChunkCount); GcCtx.FilterCas(TotalChunkHashes, [&](const IoHash& ChunkHash, bool Keep) { diff --git a/zenstore/include/zenstore/blockstore.h b/zenstore/include/zenstore/blockstore.h index 31d9145f9..5af416b59 100644 --- a/zenstore/include/zenstore/blockstore.h +++ b/zenstore/include/zenstore/blockstore.h @@ -99,6 +99,7 @@ struct BlockStoreFile : public RefCounted void Write(const void* Data, uint64_t Size, uint64_t FileOffset); void Truncate(uint64_t Size); void Flush(); + BasicFile& GetBasicFile(); void StreamByteRange(uint64_t FileOffset, uint64_t Size, std::function<void(const void* Data, uint64_t Size)>&& ChunkFun); private: @@ -142,6 +143,9 @@ public: bool DryRun, const ReclaimCallback& Callback = [](uint32_t, const std::unordered_map<size_t, BlockStoreLocation>&, const std::vector<size_t>&) { }); + void IterateChunks(const std::vector<BlockStoreLocation>& ChunkLocations, + std::function<void(size_t ChunkIndex, const void* Data, uint64_t Size)> SmallChunkCallback, + std::function<void(size_t ChunkIndex, BasicFile& BlockFile, uint64_t Offset, uint64_t Size)> LargeChunkCallback); static const char* GetBlockFileExtension(); static std::filesystem::path GetBlockPath(const std::filesystem::path& BlocksBasePath, const uint32_t BlockIndex); |