diff options
| author | Dan Engelbrecht <[email protected]> | 2025-11-18 16:34:17 +0100 |
|---|---|---|
| committer | GitHub Enterprise <[email protected]> | 2025-11-18 16:34:17 +0100 |
| commit | 2f9b8b2000b71650ff5a2b72dae3a5312e670465 (patch) | |
| tree | c58814f89b7cddc94db237f630b018e4d7982733 /src/zenremotestore/chunking/chunkedcontent.cpp | |
| parent | retain xmake state between runs (#656) (diff) | |
| download | zen-2f9b8b2000b71650ff5a2b72dae3a5312e670465.tar.xz zen-2f9b8b2000b71650ff5a2b72dae3a5312e670465.zip | |
loose chunk filtering bug when using wildcards (#654)
* fix filtering of loose chunks when downloading with a filter
add tests
* changelog
* move InlineRemoveUnusedHashes
* remove extra braces
Diffstat (limited to 'src/zenremotestore/chunking/chunkedcontent.cpp')
| -rw-r--r-- | src/zenremotestore/chunking/chunkedcontent.cpp | 461 |
1 files changed, 461 insertions, 0 deletions
diff --git a/src/zenremotestore/chunking/chunkedcontent.cpp b/src/zenremotestore/chunking/chunkedcontent.cpp index af1f06cec..ac979a64b 100644 --- a/src/zenremotestore/chunking/chunkedcontent.cpp +++ b/src/zenremotestore/chunking/chunkedcontent.cpp @@ -10,14 +10,21 @@ #include <zencore/scopeguard.h> #include <zencore/timer.h> #include <zencore/trace.h> +#include <zenremotestore/chunking/chunkblock.h> #include <zenremotestore/chunking/chunkedfile.h> #include <zenremotestore/chunking/chunkingcontroller.h> +#include <zenutil/wildcard.h> ZEN_THIRD_PARTY_INCLUDES_START #include <tsl/robin_set.h> #include <gsl/gsl-lite.hpp> ZEN_THIRD_PARTY_INCLUDES_END +#if ZEN_WITH_TESTS +# include <zencore/testing.h> +# include <zencore/testutils.h> +#endif // ZEN_WITH_TESTS + namespace zen { using namespace std::literals; @@ -715,6 +722,7 @@ DeletePathsFromChunkedContent(const ChunkedFolderContent& BaseContent, Result.ChunkedContent.ChunkHashes.reserve(BaseChunkCount); Result.ChunkedContent.ChunkRawSizes.reserve(BaseChunkCount); + Result.ChunkedContent.ChunkOrders.reserve(BaseChunkCount); tsl::robin_map<IoHash, uint32_t, IoHash::Hasher> RawHashToSequenceRawHashIndex; RawHashToSequenceRawHashIndex.reserve(ExpectedPathCount); @@ -828,6 +836,32 @@ CompareChunkedContent(const ChunkedFolderContent& Lhs, const ChunkedFolderConten return true; }; +static tsl::robin_map<IoHash, uint32_t, IoHash::Hasher> +BuildHashLookup(std::span<const IoHash> Hashes) +{ + tsl::robin_map<IoHash, uint32_t, IoHash::Hasher> Lookup; + Lookup.reserve(Hashes.size()); + for (uint32_t Index = 0; Index < Hashes.size(); Index++) + { + Lookup.insert_or_assign(Hashes[Index], Index); + } + return Lookup; +} + +static std::vector<uint32_t> +BuildChunkOrderOffset(std::span<const uint32_t> ChunkCounts) +{ + std::vector<uint32_t> ChunkOffsets; + ChunkOffsets.reserve(ChunkCounts.size()); + uint32_t Offset = 0; + for (uint32_t SequenceIndex = 0; SequenceIndex < ChunkCounts.size(); SequenceIndex++) + { + ChunkOffsets.push_back(Offset); + Offset += ChunkCounts[SequenceIndex]; + } + return ChunkOffsets; +} + ChunkedFolderContent ChunkFolderContent(ChunkingStatistics& Stats, WorkerThreadPool& WorkerPool, @@ -979,6 +1013,8 @@ BuildChunkedContentLookup(const ChunkedFolderContent& Content) const uint32_t ChunkCount = gsl::narrow<uint32_t>(Content.ChunkedContent.ChunkHashes.size()); Result.ChunkHashToChunkIndex.reserve(ChunkCount); size_t RangeOffset = 0; + Result.ChunkSequenceLocationOffset.reserve(ChunkCount); + Result.ChunkSequenceLocationCounts.reserve(ChunkCount); for (uint32_t ChunkIndex = 0; ChunkIndex < ChunkCount; ChunkIndex++) { Result.ChunkHashToChunkIndex.insert({Content.ChunkedContent.ChunkHashes[ChunkIndex], ChunkIndex}); @@ -1021,4 +1057,429 @@ BuildChunkedContentLookup(const ChunkedFolderContent& Content) return Result; } +void +ValidateChunkedFolderContent(const ChunkedFolderContent& Content, + std::span<const ChunkBlockDescription> BlockDescriptions, + std::span<const IoHash> LooseChunks) +{ + size_t TotalKnownChunkCount = LooseChunks.size(); + for (const ChunkBlockDescription& BlockDescription : BlockDescriptions) + { + TotalKnownChunkCount += BlockDescription.ChunkRawHashes.size(); + } + + tsl::robin_set<IoHash, IoHash::Hasher> KnownChunks; + KnownChunks.reserve(TotalKnownChunkCount); + KnownChunks.insert(LooseChunks.begin(), LooseChunks.end()); + for (const ChunkBlockDescription& BlockDescription : BlockDescriptions) + { + KnownChunks.insert(BlockDescription.ChunkRawHashes.begin(), BlockDescription.ChunkRawHashes.end()); + } + + std::vector<uint32_t> ChunkOrderOffsets = BuildChunkOrderOffset(Content.ChunkedContent.ChunkCounts); + tsl::robin_map<IoHash, uint32_t, IoHash::Hasher> SequenceIndexLookup = BuildHashLookup(Content.ChunkedContent.SequenceRawHashes); + std::vector<size_t> SequenceUseCount(Content.ChunkedContent.SequenceRawHashes.size(), 0); + std::vector<size_t> ChunkUseCount(Content.ChunkedContent.ChunkHashes.size(), 0); + for (size_t PathIndex = 0; PathIndex < Content.Paths.size(); PathIndex++) + { + const std::filesystem::path& Path = Content.Paths[PathIndex]; + if (Path.empty()) + { + throw std::runtime_error("Chunked folder content contains empty path"); + } + const uint64_t RawSize = Content.RawSizes[PathIndex]; + const IoHash RawHash = Content.RawHashes[PathIndex]; + if (RawSize > 0) + { + if (auto It = SequenceIndexLookup.find(RawHash); It != SequenceIndexLookup.end()) + { + const uint32_t SourceSequenceIndex = It->second; + SequenceUseCount[SourceSequenceIndex]++; + const uint32_t ChunkOrderOffset = ChunkOrderOffsets[SourceSequenceIndex]; + const uint32_t ChunkCount = Content.ChunkedContent.ChunkCounts[SourceSequenceIndex]; + + std::span<const uint32_t> ChunkIndexes = + std::span<const uint32_t>(Content.ChunkedContent.ChunkOrders).subspan(ChunkOrderOffset, ChunkCount); + + IoHashStream Hasher; + uint64_t SizeSum = 0; + for (uint32_t ChunkIndex : ChunkIndexes) + { + ChunkUseCount[ChunkIndex]++; + const uint64_t ChunkSize = Content.ChunkedContent.ChunkRawSizes[ChunkIndex]; + if (ChunkSize == 0) + { + throw std::runtime_error("Chunked folder content contains zero size chunk"); + } + const IoHash& ChunkRawHash = Content.ChunkedContent.ChunkHashes[ChunkIndex]; + if (ChunkRawHash == IoHash::Zero) + { + throw std::runtime_error("Chunked folder content contains zero chunk hash"); + } + if (!KnownChunks.contains(ChunkRawHash)) + { + throw std::runtime_error(fmt::format("Chunked folder content references an unknown chunk '{}'", ChunkRawHash)); + } + SizeSum += ChunkSize; + } + if (SizeSum != RawSize) + { + throw std::runtime_error( + fmt::format("Chunked folder content sequence size {} does not match expected size '{}'", SizeSum, RawSize)); + } + } + else + { + throw std::runtime_error(fmt::format("Chunked folder content references unknown sequence hash '{}'", RawHash)); + } + } + else + { + if (RawHash != IoHash::Zero) + { + throw std::runtime_error( + fmt::format("Chunked folder content references zero size sequence with non-zero hash '{}'", RawHash)); + } + } + } + + for (uint32_t SequenceIndex = 0; SequenceIndex < SequenceUseCount.size(); SequenceIndex++) + { + if (SequenceUseCount[SequenceIndex] == 0) + { + throw std::runtime_error( + fmt::format("Chunked folder has unused sequence '{}'", Content.ChunkedContent.SequenceRawHashes[SequenceIndex])); + } + } + for (uint32_t ChunkIndex = 0; ChunkIndex < ChunkUseCount.size(); ChunkIndex++) + { + if (ChunkUseCount[ChunkIndex] == 0) + { + throw std::runtime_error(fmt::format("Chunked folder has unused chunk '{}'", Content.ChunkedContent.ChunkHashes[ChunkIndex])); + } + } +} + +void +InlineRemoveUnusedHashes(std::vector<IoHash>& InOutHashes, std::span<const IoHash> UsedHashes) +{ + tsl::robin_set<IoHash, IoHash::Hasher> UsedChunkHashes; + UsedChunkHashes.reserve(UsedHashes.size()); + UsedChunkHashes.insert(UsedHashes.begin(), UsedHashes.end()); + for (auto It = InOutHashes.begin(); It != InOutHashes.end();) + { + if (!UsedChunkHashes.contains(*It)) + { + It = InOutHashes.erase(It); + } + else + { + It++; + } + } +} + +#if ZEN_WITH_TESTS + +void +chunkedcontent_forcelink() +{ +} + +namespace chunked_test_utils { + struct ChunkedFile + { + IoHash RawHash; + std::vector<IoHash> ChunkHashes; + std::vector<uint64_t> ChunkSizes; + std::vector<IoBuffer> Chunks; + }; + + ChunkedFile CreateChunkedFile(FastRandom& Random, size_t Size, size_t ChunkingSize) + { + size_t ChunkCount = (Size + (ChunkingSize - 1)) / ChunkingSize; + std::vector<IoHash> ChunkHashes; + std::vector<uint64_t> ChunkSizes; + std::vector<IoBuffer> Chunks; + ChunkHashes.reserve(ChunkCount); + ChunkSizes.reserve(ChunkCount); + + IoHashStream HashStream; + while (Size > 0) + { + size_t ChunkSize = Min(Size, ChunkingSize); + IoBuffer ChunkBuffer = CreateRandomBlob(Random, ChunkSize); + HashStream.Append(ChunkBuffer); + ChunkHashes.push_back(IoHash::HashBuffer(ChunkBuffer)); + ChunkSizes.push_back(ChunkSize); + Chunks.emplace_back(std::move(ChunkBuffer)); + Size -= ChunkSize; + } + return ChunkedFile{.RawHash = HashStream.GetHash(), + .ChunkHashes = std::move(ChunkHashes), + .ChunkSizes = std::move(ChunkSizes), + .Chunks = std::move(Chunks)}; + } + + ChunkedFolderContent CreateChunkedFolderContent(FastRandom& Random, + std::span<const std::pair<const std::string, uint64_t>> PathAndSizes, + uint64_t ChunkingSize, + std::vector<IoBuffer>& ChunkPayloads) + { + ChunkedFolderContent Result; + Result.Paths.reserve(PathAndSizes.size()); + Result.RawSizes.reserve(PathAndSizes.size()); + Result.Attributes.reserve(PathAndSizes.size()); + Result.RawHashes.reserve(PathAndSizes.size()); + ChunkPayloads.reserve(PathAndSizes.size()); + + tsl::robin_map<IoHash, uint32_t> SequenceToIndex; + tsl::robin_map<IoHash, uint32_t> ChunkToIndex; + for (size_t PathIndex = 0; PathIndex < PathAndSizes.size(); PathIndex++) + { + const std::string& Path = PathAndSizes[PathIndex].first; + const uint64_t Size = PathAndSizes[PathIndex].second; + + Result.Paths.push_back(Path); + Result.RawSizes.push_back(Size); + Result.Attributes.push_back(0); + + if (Size > 0) + { + ChunkedFile File = CreateChunkedFile(Random, Size, ChunkingSize); + Result.RawHashes.push_back(File.RawHash); + + if (auto SequenceIt = SequenceToIndex.find(File.RawHash); SequenceIt == SequenceToIndex.end()) + { + SequenceToIndex.insert_or_assign(File.RawHash, gsl::narrow<uint32_t>(Result.ChunkedContent.SequenceRawHashes.size())); + Result.ChunkedContent.SequenceRawHashes.push_back(File.RawHash); + Result.ChunkedContent.ChunkCounts.push_back(gsl::narrow<uint32_t>(File.ChunkHashes.size())); + for (size_t ChunkIndex = 0; ChunkIndex < File.ChunkHashes.size(); ChunkIndex++) + { + const IoHash& ChunkHash = File.ChunkHashes[ChunkIndex]; + if (auto ChunkIt = ChunkToIndex.find(ChunkHash); ChunkIt == ChunkToIndex.end()) + { + const uint32_t ChunkedContentChunkIndex = gsl::narrow<uint32_t>(Result.ChunkedContent.ChunkOrders.size()); + Result.ChunkedContent.ChunkOrders.push_back(gsl::narrow<uint32_t>(ChunkedContentChunkIndex)); + + Result.ChunkedContent.ChunkHashes.push_back(ChunkHash); + Result.ChunkedContent.ChunkRawSizes.push_back(File.ChunkSizes[ChunkIndex]); + ChunkPayloads.push_back(std::move(File.Chunks[ChunkIndex])); + } + else + { + const uint32_t ChunkedContentChunkIndex = ChunkIt->second; + Result.ChunkedContent.ChunkOrders.push_back(ChunkedContentChunkIndex); + } + } + } + } + else + { + Result.RawHashes.push_back(IoHash::Zero); + } + } + return Result; + } +# if 0 + void ValidateChunkedFolderContent(const ChunkedFolderContent& Content, std::span<const IoBuffer> Chunks) + { + std::vector<uint32_t> ChunkOrderOffsets = BuildChunkOrderOffset(Content.ChunkedContent.ChunkCounts); + tsl::robin_map<IoHash, uint32_t, IoHash::Hasher> SequenceIndexLookup = BuildHashLookup(Content.ChunkedContent.SequenceRawHashes); + std::vector<size_t> SequenceUseCount(Content.ChunkedContent.SequenceRawHashes.size(), 0); + std::vector<size_t> ChunkUseCount(Content.ChunkedContent.ChunkHashes.size(), 0); + for (size_t PathIndex = 0; PathIndex < Content.Paths.size(); PathIndex++) + { + const std::filesystem::path& Path = Content.Paths[PathIndex]; + ZEN_ASSERT(!Path.empty()); + const uint64_t RawSize = Content.RawSizes[PathIndex]; + const IoHash RawHash = Content.RawHashes[PathIndex]; + if (RawSize > 0) + { + if (auto It = SequenceIndexLookup.find(RawHash); It != SequenceIndexLookup.end()) + { + const uint32_t SourceSequenceIndex = It->second; + SequenceUseCount[SourceSequenceIndex]++; + const uint32_t ChunkOrderOffset = ChunkOrderOffsets[SourceSequenceIndex]; + const uint32_t ChunkCount = Content.ChunkedContent.ChunkCounts[SourceSequenceIndex]; + + std::span<const uint32_t> ChunkIndexes = + std::span<const uint32_t>(Content.ChunkedContent.ChunkOrders).subspan(ChunkOrderOffset, ChunkCount); + + IoHashStream Hasher; + uint64_t SizeSum = 0; + for (uint32_t ChunkIndex : ChunkIndexes) + { + ChunkUseCount[ChunkIndex]++; + const IoBuffer& ChunkBuffer = Chunks[ChunkIndex]; + const uint64_t ChunkSize = Content.ChunkedContent.ChunkRawSizes[ChunkIndex]; + const IoHash& ChunkRawHash = Content.ChunkedContent.ChunkHashes[ChunkIndex]; + SizeSum += ChunkSize; + CHECK_EQ(ChunkRawHash, IoHash::HashBuffer(ChunkBuffer)); + Hasher.Append(ChunkBuffer); + } + CHECK_EQ(RawHash, Hasher.GetHash()); + CHECK_EQ(SizeSum, RawSize); + } + else + { + CHECK(false); + } + } + else + { + CHECK(RawHash == IoHash::Zero); + } + } + + for (uint32_t SequenceIndex = 0; SequenceIndex < SequenceUseCount.size(); SequenceIndex++) + { + CHECK(SequenceUseCount[SequenceIndex] > 0); + } + for (uint32_t ChunkIndex = 0; ChunkIndex < ChunkUseCount.size(); ChunkIndex++) + { + CHECK(ChunkUseCount[ChunkIndex] > 0); + } + } +# endif // 0 + std::vector<IoBuffer> GetChunkPayloads(std::span<const IoHash> BaseHashes, + std::span<const IoBuffer> BaseChunks, + std::span<const IoHash> OverlayHashes, + std::span<const IoBuffer> OverlayChunks, + std::span<const IoHash> WantedHashes) + { + std::vector<IoBuffer> Result; + Result.reserve(WantedHashes.size()); + tsl::robin_map<IoHash, uint32_t, IoHash::Hasher> BaseChunkLookup = BuildHashLookup(BaseHashes); + tsl::robin_map<IoHash, uint32_t, IoHash::Hasher> OverlayChunkLookup = BuildHashLookup(OverlayHashes); + for (const IoHash& ChunkHash : WantedHashes) + { + if (auto It = BaseChunkLookup.find(ChunkHash); It != BaseChunkLookup.end()) + { + Result.push_back(BaseChunks[It->second]); + } + else if (It = OverlayChunkLookup.find(ChunkHash); It != OverlayChunkLookup.end()) + { + Result.push_back(OverlayChunks[It->second]); + } + else + { + CHECK(false); + } + } + return Result; + } + + tsl::robin_map<std::string, uint32_t> BuildPathLookup(std::span<const std::filesystem::path> Paths) + { + tsl::robin_map<std::string, uint32_t> Result; + Result.reserve(Paths.size()); + for (size_t Index = 0; Index < Paths.size(); Index++) + { + const std::filesystem::path& Path = Paths[Index]; + Result.insert_or_assign(Path.generic_string(), Index); + } + return Result; + } + + bool IncludePath(std::span<const std::string> IncludeWildcards, + std::span<const std::string> ExcludeWildcards, + const std::filesystem::path& Path) + { + const std::string PathString = Path.generic_string(); + bool IncludePath = true; + if (!IncludeWildcards.empty()) + { + IncludePath = false; + for (const std::string& IncludeWildcard : IncludeWildcards) + { + if (MatchWildcard(IncludeWildcard, PathString, /*CaseSensitive*/ false)) + { + IncludePath = true; + break; + } + } + if (!IncludePath) + { + return false; + } + } + for (const std::string& ExcludeWildcard : ExcludeWildcards) + { + if (MatchWildcard(ExcludeWildcard, PathString, /*CaseSensitive*/ false)) + { + return false; + } + } + return true; + } + +} // namespace chunked_test_utils + +TEST_CASE("DeletePathsFromContent") +{ + FastRandom BaseRandom; + + std::vector<IoBuffer> BaseChunks; + + const std::string BasePaths[11] = {{"file_1"}, + {"file_2.exe"}, + {"file_3.txt"}, + {"dir_1/dir1_file_1.exe"}, + {"dir_1/dir1_file_2.pdb"}, + {"dir_1/dir1_file_3.txt"}, + {"dir_2/dir2_dir1/dir2_dir1_file_1.exe"}, + {"dir_2/dir2_dir1/dir2_dir1_file_2.pdb"}, + {"dir_2/dir2_dir1/dir2_dir1_file_3.dll"}, + {"dir_2/dir2_dir2/dir2_dir2_file_1.txt"}, + {"dir_2/dir2_dir2/dir2_dir2_file_2.json"}}; + const uint64_t BaseSizes[11] = + {6u * 1024u, 0, 798, 19u * 1024u, 7u * 1024u, 93, 31u * 1024u, 17u * 1024u, 13u * 1024u, 2u * 1024u, 3u * 1024u}; + + std::pair<const std::string, uint64_t> BasePathAndSizes[11] = {{BasePaths[0], BaseSizes[0]}, + {BasePaths[1], BaseSizes[1]}, + {BasePaths[2], BaseSizes[2]}, + {BasePaths[3], BaseSizes[3]}, + {BasePaths[4], BaseSizes[4]}, + {BasePaths[5], BaseSizes[5]}, + {BasePaths[6], BaseSizes[6]}, + {BasePaths[7], BaseSizes[7]}, + {BasePaths[8], BaseSizes[8]}, + {BasePaths[9], BaseSizes[9]}, + {BasePaths[10], BaseSizes[10]}}; + + ChunkedFolderContent Base = chunked_test_utils::CreateChunkedFolderContent(BaseRandom, BasePathAndSizes, 4u * 1024u, BaseChunks); + ValidateChunkedFolderContent(Base, {}, Base.ChunkedContent.ChunkHashes); + + tsl::robin_map<IoHash, size_t, IoHash::Hasher> BaseChunksLookup; + for (size_t Index = 0; Index < BaseChunks.size(); Index++) + { + BaseChunksLookup.insert_or_assign(Base.ChunkedContent.ChunkHashes[Index], Index); + } + + std::vector<std::string> IncludeWildcards; + std::vector<std::string> ExcludeWildcards = {"*.map*", "*.pdb*", "*optional*", "*Manifest_*", "*pakchunk10sm6*"}; + + std::vector<std::filesystem::path> DeletedPaths; + for (const std::filesystem::path& RemotePath : Base.Paths) + { + if (!chunked_test_utils::IncludePath(IncludeWildcards, ExcludeWildcards, RemotePath)) + { + DeletedPaths.push_back(RemotePath); + } + } + + ZEN_ASSERT(DeletedPaths.size() == 2); + + ChunkedFolderContent FilteredContent = DeletePathsFromChunkedContent(Base, DeletedPaths); + + std::vector<IoHash> FilteredChunks = Base.ChunkedContent.ChunkHashes; + + InlineRemoveUnusedHashes(FilteredChunks, FilteredContent.ChunkedContent.ChunkHashes); + + ValidateChunkedFolderContent(FilteredContent, {}, FilteredChunks); +} + +#endif // ZEN_WITH_TESTS + } // namespace zen |