diff options
| author | Dan Engelbrecht <[email protected]> | 2025-12-15 13:20:21 +0100 |
|---|---|---|
| committer | GitHub Enterprise <[email protected]> | 2025-12-15 13:20:21 +0100 |
| commit | a715d3ab7701e6257730a73c62567052d21c9771 (patch) | |
| tree | 1f6b1de9c7cf11ec1403187d77d74a3b1af52a39 | |
| parent | show download source data (#689) (diff) | |
| download | zen-a715d3ab7701e6257730a73c62567052d21c9771.tar.xz zen-a715d3ab7701e6257730a73c62567052d21c9771.zip | |
oplog download size (#690)
- Bugfix: Upload of oplogs could reference multiple blocks for the same chunk causing redundant downloads of blocks
- Improvement: Use the improved block reuse selection function from zen builds upload in zen oplog-export to reduce oplog download size
| -rw-r--r-- | CHANGELOG.md | 2 | ||||
| -rw-r--r-- | src/zen/cmds/builds_cmd.cpp | 35 | ||||
| -rw-r--r-- | src/zenremotestore/builds/buildstorageoperations.cpp | 207 | ||||
| -rw-r--r-- | src/zenremotestore/chunking/chunkblock.cpp | 431 | ||||
| -rw-r--r-- | src/zenremotestore/include/zenremotestore/builds/buildstorageoperations.h | 41 | ||||
| -rw-r--r-- | src/zenremotestore/include/zenremotestore/chunking/chunkblock.h | 23 | ||||
| -rw-r--r-- | src/zenremotestore/include/zenremotestore/projectstore/remoteprojectstore.h | 2 | ||||
| -rw-r--r-- | src/zenremotestore/projectstore/buildsremoteprojectstore.cpp | 7 | ||||
| -rw-r--r-- | src/zenremotestore/projectstore/fileremoteprojectstore.cpp | 15 | ||||
| -rw-r--r-- | src/zenremotestore/projectstore/jupiterremoteprojectstore.cpp | 11 | ||||
| -rw-r--r-- | src/zenremotestore/projectstore/remoteprojectstore.cpp | 185 | ||||
| -rw-r--r-- | src/zenstore/cas.cpp | 18 |
12 files changed, 657 insertions, 320 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md index af0cca1a2..ec8cb4906 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,7 @@ ## +- Bugfix: Upload of oplogs could reference multiple blocks for the same chunk causing redundant downloads of blocks - Improvement: At end of `zen builds download` and `zen oplog-import` we now show information over the source of the data - cache, cloud storage, s3/azure, replication +- Improvement: Use the improved block reuse selection function from `zen builds upload` in `zen oplog-export` to reduce oplog download size ## 5.7.14 - Improvement: asio http server now supports the `--dedicated` option which prevents port remapping diff --git a/src/zen/cmds/builds_cmd.cpp b/src/zen/cmds/builds_cmd.cpp index 3b08a9290..6a97ab542 100644 --- a/src/zen/cmds/builds_cmd.cpp +++ b/src/zen/cmds/builds_cmd.cpp @@ -520,14 +520,6 @@ namespace { "\n FoundBlockChunkCount: {}" "\n FoundBlockByteCount: {}" "\n AcceptedBlockCount: {}" - "\n AcceptedChunkCount: {}" - "\n AcceptedByteCount: {}" - "\n AcceptedRawByteCount: {}" - "\n RejectedBlockCount: {}" - "\n RejectedChunkCount: {}" - "\n RejectedByteCount: {}" - "\n AcceptedReduntantChunkCount: {}" - "\n AcceptedReduntantByteCount: {}" "\n NewBlocksCount: {}" "\n NewBlocksChunkCount: {}" "\n NewBlocksChunkByteCount: {}", @@ -538,19 +530,30 @@ namespace { UploadOp.m_FindBlocksStats.FoundBlockChunkCount, NiceBytes(UploadOp.m_FindBlocksStats.FoundBlockByteCount), UploadOp.m_FindBlocksStats.AcceptedBlockCount, - UploadOp.m_FindBlocksStats.AcceptedChunkCount, - NiceBytes(UploadOp.m_FindBlocksStats.AcceptedByteCount), - NiceBytes(UploadOp.m_FindBlocksStats.AcceptedRawByteCount), - UploadOp.m_FindBlocksStats.RejectedBlockCount, - UploadOp.m_FindBlocksStats.RejectedChunkCount, - NiceBytes(UploadOp.m_FindBlocksStats.RejectedByteCount), - UploadOp.m_FindBlocksStats.AcceptedReduntantChunkCount, - NiceBytes(UploadOp.m_FindBlocksStats.AcceptedReduntantByteCount), UploadOp.m_FindBlocksStats.NewBlocksCount, UploadOp.m_FindBlocksStats.NewBlocksChunkCount, NiceBytes(UploadOp.m_FindBlocksStats.NewBlocksChunkByteCount)); ZEN_CONSOLE_VERBOSE( + "Reuse block stats:" + "\n AcceptedChunkCount: {}" + "\n AcceptedByteCount: {}" + "\n AcceptedRawByteCount: {}" + "\n RejectedBlockCount: {}" + "\n RejectedChunkCount: {}" + "\n RejectedByteCount: {}" + "\n AcceptedReduntantChunkCount: {}" + "\n AcceptedReduntantByteCount: {}", + UploadOp.m_ReuseBlocksStats.AcceptedChunkCount, + NiceBytes(UploadOp.m_ReuseBlocksStats.AcceptedByteCount), + NiceBytes(UploadOp.m_ReuseBlocksStats.AcceptedRawByteCount), + UploadOp.m_ReuseBlocksStats.RejectedBlockCount, + UploadOp.m_ReuseBlocksStats.RejectedChunkCount, + NiceBytes(UploadOp.m_ReuseBlocksStats.RejectedByteCount), + UploadOp.m_ReuseBlocksStats.AcceptedReduntantChunkCount, + NiceBytes(UploadOp.m_ReuseBlocksStats.AcceptedReduntantByteCount)); + + ZEN_CONSOLE_VERBOSE( "Generate blocks stats:" "\n GeneratedBlockByteCount: {}" "\n GeneratedBlockCount: {}" diff --git a/src/zenremotestore/builds/buildstorageoperations.cpp b/src/zenremotestore/builds/buildstorageoperations.cpp index b8dd18bb5..6c370f975 100644 --- a/src/zenremotestore/builds/buildstorageoperations.cpp +++ b/src/zenremotestore/builds/buildstorageoperations.cpp @@ -4998,7 +4998,11 @@ BuildsOperationUploadFolder::Execute() } else { - ReuseBlockIndexes = FindReuseBlocks(PrepBuildResult.KnownBlocks, + ReuseBlockIndexes = FindReuseBlocks(m_LogOutput, + m_Options.BlockReuseMinPercentLimit, + m_Options.IsVerbose, + m_ReuseBlocksStats, + PrepBuildResult.KnownBlocks, LocalContent.ChunkedContent.ChunkHashes, BlockChunkIndexes, NewBlockChunkIndexes); @@ -5027,13 +5031,13 @@ BuildsOperationUploadFolder::Execute() const double AcceptedByteCountPercent = m_FindBlocksStats.PotentialChunkByteCount > 0 - ? (100.0 * m_FindBlocksStats.AcceptedRawByteCount / m_FindBlocksStats.PotentialChunkByteCount) + ? (100.0 * m_ReuseBlocksStats.AcceptedRawByteCount / m_FindBlocksStats.PotentialChunkByteCount) : 0.0; const double AcceptedReduntantByteCountPercent = - m_FindBlocksStats.AcceptedByteCount > 0 - ? (100.0 * m_FindBlocksStats.AcceptedReduntantByteCount) / - (m_FindBlocksStats.AcceptedByteCount + m_FindBlocksStats.AcceptedReduntantByteCount) + m_ReuseBlocksStats.AcceptedByteCount > 0 + ? (100.0 * m_ReuseBlocksStats.AcceptedReduntantByteCount) / + (m_ReuseBlocksStats.AcceptedByteCount + m_ReuseBlocksStats.AcceptedReduntantByteCount) : 0.0; if (!m_Options.IsQuiet) { @@ -5050,18 +5054,18 @@ BuildsOperationUploadFolder::Execute() NiceBytes(m_FindBlocksStats.FoundBlockByteCount), NiceTimeSpanMs(m_FindBlocksStats.FindBlockTimeMS), - m_FindBlocksStats.AcceptedChunkCount, - NiceBytes(m_FindBlocksStats.AcceptedRawByteCount), + m_ReuseBlocksStats.AcceptedChunkCount, + NiceBytes(m_ReuseBlocksStats.AcceptedRawByteCount), m_FindBlocksStats.AcceptedBlockCount, AcceptedByteCountPercent, - m_FindBlocksStats.AcceptedReduntantChunkCount, - NiceBytes(m_FindBlocksStats.AcceptedReduntantByteCount), + m_ReuseBlocksStats.AcceptedReduntantChunkCount, + NiceBytes(m_ReuseBlocksStats.AcceptedReduntantByteCount), AcceptedReduntantByteCountPercent, - m_FindBlocksStats.RejectedChunkCount, - NiceBytes(m_FindBlocksStats.RejectedByteCount), - m_FindBlocksStats.RejectedBlockCount, + m_ReuseBlocksStats.RejectedChunkCount, + NiceBytes(m_ReuseBlocksStats.RejectedByteCount), + m_ReuseBlocksStats.RejectedBlockCount, m_FindBlocksStats.NewBlocksChunkCount, NiceBytes(m_FindBlocksStats.NewBlocksChunkByteCount), @@ -5497,7 +5501,7 @@ BuildsOperationUploadFolder::Execute() {{"totalSize", double(m_LocalFolderScanStats.FoundFileByteCount.load())}, {"reusedRatio", AcceptedByteCountPercent / 100.0}, {"reusedBlockCount", double(m_FindBlocksStats.AcceptedBlockCount)}, - {"reusedBlockByteCount", double(m_FindBlocksStats.AcceptedRawByteCount)}, + {"reusedBlockByteCount", double(m_ReuseBlocksStats.AcceptedRawByteCount)}, {"newBlockCount", double(m_FindBlocksStats.NewBlocksCount)}, {"newBlockByteCount", double(m_FindBlocksStats.NewBlocksChunkByteCount)}, {"uploadedCount", double(m_UploadStats.BlockCount.load() + m_UploadStats.ChunkCount.load())}, @@ -5589,183 +5593,6 @@ BuildsOperationUploadFolder::IsAcceptedFile(const std::string_view& RelativePath return true; } -std::vector<size_t> -BuildsOperationUploadFolder::FindReuseBlocks(const std::vector<ChunkBlockDescription>& KnownBlocks, - std::span<const IoHash> ChunkHashes, - std::span<const uint32_t> ChunkIndexes, - std::vector<uint32_t>& OutUnusedChunkIndexes) -{ - ZEN_TRACE_CPU("FindReuseBlocks"); - - // Find all blocks with a usage level higher than MinPercentLimit - // Pick out the blocks with usage higher or equal to MinPercentLimit - // Sort them with highest size usage - most usage first - // Make a list of all chunks and mark them as not found - // For each block, recalculate the block has usage percent based on the chunks marked as not found - // If the block still reaches MinPercentLimit, keep it and remove the matching chunks from the not found list - // Repeat for following all remaining block that initially matched MinPercentLimit - - std::vector<size_t> FilteredReuseBlockIndexes; - - uint32_t ChunkCount = gsl::narrow<uint32_t>(ChunkHashes.size()); - std::vector<bool> ChunkFound(ChunkCount, false); - - if (ChunkCount > 0) - { - if (!KnownBlocks.empty()) - { - Stopwatch ReuseTimer; - - tsl::robin_map<IoHash, uint32_t, IoHash::Hasher> ChunkHashToChunkIndex; - ChunkHashToChunkIndex.reserve(ChunkIndexes.size()); - for (uint32_t ChunkIndex : ChunkIndexes) - { - ChunkHashToChunkIndex.insert_or_assign(ChunkHashes[ChunkIndex], ChunkIndex); - } - - std::vector<size_t> BlockSizes(KnownBlocks.size(), 0); - std::vector<size_t> BlockUseSize(KnownBlocks.size(), 0); - - std::vector<size_t> ReuseBlockIndexes; - - for (size_t KnownBlockIndex = 0; KnownBlockIndex < KnownBlocks.size(); KnownBlockIndex++) - { - const ChunkBlockDescription& KnownBlock = KnownBlocks[KnownBlockIndex]; - if (KnownBlock.BlockHash != IoHash::Zero && KnownBlock.ChunkRawHashes.size() == KnownBlock.ChunkCompressedLengths.size()) - { - size_t BlockAttachmentCount = KnownBlock.ChunkRawHashes.size(); - if (BlockAttachmentCount == 0) - { - continue; - } - size_t ReuseSize = 0; - size_t BlockSize = 0; - size_t FoundAttachmentCount = 0; - size_t BlockChunkCount = KnownBlock.ChunkRawHashes.size(); - for (size_t BlockChunkIndex = 0; BlockChunkIndex < BlockChunkCount; BlockChunkIndex++) - { - const IoHash& BlockChunkHash = KnownBlock.ChunkRawHashes[BlockChunkIndex]; - const uint32_t BlockChunkSize = KnownBlock.ChunkCompressedLengths[BlockChunkIndex]; - BlockSize += BlockChunkSize; - if (ChunkHashToChunkIndex.contains(BlockChunkHash)) - { - ReuseSize += BlockChunkSize; - FoundAttachmentCount++; - } - } - - size_t ReusePercent = (ReuseSize * 100) / BlockSize; - - if (ReusePercent >= m_Options.BlockReuseMinPercentLimit) - { - if (m_Options.IsVerbose) - { - ZEN_OPERATION_LOG_INFO(m_LogOutput, - "Reusing block {}. {} attachments found, usage level: {}%", - KnownBlock.BlockHash, - FoundAttachmentCount, - ReusePercent); - } - ReuseBlockIndexes.push_back(KnownBlockIndex); - - BlockSizes[KnownBlockIndex] = BlockSize; - BlockUseSize[KnownBlockIndex] = ReuseSize; - } - else if (FoundAttachmentCount > 0) - { - // if (m_Options.IsVerbose) - //{ - // ZEN_OPERATION_LOG_INFO(m_LogOutput, "Skipping block {}. {} attachments found, usage level: {}%", - // KnownBlock.BlockHash, - // FoundAttachmentCount, ReusePercent); - //} - m_FindBlocksStats.RejectedBlockCount++; - m_FindBlocksStats.RejectedChunkCount += FoundAttachmentCount; - m_FindBlocksStats.RejectedByteCount += ReuseSize; - } - } - } - - if (!ReuseBlockIndexes.empty()) - { - std::sort(ReuseBlockIndexes.begin(), ReuseBlockIndexes.end(), [&](size_t Lhs, size_t Rhs) { - return BlockUseSize[Lhs] > BlockUseSize[Rhs]; - }); - - for (size_t KnownBlockIndex : ReuseBlockIndexes) - { - std::vector<uint32_t> FoundChunkIndexes; - size_t BlockSize = 0; - size_t AdjustedReuseSize = 0; - size_t AdjustedRawReuseSize = 0; - const ChunkBlockDescription& KnownBlock = KnownBlocks[KnownBlockIndex]; - for (size_t BlockChunkIndex = 0; BlockChunkIndex < KnownBlock.ChunkRawHashes.size(); BlockChunkIndex++) - { - const IoHash& BlockChunkHash = KnownBlock.ChunkRawHashes[BlockChunkIndex]; - const uint32_t BlockChunkSize = KnownBlock.ChunkCompressedLengths[BlockChunkIndex]; - BlockSize += BlockChunkSize; - if (auto It = ChunkHashToChunkIndex.find(BlockChunkHash); It != ChunkHashToChunkIndex.end()) - { - const uint32_t ChunkIndex = It->second; - if (!ChunkFound[ChunkIndex]) - { - FoundChunkIndexes.push_back(ChunkIndex); - AdjustedReuseSize += KnownBlock.ChunkCompressedLengths[BlockChunkIndex]; - AdjustedRawReuseSize += KnownBlock.ChunkRawLengths[BlockChunkIndex]; - } - } - } - - size_t ReusePercent = (AdjustedReuseSize * 100) / BlockSize; - - if (ReusePercent >= m_Options.BlockReuseMinPercentLimit) - { - if (m_Options.IsVerbose) - { - ZEN_OPERATION_LOG_INFO(m_LogOutput, - "Reusing block {}. {} attachments found, usage level: {}%", - KnownBlock.BlockHash, - FoundChunkIndexes.size(), - ReusePercent); - } - FilteredReuseBlockIndexes.push_back(KnownBlockIndex); - - for (uint32_t ChunkIndex : FoundChunkIndexes) - { - ChunkFound[ChunkIndex] = true; - } - m_FindBlocksStats.AcceptedChunkCount += FoundChunkIndexes.size(); - m_FindBlocksStats.AcceptedByteCount += AdjustedReuseSize; - m_FindBlocksStats.AcceptedRawByteCount += AdjustedRawReuseSize; - m_FindBlocksStats.AcceptedReduntantChunkCount += KnownBlock.ChunkRawHashes.size() - FoundChunkIndexes.size(); - m_FindBlocksStats.AcceptedReduntantByteCount += BlockSize - AdjustedReuseSize; - } - else - { - // if (m_Options.IsVerbose) - //{ - // ZEN_OPERATION_LOG_INFO(m_LogOutput, "Skipping block {}. filtered usage level: {}%", KnownBlock.BlockHash, - // ReusePercent); - //} - m_FindBlocksStats.RejectedBlockCount++; - m_FindBlocksStats.RejectedChunkCount += FoundChunkIndexes.size(); - m_FindBlocksStats.RejectedByteCount += AdjustedReuseSize; - } - } - } - } - OutUnusedChunkIndexes.reserve(ChunkIndexes.size() - m_FindBlocksStats.AcceptedChunkCount); - for (uint32_t ChunkIndex : ChunkIndexes) - { - if (!ChunkFound[ChunkIndex]) - { - OutUnusedChunkIndexes.push_back(ChunkIndex); - } - } - } - return FilteredReuseBlockIndexes; -} - void BuildsOperationUploadFolder::ArrangeChunksIntoBlocks(const ChunkedFolderContent& Content, const ChunkedContentLookup& Lookup, diff --git a/src/zenremotestore/chunking/chunkblock.cpp b/src/zenremotestore/chunking/chunkblock.cpp index 05ae13de1..a5d0db205 100644 --- a/src/zenremotestore/chunking/chunkblock.cpp +++ b/src/zenremotestore/chunking/chunkblock.cpp @@ -5,14 +5,23 @@ #include <zencore/compactbinarybuilder.h> #include <zencore/fmtutils.h> #include <zencore/logging.h> +#include <zencore/timer.h> +#include <zencore/trace.h> + +#include <zenremotestore/operationlogoutput.h> #include <vector> +ZEN_THIRD_PARTY_INCLUDES_START +#include <tsl/robin_map.h> +ZEN_THIRD_PARTY_INCLUDES_END + #if ZEN_WITH_TESTS # include <zencore/testing.h> # include <zencore/testutils.h> # include <unordered_map> +# include <numeric> #endif // ZEN_WITH_TESTS namespace zen { @@ -261,6 +270,188 @@ IterateChunkBlock(const SharedBuffer& BlockPayload, return true; }; +std::vector<size_t> +FindReuseBlocks(OperationLogOutput& Output, + const uint8_t BlockReuseMinPercentLimit, + const bool IsVerbose, + ReuseBlocksStatistics& Stats, + const std::vector<ChunkBlockDescription>& KnownBlocks, + std::span<const IoHash> ChunkHashes, + std::span<const uint32_t> ChunkIndexes, + std::vector<uint32_t>& OutUnusedChunkIndexes) +{ + ZEN_TRACE_CPU("FindReuseBlocks"); + + // Find all blocks with a usage level higher than MinPercentLimit + // Pick out the blocks with usage higher or equal to MinPercentLimit + // Sort them with highest size usage - most usage first + // Make a list of all chunks and mark them as not found + // For each block, recalculate the block has usage percent based on the chunks marked as not found + // If the block still reaches MinPercentLimit, keep it and remove the matching chunks from the not found list + // Repeat for following all remaining block that initially matched MinPercentLimit + + std::vector<size_t> FilteredReuseBlockIndexes; + + uint32_t ChunkCount = gsl::narrow<uint32_t>(ChunkHashes.size()); + std::vector<bool> ChunkFound(ChunkCount, false); + + if (ChunkCount > 0) + { + if (!KnownBlocks.empty()) + { + Stopwatch ReuseTimer; + + tsl::robin_map<IoHash, uint32_t, IoHash::Hasher> ChunkHashToChunkIndex; + ChunkHashToChunkIndex.reserve(ChunkIndexes.size()); + for (uint32_t ChunkIndex : ChunkIndexes) + { + ChunkHashToChunkIndex.insert_or_assign(ChunkHashes[ChunkIndex], ChunkIndex); + } + + std::vector<size_t> BlockSizes(KnownBlocks.size(), 0); + std::vector<size_t> BlockUseSize(KnownBlocks.size(), 0); + + std::vector<size_t> ReuseBlockIndexes; + + for (size_t KnownBlockIndex = 0; KnownBlockIndex < KnownBlocks.size(); KnownBlockIndex++) + { + const ChunkBlockDescription& KnownBlock = KnownBlocks[KnownBlockIndex]; + + if (KnownBlock.BlockHash != IoHash::Zero && KnownBlock.ChunkRawHashes.size() == KnownBlock.ChunkCompressedLengths.size()) + { + size_t BlockAttachmentCount = KnownBlock.ChunkRawHashes.size(); + if (BlockAttachmentCount == 0) + { + continue; + } + size_t ReuseSize = 0; + size_t BlockSize = 0; + size_t FoundAttachmentCount = 0; + size_t BlockChunkCount = KnownBlock.ChunkRawHashes.size(); + for (size_t BlockChunkIndex = 0; BlockChunkIndex < BlockChunkCount; BlockChunkIndex++) + { + const IoHash& BlockChunkHash = KnownBlock.ChunkRawHashes[BlockChunkIndex]; + const uint32_t BlockChunkSize = KnownBlock.ChunkCompressedLengths[BlockChunkIndex]; + BlockSize += BlockChunkSize; + if (ChunkHashToChunkIndex.contains(BlockChunkHash)) + { + ReuseSize += BlockChunkSize; + FoundAttachmentCount++; + } + } + + size_t ReusePercent = (ReuseSize * 100) / BlockSize; + + if (ReusePercent >= BlockReuseMinPercentLimit) + { + if (IsVerbose) + { + ZEN_OPERATION_LOG_INFO(Output, + "Reusing block {}. {} attachments found, usage level: {}%", + KnownBlock.BlockHash, + FoundAttachmentCount, + ReusePercent); + } + ReuseBlockIndexes.push_back(KnownBlockIndex); + + BlockSizes[KnownBlockIndex] = BlockSize; + BlockUseSize[KnownBlockIndex] = ReuseSize; + } + else if (FoundAttachmentCount > 0) + { + // if (IsVerbose) + //{ + // ZEN_OPERATION_LOG_INFO(Output, "Skipping block {}. {} attachments found, usage level: {}%", + // KnownBlock.BlockHash, + // FoundAttachmentCount, ReusePercent); + //} + Stats.RejectedBlockCount++; + Stats.RejectedChunkCount += FoundAttachmentCount; + Stats.RejectedByteCount += ReuseSize; + } + } + } + + if (!ReuseBlockIndexes.empty()) + { + std::sort(ReuseBlockIndexes.begin(), ReuseBlockIndexes.end(), [&](size_t Lhs, size_t Rhs) { + return BlockUseSize[Lhs] > BlockUseSize[Rhs]; + }); + + for (size_t KnownBlockIndex : ReuseBlockIndexes) + { + std::vector<uint32_t> FoundChunkIndexes; + size_t BlockSize = 0; + size_t AdjustedReuseSize = 0; + size_t AdjustedRawReuseSize = 0; + const ChunkBlockDescription& KnownBlock = KnownBlocks[KnownBlockIndex]; + for (size_t BlockChunkIndex = 0; BlockChunkIndex < KnownBlock.ChunkRawHashes.size(); BlockChunkIndex++) + { + const IoHash& BlockChunkHash = KnownBlock.ChunkRawHashes[BlockChunkIndex]; + const uint32_t BlockChunkSize = KnownBlock.ChunkCompressedLengths[BlockChunkIndex]; + BlockSize += BlockChunkSize; + if (auto It = ChunkHashToChunkIndex.find(BlockChunkHash); It != ChunkHashToChunkIndex.end()) + { + const uint32_t ChunkIndex = It->second; + if (!ChunkFound[ChunkIndex]) + { + FoundChunkIndexes.push_back(ChunkIndex); + AdjustedReuseSize += KnownBlock.ChunkCompressedLengths[BlockChunkIndex]; + AdjustedRawReuseSize += KnownBlock.ChunkRawLengths[BlockChunkIndex]; + } + } + } + + size_t ReusePercent = (AdjustedReuseSize * 100) / BlockSize; + + if (ReusePercent >= BlockReuseMinPercentLimit) + { + if (IsVerbose) + { + ZEN_OPERATION_LOG_INFO(Output, + "Reusing block {}. {} attachments found, usage level: {}%", + KnownBlock.BlockHash, + FoundChunkIndexes.size(), + ReusePercent); + } + FilteredReuseBlockIndexes.push_back(KnownBlockIndex); + + for (uint32_t ChunkIndex : FoundChunkIndexes) + { + ChunkFound[ChunkIndex] = true; + } + Stats.AcceptedChunkCount += FoundChunkIndexes.size(); + Stats.AcceptedByteCount += AdjustedReuseSize; + Stats.AcceptedRawByteCount += AdjustedRawReuseSize; + Stats.AcceptedReduntantChunkCount += KnownBlock.ChunkRawHashes.size() - FoundChunkIndexes.size(); + Stats.AcceptedReduntantByteCount += BlockSize - AdjustedReuseSize; + } + else + { + // if (IsVerbose) + //{ + // ZEN_OPERATION_LOG_INFO(Output, "Skipping block {}. filtered usage level: {}%", KnownBlock.BlockHash, + // ReusePercent); + //} + Stats.RejectedBlockCount++; + Stats.RejectedChunkCount += FoundChunkIndexes.size(); + Stats.RejectedByteCount += AdjustedReuseSize; + } + } + } + } + OutUnusedChunkIndexes.reserve(ChunkIndexes.size() - Stats.AcceptedChunkCount); + for (uint32_t ChunkIndex : ChunkIndexes) + { + if (!ChunkFound[ChunkIndex]) + { + OutUnusedChunkIndexes.push_back(ChunkIndex); + } + } + } + return FilteredReuseBlockIndexes; +} + #if ZEN_WITH_TESTS namespace testutils { @@ -310,6 +501,246 @@ TEST_CASE("project.store.block") HeaderSize)); } +TEST_CASE("project.store.reuseblocks") +{ + using namespace std::literals; + using namespace testutils; + + std::vector<std::vector<std::size_t>> BlockAttachmentSizes( + {std::vector<std::size_t>{7633, 6825, 5738, 8031, 7225, 566, 3656, 6006, 24, 3466, 1093, 4269, 2257, 3685, 3489, + 7194, 6151, 5482, 6217, 3511, 6738, 5061, 7537, 2759, 1916, 8210, 2235, 4024, 1582, 5251, + 491, 5464, 4607, 8135, 3767, 4045, 4415, 5007, 8876, 6761, 3359, 8526, 4097, 4855, 8225}, + {17633, 16825, 15738, 18031, 17225, 11566, 13656, 16006, 11124, 13466, 11093, 14269, 12257, 13685, 13489, + 17194, 16151, 15482, 16217, 13511, 16738, 15061, 17537, 12759, 11916, 18210, 12235, 14024, 11582, 15251, + 11491, 15464, 14607, 18135, 13767, 14045, 14415, 15007, 18876, 16761, 13359, 18526, 14097, 14855, 18225}}); + + std::vector<ChunkBlockDescription> BlockDescriptions; + for (auto& AttachmentSizes : BlockAttachmentSizes) + { + std::vector<std::pair<Oid, CompressedBuffer>> AttachmentsWithId = CreateAttachments(AttachmentSizes); + std::vector<std::pair<IoHash, FetchChunkFunc>> Chunks; + Chunks.reserve(AttachmentSizes.size()); + for (const auto& It : AttachmentsWithId) + { + Chunks.push_back( + std::make_pair(It.second.DecodeRawHash(), [Buffer = It.second](const IoHash&) -> std::pair<uint64_t, CompressedBuffer> { + return {Buffer.DecodeRawSize(), Buffer}; + })); + } + ChunkBlockDescription Block; + CompressedBuffer BlockBuffer = GenerateChunkBlock(std::move(Chunks), Block); + BlockDescriptions.emplace_back(std::move(Block)); + } + + LoggerRef LogRef = Log(); + std::unique_ptr<OperationLogOutput> LogOutput(CreateStandardLogOutput(LogRef)); + + { + // We use just about all the chunks - should result in use of both blocks + ReuseBlocksStatistics ReuseBlocksStats; + std::vector<IoHash> ManyChunkHashes; + ManyChunkHashes.insert(ManyChunkHashes.end(), + BlockDescriptions[0].ChunkRawHashes.begin(), + BlockDescriptions[0].ChunkRawHashes.end() - 1); + ManyChunkHashes.insert(ManyChunkHashes.end(), + BlockDescriptions[1].ChunkRawHashes.begin() + 1, + BlockDescriptions[1].ChunkRawHashes.end()); + std::vector<uint32_t> ManyChunkIndexes; + ManyChunkIndexes.resize(ManyChunkHashes.size()); + std::iota(ManyChunkIndexes.begin(), ManyChunkIndexes.end(), 0); + std::vector<uint32_t> UnusedChunkIndexes; + + std::vector<size_t> ReusedBlocks = FindReuseBlocks(*LogOutput, + 80, + false, + ReuseBlocksStats, + BlockDescriptions, + ManyChunkHashes, + ManyChunkIndexes, + UnusedChunkIndexes); + + CHECK_EQ(2u, ReusedBlocks.size()); + CHECK_EQ(0u, UnusedChunkIndexes.size()); + } + + { + // We now only about one of the blocks + ReuseBlocksStatistics ReuseBlocksStats; + std::vector<IoHash> ManyChunkHashes; + ManyChunkHashes.insert(ManyChunkHashes.end(), + BlockDescriptions[0].ChunkRawHashes.begin(), + BlockDescriptions[0].ChunkRawHashes.end() - 1); + ManyChunkHashes.insert(ManyChunkHashes.end(), + BlockDescriptions[1].ChunkRawHashes.begin() + 1, + BlockDescriptions[1].ChunkRawHashes.end()); + std::vector<uint32_t> ManyChunkIndexes; + ManyChunkIndexes.resize(ManyChunkHashes.size()); + std::iota(ManyChunkIndexes.begin(), ManyChunkIndexes.end(), 0); + std::vector<uint32_t> UnusedChunkIndexes; + + std::vector<size_t> ReusedBlocks = FindReuseBlocks(*LogOutput, + 80, + false, + ReuseBlocksStats, + std::vector<ChunkBlockDescription>{BlockDescriptions[0]}, + ManyChunkHashes, + ManyChunkIndexes, + UnusedChunkIndexes); + + CHECK_EQ(1u, ReusedBlocks.size()); + CHECK_EQ(BlockDescriptions[1].ChunkRawHashes.size() - 1, UnusedChunkIndexes.size()); + } + + { + std::vector<IoHash> ManyChunkHashes; + ManyChunkHashes.insert(ManyChunkHashes.end(), + BlockDescriptions[0].ChunkRawHashes.begin(), + BlockDescriptions[0].ChunkRawHashes.end() - BlockDescriptions[0].ChunkRawHashes.size() / 2); + ManyChunkHashes.insert(ManyChunkHashes.end(), + BlockDescriptions[1].ChunkRawHashes.begin() + BlockDescriptions[1].ChunkRawHashes.size() / 2, + BlockDescriptions[1].ChunkRawHashes.end()); + std::vector<uint32_t> ManyChunkIndexes; + ManyChunkIndexes.resize(ManyChunkHashes.size()); + std::iota(ManyChunkIndexes.begin(), ManyChunkIndexes.end(), 0); + + { + // We use half the chunks - should result in no use of blocks due to 80% limit + std::vector<uint32_t> UnusedChunkIndexes80Percent; + ReuseBlocksStatistics ReuseBlocksStats; + std::vector<size_t> ReusedBlocks80Percent = FindReuseBlocks(*LogOutput, + 80, + false, + ReuseBlocksStats, + BlockDescriptions, + ManyChunkHashes, + ManyChunkIndexes, + UnusedChunkIndexes80Percent); + + CHECK_EQ(0u, ReusedBlocks80Percent.size()); + CHECK_EQ(ManyChunkHashes.size(), UnusedChunkIndexes80Percent.size()); + } + { + // We use half the chunks - should result in use of both blocks due to 40% limit + std::vector<uint32_t> UnusedChunkIndexes40Percent; + ReuseBlocksStatistics ReuseBlocksStats; + std::vector<size_t> ReusedBlocks40Percent = FindReuseBlocks(*LogOutput, + 40, + false, + ReuseBlocksStats, + BlockDescriptions, + ManyChunkHashes, + ManyChunkIndexes, + UnusedChunkIndexes40Percent); + + CHECK_EQ(2u, ReusedBlocks40Percent.size()); + CHECK_EQ(0u, UnusedChunkIndexes40Percent.size()); + } + } + + { + std::vector<IoHash> ManyChunkHashes; + ManyChunkHashes.insert(ManyChunkHashes.end(), + BlockDescriptions[0].ChunkRawHashes.begin(), + BlockDescriptions[0].ChunkRawHashes.end() - BlockDescriptions[0].ChunkRawHashes.size() / 2); + ManyChunkHashes.insert(ManyChunkHashes.end(), + BlockDescriptions[1].ChunkRawHashes.begin() + 1, + BlockDescriptions[1].ChunkRawHashes.end()); + std::vector<uint32_t> ManyChunkIndexes; + ManyChunkIndexes.resize(ManyChunkHashes.size()); + std::iota(ManyChunkIndexes.begin(), ManyChunkIndexes.end(), 0); + + { + // We use half the chunks for first block - should result in use of one blocks due to 80% limit + ReuseBlocksStatistics ReuseBlocksStats; + std::vector<uint32_t> UnusedChunkIndexes80Percent; + std::vector<size_t> ReusedBlocks80Percent = FindReuseBlocks(*LogOutput, + 80, + false, + ReuseBlocksStats, + BlockDescriptions, + ManyChunkHashes, + ManyChunkIndexes, + UnusedChunkIndexes80Percent); + + CHECK_EQ(1u, ReusedBlocks80Percent.size()); + CHECK_EQ(BlockDescriptions[0].ChunkRawHashes.size() - BlockDescriptions[0].ChunkRawHashes.size() / 2, + UnusedChunkIndexes80Percent.size()); + } + { + // We use half the chunks - should result in use of both blocks due to 40% limit + ReuseBlocksStatistics ReuseBlocksStats; + std::vector<uint32_t> UnusedChunkIndexes40Percent; + std::vector<size_t> ReusedBlocks40Percent = FindReuseBlocks(*LogOutput, + 40, + false, + ReuseBlocksStats, + BlockDescriptions, + ManyChunkHashes, + ManyChunkIndexes, + UnusedChunkIndexes40Percent); + + CHECK_EQ(2u, ReusedBlocks40Percent.size()); + CHECK_EQ(0u, UnusedChunkIndexes40Percent.size()); + } + } + + { + // Test simulate ThinkChunkBlockDescriptions + + for (ChunkBlockDescription& BlockDescription : BlockDescriptions) + { + BlockDescription.HeaderSize = 0; + BlockDescription.ChunkRawLengths = std::vector<uint32_t>(BlockDescription.ChunkRawHashes.size(), 1); + BlockDescription.ChunkCompressedLengths = std::vector<uint32_t>(BlockDescription.ChunkRawHashes.size(), 1); + } + + std::vector<IoHash> ManyChunkHashes; + ManyChunkHashes.insert(ManyChunkHashes.end(), + BlockDescriptions[0].ChunkRawHashes.begin(), + BlockDescriptions[0].ChunkRawHashes.end() - BlockDescriptions[0].ChunkRawHashes.size() / 2); + ManyChunkHashes.insert(ManyChunkHashes.end(), + BlockDescriptions[1].ChunkRawHashes.begin() + 1, + BlockDescriptions[1].ChunkRawHashes.end()); + std::vector<uint32_t> ManyChunkIndexes; + ManyChunkIndexes.resize(ManyChunkHashes.size()); + std::iota(ManyChunkIndexes.begin(), ManyChunkIndexes.end(), 0); + + { + // We use half the chunks for first block - should result in use of one blocks due to 80% limit + ReuseBlocksStatistics ReuseBlocksStats; + std::vector<uint32_t> UnusedChunkIndexes80Percent; + std::vector<size_t> ReusedBlocks80Percent = FindReuseBlocks(*LogOutput, + 80, + false, + ReuseBlocksStats, + BlockDescriptions, + ManyChunkHashes, + ManyChunkIndexes, + UnusedChunkIndexes80Percent); + + CHECK_EQ(1u, ReusedBlocks80Percent.size()); + CHECK_EQ(BlockDescriptions[0].ChunkRawHashes.size() - BlockDescriptions[0].ChunkRawHashes.size() / 2, + UnusedChunkIndexes80Percent.size()); + } + { + // We use half the chunks - should result in use of both blocks due to 40% limit + ReuseBlocksStatistics ReuseBlocksStats; + std::vector<uint32_t> UnusedChunkIndexes40Percent; + std::vector<size_t> ReusedBlocks40Percent = FindReuseBlocks(*LogOutput, + 40, + false, + ReuseBlocksStats, + BlockDescriptions, + ManyChunkHashes, + ManyChunkIndexes, + UnusedChunkIndexes40Percent); + + CHECK_EQ(2u, ReusedBlocks40Percent.size()); + CHECK_EQ(0u, UnusedChunkIndexes40Percent.size()); + } + } +} + void chunkblock_forcelink() { diff --git a/src/zenremotestore/include/zenremotestore/builds/buildstorageoperations.h b/src/zenremotestore/include/zenremotestore/builds/buildstorageoperations.h index 3c4535d9c..223c668cd 100644 --- a/src/zenremotestore/include/zenremotestore/builds/buildstorageoperations.h +++ b/src/zenremotestore/include/zenremotestore/builds/buildstorageoperations.h @@ -406,24 +406,16 @@ private: struct FindBlocksStatistics { - uint64_t FindBlockTimeMS = 0; - uint64_t PotentialChunkCount = 0; - uint64_t PotentialChunkByteCount = 0; - uint64_t FoundBlockCount = 0; - uint64_t FoundBlockChunkCount = 0; - uint64_t FoundBlockByteCount = 0; - uint64_t AcceptedBlockCount = 0; - uint64_t AcceptedChunkCount = 0; - uint64_t AcceptedByteCount = 0; - uint64_t AcceptedRawByteCount = 0; - uint64_t RejectedBlockCount = 0; - uint64_t RejectedChunkCount = 0; - uint64_t RejectedByteCount = 0; - uint64_t AcceptedReduntantChunkCount = 0; - uint64_t AcceptedReduntantByteCount = 0; - uint64_t NewBlocksCount = 0; - uint64_t NewBlocksChunkCount = 0; - uint64_t NewBlocksChunkByteCount = 0; + uint64_t FindBlockTimeMS = 0; + uint64_t PotentialChunkCount = 0; + uint64_t PotentialChunkByteCount = 0; + uint64_t FoundBlockCount = 0; + uint64_t FoundBlockChunkCount = 0; + uint64_t FoundBlockByteCount = 0; + uint64_t AcceptedBlockCount = 0; + uint64_t NewBlocksCount = 0; + uint64_t NewBlocksChunkCount = 0; + uint64_t NewBlocksChunkByteCount = 0; }; struct UploadStatistics @@ -541,6 +533,7 @@ public: GetFolderContentStatistics m_LocalFolderScanStats; ChunkingStatistics m_ChunkingStats; FindBlocksStatistics m_FindBlocksStats; + ReuseBlocksStatistics m_ReuseBlocksStats; UploadStatistics m_UploadStats; GenerateBlocksStatistics m_GenerateBlocksStats; LooseChunksStatistics m_LooseChunksStats; @@ -551,14 +544,10 @@ private: bool IsAcceptedFolder(const std::string_view& RelativePath) const; bool IsAcceptedFile(const std::string_view& RelativePath) const; - std::vector<size_t> FindReuseBlocks(const std::vector<ChunkBlockDescription>& KnownBlocks, - std::span<const IoHash> ChunkHashes, - std::span<const uint32_t> ChunkIndexes, - std::vector<uint32_t>& OutUnusedChunkIndexes); - void ArrangeChunksIntoBlocks(const ChunkedFolderContent& Content, - const ChunkedContentLookup& Lookup, - std::vector<uint32_t>& ChunkIndexes, - std::vector<std::vector<uint32_t>>& OutBlocks); + void ArrangeChunksIntoBlocks(const ChunkedFolderContent& Content, + const ChunkedContentLookup& Lookup, + std::vector<uint32_t>& ChunkIndexes, + std::vector<std::vector<uint32_t>>& OutBlocks); struct GeneratedBlocks { std::vector<ChunkBlockDescription> BlockDescriptions; diff --git a/src/zenremotestore/include/zenremotestore/chunking/chunkblock.h b/src/zenremotestore/include/zenremotestore/chunking/chunkblock.h index b0d8ef24c..295d275d1 100644 --- a/src/zenremotestore/include/zenremotestore/chunking/chunkblock.h +++ b/src/zenremotestore/include/zenremotestore/chunking/chunkblock.h @@ -37,6 +37,29 @@ bool IterateChunkBlock(const SharedBuffer& BlockPayload, uint64_t& OutHeaderSize); std::vector<uint32_t> ReadChunkBlockHeader(const MemoryView BlockView, uint64_t& OutHeaderSize); +struct ReuseBlocksStatistics +{ + uint64_t AcceptedChunkCount = 0; + uint64_t AcceptedByteCount = 0; + uint64_t AcceptedRawByteCount = 0; + uint64_t RejectedBlockCount = 0; + uint64_t RejectedChunkCount = 0; + uint64_t RejectedByteCount = 0; + uint64_t AcceptedReduntantChunkCount = 0; + uint64_t AcceptedReduntantByteCount = 0; +}; + +class OperationLogOutput; + +std::vector<size_t> FindReuseBlocks(OperationLogOutput& Output, + const uint8_t BlockReuseMinPercentLimit, + const bool IsVerbose, + ReuseBlocksStatistics& Stats, + const std::vector<ChunkBlockDescription>& KnownBlocks, + std::span<const IoHash> ChunkHashes, + std::span<const uint32_t> ChunkIndexes, + std::vector<uint32_t>& OutUnusedChunkIndexes); + void chunkblock_forcelink(); } // namespace zen diff --git a/src/zenremotestore/include/zenremotestore/projectstore/remoteprojectstore.h b/src/zenremotestore/include/zenremotestore/projectstore/remoteprojectstore.h index 182b64609..008f94351 100644 --- a/src/zenremotestore/include/zenremotestore/projectstore/remoteprojectstore.h +++ b/src/zenremotestore/include/zenremotestore/projectstore/remoteprojectstore.h @@ -70,7 +70,7 @@ public: struct GetKnownBlocksResult : public Result { - std::vector<ThinChunkBlockDescription> Blocks; + std::vector<ChunkBlockDescription> Blocks; }; struct RemoteStoreInfo diff --git a/src/zenremotestore/projectstore/buildsremoteprojectstore.cpp b/src/zenremotestore/projectstore/buildsremoteprojectstore.cpp index 706f11e8c..bd793b745 100644 --- a/src/zenremotestore/projectstore/buildsremoteprojectstore.cpp +++ b/src/zenremotestore/projectstore/buildsremoteprojectstore.cpp @@ -436,12 +436,7 @@ public: { CbObject KnownBlocks = m_BuildStorage->FindBlocks(m_BuildId, 10000u); std::optional<std::vector<ChunkBlockDescription>> Blocks = ParseChunkBlockDescriptionList(KnownBlocks); - Result.Blocks.reserve(Blocks.value().size()); - for (ChunkBlockDescription& BlockDescription : Blocks.value()) - { - Result.Blocks.push_back(ThinChunkBlockDescription{.BlockHash = BlockDescription.BlockHash, - .ChunkRawHashes = std::move(BlockDescription.ChunkRawHashes)}); - } + Result.Blocks = std::move(Blocks.value()); } catch (const HttpClientError& Ex) { diff --git a/src/zenremotestore/projectstore/fileremoteprojectstore.cpp b/src/zenremotestore/projectstore/fileremoteprojectstore.cpp index 50be5d2d9..3a67d3842 100644 --- a/src/zenremotestore/projectstore/fileremoteprojectstore.cpp +++ b/src/zenremotestore/projectstore/fileremoteprojectstore.cpp @@ -202,9 +202,18 @@ public: return GetKnownBlocksResult{{.ErrorCode = static_cast<int>(HttpResponseCode::NoContent), .ElapsedSeconds = LoadResult.ElapsedSeconds + Timer.GetElapsedTimeUs() * 1000}}; } - std::vector<ThinChunkBlockDescription> KnownBlocks = GetBlocksFromOplog(LoadResult.ContainerObject, ExistingBlockHashes); - GetKnownBlocksResult Result{{.ElapsedSeconds = LoadResult.ElapsedSeconds + Timer.GetElapsedTimeUs() * 1000}}; - Result.Blocks = std::move(KnownBlocks); + std::vector<ThinChunkBlockDescription> ThinKnownBlocks = GetBlocksFromOplog(LoadResult.ContainerObject, ExistingBlockHashes); + + const size_t KnowBlockCount = ThinKnownBlocks.size(); + + GetKnownBlocksResult Result{{.ElapsedSeconds = LoadResult.ElapsedSeconds + Timer.GetElapsedTimeUs() * 1000}}; + Result.Blocks.resize(KnowBlockCount); + for (size_t BlockIndex = 0; BlockIndex < KnowBlockCount; BlockIndex++) + { + Result.Blocks[BlockIndex].BlockHash = ThinKnownBlocks[BlockIndex].BlockHash; + Result.Blocks[BlockIndex].ChunkRawHashes = std::move(ThinKnownBlocks[BlockIndex].ChunkRawHashes); + } + return Result; } diff --git a/src/zenremotestore/projectstore/jupiterremoteprojectstore.cpp b/src/zenremotestore/projectstore/jupiterremoteprojectstore.cpp index e26a5e88d..6d888ea01 100644 --- a/src/zenremotestore/projectstore/jupiterremoteprojectstore.cpp +++ b/src/zenremotestore/projectstore/jupiterremoteprojectstore.cpp @@ -197,11 +197,18 @@ public: return GetKnownBlocksResult{{.ErrorCode = static_cast<int>(HttpResponseCode::NoContent), .ElapsedSeconds = LoadResult.ElapsedSeconds + ExistsResult.ElapsedSeconds}}; } - std::vector<ThinChunkBlockDescription> KnownBlocks = GetBlocksFromOplog(LoadResult.ContainerObject, ExistingBlockHashes); + std::vector<ThinChunkBlockDescription> ThinKnownBlocks = GetBlocksFromOplog(LoadResult.ContainerObject, ExistingBlockHashes); GetKnownBlocksResult Result{ {.ElapsedSeconds = LoadResult.ElapsedSeconds + ExistsResult.ElapsedSeconds + Timer.GetElapsedTimeUs() * 1000.0}}; - Result.Blocks = std::move(KnownBlocks); + const size_t KnowBlockCount = ThinKnownBlocks.size(); + + Result.Blocks.resize(KnowBlockCount); + for (size_t BlockIndex = 0; BlockIndex < KnowBlockCount; BlockIndex++) + { + Result.Blocks[BlockIndex].BlockHash = ThinKnownBlocks[BlockIndex].BlockHash; + Result.Blocks[BlockIndex].ChunkRawHashes = std::move(ThinKnownBlocks[BlockIndex].ChunkRawHashes); + } return Result; } diff --git a/src/zenremotestore/projectstore/remoteprojectstore.cpp b/src/zenremotestore/projectstore/remoteprojectstore.cpp index 5652d5271..0e18cc6b0 100644 --- a/src/zenremotestore/projectstore/remoteprojectstore.cpp +++ b/src/zenremotestore/projectstore/remoteprojectstore.cpp @@ -14,8 +14,10 @@ #include <zencore/workthreadpool.h> #include <zenhttp/httpcommon.h> #include <zenremotestore/chunking/chunkedfile.h> +#include <zenremotestore/operationlogoutput.h> #include <zenstore/cidstore.h> +#include <numeric> #include <unordered_map> #if ZEN_WITH_TESTS @@ -534,11 +536,16 @@ namespace remotestore_impl { return; } + uint64_t PotentialSize = 0; + uint64_t UsedSize = 0; + uint64_t BlockSize = BlockPayload.GetSize(); + uint64_t BlockHeaderSize = 0; bool StoreChunksOK = IterateChunkBlock( BlockPayload, - [&WantedChunks, &WriteAttachmentBuffers, &WriteRawHashes, &Info](CompressedBuffer&& Chunk, - const IoHash& AttachmentRawHash) { + [&WantedChunks, &WriteAttachmentBuffers, &WriteRawHashes, &Info, &PotentialSize]( + CompressedBuffer&& Chunk, + const IoHash& AttachmentRawHash) { if (WantedChunks.contains(AttachmentRawHash)) { WriteAttachmentBuffers.emplace_back(Chunk.GetCompressed().Flatten().AsIoBuffer()); @@ -552,6 +559,7 @@ namespace remotestore_impl { ZEN_ASSERT(RawHash == AttachmentRawHash); WriteRawHashes.emplace_back(AttachmentRawHash); WantedChunks.erase(AttachmentRawHash); + PotentialSize += WriteAttachmentBuffers.back().GetSize(); } }, BlockHeaderSize); @@ -581,8 +589,16 @@ namespace remotestore_impl { { Info.AttachmentBytesStored.fetch_add(WriteAttachmentBuffers[Index].GetSize()); Info.AttachmentsStored.fetch_add(1); + UsedSize += WriteAttachmentBuffers[Index].GetSize(); } } + ZEN_DEBUG("Used {} (matching {}) out of {} for block {} ({} %) (use of matching {}%)", + NiceBytes(UsedSize), + NiceBytes(PotentialSize), + NiceBytes(BlockSize), + BlockHash, + (100 * UsedSize) / BlockSize, + PotentialSize > 0 ? (UsedSize * 100) / PotentialSize : 0); } } catch (const std::exception& Ex) @@ -1182,7 +1198,7 @@ BuildContainer(CidStore& ChunkStore, bool BuildBlocks, bool IgnoreMissingAttachments, bool AllowChunking, - const std::vector<ThinChunkBlockDescription>& KnownBlocks, + const std::vector<ChunkBlockDescription>& KnownBlocks, WorkerThreadPool& WorkerPool, const std::function<void(CompressedBuffer&&, ChunkBlockDescription&&)>& AsyncOnBlock, const std::function<void(const IoHash&, TGetAttachmentBufferFunc&&)>& OnLargeAttachment, @@ -1193,6 +1209,36 @@ BuildContainer(CidStore& ChunkStore, { using namespace std::literals; + class JobContextLogOutput : public OperationLogOutput + { + public: + JobContextLogOutput(JobContext* OptionalContext) : m_OptionalContext(OptionalContext) {} + virtual void EmitLogMessage(int LogLevel, std::string_view Format, fmt::format_args Args) override + { + ZEN_UNUSED(LogLevel); + if (m_OptionalContext) + { + fmt::basic_memory_buffer<char, 250> MessageBuffer; + fmt::vformat_to(fmt::appender(MessageBuffer), Format, Args); + remotestore_impl::ReportMessage(m_OptionalContext, std::string_view(MessageBuffer.data(), MessageBuffer.size())); + } + } + + virtual void SetLogOperationName(std::string_view Name) override { ZEN_UNUSED(Name); } + virtual void SetLogOperationProgress(uint32_t StepIndex, uint32_t StepCount) override { ZEN_UNUSED(StepIndex, StepCount); } + virtual uint32_t GetProgressUpdateDelayMS() override { return 0; } + virtual ProgressBar* CreateProgressBar(std::string_view InSubTask) override + { + ZEN_UNUSED(InSubTask); + return nullptr; + } + + private: + JobContext* m_OptionalContext; + }; + + std::unique_ptr<OperationLogOutput> LogOutput(std::make_unique<JobContextLogOutput>(OptionalContext)); + size_t OpCount = 0; CbObject OplogContainerObject; @@ -1424,56 +1470,6 @@ BuildContainer(CidStore& ChunkStore, return {}; } - auto FindReuseBlocks = [](const std::vector<ThinChunkBlockDescription>& KnownBlocks, - const std::unordered_set<IoHash, IoHash::Hasher>& Attachments, - JobContext* OptionalContext) -> std::vector<size_t> { - std::vector<size_t> ReuseBlockIndexes; - if (!Attachments.empty() && !KnownBlocks.empty()) - { - remotestore_impl::ReportMessage( - OptionalContext, - fmt::format("Checking {} Attachments against {} known blocks for reuse", Attachments.size(), KnownBlocks.size())); - Stopwatch ReuseTimer; - - for (size_t KnownBlockIndex = 0; KnownBlockIndex < KnownBlocks.size(); KnownBlockIndex++) - { - const ThinChunkBlockDescription& KnownBlock = KnownBlocks[KnownBlockIndex]; - size_t BlockAttachmentCount = KnownBlock.ChunkRawHashes.size(); - if (BlockAttachmentCount == 0) - { - continue; - } - size_t FoundAttachmentCount = 0; - for (const IoHash& KnownHash : KnownBlock.ChunkRawHashes) - { - if (Attachments.contains(KnownHash)) - { - FoundAttachmentCount++; - } - } - - size_t ReusePercent = (FoundAttachmentCount * 100) / BlockAttachmentCount; - // TODO: Configure reuse-level - if (ReusePercent > 80) - { - ZEN_DEBUG("Reusing block {}. {} attachments found, usage level: {}%", - KnownBlock.BlockHash, - FoundAttachmentCount, - ReusePercent); - ReuseBlockIndexes.push_back(KnownBlockIndex); - } - else if (FoundAttachmentCount > 0) - { - ZEN_DEBUG("Skipping block {}. {} attachments found, usage level: {}%", - KnownBlock.BlockHash, - FoundAttachmentCount, - ReusePercent); - } - } - } - return ReuseBlockIndexes; - }; - std::unordered_set<IoHash, IoHash::Hasher> FoundHashes; FoundHashes.reserve(UploadAttachments.size()); for (const auto& It : UploadAttachments) @@ -1482,15 +1478,35 @@ BuildContainer(CidStore& ChunkStore, } size_t ReusedAttachmentCount = 0; - std::vector<size_t> ReusedBlockIndexes = FindReuseBlocks(KnownBlocks, FoundHashes, OptionalContext); - for (size_t KnownBlockIndex : ReusedBlockIndexes) + std::vector<size_t> ReusedBlockIndexes; { - const ThinChunkBlockDescription& KnownBlock = KnownBlocks[KnownBlockIndex]; - for (const IoHash& KnownHash : KnownBlock.ChunkRawHashes) + std::vector<IoHash> ChunkHashes; + ChunkHashes.reserve(FoundHashes.size()); + ChunkHashes.insert(ChunkHashes.begin(), FoundHashes.begin(), FoundHashes.end()); + std::vector<uint32_t> ChunkIndexes; + ChunkIndexes.resize(FoundHashes.size()); + std::iota(ChunkIndexes.begin(), ChunkIndexes.end(), 0); + + std::vector<uint32_t> UnusedChunkIndexes; + ReuseBlocksStatistics ReuseBlocksStats; + + ReusedBlockIndexes = FindReuseBlocks(*LogOutput, + /*BlockReuseMinPercentLimit*/ 80, + /*IsVerbose*/ false, + ReuseBlocksStats, + KnownBlocks, + ChunkHashes, + ChunkIndexes, + UnusedChunkIndexes); + for (size_t KnownBlockIndex : ReusedBlockIndexes) { - if (UploadAttachments.erase(KnownHash) == 1) + const ThinChunkBlockDescription& KnownBlock = KnownBlocks[KnownBlockIndex]; + for (const IoHash& KnownHash : KnownBlock.ChunkRawHashes) { - ReusedAttachmentCount++; + if (UploadAttachments.erase(KnownHash) == 1) + { + ReusedAttachmentCount++; + } } } } @@ -1823,20 +1839,39 @@ BuildContainer(CidStore& ChunkStore, UploadAttachments.erase(It); } - std::vector<size_t> ReusedBlockFromChunking = FindReuseBlocks(KnownBlocks, ChunkedHashes, OptionalContext); - for (size_t KnownBlockIndex : ReusedBlockIndexes) { - const ThinChunkBlockDescription& KnownBlock = KnownBlocks[KnownBlockIndex]; - for (const IoHash& KnownHash : KnownBlock.ChunkRawHashes) + std::vector<IoHash> ChunkHashes; + ChunkHashes.reserve(ChunkedHashes.size()); + ChunkHashes.insert(ChunkHashes.begin(), ChunkedHashes.begin(), ChunkedHashes.end()); + std::vector<uint32_t> ChunkIndexes; + ChunkIndexes.resize(ChunkedHashes.size()); + std::iota(ChunkIndexes.begin(), ChunkIndexes.end(), 0); + + std::vector<uint32_t> UnusedChunkIndexes; + ReuseBlocksStatistics ReuseBlocksStats; + + std::vector<size_t> ReusedBlockFromChunking = FindReuseBlocks(*LogOutput, + /*BlockReuseMinPercentLimit*/ 80, + /*IsVerbose*/ false, + ReuseBlocksStats, + KnownBlocks, + ChunkHashes, + ChunkIndexes, + UnusedChunkIndexes); + for (size_t KnownBlockIndex : ReusedBlockIndexes) { - if (ChunkedHashes.erase(KnownHash) == 1) + const ThinChunkBlockDescription& KnownBlock = KnownBlocks[KnownBlockIndex]; + for (const IoHash& KnownHash : KnownBlock.ChunkRawHashes) { - ReusedAttachmentCount++; + if (ChunkedHashes.erase(KnownHash) == 1) + { + ReusedAttachmentCount++; + } } } + ReusedBlockIndexes.insert(ReusedBlockIndexes.end(), ReusedBlockFromChunking.begin(), ReusedBlockFromChunking.end()); } - ReusedBlockIndexes.insert(ReusedBlockIndexes.end(), ReusedBlockFromChunking.begin(), ReusedBlockFromChunking.end()); std::sort(ReusedBlockIndexes.begin(), ReusedBlockIndexes.end()); auto UniqueKnownBlocksEnd = std::unique(ReusedBlockIndexes.begin(), ReusedBlockIndexes.end()); size_t ReuseBlockCount = std::distance(ReusedBlockIndexes.begin(), UniqueKnownBlocksEnd); @@ -2448,7 +2483,7 @@ SaveOplog(CidStore& ChunkStore, OnBlock = UploadBlock; } - std::vector<ThinChunkBlockDescription> KnownBlocks; + std::vector<ChunkBlockDescription> KnownBlocks; uint64_t TransferWallTimeMS = 0; @@ -2473,6 +2508,22 @@ SaveOplog(CidStore& ChunkStore, RemoteProjectStore::GetKnownBlocksResult KnownBlocksResult = RemoteStore.GetKnownBlocks(); TransferWallTimeMS += GetKnownBlocksTimer.GetElapsedTimeMs(); + for (ChunkBlockDescription& BlockDescription : KnownBlocksResult.Blocks) + { + if (BlockDescription.ChunkRawLengths.empty()) + { + ZEN_ASSERT(BlockDescription.ChunkCompressedLengths.empty()); + + size_t ChunkCount = BlockDescription.ChunkRawLengths.size(); + if (ChunkCount > 0) + { + // Fake sizes, will give usage number of number of chunks used rather than bytes used - better than nothing + BlockDescription.ChunkRawLengths.resize(ChunkCount, 1); + BlockDescription.ChunkCompressedLengths.resize(ChunkCount, 1); + } + } + } + if (KnownBlocksResult.ErrorCode == static_cast<int>(HttpResponseCode::NoContent)) { remotestore_impl::ReportMessage(OptionalContext, diff --git a/src/zenstore/cas.cpp b/src/zenstore/cas.cpp index 49d24c21e..ed017988f 100644 --- a/src/zenstore/cas.cpp +++ b/src/zenstore/cas.cpp @@ -267,17 +267,17 @@ CasImpl::InsertChunk(IoBuffer Chunk, const IoHash& ChunkHash, InsertMode Mode) } static void -GetCompactCasResults(CasContainerStrategy& Strategy, - std::span<IoBuffer> Data, - std::span<IoHash> ChunkHashes, - std::span<size_t> Indexes, - std::vector<CasStore::InsertResult> Results) +GetCompactCasResults(CasContainerStrategy& Strategy, + std::span<IoBuffer> Data, + std::span<IoHash> ChunkHashes, + std::span<size_t> Indexes, + std::vector<CasStore::InsertResult>& OutResults) { const size_t Count = Indexes.size(); if (Count == 1) { const size_t Index = Indexes[0]; - Results[Index] = Strategy.InsertChunk(Data[Index], ChunkHashes[Index]); + OutResults[Index] = Strategy.InsertChunk(Data[Index], ChunkHashes[Index]); return; } std::vector<IoBuffer> Chunks; @@ -290,12 +290,12 @@ GetCompactCasResults(CasContainerStrategy& Strategy, Hashes.push_back(ChunkHashes[Index]); } - Strategy.InsertChunks(Chunks, Hashes); + std::vector<CasStore::InsertResult> Results = Strategy.InsertChunks(Chunks, Hashes); for (size_t Offset = 0; Offset < Count; Offset++) { - size_t Index = Indexes[Offset]; - Results[Index] = Results[Offset]; + size_t Index = Indexes[Offset]; + OutResults[Index] = Results[Offset]; } }; |