aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDan Engelbrecht <[email protected]>2025-12-15 13:20:21 +0100
committerGitHub Enterprise <[email protected]>2025-12-15 13:20:21 +0100
commita715d3ab7701e6257730a73c62567052d21c9771 (patch)
tree1f6b1de9c7cf11ec1403187d77d74a3b1af52a39
parentshow download source data (#689) (diff)
downloadzen-a715d3ab7701e6257730a73c62567052d21c9771.tar.xz
zen-a715d3ab7701e6257730a73c62567052d21c9771.zip
oplog download size (#690)
- Bugfix: Upload of oplogs could reference multiple blocks for the same chunk causing redundant downloads of blocks - Improvement: Use the improved block reuse selection function from zen builds upload in zen oplog-export to reduce oplog download size
-rw-r--r--CHANGELOG.md2
-rw-r--r--src/zen/cmds/builds_cmd.cpp35
-rw-r--r--src/zenremotestore/builds/buildstorageoperations.cpp207
-rw-r--r--src/zenremotestore/chunking/chunkblock.cpp431
-rw-r--r--src/zenremotestore/include/zenremotestore/builds/buildstorageoperations.h41
-rw-r--r--src/zenremotestore/include/zenremotestore/chunking/chunkblock.h23
-rw-r--r--src/zenremotestore/include/zenremotestore/projectstore/remoteprojectstore.h2
-rw-r--r--src/zenremotestore/projectstore/buildsremoteprojectstore.cpp7
-rw-r--r--src/zenremotestore/projectstore/fileremoteprojectstore.cpp15
-rw-r--r--src/zenremotestore/projectstore/jupiterremoteprojectstore.cpp11
-rw-r--r--src/zenremotestore/projectstore/remoteprojectstore.cpp185
-rw-r--r--src/zenstore/cas.cpp18
12 files changed, 657 insertions, 320 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md
index af0cca1a2..ec8cb4906 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,7 @@
##
+- Bugfix: Upload of oplogs could reference multiple blocks for the same chunk causing redundant downloads of blocks
- Improvement: At end of `zen builds download` and `zen oplog-import` we now show information over the source of the data - cache, cloud storage, s3/azure, replication
+- Improvement: Use the improved block reuse selection function from `zen builds upload` in `zen oplog-export` to reduce oplog download size
## 5.7.14
- Improvement: asio http server now supports the `--dedicated` option which prevents port remapping
diff --git a/src/zen/cmds/builds_cmd.cpp b/src/zen/cmds/builds_cmd.cpp
index 3b08a9290..6a97ab542 100644
--- a/src/zen/cmds/builds_cmd.cpp
+++ b/src/zen/cmds/builds_cmd.cpp
@@ -520,14 +520,6 @@ namespace {
"\n FoundBlockChunkCount: {}"
"\n FoundBlockByteCount: {}"
"\n AcceptedBlockCount: {}"
- "\n AcceptedChunkCount: {}"
- "\n AcceptedByteCount: {}"
- "\n AcceptedRawByteCount: {}"
- "\n RejectedBlockCount: {}"
- "\n RejectedChunkCount: {}"
- "\n RejectedByteCount: {}"
- "\n AcceptedReduntantChunkCount: {}"
- "\n AcceptedReduntantByteCount: {}"
"\n NewBlocksCount: {}"
"\n NewBlocksChunkCount: {}"
"\n NewBlocksChunkByteCount: {}",
@@ -538,19 +530,30 @@ namespace {
UploadOp.m_FindBlocksStats.FoundBlockChunkCount,
NiceBytes(UploadOp.m_FindBlocksStats.FoundBlockByteCount),
UploadOp.m_FindBlocksStats.AcceptedBlockCount,
- UploadOp.m_FindBlocksStats.AcceptedChunkCount,
- NiceBytes(UploadOp.m_FindBlocksStats.AcceptedByteCount),
- NiceBytes(UploadOp.m_FindBlocksStats.AcceptedRawByteCount),
- UploadOp.m_FindBlocksStats.RejectedBlockCount,
- UploadOp.m_FindBlocksStats.RejectedChunkCount,
- NiceBytes(UploadOp.m_FindBlocksStats.RejectedByteCount),
- UploadOp.m_FindBlocksStats.AcceptedReduntantChunkCount,
- NiceBytes(UploadOp.m_FindBlocksStats.AcceptedReduntantByteCount),
UploadOp.m_FindBlocksStats.NewBlocksCount,
UploadOp.m_FindBlocksStats.NewBlocksChunkCount,
NiceBytes(UploadOp.m_FindBlocksStats.NewBlocksChunkByteCount));
ZEN_CONSOLE_VERBOSE(
+ "Reuse block stats:"
+ "\n AcceptedChunkCount: {}"
+ "\n AcceptedByteCount: {}"
+ "\n AcceptedRawByteCount: {}"
+ "\n RejectedBlockCount: {}"
+ "\n RejectedChunkCount: {}"
+ "\n RejectedByteCount: {}"
+ "\n AcceptedReduntantChunkCount: {}"
+ "\n AcceptedReduntantByteCount: {}",
+ UploadOp.m_ReuseBlocksStats.AcceptedChunkCount,
+ NiceBytes(UploadOp.m_ReuseBlocksStats.AcceptedByteCount),
+ NiceBytes(UploadOp.m_ReuseBlocksStats.AcceptedRawByteCount),
+ UploadOp.m_ReuseBlocksStats.RejectedBlockCount,
+ UploadOp.m_ReuseBlocksStats.RejectedChunkCount,
+ NiceBytes(UploadOp.m_ReuseBlocksStats.RejectedByteCount),
+ UploadOp.m_ReuseBlocksStats.AcceptedReduntantChunkCount,
+ NiceBytes(UploadOp.m_ReuseBlocksStats.AcceptedReduntantByteCount));
+
+ ZEN_CONSOLE_VERBOSE(
"Generate blocks stats:"
"\n GeneratedBlockByteCount: {}"
"\n GeneratedBlockCount: {}"
diff --git a/src/zenremotestore/builds/buildstorageoperations.cpp b/src/zenremotestore/builds/buildstorageoperations.cpp
index b8dd18bb5..6c370f975 100644
--- a/src/zenremotestore/builds/buildstorageoperations.cpp
+++ b/src/zenremotestore/builds/buildstorageoperations.cpp
@@ -4998,7 +4998,11 @@ BuildsOperationUploadFolder::Execute()
}
else
{
- ReuseBlockIndexes = FindReuseBlocks(PrepBuildResult.KnownBlocks,
+ ReuseBlockIndexes = FindReuseBlocks(m_LogOutput,
+ m_Options.BlockReuseMinPercentLimit,
+ m_Options.IsVerbose,
+ m_ReuseBlocksStats,
+ PrepBuildResult.KnownBlocks,
LocalContent.ChunkedContent.ChunkHashes,
BlockChunkIndexes,
NewBlockChunkIndexes);
@@ -5027,13 +5031,13 @@ BuildsOperationUploadFolder::Execute()
const double AcceptedByteCountPercent =
m_FindBlocksStats.PotentialChunkByteCount > 0
- ? (100.0 * m_FindBlocksStats.AcceptedRawByteCount / m_FindBlocksStats.PotentialChunkByteCount)
+ ? (100.0 * m_ReuseBlocksStats.AcceptedRawByteCount / m_FindBlocksStats.PotentialChunkByteCount)
: 0.0;
const double AcceptedReduntantByteCountPercent =
- m_FindBlocksStats.AcceptedByteCount > 0
- ? (100.0 * m_FindBlocksStats.AcceptedReduntantByteCount) /
- (m_FindBlocksStats.AcceptedByteCount + m_FindBlocksStats.AcceptedReduntantByteCount)
+ m_ReuseBlocksStats.AcceptedByteCount > 0
+ ? (100.0 * m_ReuseBlocksStats.AcceptedReduntantByteCount) /
+ (m_ReuseBlocksStats.AcceptedByteCount + m_ReuseBlocksStats.AcceptedReduntantByteCount)
: 0.0;
if (!m_Options.IsQuiet)
{
@@ -5050,18 +5054,18 @@ BuildsOperationUploadFolder::Execute()
NiceBytes(m_FindBlocksStats.FoundBlockByteCount),
NiceTimeSpanMs(m_FindBlocksStats.FindBlockTimeMS),
- m_FindBlocksStats.AcceptedChunkCount,
- NiceBytes(m_FindBlocksStats.AcceptedRawByteCount),
+ m_ReuseBlocksStats.AcceptedChunkCount,
+ NiceBytes(m_ReuseBlocksStats.AcceptedRawByteCount),
m_FindBlocksStats.AcceptedBlockCount,
AcceptedByteCountPercent,
- m_FindBlocksStats.AcceptedReduntantChunkCount,
- NiceBytes(m_FindBlocksStats.AcceptedReduntantByteCount),
+ m_ReuseBlocksStats.AcceptedReduntantChunkCount,
+ NiceBytes(m_ReuseBlocksStats.AcceptedReduntantByteCount),
AcceptedReduntantByteCountPercent,
- m_FindBlocksStats.RejectedChunkCount,
- NiceBytes(m_FindBlocksStats.RejectedByteCount),
- m_FindBlocksStats.RejectedBlockCount,
+ m_ReuseBlocksStats.RejectedChunkCount,
+ NiceBytes(m_ReuseBlocksStats.RejectedByteCount),
+ m_ReuseBlocksStats.RejectedBlockCount,
m_FindBlocksStats.NewBlocksChunkCount,
NiceBytes(m_FindBlocksStats.NewBlocksChunkByteCount),
@@ -5497,7 +5501,7 @@ BuildsOperationUploadFolder::Execute()
{{"totalSize", double(m_LocalFolderScanStats.FoundFileByteCount.load())},
{"reusedRatio", AcceptedByteCountPercent / 100.0},
{"reusedBlockCount", double(m_FindBlocksStats.AcceptedBlockCount)},
- {"reusedBlockByteCount", double(m_FindBlocksStats.AcceptedRawByteCount)},
+ {"reusedBlockByteCount", double(m_ReuseBlocksStats.AcceptedRawByteCount)},
{"newBlockCount", double(m_FindBlocksStats.NewBlocksCount)},
{"newBlockByteCount", double(m_FindBlocksStats.NewBlocksChunkByteCount)},
{"uploadedCount", double(m_UploadStats.BlockCount.load() + m_UploadStats.ChunkCount.load())},
@@ -5589,183 +5593,6 @@ BuildsOperationUploadFolder::IsAcceptedFile(const std::string_view& RelativePath
return true;
}
-std::vector<size_t>
-BuildsOperationUploadFolder::FindReuseBlocks(const std::vector<ChunkBlockDescription>& KnownBlocks,
- std::span<const IoHash> ChunkHashes,
- std::span<const uint32_t> ChunkIndexes,
- std::vector<uint32_t>& OutUnusedChunkIndexes)
-{
- ZEN_TRACE_CPU("FindReuseBlocks");
-
- // Find all blocks with a usage level higher than MinPercentLimit
- // Pick out the blocks with usage higher or equal to MinPercentLimit
- // Sort them with highest size usage - most usage first
- // Make a list of all chunks and mark them as not found
- // For each block, recalculate the block has usage percent based on the chunks marked as not found
- // If the block still reaches MinPercentLimit, keep it and remove the matching chunks from the not found list
- // Repeat for following all remaining block that initially matched MinPercentLimit
-
- std::vector<size_t> FilteredReuseBlockIndexes;
-
- uint32_t ChunkCount = gsl::narrow<uint32_t>(ChunkHashes.size());
- std::vector<bool> ChunkFound(ChunkCount, false);
-
- if (ChunkCount > 0)
- {
- if (!KnownBlocks.empty())
- {
- Stopwatch ReuseTimer;
-
- tsl::robin_map<IoHash, uint32_t, IoHash::Hasher> ChunkHashToChunkIndex;
- ChunkHashToChunkIndex.reserve(ChunkIndexes.size());
- for (uint32_t ChunkIndex : ChunkIndexes)
- {
- ChunkHashToChunkIndex.insert_or_assign(ChunkHashes[ChunkIndex], ChunkIndex);
- }
-
- std::vector<size_t> BlockSizes(KnownBlocks.size(), 0);
- std::vector<size_t> BlockUseSize(KnownBlocks.size(), 0);
-
- std::vector<size_t> ReuseBlockIndexes;
-
- for (size_t KnownBlockIndex = 0; KnownBlockIndex < KnownBlocks.size(); KnownBlockIndex++)
- {
- const ChunkBlockDescription& KnownBlock = KnownBlocks[KnownBlockIndex];
- if (KnownBlock.BlockHash != IoHash::Zero && KnownBlock.ChunkRawHashes.size() == KnownBlock.ChunkCompressedLengths.size())
- {
- size_t BlockAttachmentCount = KnownBlock.ChunkRawHashes.size();
- if (BlockAttachmentCount == 0)
- {
- continue;
- }
- size_t ReuseSize = 0;
- size_t BlockSize = 0;
- size_t FoundAttachmentCount = 0;
- size_t BlockChunkCount = KnownBlock.ChunkRawHashes.size();
- for (size_t BlockChunkIndex = 0; BlockChunkIndex < BlockChunkCount; BlockChunkIndex++)
- {
- const IoHash& BlockChunkHash = KnownBlock.ChunkRawHashes[BlockChunkIndex];
- const uint32_t BlockChunkSize = KnownBlock.ChunkCompressedLengths[BlockChunkIndex];
- BlockSize += BlockChunkSize;
- if (ChunkHashToChunkIndex.contains(BlockChunkHash))
- {
- ReuseSize += BlockChunkSize;
- FoundAttachmentCount++;
- }
- }
-
- size_t ReusePercent = (ReuseSize * 100) / BlockSize;
-
- if (ReusePercent >= m_Options.BlockReuseMinPercentLimit)
- {
- if (m_Options.IsVerbose)
- {
- ZEN_OPERATION_LOG_INFO(m_LogOutput,
- "Reusing block {}. {} attachments found, usage level: {}%",
- KnownBlock.BlockHash,
- FoundAttachmentCount,
- ReusePercent);
- }
- ReuseBlockIndexes.push_back(KnownBlockIndex);
-
- BlockSizes[KnownBlockIndex] = BlockSize;
- BlockUseSize[KnownBlockIndex] = ReuseSize;
- }
- else if (FoundAttachmentCount > 0)
- {
- // if (m_Options.IsVerbose)
- //{
- // ZEN_OPERATION_LOG_INFO(m_LogOutput, "Skipping block {}. {} attachments found, usage level: {}%",
- // KnownBlock.BlockHash,
- // FoundAttachmentCount, ReusePercent);
- //}
- m_FindBlocksStats.RejectedBlockCount++;
- m_FindBlocksStats.RejectedChunkCount += FoundAttachmentCount;
- m_FindBlocksStats.RejectedByteCount += ReuseSize;
- }
- }
- }
-
- if (!ReuseBlockIndexes.empty())
- {
- std::sort(ReuseBlockIndexes.begin(), ReuseBlockIndexes.end(), [&](size_t Lhs, size_t Rhs) {
- return BlockUseSize[Lhs] > BlockUseSize[Rhs];
- });
-
- for (size_t KnownBlockIndex : ReuseBlockIndexes)
- {
- std::vector<uint32_t> FoundChunkIndexes;
- size_t BlockSize = 0;
- size_t AdjustedReuseSize = 0;
- size_t AdjustedRawReuseSize = 0;
- const ChunkBlockDescription& KnownBlock = KnownBlocks[KnownBlockIndex];
- for (size_t BlockChunkIndex = 0; BlockChunkIndex < KnownBlock.ChunkRawHashes.size(); BlockChunkIndex++)
- {
- const IoHash& BlockChunkHash = KnownBlock.ChunkRawHashes[BlockChunkIndex];
- const uint32_t BlockChunkSize = KnownBlock.ChunkCompressedLengths[BlockChunkIndex];
- BlockSize += BlockChunkSize;
- if (auto It = ChunkHashToChunkIndex.find(BlockChunkHash); It != ChunkHashToChunkIndex.end())
- {
- const uint32_t ChunkIndex = It->second;
- if (!ChunkFound[ChunkIndex])
- {
- FoundChunkIndexes.push_back(ChunkIndex);
- AdjustedReuseSize += KnownBlock.ChunkCompressedLengths[BlockChunkIndex];
- AdjustedRawReuseSize += KnownBlock.ChunkRawLengths[BlockChunkIndex];
- }
- }
- }
-
- size_t ReusePercent = (AdjustedReuseSize * 100) / BlockSize;
-
- if (ReusePercent >= m_Options.BlockReuseMinPercentLimit)
- {
- if (m_Options.IsVerbose)
- {
- ZEN_OPERATION_LOG_INFO(m_LogOutput,
- "Reusing block {}. {} attachments found, usage level: {}%",
- KnownBlock.BlockHash,
- FoundChunkIndexes.size(),
- ReusePercent);
- }
- FilteredReuseBlockIndexes.push_back(KnownBlockIndex);
-
- for (uint32_t ChunkIndex : FoundChunkIndexes)
- {
- ChunkFound[ChunkIndex] = true;
- }
- m_FindBlocksStats.AcceptedChunkCount += FoundChunkIndexes.size();
- m_FindBlocksStats.AcceptedByteCount += AdjustedReuseSize;
- m_FindBlocksStats.AcceptedRawByteCount += AdjustedRawReuseSize;
- m_FindBlocksStats.AcceptedReduntantChunkCount += KnownBlock.ChunkRawHashes.size() - FoundChunkIndexes.size();
- m_FindBlocksStats.AcceptedReduntantByteCount += BlockSize - AdjustedReuseSize;
- }
- else
- {
- // if (m_Options.IsVerbose)
- //{
- // ZEN_OPERATION_LOG_INFO(m_LogOutput, "Skipping block {}. filtered usage level: {}%", KnownBlock.BlockHash,
- // ReusePercent);
- //}
- m_FindBlocksStats.RejectedBlockCount++;
- m_FindBlocksStats.RejectedChunkCount += FoundChunkIndexes.size();
- m_FindBlocksStats.RejectedByteCount += AdjustedReuseSize;
- }
- }
- }
- }
- OutUnusedChunkIndexes.reserve(ChunkIndexes.size() - m_FindBlocksStats.AcceptedChunkCount);
- for (uint32_t ChunkIndex : ChunkIndexes)
- {
- if (!ChunkFound[ChunkIndex])
- {
- OutUnusedChunkIndexes.push_back(ChunkIndex);
- }
- }
- }
- return FilteredReuseBlockIndexes;
-}
-
void
BuildsOperationUploadFolder::ArrangeChunksIntoBlocks(const ChunkedFolderContent& Content,
const ChunkedContentLookup& Lookup,
diff --git a/src/zenremotestore/chunking/chunkblock.cpp b/src/zenremotestore/chunking/chunkblock.cpp
index 05ae13de1..a5d0db205 100644
--- a/src/zenremotestore/chunking/chunkblock.cpp
+++ b/src/zenremotestore/chunking/chunkblock.cpp
@@ -5,14 +5,23 @@
#include <zencore/compactbinarybuilder.h>
#include <zencore/fmtutils.h>
#include <zencore/logging.h>
+#include <zencore/timer.h>
+#include <zencore/trace.h>
+
+#include <zenremotestore/operationlogoutput.h>
#include <vector>
+ZEN_THIRD_PARTY_INCLUDES_START
+#include <tsl/robin_map.h>
+ZEN_THIRD_PARTY_INCLUDES_END
+
#if ZEN_WITH_TESTS
# include <zencore/testing.h>
# include <zencore/testutils.h>
# include <unordered_map>
+# include <numeric>
#endif // ZEN_WITH_TESTS
namespace zen {
@@ -261,6 +270,188 @@ IterateChunkBlock(const SharedBuffer& BlockPayload,
return true;
};
+std::vector<size_t>
+FindReuseBlocks(OperationLogOutput& Output,
+ const uint8_t BlockReuseMinPercentLimit,
+ const bool IsVerbose,
+ ReuseBlocksStatistics& Stats,
+ const std::vector<ChunkBlockDescription>& KnownBlocks,
+ std::span<const IoHash> ChunkHashes,
+ std::span<const uint32_t> ChunkIndexes,
+ std::vector<uint32_t>& OutUnusedChunkIndexes)
+{
+ ZEN_TRACE_CPU("FindReuseBlocks");
+
+ // Find all blocks with a usage level higher than MinPercentLimit
+ // Pick out the blocks with usage higher or equal to MinPercentLimit
+ // Sort them with highest size usage - most usage first
+ // Make a list of all chunks and mark them as not found
+ // For each block, recalculate the block has usage percent based on the chunks marked as not found
+ // If the block still reaches MinPercentLimit, keep it and remove the matching chunks from the not found list
+ // Repeat for following all remaining block that initially matched MinPercentLimit
+
+ std::vector<size_t> FilteredReuseBlockIndexes;
+
+ uint32_t ChunkCount = gsl::narrow<uint32_t>(ChunkHashes.size());
+ std::vector<bool> ChunkFound(ChunkCount, false);
+
+ if (ChunkCount > 0)
+ {
+ if (!KnownBlocks.empty())
+ {
+ Stopwatch ReuseTimer;
+
+ tsl::robin_map<IoHash, uint32_t, IoHash::Hasher> ChunkHashToChunkIndex;
+ ChunkHashToChunkIndex.reserve(ChunkIndexes.size());
+ for (uint32_t ChunkIndex : ChunkIndexes)
+ {
+ ChunkHashToChunkIndex.insert_or_assign(ChunkHashes[ChunkIndex], ChunkIndex);
+ }
+
+ std::vector<size_t> BlockSizes(KnownBlocks.size(), 0);
+ std::vector<size_t> BlockUseSize(KnownBlocks.size(), 0);
+
+ std::vector<size_t> ReuseBlockIndexes;
+
+ for (size_t KnownBlockIndex = 0; KnownBlockIndex < KnownBlocks.size(); KnownBlockIndex++)
+ {
+ const ChunkBlockDescription& KnownBlock = KnownBlocks[KnownBlockIndex];
+
+ if (KnownBlock.BlockHash != IoHash::Zero && KnownBlock.ChunkRawHashes.size() == KnownBlock.ChunkCompressedLengths.size())
+ {
+ size_t BlockAttachmentCount = KnownBlock.ChunkRawHashes.size();
+ if (BlockAttachmentCount == 0)
+ {
+ continue;
+ }
+ size_t ReuseSize = 0;
+ size_t BlockSize = 0;
+ size_t FoundAttachmentCount = 0;
+ size_t BlockChunkCount = KnownBlock.ChunkRawHashes.size();
+ for (size_t BlockChunkIndex = 0; BlockChunkIndex < BlockChunkCount; BlockChunkIndex++)
+ {
+ const IoHash& BlockChunkHash = KnownBlock.ChunkRawHashes[BlockChunkIndex];
+ const uint32_t BlockChunkSize = KnownBlock.ChunkCompressedLengths[BlockChunkIndex];
+ BlockSize += BlockChunkSize;
+ if (ChunkHashToChunkIndex.contains(BlockChunkHash))
+ {
+ ReuseSize += BlockChunkSize;
+ FoundAttachmentCount++;
+ }
+ }
+
+ size_t ReusePercent = (ReuseSize * 100) / BlockSize;
+
+ if (ReusePercent >= BlockReuseMinPercentLimit)
+ {
+ if (IsVerbose)
+ {
+ ZEN_OPERATION_LOG_INFO(Output,
+ "Reusing block {}. {} attachments found, usage level: {}%",
+ KnownBlock.BlockHash,
+ FoundAttachmentCount,
+ ReusePercent);
+ }
+ ReuseBlockIndexes.push_back(KnownBlockIndex);
+
+ BlockSizes[KnownBlockIndex] = BlockSize;
+ BlockUseSize[KnownBlockIndex] = ReuseSize;
+ }
+ else if (FoundAttachmentCount > 0)
+ {
+ // if (IsVerbose)
+ //{
+ // ZEN_OPERATION_LOG_INFO(Output, "Skipping block {}. {} attachments found, usage level: {}%",
+ // KnownBlock.BlockHash,
+ // FoundAttachmentCount, ReusePercent);
+ //}
+ Stats.RejectedBlockCount++;
+ Stats.RejectedChunkCount += FoundAttachmentCount;
+ Stats.RejectedByteCount += ReuseSize;
+ }
+ }
+ }
+
+ if (!ReuseBlockIndexes.empty())
+ {
+ std::sort(ReuseBlockIndexes.begin(), ReuseBlockIndexes.end(), [&](size_t Lhs, size_t Rhs) {
+ return BlockUseSize[Lhs] > BlockUseSize[Rhs];
+ });
+
+ for (size_t KnownBlockIndex : ReuseBlockIndexes)
+ {
+ std::vector<uint32_t> FoundChunkIndexes;
+ size_t BlockSize = 0;
+ size_t AdjustedReuseSize = 0;
+ size_t AdjustedRawReuseSize = 0;
+ const ChunkBlockDescription& KnownBlock = KnownBlocks[KnownBlockIndex];
+ for (size_t BlockChunkIndex = 0; BlockChunkIndex < KnownBlock.ChunkRawHashes.size(); BlockChunkIndex++)
+ {
+ const IoHash& BlockChunkHash = KnownBlock.ChunkRawHashes[BlockChunkIndex];
+ const uint32_t BlockChunkSize = KnownBlock.ChunkCompressedLengths[BlockChunkIndex];
+ BlockSize += BlockChunkSize;
+ if (auto It = ChunkHashToChunkIndex.find(BlockChunkHash); It != ChunkHashToChunkIndex.end())
+ {
+ const uint32_t ChunkIndex = It->second;
+ if (!ChunkFound[ChunkIndex])
+ {
+ FoundChunkIndexes.push_back(ChunkIndex);
+ AdjustedReuseSize += KnownBlock.ChunkCompressedLengths[BlockChunkIndex];
+ AdjustedRawReuseSize += KnownBlock.ChunkRawLengths[BlockChunkIndex];
+ }
+ }
+ }
+
+ size_t ReusePercent = (AdjustedReuseSize * 100) / BlockSize;
+
+ if (ReusePercent >= BlockReuseMinPercentLimit)
+ {
+ if (IsVerbose)
+ {
+ ZEN_OPERATION_LOG_INFO(Output,
+ "Reusing block {}. {} attachments found, usage level: {}%",
+ KnownBlock.BlockHash,
+ FoundChunkIndexes.size(),
+ ReusePercent);
+ }
+ FilteredReuseBlockIndexes.push_back(KnownBlockIndex);
+
+ for (uint32_t ChunkIndex : FoundChunkIndexes)
+ {
+ ChunkFound[ChunkIndex] = true;
+ }
+ Stats.AcceptedChunkCount += FoundChunkIndexes.size();
+ Stats.AcceptedByteCount += AdjustedReuseSize;
+ Stats.AcceptedRawByteCount += AdjustedRawReuseSize;
+ Stats.AcceptedReduntantChunkCount += KnownBlock.ChunkRawHashes.size() - FoundChunkIndexes.size();
+ Stats.AcceptedReduntantByteCount += BlockSize - AdjustedReuseSize;
+ }
+ else
+ {
+ // if (IsVerbose)
+ //{
+ // ZEN_OPERATION_LOG_INFO(Output, "Skipping block {}. filtered usage level: {}%", KnownBlock.BlockHash,
+ // ReusePercent);
+ //}
+ Stats.RejectedBlockCount++;
+ Stats.RejectedChunkCount += FoundChunkIndexes.size();
+ Stats.RejectedByteCount += AdjustedReuseSize;
+ }
+ }
+ }
+ }
+ OutUnusedChunkIndexes.reserve(ChunkIndexes.size() - Stats.AcceptedChunkCount);
+ for (uint32_t ChunkIndex : ChunkIndexes)
+ {
+ if (!ChunkFound[ChunkIndex])
+ {
+ OutUnusedChunkIndexes.push_back(ChunkIndex);
+ }
+ }
+ }
+ return FilteredReuseBlockIndexes;
+}
+
#if ZEN_WITH_TESTS
namespace testutils {
@@ -310,6 +501,246 @@ TEST_CASE("project.store.block")
HeaderSize));
}
+TEST_CASE("project.store.reuseblocks")
+{
+ using namespace std::literals;
+ using namespace testutils;
+
+ std::vector<std::vector<std::size_t>> BlockAttachmentSizes(
+ {std::vector<std::size_t>{7633, 6825, 5738, 8031, 7225, 566, 3656, 6006, 24, 3466, 1093, 4269, 2257, 3685, 3489,
+ 7194, 6151, 5482, 6217, 3511, 6738, 5061, 7537, 2759, 1916, 8210, 2235, 4024, 1582, 5251,
+ 491, 5464, 4607, 8135, 3767, 4045, 4415, 5007, 8876, 6761, 3359, 8526, 4097, 4855, 8225},
+ {17633, 16825, 15738, 18031, 17225, 11566, 13656, 16006, 11124, 13466, 11093, 14269, 12257, 13685, 13489,
+ 17194, 16151, 15482, 16217, 13511, 16738, 15061, 17537, 12759, 11916, 18210, 12235, 14024, 11582, 15251,
+ 11491, 15464, 14607, 18135, 13767, 14045, 14415, 15007, 18876, 16761, 13359, 18526, 14097, 14855, 18225}});
+
+ std::vector<ChunkBlockDescription> BlockDescriptions;
+ for (auto& AttachmentSizes : BlockAttachmentSizes)
+ {
+ std::vector<std::pair<Oid, CompressedBuffer>> AttachmentsWithId = CreateAttachments(AttachmentSizes);
+ std::vector<std::pair<IoHash, FetchChunkFunc>> Chunks;
+ Chunks.reserve(AttachmentSizes.size());
+ for (const auto& It : AttachmentsWithId)
+ {
+ Chunks.push_back(
+ std::make_pair(It.second.DecodeRawHash(), [Buffer = It.second](const IoHash&) -> std::pair<uint64_t, CompressedBuffer> {
+ return {Buffer.DecodeRawSize(), Buffer};
+ }));
+ }
+ ChunkBlockDescription Block;
+ CompressedBuffer BlockBuffer = GenerateChunkBlock(std::move(Chunks), Block);
+ BlockDescriptions.emplace_back(std::move(Block));
+ }
+
+ LoggerRef LogRef = Log();
+ std::unique_ptr<OperationLogOutput> LogOutput(CreateStandardLogOutput(LogRef));
+
+ {
+ // We use just about all the chunks - should result in use of both blocks
+ ReuseBlocksStatistics ReuseBlocksStats;
+ std::vector<IoHash> ManyChunkHashes;
+ ManyChunkHashes.insert(ManyChunkHashes.end(),
+ BlockDescriptions[0].ChunkRawHashes.begin(),
+ BlockDescriptions[0].ChunkRawHashes.end() - 1);
+ ManyChunkHashes.insert(ManyChunkHashes.end(),
+ BlockDescriptions[1].ChunkRawHashes.begin() + 1,
+ BlockDescriptions[1].ChunkRawHashes.end());
+ std::vector<uint32_t> ManyChunkIndexes;
+ ManyChunkIndexes.resize(ManyChunkHashes.size());
+ std::iota(ManyChunkIndexes.begin(), ManyChunkIndexes.end(), 0);
+ std::vector<uint32_t> UnusedChunkIndexes;
+
+ std::vector<size_t> ReusedBlocks = FindReuseBlocks(*LogOutput,
+ 80,
+ false,
+ ReuseBlocksStats,
+ BlockDescriptions,
+ ManyChunkHashes,
+ ManyChunkIndexes,
+ UnusedChunkIndexes);
+
+ CHECK_EQ(2u, ReusedBlocks.size());
+ CHECK_EQ(0u, UnusedChunkIndexes.size());
+ }
+
+ {
+ // We now only about one of the blocks
+ ReuseBlocksStatistics ReuseBlocksStats;
+ std::vector<IoHash> ManyChunkHashes;
+ ManyChunkHashes.insert(ManyChunkHashes.end(),
+ BlockDescriptions[0].ChunkRawHashes.begin(),
+ BlockDescriptions[0].ChunkRawHashes.end() - 1);
+ ManyChunkHashes.insert(ManyChunkHashes.end(),
+ BlockDescriptions[1].ChunkRawHashes.begin() + 1,
+ BlockDescriptions[1].ChunkRawHashes.end());
+ std::vector<uint32_t> ManyChunkIndexes;
+ ManyChunkIndexes.resize(ManyChunkHashes.size());
+ std::iota(ManyChunkIndexes.begin(), ManyChunkIndexes.end(), 0);
+ std::vector<uint32_t> UnusedChunkIndexes;
+
+ std::vector<size_t> ReusedBlocks = FindReuseBlocks(*LogOutput,
+ 80,
+ false,
+ ReuseBlocksStats,
+ std::vector<ChunkBlockDescription>{BlockDescriptions[0]},
+ ManyChunkHashes,
+ ManyChunkIndexes,
+ UnusedChunkIndexes);
+
+ CHECK_EQ(1u, ReusedBlocks.size());
+ CHECK_EQ(BlockDescriptions[1].ChunkRawHashes.size() - 1, UnusedChunkIndexes.size());
+ }
+
+ {
+ std::vector<IoHash> ManyChunkHashes;
+ ManyChunkHashes.insert(ManyChunkHashes.end(),
+ BlockDescriptions[0].ChunkRawHashes.begin(),
+ BlockDescriptions[0].ChunkRawHashes.end() - BlockDescriptions[0].ChunkRawHashes.size() / 2);
+ ManyChunkHashes.insert(ManyChunkHashes.end(),
+ BlockDescriptions[1].ChunkRawHashes.begin() + BlockDescriptions[1].ChunkRawHashes.size() / 2,
+ BlockDescriptions[1].ChunkRawHashes.end());
+ std::vector<uint32_t> ManyChunkIndexes;
+ ManyChunkIndexes.resize(ManyChunkHashes.size());
+ std::iota(ManyChunkIndexes.begin(), ManyChunkIndexes.end(), 0);
+
+ {
+ // We use half the chunks - should result in no use of blocks due to 80% limit
+ std::vector<uint32_t> UnusedChunkIndexes80Percent;
+ ReuseBlocksStatistics ReuseBlocksStats;
+ std::vector<size_t> ReusedBlocks80Percent = FindReuseBlocks(*LogOutput,
+ 80,
+ false,
+ ReuseBlocksStats,
+ BlockDescriptions,
+ ManyChunkHashes,
+ ManyChunkIndexes,
+ UnusedChunkIndexes80Percent);
+
+ CHECK_EQ(0u, ReusedBlocks80Percent.size());
+ CHECK_EQ(ManyChunkHashes.size(), UnusedChunkIndexes80Percent.size());
+ }
+ {
+ // We use half the chunks - should result in use of both blocks due to 40% limit
+ std::vector<uint32_t> UnusedChunkIndexes40Percent;
+ ReuseBlocksStatistics ReuseBlocksStats;
+ std::vector<size_t> ReusedBlocks40Percent = FindReuseBlocks(*LogOutput,
+ 40,
+ false,
+ ReuseBlocksStats,
+ BlockDescriptions,
+ ManyChunkHashes,
+ ManyChunkIndexes,
+ UnusedChunkIndexes40Percent);
+
+ CHECK_EQ(2u, ReusedBlocks40Percent.size());
+ CHECK_EQ(0u, UnusedChunkIndexes40Percent.size());
+ }
+ }
+
+ {
+ std::vector<IoHash> ManyChunkHashes;
+ ManyChunkHashes.insert(ManyChunkHashes.end(),
+ BlockDescriptions[0].ChunkRawHashes.begin(),
+ BlockDescriptions[0].ChunkRawHashes.end() - BlockDescriptions[0].ChunkRawHashes.size() / 2);
+ ManyChunkHashes.insert(ManyChunkHashes.end(),
+ BlockDescriptions[1].ChunkRawHashes.begin() + 1,
+ BlockDescriptions[1].ChunkRawHashes.end());
+ std::vector<uint32_t> ManyChunkIndexes;
+ ManyChunkIndexes.resize(ManyChunkHashes.size());
+ std::iota(ManyChunkIndexes.begin(), ManyChunkIndexes.end(), 0);
+
+ {
+ // We use half the chunks for first block - should result in use of one blocks due to 80% limit
+ ReuseBlocksStatistics ReuseBlocksStats;
+ std::vector<uint32_t> UnusedChunkIndexes80Percent;
+ std::vector<size_t> ReusedBlocks80Percent = FindReuseBlocks(*LogOutput,
+ 80,
+ false,
+ ReuseBlocksStats,
+ BlockDescriptions,
+ ManyChunkHashes,
+ ManyChunkIndexes,
+ UnusedChunkIndexes80Percent);
+
+ CHECK_EQ(1u, ReusedBlocks80Percent.size());
+ CHECK_EQ(BlockDescriptions[0].ChunkRawHashes.size() - BlockDescriptions[0].ChunkRawHashes.size() / 2,
+ UnusedChunkIndexes80Percent.size());
+ }
+ {
+ // We use half the chunks - should result in use of both blocks due to 40% limit
+ ReuseBlocksStatistics ReuseBlocksStats;
+ std::vector<uint32_t> UnusedChunkIndexes40Percent;
+ std::vector<size_t> ReusedBlocks40Percent = FindReuseBlocks(*LogOutput,
+ 40,
+ false,
+ ReuseBlocksStats,
+ BlockDescriptions,
+ ManyChunkHashes,
+ ManyChunkIndexes,
+ UnusedChunkIndexes40Percent);
+
+ CHECK_EQ(2u, ReusedBlocks40Percent.size());
+ CHECK_EQ(0u, UnusedChunkIndexes40Percent.size());
+ }
+ }
+
+ {
+ // Test simulate ThinkChunkBlockDescriptions
+
+ for (ChunkBlockDescription& BlockDescription : BlockDescriptions)
+ {
+ BlockDescription.HeaderSize = 0;
+ BlockDescription.ChunkRawLengths = std::vector<uint32_t>(BlockDescription.ChunkRawHashes.size(), 1);
+ BlockDescription.ChunkCompressedLengths = std::vector<uint32_t>(BlockDescription.ChunkRawHashes.size(), 1);
+ }
+
+ std::vector<IoHash> ManyChunkHashes;
+ ManyChunkHashes.insert(ManyChunkHashes.end(),
+ BlockDescriptions[0].ChunkRawHashes.begin(),
+ BlockDescriptions[0].ChunkRawHashes.end() - BlockDescriptions[0].ChunkRawHashes.size() / 2);
+ ManyChunkHashes.insert(ManyChunkHashes.end(),
+ BlockDescriptions[1].ChunkRawHashes.begin() + 1,
+ BlockDescriptions[1].ChunkRawHashes.end());
+ std::vector<uint32_t> ManyChunkIndexes;
+ ManyChunkIndexes.resize(ManyChunkHashes.size());
+ std::iota(ManyChunkIndexes.begin(), ManyChunkIndexes.end(), 0);
+
+ {
+ // We use half the chunks for first block - should result in use of one blocks due to 80% limit
+ ReuseBlocksStatistics ReuseBlocksStats;
+ std::vector<uint32_t> UnusedChunkIndexes80Percent;
+ std::vector<size_t> ReusedBlocks80Percent = FindReuseBlocks(*LogOutput,
+ 80,
+ false,
+ ReuseBlocksStats,
+ BlockDescriptions,
+ ManyChunkHashes,
+ ManyChunkIndexes,
+ UnusedChunkIndexes80Percent);
+
+ CHECK_EQ(1u, ReusedBlocks80Percent.size());
+ CHECK_EQ(BlockDescriptions[0].ChunkRawHashes.size() - BlockDescriptions[0].ChunkRawHashes.size() / 2,
+ UnusedChunkIndexes80Percent.size());
+ }
+ {
+ // We use half the chunks - should result in use of both blocks due to 40% limit
+ ReuseBlocksStatistics ReuseBlocksStats;
+ std::vector<uint32_t> UnusedChunkIndexes40Percent;
+ std::vector<size_t> ReusedBlocks40Percent = FindReuseBlocks(*LogOutput,
+ 40,
+ false,
+ ReuseBlocksStats,
+ BlockDescriptions,
+ ManyChunkHashes,
+ ManyChunkIndexes,
+ UnusedChunkIndexes40Percent);
+
+ CHECK_EQ(2u, ReusedBlocks40Percent.size());
+ CHECK_EQ(0u, UnusedChunkIndexes40Percent.size());
+ }
+ }
+}
+
void
chunkblock_forcelink()
{
diff --git a/src/zenremotestore/include/zenremotestore/builds/buildstorageoperations.h b/src/zenremotestore/include/zenremotestore/builds/buildstorageoperations.h
index 3c4535d9c..223c668cd 100644
--- a/src/zenremotestore/include/zenremotestore/builds/buildstorageoperations.h
+++ b/src/zenremotestore/include/zenremotestore/builds/buildstorageoperations.h
@@ -406,24 +406,16 @@ private:
struct FindBlocksStatistics
{
- uint64_t FindBlockTimeMS = 0;
- uint64_t PotentialChunkCount = 0;
- uint64_t PotentialChunkByteCount = 0;
- uint64_t FoundBlockCount = 0;
- uint64_t FoundBlockChunkCount = 0;
- uint64_t FoundBlockByteCount = 0;
- uint64_t AcceptedBlockCount = 0;
- uint64_t AcceptedChunkCount = 0;
- uint64_t AcceptedByteCount = 0;
- uint64_t AcceptedRawByteCount = 0;
- uint64_t RejectedBlockCount = 0;
- uint64_t RejectedChunkCount = 0;
- uint64_t RejectedByteCount = 0;
- uint64_t AcceptedReduntantChunkCount = 0;
- uint64_t AcceptedReduntantByteCount = 0;
- uint64_t NewBlocksCount = 0;
- uint64_t NewBlocksChunkCount = 0;
- uint64_t NewBlocksChunkByteCount = 0;
+ uint64_t FindBlockTimeMS = 0;
+ uint64_t PotentialChunkCount = 0;
+ uint64_t PotentialChunkByteCount = 0;
+ uint64_t FoundBlockCount = 0;
+ uint64_t FoundBlockChunkCount = 0;
+ uint64_t FoundBlockByteCount = 0;
+ uint64_t AcceptedBlockCount = 0;
+ uint64_t NewBlocksCount = 0;
+ uint64_t NewBlocksChunkCount = 0;
+ uint64_t NewBlocksChunkByteCount = 0;
};
struct UploadStatistics
@@ -541,6 +533,7 @@ public:
GetFolderContentStatistics m_LocalFolderScanStats;
ChunkingStatistics m_ChunkingStats;
FindBlocksStatistics m_FindBlocksStats;
+ ReuseBlocksStatistics m_ReuseBlocksStats;
UploadStatistics m_UploadStats;
GenerateBlocksStatistics m_GenerateBlocksStats;
LooseChunksStatistics m_LooseChunksStats;
@@ -551,14 +544,10 @@ private:
bool IsAcceptedFolder(const std::string_view& RelativePath) const;
bool IsAcceptedFile(const std::string_view& RelativePath) const;
- std::vector<size_t> FindReuseBlocks(const std::vector<ChunkBlockDescription>& KnownBlocks,
- std::span<const IoHash> ChunkHashes,
- std::span<const uint32_t> ChunkIndexes,
- std::vector<uint32_t>& OutUnusedChunkIndexes);
- void ArrangeChunksIntoBlocks(const ChunkedFolderContent& Content,
- const ChunkedContentLookup& Lookup,
- std::vector<uint32_t>& ChunkIndexes,
- std::vector<std::vector<uint32_t>>& OutBlocks);
+ void ArrangeChunksIntoBlocks(const ChunkedFolderContent& Content,
+ const ChunkedContentLookup& Lookup,
+ std::vector<uint32_t>& ChunkIndexes,
+ std::vector<std::vector<uint32_t>>& OutBlocks);
struct GeneratedBlocks
{
std::vector<ChunkBlockDescription> BlockDescriptions;
diff --git a/src/zenremotestore/include/zenremotestore/chunking/chunkblock.h b/src/zenremotestore/include/zenremotestore/chunking/chunkblock.h
index b0d8ef24c..295d275d1 100644
--- a/src/zenremotestore/include/zenremotestore/chunking/chunkblock.h
+++ b/src/zenremotestore/include/zenremotestore/chunking/chunkblock.h
@@ -37,6 +37,29 @@ bool IterateChunkBlock(const SharedBuffer& BlockPayload,
uint64_t& OutHeaderSize);
std::vector<uint32_t> ReadChunkBlockHeader(const MemoryView BlockView, uint64_t& OutHeaderSize);
+struct ReuseBlocksStatistics
+{
+ uint64_t AcceptedChunkCount = 0;
+ uint64_t AcceptedByteCount = 0;
+ uint64_t AcceptedRawByteCount = 0;
+ uint64_t RejectedBlockCount = 0;
+ uint64_t RejectedChunkCount = 0;
+ uint64_t RejectedByteCount = 0;
+ uint64_t AcceptedReduntantChunkCount = 0;
+ uint64_t AcceptedReduntantByteCount = 0;
+};
+
+class OperationLogOutput;
+
+std::vector<size_t> FindReuseBlocks(OperationLogOutput& Output,
+ const uint8_t BlockReuseMinPercentLimit,
+ const bool IsVerbose,
+ ReuseBlocksStatistics& Stats,
+ const std::vector<ChunkBlockDescription>& KnownBlocks,
+ std::span<const IoHash> ChunkHashes,
+ std::span<const uint32_t> ChunkIndexes,
+ std::vector<uint32_t>& OutUnusedChunkIndexes);
+
void chunkblock_forcelink();
} // namespace zen
diff --git a/src/zenremotestore/include/zenremotestore/projectstore/remoteprojectstore.h b/src/zenremotestore/include/zenremotestore/projectstore/remoteprojectstore.h
index 182b64609..008f94351 100644
--- a/src/zenremotestore/include/zenremotestore/projectstore/remoteprojectstore.h
+++ b/src/zenremotestore/include/zenremotestore/projectstore/remoteprojectstore.h
@@ -70,7 +70,7 @@ public:
struct GetKnownBlocksResult : public Result
{
- std::vector<ThinChunkBlockDescription> Blocks;
+ std::vector<ChunkBlockDescription> Blocks;
};
struct RemoteStoreInfo
diff --git a/src/zenremotestore/projectstore/buildsremoteprojectstore.cpp b/src/zenremotestore/projectstore/buildsremoteprojectstore.cpp
index 706f11e8c..bd793b745 100644
--- a/src/zenremotestore/projectstore/buildsremoteprojectstore.cpp
+++ b/src/zenremotestore/projectstore/buildsremoteprojectstore.cpp
@@ -436,12 +436,7 @@ public:
{
CbObject KnownBlocks = m_BuildStorage->FindBlocks(m_BuildId, 10000u);
std::optional<std::vector<ChunkBlockDescription>> Blocks = ParseChunkBlockDescriptionList(KnownBlocks);
- Result.Blocks.reserve(Blocks.value().size());
- for (ChunkBlockDescription& BlockDescription : Blocks.value())
- {
- Result.Blocks.push_back(ThinChunkBlockDescription{.BlockHash = BlockDescription.BlockHash,
- .ChunkRawHashes = std::move(BlockDescription.ChunkRawHashes)});
- }
+ Result.Blocks = std::move(Blocks.value());
}
catch (const HttpClientError& Ex)
{
diff --git a/src/zenremotestore/projectstore/fileremoteprojectstore.cpp b/src/zenremotestore/projectstore/fileremoteprojectstore.cpp
index 50be5d2d9..3a67d3842 100644
--- a/src/zenremotestore/projectstore/fileremoteprojectstore.cpp
+++ b/src/zenremotestore/projectstore/fileremoteprojectstore.cpp
@@ -202,9 +202,18 @@ public:
return GetKnownBlocksResult{{.ErrorCode = static_cast<int>(HttpResponseCode::NoContent),
.ElapsedSeconds = LoadResult.ElapsedSeconds + Timer.GetElapsedTimeUs() * 1000}};
}
- std::vector<ThinChunkBlockDescription> KnownBlocks = GetBlocksFromOplog(LoadResult.ContainerObject, ExistingBlockHashes);
- GetKnownBlocksResult Result{{.ElapsedSeconds = LoadResult.ElapsedSeconds + Timer.GetElapsedTimeUs() * 1000}};
- Result.Blocks = std::move(KnownBlocks);
+ std::vector<ThinChunkBlockDescription> ThinKnownBlocks = GetBlocksFromOplog(LoadResult.ContainerObject, ExistingBlockHashes);
+
+ const size_t KnowBlockCount = ThinKnownBlocks.size();
+
+ GetKnownBlocksResult Result{{.ElapsedSeconds = LoadResult.ElapsedSeconds + Timer.GetElapsedTimeUs() * 1000}};
+ Result.Blocks.resize(KnowBlockCount);
+ for (size_t BlockIndex = 0; BlockIndex < KnowBlockCount; BlockIndex++)
+ {
+ Result.Blocks[BlockIndex].BlockHash = ThinKnownBlocks[BlockIndex].BlockHash;
+ Result.Blocks[BlockIndex].ChunkRawHashes = std::move(ThinKnownBlocks[BlockIndex].ChunkRawHashes);
+ }
+
return Result;
}
diff --git a/src/zenremotestore/projectstore/jupiterremoteprojectstore.cpp b/src/zenremotestore/projectstore/jupiterremoteprojectstore.cpp
index e26a5e88d..6d888ea01 100644
--- a/src/zenremotestore/projectstore/jupiterremoteprojectstore.cpp
+++ b/src/zenremotestore/projectstore/jupiterremoteprojectstore.cpp
@@ -197,11 +197,18 @@ public:
return GetKnownBlocksResult{{.ErrorCode = static_cast<int>(HttpResponseCode::NoContent),
.ElapsedSeconds = LoadResult.ElapsedSeconds + ExistsResult.ElapsedSeconds}};
}
- std::vector<ThinChunkBlockDescription> KnownBlocks = GetBlocksFromOplog(LoadResult.ContainerObject, ExistingBlockHashes);
+ std::vector<ThinChunkBlockDescription> ThinKnownBlocks = GetBlocksFromOplog(LoadResult.ContainerObject, ExistingBlockHashes);
GetKnownBlocksResult Result{
{.ElapsedSeconds = LoadResult.ElapsedSeconds + ExistsResult.ElapsedSeconds + Timer.GetElapsedTimeUs() * 1000.0}};
- Result.Blocks = std::move(KnownBlocks);
+ const size_t KnowBlockCount = ThinKnownBlocks.size();
+
+ Result.Blocks.resize(KnowBlockCount);
+ for (size_t BlockIndex = 0; BlockIndex < KnowBlockCount; BlockIndex++)
+ {
+ Result.Blocks[BlockIndex].BlockHash = ThinKnownBlocks[BlockIndex].BlockHash;
+ Result.Blocks[BlockIndex].ChunkRawHashes = std::move(ThinKnownBlocks[BlockIndex].ChunkRawHashes);
+ }
return Result;
}
diff --git a/src/zenremotestore/projectstore/remoteprojectstore.cpp b/src/zenremotestore/projectstore/remoteprojectstore.cpp
index 5652d5271..0e18cc6b0 100644
--- a/src/zenremotestore/projectstore/remoteprojectstore.cpp
+++ b/src/zenremotestore/projectstore/remoteprojectstore.cpp
@@ -14,8 +14,10 @@
#include <zencore/workthreadpool.h>
#include <zenhttp/httpcommon.h>
#include <zenremotestore/chunking/chunkedfile.h>
+#include <zenremotestore/operationlogoutput.h>
#include <zenstore/cidstore.h>
+#include <numeric>
#include <unordered_map>
#if ZEN_WITH_TESTS
@@ -534,11 +536,16 @@ namespace remotestore_impl {
return;
}
+ uint64_t PotentialSize = 0;
+ uint64_t UsedSize = 0;
+ uint64_t BlockSize = BlockPayload.GetSize();
+
uint64_t BlockHeaderSize = 0;
bool StoreChunksOK = IterateChunkBlock(
BlockPayload,
- [&WantedChunks, &WriteAttachmentBuffers, &WriteRawHashes, &Info](CompressedBuffer&& Chunk,
- const IoHash& AttachmentRawHash) {
+ [&WantedChunks, &WriteAttachmentBuffers, &WriteRawHashes, &Info, &PotentialSize](
+ CompressedBuffer&& Chunk,
+ const IoHash& AttachmentRawHash) {
if (WantedChunks.contains(AttachmentRawHash))
{
WriteAttachmentBuffers.emplace_back(Chunk.GetCompressed().Flatten().AsIoBuffer());
@@ -552,6 +559,7 @@ namespace remotestore_impl {
ZEN_ASSERT(RawHash == AttachmentRawHash);
WriteRawHashes.emplace_back(AttachmentRawHash);
WantedChunks.erase(AttachmentRawHash);
+ PotentialSize += WriteAttachmentBuffers.back().GetSize();
}
},
BlockHeaderSize);
@@ -581,8 +589,16 @@ namespace remotestore_impl {
{
Info.AttachmentBytesStored.fetch_add(WriteAttachmentBuffers[Index].GetSize());
Info.AttachmentsStored.fetch_add(1);
+ UsedSize += WriteAttachmentBuffers[Index].GetSize();
}
}
+ ZEN_DEBUG("Used {} (matching {}) out of {} for block {} ({} %) (use of matching {}%)",
+ NiceBytes(UsedSize),
+ NiceBytes(PotentialSize),
+ NiceBytes(BlockSize),
+ BlockHash,
+ (100 * UsedSize) / BlockSize,
+ PotentialSize > 0 ? (UsedSize * 100) / PotentialSize : 0);
}
}
catch (const std::exception& Ex)
@@ -1182,7 +1198,7 @@ BuildContainer(CidStore& ChunkStore,
bool BuildBlocks,
bool IgnoreMissingAttachments,
bool AllowChunking,
- const std::vector<ThinChunkBlockDescription>& KnownBlocks,
+ const std::vector<ChunkBlockDescription>& KnownBlocks,
WorkerThreadPool& WorkerPool,
const std::function<void(CompressedBuffer&&, ChunkBlockDescription&&)>& AsyncOnBlock,
const std::function<void(const IoHash&, TGetAttachmentBufferFunc&&)>& OnLargeAttachment,
@@ -1193,6 +1209,36 @@ BuildContainer(CidStore& ChunkStore,
{
using namespace std::literals;
+ class JobContextLogOutput : public OperationLogOutput
+ {
+ public:
+ JobContextLogOutput(JobContext* OptionalContext) : m_OptionalContext(OptionalContext) {}
+ virtual void EmitLogMessage(int LogLevel, std::string_view Format, fmt::format_args Args) override
+ {
+ ZEN_UNUSED(LogLevel);
+ if (m_OptionalContext)
+ {
+ fmt::basic_memory_buffer<char, 250> MessageBuffer;
+ fmt::vformat_to(fmt::appender(MessageBuffer), Format, Args);
+ remotestore_impl::ReportMessage(m_OptionalContext, std::string_view(MessageBuffer.data(), MessageBuffer.size()));
+ }
+ }
+
+ virtual void SetLogOperationName(std::string_view Name) override { ZEN_UNUSED(Name); }
+ virtual void SetLogOperationProgress(uint32_t StepIndex, uint32_t StepCount) override { ZEN_UNUSED(StepIndex, StepCount); }
+ virtual uint32_t GetProgressUpdateDelayMS() override { return 0; }
+ virtual ProgressBar* CreateProgressBar(std::string_view InSubTask) override
+ {
+ ZEN_UNUSED(InSubTask);
+ return nullptr;
+ }
+
+ private:
+ JobContext* m_OptionalContext;
+ };
+
+ std::unique_ptr<OperationLogOutput> LogOutput(std::make_unique<JobContextLogOutput>(OptionalContext));
+
size_t OpCount = 0;
CbObject OplogContainerObject;
@@ -1424,56 +1470,6 @@ BuildContainer(CidStore& ChunkStore,
return {};
}
- auto FindReuseBlocks = [](const std::vector<ThinChunkBlockDescription>& KnownBlocks,
- const std::unordered_set<IoHash, IoHash::Hasher>& Attachments,
- JobContext* OptionalContext) -> std::vector<size_t> {
- std::vector<size_t> ReuseBlockIndexes;
- if (!Attachments.empty() && !KnownBlocks.empty())
- {
- remotestore_impl::ReportMessage(
- OptionalContext,
- fmt::format("Checking {} Attachments against {} known blocks for reuse", Attachments.size(), KnownBlocks.size()));
- Stopwatch ReuseTimer;
-
- for (size_t KnownBlockIndex = 0; KnownBlockIndex < KnownBlocks.size(); KnownBlockIndex++)
- {
- const ThinChunkBlockDescription& KnownBlock = KnownBlocks[KnownBlockIndex];
- size_t BlockAttachmentCount = KnownBlock.ChunkRawHashes.size();
- if (BlockAttachmentCount == 0)
- {
- continue;
- }
- size_t FoundAttachmentCount = 0;
- for (const IoHash& KnownHash : KnownBlock.ChunkRawHashes)
- {
- if (Attachments.contains(KnownHash))
- {
- FoundAttachmentCount++;
- }
- }
-
- size_t ReusePercent = (FoundAttachmentCount * 100) / BlockAttachmentCount;
- // TODO: Configure reuse-level
- if (ReusePercent > 80)
- {
- ZEN_DEBUG("Reusing block {}. {} attachments found, usage level: {}%",
- KnownBlock.BlockHash,
- FoundAttachmentCount,
- ReusePercent);
- ReuseBlockIndexes.push_back(KnownBlockIndex);
- }
- else if (FoundAttachmentCount > 0)
- {
- ZEN_DEBUG("Skipping block {}. {} attachments found, usage level: {}%",
- KnownBlock.BlockHash,
- FoundAttachmentCount,
- ReusePercent);
- }
- }
- }
- return ReuseBlockIndexes;
- };
-
std::unordered_set<IoHash, IoHash::Hasher> FoundHashes;
FoundHashes.reserve(UploadAttachments.size());
for (const auto& It : UploadAttachments)
@@ -1482,15 +1478,35 @@ BuildContainer(CidStore& ChunkStore,
}
size_t ReusedAttachmentCount = 0;
- std::vector<size_t> ReusedBlockIndexes = FindReuseBlocks(KnownBlocks, FoundHashes, OptionalContext);
- for (size_t KnownBlockIndex : ReusedBlockIndexes)
+ std::vector<size_t> ReusedBlockIndexes;
{
- const ThinChunkBlockDescription& KnownBlock = KnownBlocks[KnownBlockIndex];
- for (const IoHash& KnownHash : KnownBlock.ChunkRawHashes)
+ std::vector<IoHash> ChunkHashes;
+ ChunkHashes.reserve(FoundHashes.size());
+ ChunkHashes.insert(ChunkHashes.begin(), FoundHashes.begin(), FoundHashes.end());
+ std::vector<uint32_t> ChunkIndexes;
+ ChunkIndexes.resize(FoundHashes.size());
+ std::iota(ChunkIndexes.begin(), ChunkIndexes.end(), 0);
+
+ std::vector<uint32_t> UnusedChunkIndexes;
+ ReuseBlocksStatistics ReuseBlocksStats;
+
+ ReusedBlockIndexes = FindReuseBlocks(*LogOutput,
+ /*BlockReuseMinPercentLimit*/ 80,
+ /*IsVerbose*/ false,
+ ReuseBlocksStats,
+ KnownBlocks,
+ ChunkHashes,
+ ChunkIndexes,
+ UnusedChunkIndexes);
+ for (size_t KnownBlockIndex : ReusedBlockIndexes)
{
- if (UploadAttachments.erase(KnownHash) == 1)
+ const ThinChunkBlockDescription& KnownBlock = KnownBlocks[KnownBlockIndex];
+ for (const IoHash& KnownHash : KnownBlock.ChunkRawHashes)
{
- ReusedAttachmentCount++;
+ if (UploadAttachments.erase(KnownHash) == 1)
+ {
+ ReusedAttachmentCount++;
+ }
}
}
}
@@ -1823,20 +1839,39 @@ BuildContainer(CidStore& ChunkStore,
UploadAttachments.erase(It);
}
- std::vector<size_t> ReusedBlockFromChunking = FindReuseBlocks(KnownBlocks, ChunkedHashes, OptionalContext);
- for (size_t KnownBlockIndex : ReusedBlockIndexes)
{
- const ThinChunkBlockDescription& KnownBlock = KnownBlocks[KnownBlockIndex];
- for (const IoHash& KnownHash : KnownBlock.ChunkRawHashes)
+ std::vector<IoHash> ChunkHashes;
+ ChunkHashes.reserve(ChunkedHashes.size());
+ ChunkHashes.insert(ChunkHashes.begin(), ChunkedHashes.begin(), ChunkedHashes.end());
+ std::vector<uint32_t> ChunkIndexes;
+ ChunkIndexes.resize(ChunkedHashes.size());
+ std::iota(ChunkIndexes.begin(), ChunkIndexes.end(), 0);
+
+ std::vector<uint32_t> UnusedChunkIndexes;
+ ReuseBlocksStatistics ReuseBlocksStats;
+
+ std::vector<size_t> ReusedBlockFromChunking = FindReuseBlocks(*LogOutput,
+ /*BlockReuseMinPercentLimit*/ 80,
+ /*IsVerbose*/ false,
+ ReuseBlocksStats,
+ KnownBlocks,
+ ChunkHashes,
+ ChunkIndexes,
+ UnusedChunkIndexes);
+ for (size_t KnownBlockIndex : ReusedBlockIndexes)
{
- if (ChunkedHashes.erase(KnownHash) == 1)
+ const ThinChunkBlockDescription& KnownBlock = KnownBlocks[KnownBlockIndex];
+ for (const IoHash& KnownHash : KnownBlock.ChunkRawHashes)
{
- ReusedAttachmentCount++;
+ if (ChunkedHashes.erase(KnownHash) == 1)
+ {
+ ReusedAttachmentCount++;
+ }
}
}
+ ReusedBlockIndexes.insert(ReusedBlockIndexes.end(), ReusedBlockFromChunking.begin(), ReusedBlockFromChunking.end());
}
- ReusedBlockIndexes.insert(ReusedBlockIndexes.end(), ReusedBlockFromChunking.begin(), ReusedBlockFromChunking.end());
std::sort(ReusedBlockIndexes.begin(), ReusedBlockIndexes.end());
auto UniqueKnownBlocksEnd = std::unique(ReusedBlockIndexes.begin(), ReusedBlockIndexes.end());
size_t ReuseBlockCount = std::distance(ReusedBlockIndexes.begin(), UniqueKnownBlocksEnd);
@@ -2448,7 +2483,7 @@ SaveOplog(CidStore& ChunkStore,
OnBlock = UploadBlock;
}
- std::vector<ThinChunkBlockDescription> KnownBlocks;
+ std::vector<ChunkBlockDescription> KnownBlocks;
uint64_t TransferWallTimeMS = 0;
@@ -2473,6 +2508,22 @@ SaveOplog(CidStore& ChunkStore,
RemoteProjectStore::GetKnownBlocksResult KnownBlocksResult = RemoteStore.GetKnownBlocks();
TransferWallTimeMS += GetKnownBlocksTimer.GetElapsedTimeMs();
+ for (ChunkBlockDescription& BlockDescription : KnownBlocksResult.Blocks)
+ {
+ if (BlockDescription.ChunkRawLengths.empty())
+ {
+ ZEN_ASSERT(BlockDescription.ChunkCompressedLengths.empty());
+
+ size_t ChunkCount = BlockDescription.ChunkRawLengths.size();
+ if (ChunkCount > 0)
+ {
+ // Fake sizes, will give usage number of number of chunks used rather than bytes used - better than nothing
+ BlockDescription.ChunkRawLengths.resize(ChunkCount, 1);
+ BlockDescription.ChunkCompressedLengths.resize(ChunkCount, 1);
+ }
+ }
+ }
+
if (KnownBlocksResult.ErrorCode == static_cast<int>(HttpResponseCode::NoContent))
{
remotestore_impl::ReportMessage(OptionalContext,
diff --git a/src/zenstore/cas.cpp b/src/zenstore/cas.cpp
index 49d24c21e..ed017988f 100644
--- a/src/zenstore/cas.cpp
+++ b/src/zenstore/cas.cpp
@@ -267,17 +267,17 @@ CasImpl::InsertChunk(IoBuffer Chunk, const IoHash& ChunkHash, InsertMode Mode)
}
static void
-GetCompactCasResults(CasContainerStrategy& Strategy,
- std::span<IoBuffer> Data,
- std::span<IoHash> ChunkHashes,
- std::span<size_t> Indexes,
- std::vector<CasStore::InsertResult> Results)
+GetCompactCasResults(CasContainerStrategy& Strategy,
+ std::span<IoBuffer> Data,
+ std::span<IoHash> ChunkHashes,
+ std::span<size_t> Indexes,
+ std::vector<CasStore::InsertResult>& OutResults)
{
const size_t Count = Indexes.size();
if (Count == 1)
{
const size_t Index = Indexes[0];
- Results[Index] = Strategy.InsertChunk(Data[Index], ChunkHashes[Index]);
+ OutResults[Index] = Strategy.InsertChunk(Data[Index], ChunkHashes[Index]);
return;
}
std::vector<IoBuffer> Chunks;
@@ -290,12 +290,12 @@ GetCompactCasResults(CasContainerStrategy& Strategy,
Hashes.push_back(ChunkHashes[Index]);
}
- Strategy.InsertChunks(Chunks, Hashes);
+ std::vector<CasStore::InsertResult> Results = Strategy.InsertChunks(Chunks, Hashes);
for (size_t Offset = 0; Offset < Count; Offset++)
{
- size_t Index = Indexes[Offset];
- Results[Index] = Results[Offset];
+ size_t Index = Indexes[Offset];
+ OutResults[Index] = Results[Offset];
}
};