aboutsummaryrefslogtreecommitdiff
path: root/src/zenremotestore/projectstore/remoteprojectstore.cpp
diff options
context:
space:
mode:
authorDan Engelbrecht <[email protected]>2025-12-15 13:20:21 +0100
committerGitHub Enterprise <[email protected]>2025-12-15 13:20:21 +0100
commita715d3ab7701e6257730a73c62567052d21c9771 (patch)
tree1f6b1de9c7cf11ec1403187d77d74a3b1af52a39 /src/zenremotestore/projectstore/remoteprojectstore.cpp
parentshow download source data (#689) (diff)
downloadzen-a715d3ab7701e6257730a73c62567052d21c9771.tar.xz
zen-a715d3ab7701e6257730a73c62567052d21c9771.zip
oplog download size (#690)
- Bugfix: Upload of oplogs could reference multiple blocks for the same chunk causing redundant downloads of blocks - Improvement: Use the improved block reuse selection function from zen builds upload in zen oplog-export to reduce oplog download size
Diffstat (limited to 'src/zenremotestore/projectstore/remoteprojectstore.cpp')
-rw-r--r--src/zenremotestore/projectstore/remoteprojectstore.cpp185
1 files changed, 118 insertions, 67 deletions
diff --git a/src/zenremotestore/projectstore/remoteprojectstore.cpp b/src/zenremotestore/projectstore/remoteprojectstore.cpp
index 5652d5271..0e18cc6b0 100644
--- a/src/zenremotestore/projectstore/remoteprojectstore.cpp
+++ b/src/zenremotestore/projectstore/remoteprojectstore.cpp
@@ -14,8 +14,10 @@
#include <zencore/workthreadpool.h>
#include <zenhttp/httpcommon.h>
#include <zenremotestore/chunking/chunkedfile.h>
+#include <zenremotestore/operationlogoutput.h>
#include <zenstore/cidstore.h>
+#include <numeric>
#include <unordered_map>
#if ZEN_WITH_TESTS
@@ -534,11 +536,16 @@ namespace remotestore_impl {
return;
}
+ uint64_t PotentialSize = 0;
+ uint64_t UsedSize = 0;
+ uint64_t BlockSize = BlockPayload.GetSize();
+
uint64_t BlockHeaderSize = 0;
bool StoreChunksOK = IterateChunkBlock(
BlockPayload,
- [&WantedChunks, &WriteAttachmentBuffers, &WriteRawHashes, &Info](CompressedBuffer&& Chunk,
- const IoHash& AttachmentRawHash) {
+ [&WantedChunks, &WriteAttachmentBuffers, &WriteRawHashes, &Info, &PotentialSize](
+ CompressedBuffer&& Chunk,
+ const IoHash& AttachmentRawHash) {
if (WantedChunks.contains(AttachmentRawHash))
{
WriteAttachmentBuffers.emplace_back(Chunk.GetCompressed().Flatten().AsIoBuffer());
@@ -552,6 +559,7 @@ namespace remotestore_impl {
ZEN_ASSERT(RawHash == AttachmentRawHash);
WriteRawHashes.emplace_back(AttachmentRawHash);
WantedChunks.erase(AttachmentRawHash);
+ PotentialSize += WriteAttachmentBuffers.back().GetSize();
}
},
BlockHeaderSize);
@@ -581,8 +589,16 @@ namespace remotestore_impl {
{
Info.AttachmentBytesStored.fetch_add(WriteAttachmentBuffers[Index].GetSize());
Info.AttachmentsStored.fetch_add(1);
+ UsedSize += WriteAttachmentBuffers[Index].GetSize();
}
}
+ ZEN_DEBUG("Used {} (matching {}) out of {} for block {} ({} %) (use of matching {}%)",
+ NiceBytes(UsedSize),
+ NiceBytes(PotentialSize),
+ NiceBytes(BlockSize),
+ BlockHash,
+ (100 * UsedSize) / BlockSize,
+ PotentialSize > 0 ? (UsedSize * 100) / PotentialSize : 0);
}
}
catch (const std::exception& Ex)
@@ -1182,7 +1198,7 @@ BuildContainer(CidStore& ChunkStore,
bool BuildBlocks,
bool IgnoreMissingAttachments,
bool AllowChunking,
- const std::vector<ThinChunkBlockDescription>& KnownBlocks,
+ const std::vector<ChunkBlockDescription>& KnownBlocks,
WorkerThreadPool& WorkerPool,
const std::function<void(CompressedBuffer&&, ChunkBlockDescription&&)>& AsyncOnBlock,
const std::function<void(const IoHash&, TGetAttachmentBufferFunc&&)>& OnLargeAttachment,
@@ -1193,6 +1209,36 @@ BuildContainer(CidStore& ChunkStore,
{
using namespace std::literals;
+ class JobContextLogOutput : public OperationLogOutput
+ {
+ public:
+ JobContextLogOutput(JobContext* OptionalContext) : m_OptionalContext(OptionalContext) {}
+ virtual void EmitLogMessage(int LogLevel, std::string_view Format, fmt::format_args Args) override
+ {
+ ZEN_UNUSED(LogLevel);
+ if (m_OptionalContext)
+ {
+ fmt::basic_memory_buffer<char, 250> MessageBuffer;
+ fmt::vformat_to(fmt::appender(MessageBuffer), Format, Args);
+ remotestore_impl::ReportMessage(m_OptionalContext, std::string_view(MessageBuffer.data(), MessageBuffer.size()));
+ }
+ }
+
+ virtual void SetLogOperationName(std::string_view Name) override { ZEN_UNUSED(Name); }
+ virtual void SetLogOperationProgress(uint32_t StepIndex, uint32_t StepCount) override { ZEN_UNUSED(StepIndex, StepCount); }
+ virtual uint32_t GetProgressUpdateDelayMS() override { return 0; }
+ virtual ProgressBar* CreateProgressBar(std::string_view InSubTask) override
+ {
+ ZEN_UNUSED(InSubTask);
+ return nullptr;
+ }
+
+ private:
+ JobContext* m_OptionalContext;
+ };
+
+ std::unique_ptr<OperationLogOutput> LogOutput(std::make_unique<JobContextLogOutput>(OptionalContext));
+
size_t OpCount = 0;
CbObject OplogContainerObject;
@@ -1424,56 +1470,6 @@ BuildContainer(CidStore& ChunkStore,
return {};
}
- auto FindReuseBlocks = [](const std::vector<ThinChunkBlockDescription>& KnownBlocks,
- const std::unordered_set<IoHash, IoHash::Hasher>& Attachments,
- JobContext* OptionalContext) -> std::vector<size_t> {
- std::vector<size_t> ReuseBlockIndexes;
- if (!Attachments.empty() && !KnownBlocks.empty())
- {
- remotestore_impl::ReportMessage(
- OptionalContext,
- fmt::format("Checking {} Attachments against {} known blocks for reuse", Attachments.size(), KnownBlocks.size()));
- Stopwatch ReuseTimer;
-
- for (size_t KnownBlockIndex = 0; KnownBlockIndex < KnownBlocks.size(); KnownBlockIndex++)
- {
- const ThinChunkBlockDescription& KnownBlock = KnownBlocks[KnownBlockIndex];
- size_t BlockAttachmentCount = KnownBlock.ChunkRawHashes.size();
- if (BlockAttachmentCount == 0)
- {
- continue;
- }
- size_t FoundAttachmentCount = 0;
- for (const IoHash& KnownHash : KnownBlock.ChunkRawHashes)
- {
- if (Attachments.contains(KnownHash))
- {
- FoundAttachmentCount++;
- }
- }
-
- size_t ReusePercent = (FoundAttachmentCount * 100) / BlockAttachmentCount;
- // TODO: Configure reuse-level
- if (ReusePercent > 80)
- {
- ZEN_DEBUG("Reusing block {}. {} attachments found, usage level: {}%",
- KnownBlock.BlockHash,
- FoundAttachmentCount,
- ReusePercent);
- ReuseBlockIndexes.push_back(KnownBlockIndex);
- }
- else if (FoundAttachmentCount > 0)
- {
- ZEN_DEBUG("Skipping block {}. {} attachments found, usage level: {}%",
- KnownBlock.BlockHash,
- FoundAttachmentCount,
- ReusePercent);
- }
- }
- }
- return ReuseBlockIndexes;
- };
-
std::unordered_set<IoHash, IoHash::Hasher> FoundHashes;
FoundHashes.reserve(UploadAttachments.size());
for (const auto& It : UploadAttachments)
@@ -1482,15 +1478,35 @@ BuildContainer(CidStore& ChunkStore,
}
size_t ReusedAttachmentCount = 0;
- std::vector<size_t> ReusedBlockIndexes = FindReuseBlocks(KnownBlocks, FoundHashes, OptionalContext);
- for (size_t KnownBlockIndex : ReusedBlockIndexes)
+ std::vector<size_t> ReusedBlockIndexes;
{
- const ThinChunkBlockDescription& KnownBlock = KnownBlocks[KnownBlockIndex];
- for (const IoHash& KnownHash : KnownBlock.ChunkRawHashes)
+ std::vector<IoHash> ChunkHashes;
+ ChunkHashes.reserve(FoundHashes.size());
+ ChunkHashes.insert(ChunkHashes.begin(), FoundHashes.begin(), FoundHashes.end());
+ std::vector<uint32_t> ChunkIndexes;
+ ChunkIndexes.resize(FoundHashes.size());
+ std::iota(ChunkIndexes.begin(), ChunkIndexes.end(), 0);
+
+ std::vector<uint32_t> UnusedChunkIndexes;
+ ReuseBlocksStatistics ReuseBlocksStats;
+
+ ReusedBlockIndexes = FindReuseBlocks(*LogOutput,
+ /*BlockReuseMinPercentLimit*/ 80,
+ /*IsVerbose*/ false,
+ ReuseBlocksStats,
+ KnownBlocks,
+ ChunkHashes,
+ ChunkIndexes,
+ UnusedChunkIndexes);
+ for (size_t KnownBlockIndex : ReusedBlockIndexes)
{
- if (UploadAttachments.erase(KnownHash) == 1)
+ const ThinChunkBlockDescription& KnownBlock = KnownBlocks[KnownBlockIndex];
+ for (const IoHash& KnownHash : KnownBlock.ChunkRawHashes)
{
- ReusedAttachmentCount++;
+ if (UploadAttachments.erase(KnownHash) == 1)
+ {
+ ReusedAttachmentCount++;
+ }
}
}
}
@@ -1823,20 +1839,39 @@ BuildContainer(CidStore& ChunkStore,
UploadAttachments.erase(It);
}
- std::vector<size_t> ReusedBlockFromChunking = FindReuseBlocks(KnownBlocks, ChunkedHashes, OptionalContext);
- for (size_t KnownBlockIndex : ReusedBlockIndexes)
{
- const ThinChunkBlockDescription& KnownBlock = KnownBlocks[KnownBlockIndex];
- for (const IoHash& KnownHash : KnownBlock.ChunkRawHashes)
+ std::vector<IoHash> ChunkHashes;
+ ChunkHashes.reserve(ChunkedHashes.size());
+ ChunkHashes.insert(ChunkHashes.begin(), ChunkedHashes.begin(), ChunkedHashes.end());
+ std::vector<uint32_t> ChunkIndexes;
+ ChunkIndexes.resize(ChunkedHashes.size());
+ std::iota(ChunkIndexes.begin(), ChunkIndexes.end(), 0);
+
+ std::vector<uint32_t> UnusedChunkIndexes;
+ ReuseBlocksStatistics ReuseBlocksStats;
+
+ std::vector<size_t> ReusedBlockFromChunking = FindReuseBlocks(*LogOutput,
+ /*BlockReuseMinPercentLimit*/ 80,
+ /*IsVerbose*/ false,
+ ReuseBlocksStats,
+ KnownBlocks,
+ ChunkHashes,
+ ChunkIndexes,
+ UnusedChunkIndexes);
+ for (size_t KnownBlockIndex : ReusedBlockIndexes)
{
- if (ChunkedHashes.erase(KnownHash) == 1)
+ const ThinChunkBlockDescription& KnownBlock = KnownBlocks[KnownBlockIndex];
+ for (const IoHash& KnownHash : KnownBlock.ChunkRawHashes)
{
- ReusedAttachmentCount++;
+ if (ChunkedHashes.erase(KnownHash) == 1)
+ {
+ ReusedAttachmentCount++;
+ }
}
}
+ ReusedBlockIndexes.insert(ReusedBlockIndexes.end(), ReusedBlockFromChunking.begin(), ReusedBlockFromChunking.end());
}
- ReusedBlockIndexes.insert(ReusedBlockIndexes.end(), ReusedBlockFromChunking.begin(), ReusedBlockFromChunking.end());
std::sort(ReusedBlockIndexes.begin(), ReusedBlockIndexes.end());
auto UniqueKnownBlocksEnd = std::unique(ReusedBlockIndexes.begin(), ReusedBlockIndexes.end());
size_t ReuseBlockCount = std::distance(ReusedBlockIndexes.begin(), UniqueKnownBlocksEnd);
@@ -2448,7 +2483,7 @@ SaveOplog(CidStore& ChunkStore,
OnBlock = UploadBlock;
}
- std::vector<ThinChunkBlockDescription> KnownBlocks;
+ std::vector<ChunkBlockDescription> KnownBlocks;
uint64_t TransferWallTimeMS = 0;
@@ -2473,6 +2508,22 @@ SaveOplog(CidStore& ChunkStore,
RemoteProjectStore::GetKnownBlocksResult KnownBlocksResult = RemoteStore.GetKnownBlocks();
TransferWallTimeMS += GetKnownBlocksTimer.GetElapsedTimeMs();
+ for (ChunkBlockDescription& BlockDescription : KnownBlocksResult.Blocks)
+ {
+ if (BlockDescription.ChunkRawLengths.empty())
+ {
+ ZEN_ASSERT(BlockDescription.ChunkCompressedLengths.empty());
+
+ size_t ChunkCount = BlockDescription.ChunkRawLengths.size();
+ if (ChunkCount > 0)
+ {
+ // Fake sizes, will give usage number of number of chunks used rather than bytes used - better than nothing
+ BlockDescription.ChunkRawLengths.resize(ChunkCount, 1);
+ BlockDescription.ChunkCompressedLengths.resize(ChunkCount, 1);
+ }
+ }
+ }
+
if (KnownBlocksResult.ErrorCode == static_cast<int>(HttpResponseCode::NoContent))
{
remotestore_impl::ReportMessage(OptionalContext,