diff options
| author | Dan Engelbrecht <[email protected]> | 2025-12-15 13:20:21 +0100 |
|---|---|---|
| committer | GitHub Enterprise <[email protected]> | 2025-12-15 13:20:21 +0100 |
| commit | a715d3ab7701e6257730a73c62567052d21c9771 (patch) | |
| tree | 1f6b1de9c7cf11ec1403187d77d74a3b1af52a39 /src/zenremotestore/projectstore/remoteprojectstore.cpp | |
| parent | show download source data (#689) (diff) | |
| download | zen-a715d3ab7701e6257730a73c62567052d21c9771.tar.xz zen-a715d3ab7701e6257730a73c62567052d21c9771.zip | |
oplog download size (#690)
- Bugfix: Upload of oplogs could reference multiple blocks for the same chunk causing redundant downloads of blocks
- Improvement: Use the improved block reuse selection function from zen builds upload in zen oplog-export to reduce oplog download size
Diffstat (limited to 'src/zenremotestore/projectstore/remoteprojectstore.cpp')
| -rw-r--r-- | src/zenremotestore/projectstore/remoteprojectstore.cpp | 185 |
1 files changed, 118 insertions, 67 deletions
diff --git a/src/zenremotestore/projectstore/remoteprojectstore.cpp b/src/zenremotestore/projectstore/remoteprojectstore.cpp index 5652d5271..0e18cc6b0 100644 --- a/src/zenremotestore/projectstore/remoteprojectstore.cpp +++ b/src/zenremotestore/projectstore/remoteprojectstore.cpp @@ -14,8 +14,10 @@ #include <zencore/workthreadpool.h> #include <zenhttp/httpcommon.h> #include <zenremotestore/chunking/chunkedfile.h> +#include <zenremotestore/operationlogoutput.h> #include <zenstore/cidstore.h> +#include <numeric> #include <unordered_map> #if ZEN_WITH_TESTS @@ -534,11 +536,16 @@ namespace remotestore_impl { return; } + uint64_t PotentialSize = 0; + uint64_t UsedSize = 0; + uint64_t BlockSize = BlockPayload.GetSize(); + uint64_t BlockHeaderSize = 0; bool StoreChunksOK = IterateChunkBlock( BlockPayload, - [&WantedChunks, &WriteAttachmentBuffers, &WriteRawHashes, &Info](CompressedBuffer&& Chunk, - const IoHash& AttachmentRawHash) { + [&WantedChunks, &WriteAttachmentBuffers, &WriteRawHashes, &Info, &PotentialSize]( + CompressedBuffer&& Chunk, + const IoHash& AttachmentRawHash) { if (WantedChunks.contains(AttachmentRawHash)) { WriteAttachmentBuffers.emplace_back(Chunk.GetCompressed().Flatten().AsIoBuffer()); @@ -552,6 +559,7 @@ namespace remotestore_impl { ZEN_ASSERT(RawHash == AttachmentRawHash); WriteRawHashes.emplace_back(AttachmentRawHash); WantedChunks.erase(AttachmentRawHash); + PotentialSize += WriteAttachmentBuffers.back().GetSize(); } }, BlockHeaderSize); @@ -581,8 +589,16 @@ namespace remotestore_impl { { Info.AttachmentBytesStored.fetch_add(WriteAttachmentBuffers[Index].GetSize()); Info.AttachmentsStored.fetch_add(1); + UsedSize += WriteAttachmentBuffers[Index].GetSize(); } } + ZEN_DEBUG("Used {} (matching {}) out of {} for block {} ({} %) (use of matching {}%)", + NiceBytes(UsedSize), + NiceBytes(PotentialSize), + NiceBytes(BlockSize), + BlockHash, + (100 * UsedSize) / BlockSize, + PotentialSize > 0 ? (UsedSize * 100) / PotentialSize : 0); } } catch (const std::exception& Ex) @@ -1182,7 +1198,7 @@ BuildContainer(CidStore& ChunkStore, bool BuildBlocks, bool IgnoreMissingAttachments, bool AllowChunking, - const std::vector<ThinChunkBlockDescription>& KnownBlocks, + const std::vector<ChunkBlockDescription>& KnownBlocks, WorkerThreadPool& WorkerPool, const std::function<void(CompressedBuffer&&, ChunkBlockDescription&&)>& AsyncOnBlock, const std::function<void(const IoHash&, TGetAttachmentBufferFunc&&)>& OnLargeAttachment, @@ -1193,6 +1209,36 @@ BuildContainer(CidStore& ChunkStore, { using namespace std::literals; + class JobContextLogOutput : public OperationLogOutput + { + public: + JobContextLogOutput(JobContext* OptionalContext) : m_OptionalContext(OptionalContext) {} + virtual void EmitLogMessage(int LogLevel, std::string_view Format, fmt::format_args Args) override + { + ZEN_UNUSED(LogLevel); + if (m_OptionalContext) + { + fmt::basic_memory_buffer<char, 250> MessageBuffer; + fmt::vformat_to(fmt::appender(MessageBuffer), Format, Args); + remotestore_impl::ReportMessage(m_OptionalContext, std::string_view(MessageBuffer.data(), MessageBuffer.size())); + } + } + + virtual void SetLogOperationName(std::string_view Name) override { ZEN_UNUSED(Name); } + virtual void SetLogOperationProgress(uint32_t StepIndex, uint32_t StepCount) override { ZEN_UNUSED(StepIndex, StepCount); } + virtual uint32_t GetProgressUpdateDelayMS() override { return 0; } + virtual ProgressBar* CreateProgressBar(std::string_view InSubTask) override + { + ZEN_UNUSED(InSubTask); + return nullptr; + } + + private: + JobContext* m_OptionalContext; + }; + + std::unique_ptr<OperationLogOutput> LogOutput(std::make_unique<JobContextLogOutput>(OptionalContext)); + size_t OpCount = 0; CbObject OplogContainerObject; @@ -1424,56 +1470,6 @@ BuildContainer(CidStore& ChunkStore, return {}; } - auto FindReuseBlocks = [](const std::vector<ThinChunkBlockDescription>& KnownBlocks, - const std::unordered_set<IoHash, IoHash::Hasher>& Attachments, - JobContext* OptionalContext) -> std::vector<size_t> { - std::vector<size_t> ReuseBlockIndexes; - if (!Attachments.empty() && !KnownBlocks.empty()) - { - remotestore_impl::ReportMessage( - OptionalContext, - fmt::format("Checking {} Attachments against {} known blocks for reuse", Attachments.size(), KnownBlocks.size())); - Stopwatch ReuseTimer; - - for (size_t KnownBlockIndex = 0; KnownBlockIndex < KnownBlocks.size(); KnownBlockIndex++) - { - const ThinChunkBlockDescription& KnownBlock = KnownBlocks[KnownBlockIndex]; - size_t BlockAttachmentCount = KnownBlock.ChunkRawHashes.size(); - if (BlockAttachmentCount == 0) - { - continue; - } - size_t FoundAttachmentCount = 0; - for (const IoHash& KnownHash : KnownBlock.ChunkRawHashes) - { - if (Attachments.contains(KnownHash)) - { - FoundAttachmentCount++; - } - } - - size_t ReusePercent = (FoundAttachmentCount * 100) / BlockAttachmentCount; - // TODO: Configure reuse-level - if (ReusePercent > 80) - { - ZEN_DEBUG("Reusing block {}. {} attachments found, usage level: {}%", - KnownBlock.BlockHash, - FoundAttachmentCount, - ReusePercent); - ReuseBlockIndexes.push_back(KnownBlockIndex); - } - else if (FoundAttachmentCount > 0) - { - ZEN_DEBUG("Skipping block {}. {} attachments found, usage level: {}%", - KnownBlock.BlockHash, - FoundAttachmentCount, - ReusePercent); - } - } - } - return ReuseBlockIndexes; - }; - std::unordered_set<IoHash, IoHash::Hasher> FoundHashes; FoundHashes.reserve(UploadAttachments.size()); for (const auto& It : UploadAttachments) @@ -1482,15 +1478,35 @@ BuildContainer(CidStore& ChunkStore, } size_t ReusedAttachmentCount = 0; - std::vector<size_t> ReusedBlockIndexes = FindReuseBlocks(KnownBlocks, FoundHashes, OptionalContext); - for (size_t KnownBlockIndex : ReusedBlockIndexes) + std::vector<size_t> ReusedBlockIndexes; { - const ThinChunkBlockDescription& KnownBlock = KnownBlocks[KnownBlockIndex]; - for (const IoHash& KnownHash : KnownBlock.ChunkRawHashes) + std::vector<IoHash> ChunkHashes; + ChunkHashes.reserve(FoundHashes.size()); + ChunkHashes.insert(ChunkHashes.begin(), FoundHashes.begin(), FoundHashes.end()); + std::vector<uint32_t> ChunkIndexes; + ChunkIndexes.resize(FoundHashes.size()); + std::iota(ChunkIndexes.begin(), ChunkIndexes.end(), 0); + + std::vector<uint32_t> UnusedChunkIndexes; + ReuseBlocksStatistics ReuseBlocksStats; + + ReusedBlockIndexes = FindReuseBlocks(*LogOutput, + /*BlockReuseMinPercentLimit*/ 80, + /*IsVerbose*/ false, + ReuseBlocksStats, + KnownBlocks, + ChunkHashes, + ChunkIndexes, + UnusedChunkIndexes); + for (size_t KnownBlockIndex : ReusedBlockIndexes) { - if (UploadAttachments.erase(KnownHash) == 1) + const ThinChunkBlockDescription& KnownBlock = KnownBlocks[KnownBlockIndex]; + for (const IoHash& KnownHash : KnownBlock.ChunkRawHashes) { - ReusedAttachmentCount++; + if (UploadAttachments.erase(KnownHash) == 1) + { + ReusedAttachmentCount++; + } } } } @@ -1823,20 +1839,39 @@ BuildContainer(CidStore& ChunkStore, UploadAttachments.erase(It); } - std::vector<size_t> ReusedBlockFromChunking = FindReuseBlocks(KnownBlocks, ChunkedHashes, OptionalContext); - for (size_t KnownBlockIndex : ReusedBlockIndexes) { - const ThinChunkBlockDescription& KnownBlock = KnownBlocks[KnownBlockIndex]; - for (const IoHash& KnownHash : KnownBlock.ChunkRawHashes) + std::vector<IoHash> ChunkHashes; + ChunkHashes.reserve(ChunkedHashes.size()); + ChunkHashes.insert(ChunkHashes.begin(), ChunkedHashes.begin(), ChunkedHashes.end()); + std::vector<uint32_t> ChunkIndexes; + ChunkIndexes.resize(ChunkedHashes.size()); + std::iota(ChunkIndexes.begin(), ChunkIndexes.end(), 0); + + std::vector<uint32_t> UnusedChunkIndexes; + ReuseBlocksStatistics ReuseBlocksStats; + + std::vector<size_t> ReusedBlockFromChunking = FindReuseBlocks(*LogOutput, + /*BlockReuseMinPercentLimit*/ 80, + /*IsVerbose*/ false, + ReuseBlocksStats, + KnownBlocks, + ChunkHashes, + ChunkIndexes, + UnusedChunkIndexes); + for (size_t KnownBlockIndex : ReusedBlockIndexes) { - if (ChunkedHashes.erase(KnownHash) == 1) + const ThinChunkBlockDescription& KnownBlock = KnownBlocks[KnownBlockIndex]; + for (const IoHash& KnownHash : KnownBlock.ChunkRawHashes) { - ReusedAttachmentCount++; + if (ChunkedHashes.erase(KnownHash) == 1) + { + ReusedAttachmentCount++; + } } } + ReusedBlockIndexes.insert(ReusedBlockIndexes.end(), ReusedBlockFromChunking.begin(), ReusedBlockFromChunking.end()); } - ReusedBlockIndexes.insert(ReusedBlockIndexes.end(), ReusedBlockFromChunking.begin(), ReusedBlockFromChunking.end()); std::sort(ReusedBlockIndexes.begin(), ReusedBlockIndexes.end()); auto UniqueKnownBlocksEnd = std::unique(ReusedBlockIndexes.begin(), ReusedBlockIndexes.end()); size_t ReuseBlockCount = std::distance(ReusedBlockIndexes.begin(), UniqueKnownBlocksEnd); @@ -2448,7 +2483,7 @@ SaveOplog(CidStore& ChunkStore, OnBlock = UploadBlock; } - std::vector<ThinChunkBlockDescription> KnownBlocks; + std::vector<ChunkBlockDescription> KnownBlocks; uint64_t TransferWallTimeMS = 0; @@ -2473,6 +2508,22 @@ SaveOplog(CidStore& ChunkStore, RemoteProjectStore::GetKnownBlocksResult KnownBlocksResult = RemoteStore.GetKnownBlocks(); TransferWallTimeMS += GetKnownBlocksTimer.GetElapsedTimeMs(); + for (ChunkBlockDescription& BlockDescription : KnownBlocksResult.Blocks) + { + if (BlockDescription.ChunkRawLengths.empty()) + { + ZEN_ASSERT(BlockDescription.ChunkCompressedLengths.empty()); + + size_t ChunkCount = BlockDescription.ChunkRawLengths.size(); + if (ChunkCount > 0) + { + // Fake sizes, will give usage number of number of chunks used rather than bytes used - better than nothing + BlockDescription.ChunkRawLengths.resize(ChunkCount, 1); + BlockDescription.ChunkCompressedLengths.resize(ChunkCount, 1); + } + } + } + if (KnownBlocksResult.ErrorCode == static_cast<int>(HttpResponseCode::NoContent)) { remotestore_impl::ReportMessage(OptionalContext, |