aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDan Engelbrecht <[email protected]>2026-01-08 14:41:08 +0100
committerGitHub Enterprise <[email protected]>2026-01-08 14:41:08 +0100
commit887bc638d2cdc529739f7fb884926024761c298e (patch)
treed7c47cde6bf2e12fe2fe12c85377898d7a3f5e35
parentadded early-out check in GcManager::ScrubStorage(ScrubContext& GcCtx) (#698) (diff)
downloadzen-887bc638d2cdc529739f7fb884926024761c298e.tar.xz
zen-887bc638d2cdc529739f7fb884926024761c298e.zip
optimize scavenge part 2 (#699)
* optimize scavenge discovery by iterate over chunks only instead of iterating through all chunks in the scavenged files * refactor scavenge lookup
-rw-r--r--src/zenremotestore/builds/buildstorageoperations.cpp246
-rw-r--r--src/zenremotestore/include/zenremotestore/builds/buildstorageoperations.h11
2 files changed, 109 insertions, 148 deletions
diff --git a/src/zenremotestore/builds/buildstorageoperations.cpp b/src/zenremotestore/builds/buildstorageoperations.cpp
index b9f5eb07a..3ca2f72c1 100644
--- a/src/zenremotestore/builds/buildstorageoperations.cpp
+++ b/src/zenremotestore/builds/buildstorageoperations.cpp
@@ -782,78 +782,17 @@ BuildsOperationUpdateFolder::Execute(FolderContent& OutLocalFolderState)
Stopwatch LocalTimer;
- for (uint32_t LocalSequenceIndex = 0;
- LocalSequenceIndex < m_LocalContent.ChunkedContent.SequenceRawHashes.size() && (RemainingChunkCount > 0);
- LocalSequenceIndex++)
- {
- const IoHash& LocalSequenceRawHash = m_LocalContent.ChunkedContent.SequenceRawHashes[LocalSequenceIndex];
- const uint32_t LocalOrderOffset = m_LocalLookup.SequenceIndexChunkOrderOffset[LocalSequenceIndex];
-
- {
- uint64_t SourceOffset = 0;
- const uint32_t LocalChunkCount = m_LocalContent.ChunkedContent.ChunkCounts[LocalSequenceIndex];
- for (uint32_t LocalOrderIndex = 0; LocalOrderIndex < LocalChunkCount; LocalOrderIndex++)
- {
- const uint32_t LocalChunkIndex = m_LocalContent.ChunkedContent.ChunkOrders[LocalOrderOffset + LocalOrderIndex];
- const IoHash& LocalChunkHash = m_LocalContent.ChunkedContent.ChunkHashes[LocalChunkIndex];
- const uint64_t LocalChunkRawSize = m_LocalContent.ChunkedContent.ChunkRawSizes[LocalChunkIndex];
-
- if (auto RemoteChunkIt = m_RemoteLookup.ChunkHashToChunkIndex.find(LocalChunkHash);
- RemoteChunkIt != m_RemoteLookup.ChunkHashToChunkIndex.end())
- {
- const uint32_t RemoteChunkIndex = RemoteChunkIt->second;
- if (!RemoteChunkIndexNeedsCopyFromLocalFileFlags[RemoteChunkIndex])
- {
- std::vector<const ChunkedContentLookup::ChunkSequenceLocation*> ChunkTargetPtrs =
- GetRemainingChunkTargets(SequenceIndexChunksLeftToWriteCounters, RemoteChunkIndex);
+ ScavengeSourceForChunks(RemainingChunkCount,
+ RemoteChunkIndexNeedsCopyFromLocalFileFlags,
+ RawHashToCopyChunkDataIndex,
+ SequenceIndexChunksLeftToWriteCounters,
+ m_LocalContent,
+ m_LocalLookup,
+ CopyChunkDatas,
+ uint32_t(-1),
+ m_CacheMappingStats.LocalChunkMatchingRemoteCount,
+ m_CacheMappingStats.LocalChunkMatchingRemoteByteCount);
- if (!ChunkTargetPtrs.empty())
- {
- CopyChunkData::ChunkTarget Target = {
- .TargetChunkLocationCount = gsl::narrow<uint32_t>(ChunkTargetPtrs.size()),
- .RemoteChunkIndex = RemoteChunkIndex,
- .CacheFileOffset = SourceOffset};
- if (auto CopySourceIt = RawHashToCopyChunkDataIndex.find(LocalSequenceRawHash);
- CopySourceIt != RawHashToCopyChunkDataIndex.end())
- {
- CopyChunkData& Data = CopyChunkDatas[CopySourceIt->second];
- if (Data.TargetChunkLocationPtrs.size() > 1024)
- {
- RawHashToCopyChunkDataIndex.insert_or_assign(LocalSequenceRawHash, CopyChunkDatas.size());
- CopyChunkDatas.push_back(
- CopyChunkData{.ScavengeSourceIndex = (uint32_t)-1,
- .SourceSequenceIndex = LocalSequenceIndex,
- .TargetChunkLocationPtrs = ChunkTargetPtrs,
- .ChunkTargets = std::vector<CopyChunkData::ChunkTarget>{Target}});
- }
- else
- {
- Data.TargetChunkLocationPtrs.insert(Data.TargetChunkLocationPtrs.end(),
- ChunkTargetPtrs.begin(),
- ChunkTargetPtrs.end());
- Data.ChunkTargets.push_back(Target);
- }
- }
- else
- {
- RawHashToCopyChunkDataIndex.insert_or_assign(LocalSequenceRawHash, CopyChunkDatas.size());
- CopyChunkDatas.push_back(
- CopyChunkData{.ScavengeSourceIndex = (uint32_t)-1,
- .SourceSequenceIndex = LocalSequenceIndex,
- .TargetChunkLocationPtrs = ChunkTargetPtrs,
- .ChunkTargets = std::vector<CopyChunkData::ChunkTarget>{Target}});
- }
- m_CacheMappingStats.LocalChunkMatchingRemoteCount++;
- m_CacheMappingStats.LocalChunkMatchingRemoteByteCount += LocalChunkRawSize;
- RemoteChunkIndexNeedsCopyFromLocalFileFlags[RemoteChunkIndex] = true;
- RemainingChunkCount--;
- }
- }
- }
- SourceOffset += LocalChunkRawSize;
- }
- }
- }
m_CacheMappingStats.LocalScanElapsedWallTimeUs += LocalTimer.GetElapsedTimeUs();
}
@@ -867,86 +806,22 @@ BuildsOperationUpdateFolder::Execute(FolderContent& OutLocalFolderState)
ScavengedContentIndex++)
{
const ChunkedFolderContent& ScavengedContent = ScavengedContents[ScavengedContentIndex];
- // const std::filesystem::path& ScavengedPath = ScavengedPaths[ScavengedContentIndex];
- const ChunkedContentLookup& ScavengedLookup = ScavengedLookups[ScavengedContentIndex];
-
- for (uint32_t ScavengedSequenceIndex = 0;
- ScavengedSequenceIndex < ScavengedContent.ChunkedContent.SequenceRawHashes.size() && (RemainingChunkCount > 0);
- ScavengedSequenceIndex++)
- {
- const IoHash& ScavengedSequenceRawHash = ScavengedContent.ChunkedContent.SequenceRawHashes[ScavengedSequenceIndex];
- const uint32_t ScavengedOrderOffset = ScavengedLookup.SequenceIndexChunkOrderOffset[ScavengedSequenceIndex];
-
- {
- uint64_t SourceOffset = 0;
- const uint32_t ScavengedChunkCount = ScavengedContent.ChunkedContent.ChunkCounts[ScavengedSequenceIndex];
- for (uint32_t ScavengedOrderIndex = 0; ScavengedOrderIndex < ScavengedChunkCount; ScavengedOrderIndex++)
- {
- const uint32_t ScavengedChunkIndex =
- ScavengedContent.ChunkedContent.ChunkOrders[ScavengedOrderOffset + ScavengedOrderIndex];
- const IoHash& ScavengedChunkHash = ScavengedContent.ChunkedContent.ChunkHashes[ScavengedChunkIndex];
- const uint64_t ScavengedChunkRawSize = ScavengedContent.ChunkedContent.ChunkRawSizes[ScavengedChunkIndex];
-
- if (auto RemoteChunkIt = m_RemoteLookup.ChunkHashToChunkIndex.find(ScavengedChunkHash);
- RemoteChunkIt != m_RemoteLookup.ChunkHashToChunkIndex.end())
- {
- const uint32_t RemoteChunkIndex = RemoteChunkIt->second;
- if (!RemoteChunkIndexNeedsCopyFromLocalFileFlags[RemoteChunkIndex])
- {
- std::vector<const ChunkedContentLookup::ChunkSequenceLocation*> ChunkTargetPtrs =
- GetRemainingChunkTargets(SequenceIndexChunksLeftToWriteCounters, RemoteChunkIndex);
-
- if (!ChunkTargetPtrs.empty())
- {
- CopyChunkData::ChunkTarget Target = {
- .TargetChunkLocationCount = gsl::narrow<uint32_t>(ChunkTargetPtrs.size()),
- .RemoteChunkIndex = RemoteChunkIndex,
- .CacheFileOffset = SourceOffset};
- if (auto CopySourceIt = RawHashToCopyChunkDataIndex.find(ScavengedSequenceRawHash);
- CopySourceIt != RawHashToCopyChunkDataIndex.end())
- {
- CopyChunkData& Data = CopyChunkDatas[CopySourceIt->second];
- if (Data.TargetChunkLocationPtrs.size() > 1024)
- {
- RawHashToCopyChunkDataIndex.insert_or_assign(ScavengedSequenceRawHash,
- CopyChunkDatas.size());
- CopyChunkDatas.push_back(
- CopyChunkData{.ScavengeSourceIndex = ScavengedContentIndex,
- .SourceSequenceIndex = ScavengedSequenceIndex,
- .TargetChunkLocationPtrs = ChunkTargetPtrs,
- .ChunkTargets = std::vector<CopyChunkData::ChunkTarget>{Target}});
- }
- else
- {
- Data.TargetChunkLocationPtrs.insert(Data.TargetChunkLocationPtrs.end(),
- ChunkTargetPtrs.begin(),
- ChunkTargetPtrs.end());
- Data.ChunkTargets.push_back(Target);
- }
- }
- else
- {
- RawHashToCopyChunkDataIndex.insert_or_assign(ScavengedSequenceRawHash, CopyChunkDatas.size());
- CopyChunkDatas.push_back(
- CopyChunkData{.ScavengeSourceIndex = ScavengedContentIndex,
- .SourceSequenceIndex = ScavengedSequenceIndex,
- .TargetChunkLocationPtrs = ChunkTargetPtrs,
- .ChunkTargets = std::vector<CopyChunkData::ChunkTarget>{Target}});
- }
- m_CacheMappingStats.ScavengedChunkMatchingRemoteCount++;
- m_CacheMappingStats.ScavengedChunkMatchingRemoteByteCount += ScavengedChunkRawSize;
- RemoteChunkIndexNeedsCopyFromLocalFileFlags[RemoteChunkIndex] = true;
- RemainingChunkCount--;
- }
- }
- }
- SourceOffset += ScavengedChunkRawSize;
- }
- }
- }
+ const ChunkedContentLookup& ScavengedLookup = ScavengedLookups[ScavengedContentIndex];
+
+ ScavengeSourceForChunks(RemainingChunkCount,
+ RemoteChunkIndexNeedsCopyFromLocalFileFlags,
+ RawHashToCopyChunkDataIndex,
+ SequenceIndexChunksLeftToWriteCounters,
+ ScavengedContent,
+ ScavengedLookup,
+ CopyChunkDatas,
+ ScavengedContentIndex,
+ m_CacheMappingStats.ScavengedChunkMatchingRemoteCount,
+ m_CacheMappingStats.ScavengedChunkMatchingRemoteByteCount);
}
m_CacheMappingStats.ScavengeElapsedWallTimeUs += ScavengeTimer.GetElapsedTimeUs();
}
+
if (!m_Options.IsQuiet)
{
if (m_CacheMappingStats.CacheSequenceHashesCount > 0 || m_CacheMappingStats.CacheChunkCount > 0 ||
@@ -2941,6 +2816,81 @@ BuildsOperationUpdateFolder::FindScavengeContent(const ScavengeSource& Source,
return true;
}
+void
+BuildsOperationUpdateFolder::ScavengeSourceForChunks(uint32_t& InOutRemainingChunkCount,
+ std::vector<bool>& InOutRemoteChunkIndexNeedsCopyFromLocalFileFlags,
+ tsl::robin_map<IoHash, size_t, IoHash::Hasher>& InOutRawHashToCopyChunkDataIndex,
+ const std::vector<std::atomic<uint32_t>>& SequenceIndexChunksLeftToWriteCounters,
+ const ChunkedFolderContent& ScavengedContent,
+ const ChunkedContentLookup& ScavengedLookup,
+ std::vector<CopyChunkData>& InOutCopyChunkDatas,
+ uint32_t ScavengedContentIndex,
+ uint64_t& InOutChunkMatchingRemoteCount,
+ uint64_t& InOutChunkMatchingRemoteByteCount)
+{
+ for (uint32_t RemoteChunkIndex = 0;
+ RemoteChunkIndex < m_RemoteContent.ChunkedContent.ChunkHashes.size() && (InOutRemainingChunkCount > 0);
+ RemoteChunkIndex++)
+ {
+ if (!InOutRemoteChunkIndexNeedsCopyFromLocalFileFlags[RemoteChunkIndex])
+ {
+ const IoHash& RemoteChunkHash = m_RemoteContent.ChunkedContent.ChunkHashes[RemoteChunkIndex];
+ if (auto It = ScavengedLookup.ChunkHashToChunkIndex.find(RemoteChunkHash); It != ScavengedLookup.ChunkHashToChunkIndex.end())
+ {
+ std::vector<const ChunkedContentLookup::ChunkSequenceLocation*> ChunkTargetPtrs =
+ GetRemainingChunkTargets(SequenceIndexChunksLeftToWriteCounters, RemoteChunkIndex);
+
+ if (!ChunkTargetPtrs.empty())
+ {
+ const uint32_t ScavengedChunkIndex = It->second;
+ const uint64_t ScavengedChunkRawSize = ScavengedContent.ChunkedContent.ChunkRawSizes[ScavengedChunkIndex];
+ const size_t ChunkSequenceLocationOffset = ScavengedLookup.ChunkSequenceLocationOffset[ScavengedChunkIndex];
+ const ChunkedContentLookup::ChunkSequenceLocation& ScavengeLocation =
+ ScavengedLookup.ChunkSequenceLocations[ChunkSequenceLocationOffset];
+ const IoHash& ScavengedSequenceRawHash =
+ ScavengedContent.ChunkedContent.SequenceRawHashes[ScavengeLocation.SequenceIndex];
+
+ CopyChunkData::ChunkTarget Target = {.TargetChunkLocationCount = gsl::narrow<uint32_t>(ChunkTargetPtrs.size()),
+ .RemoteChunkIndex = RemoteChunkIndex,
+ .CacheFileOffset = ScavengeLocation.Offset};
+ if (auto CopySourceIt = InOutRawHashToCopyChunkDataIndex.find(ScavengedSequenceRawHash);
+ CopySourceIt != InOutRawHashToCopyChunkDataIndex.end())
+ {
+ CopyChunkData& Data = InOutCopyChunkDatas[CopySourceIt->second];
+ if (Data.TargetChunkLocationPtrs.size() > 1024)
+ {
+ InOutRawHashToCopyChunkDataIndex.insert_or_assign(ScavengedSequenceRawHash, InOutCopyChunkDatas.size());
+ InOutCopyChunkDatas.push_back(CopyChunkData{.ScavengeSourceIndex = ScavengedContentIndex,
+ .SourceSequenceIndex = ScavengeLocation.SequenceIndex,
+ .TargetChunkLocationPtrs = ChunkTargetPtrs,
+ .ChunkTargets = std::vector<CopyChunkData::ChunkTarget>{Target}});
+ }
+ else
+ {
+ Data.TargetChunkLocationPtrs.insert(Data.TargetChunkLocationPtrs.end(),
+ ChunkTargetPtrs.begin(),
+ ChunkTargetPtrs.end());
+ Data.ChunkTargets.push_back(Target);
+ }
+ }
+ else
+ {
+ InOutRawHashToCopyChunkDataIndex.insert_or_assign(ScavengedSequenceRawHash, InOutCopyChunkDatas.size());
+ InOutCopyChunkDatas.push_back(CopyChunkData{.ScavengeSourceIndex = ScavengedContentIndex,
+ .SourceSequenceIndex = ScavengeLocation.SequenceIndex,
+ .TargetChunkLocationPtrs = ChunkTargetPtrs,
+ .ChunkTargets = std::vector<CopyChunkData::ChunkTarget>{Target}});
+ }
+ InOutChunkMatchingRemoteCount++;
+ InOutChunkMatchingRemoteByteCount += ScavengedChunkRawSize;
+ InOutRemoteChunkIndexNeedsCopyFromLocalFileFlags[RemoteChunkIndex] = true;
+ InOutRemainingChunkCount--;
+ }
+ }
+ }
+ }
+}
+
std::filesystem::path
BuildsOperationUpdateFolder::FindDownloadedChunk(const IoHash& ChunkHash)
{
diff --git a/src/zenremotestore/include/zenremotestore/builds/buildstorageoperations.h b/src/zenremotestore/include/zenremotestore/builds/buildstorageoperations.h
index d78ee29c1..51cd7f841 100644
--- a/src/zenremotestore/include/zenremotestore/builds/buildstorageoperations.h
+++ b/src/zenremotestore/include/zenremotestore/builds/buildstorageoperations.h
@@ -257,6 +257,17 @@ private:
ChunkedFolderContent& OutScavengedLocalContent,
ChunkedContentLookup& OutScavengedLookup);
+ void ScavengeSourceForChunks(uint32_t& InOutRemainingChunkCount,
+ std::vector<bool>& InOutRemoteChunkIndexNeedsCopyFromLocalFileFlags,
+ tsl::robin_map<IoHash, size_t, IoHash::Hasher>& InOutRawHashToCopyChunkDataIndex,
+ const std::vector<std::atomic<uint32_t>>& SequenceIndexChunksLeftToWriteCounters,
+ const ChunkedFolderContent& ScavengedContent,
+ const ChunkedContentLookup& ScavengedLookup,
+ std::vector<CopyChunkData>& InOutCopyChunkDatas,
+ uint32_t ScavengedContentIndex,
+ uint64_t& InOutChunkMatchingRemoteCount,
+ uint64_t& InOutChunkMatchingRemoteByteCount);
+
std::filesystem::path FindDownloadedChunk(const IoHash& ChunkHash);
std::vector<const ChunkedContentLookup::ChunkSequenceLocation*> GetRemainingChunkTargets(