aboutsummaryrefslogtreecommitdiff
path: root/src/zenremotestore/chunking/chunkedcontent.cpp
diff options
context:
space:
mode:
authorDan Engelbrecht <[email protected]>2025-12-19 16:30:03 +0100
committerGitHub Enterprise <[email protected]>2025-12-19 16:30:03 +0100
commit0bf7531d530f12e0fa2edab70b6bf4693fb041db (patch)
tree0f29a872019d5c5b6952ef5e8babde1b6c7cd555 /src/zenremotestore/chunking/chunkedcontent.cpp
parent5.7.15 (diff)
downloadzen-0bf7531d530f12e0fa2edab70b6bf4693fb041db.tar.xz
zen-0bf7531d530f12e0fa2edab70b6bf4693fb041db.zip
optimize scavenge (#697)
* optimize FindScavengeContent * optimize GetValidFolderContent
Diffstat (limited to 'src/zenremotestore/chunking/chunkedcontent.cpp')
-rw-r--r--src/zenremotestore/chunking/chunkedcontent.cpp212
1 files changed, 169 insertions, 43 deletions
diff --git a/src/zenremotestore/chunking/chunkedcontent.cpp b/src/zenremotestore/chunking/chunkedcontent.cpp
index 5f1876908..e8187d348 100644
--- a/src/zenremotestore/chunking/chunkedcontent.cpp
+++ b/src/zenremotestore/chunking/chunkedcontent.cpp
@@ -177,31 +177,6 @@ namespace {
std::string PathCompareString(const std::filesystem::path& Path) { return ToLower(Path.generic_string()); }
- tsl::robin_map<IoHash, uint32_t, IoHash::Hasher> BuildHashLookup(std::span<const IoHash> Hashes)
- {
- tsl::robin_map<IoHash, uint32_t, IoHash::Hasher> Lookup;
- Lookup.reserve(Hashes.size());
- for (uint32_t Index = 0; Index < Hashes.size(); Index++)
- {
- bool IsNew = Lookup.insert_or_assign(Hashes[Index], Index).second;
- ZEN_ASSERT(IsNew);
- }
- return Lookup;
- }
-
- std::vector<uint32_t> BuildChunkOrderOffset(std::span<const uint32_t> ChunkCounts)
- {
- std::vector<uint32_t> ChunkOffsets;
- ChunkOffsets.reserve(ChunkCounts.size());
- uint32_t Offset = 0;
- for (uint32_t SequenceIndex = 0; SequenceIndex < ChunkCounts.size(); SequenceIndex++)
- {
- ChunkOffsets.push_back(Offset);
- Offset += ChunkCounts[SequenceIndex];
- }
- return ChunkOffsets;
- }
-
} // namespace
std::string_view FolderContentSourcePlatformNames[(size_t)SourcePlatform::_Count] = {"Windows"sv, "Linux"sv, "MacOS"sv};
@@ -551,6 +526,134 @@ GetFolderContent(GetFolderContentStatistics& Stats,
return OrderedContent;
}
+FolderContent
+GetValidFolderContent(WorkerThreadPool& WorkerPool,
+ GetFolderContentStatistics& FolderScanStats,
+ const std::filesystem::path& Path,
+ std::span<const std::filesystem::path> PathsToCheck,
+ std::function<void(uint64_t PathCount, uint64_t CompletedPathCount)>&& ProgressCallback,
+ uint32_t ProgressUpdateDelayMS,
+ std::atomic<bool>& AbortFlag,
+ std::atomic<bool>& PauseFlag)
+{
+ ZEN_TRACE_CPU("GetValidFolderContent");
+
+ FolderContent Result;
+ const uint32_t PathCount = gsl::narrow<uint32_t>(PathsToCheck.size());
+
+ Result.Paths.resize(PathCount);
+ Result.RawSizes.resize(PathCount);
+ Result.Attributes.resize(PathCount);
+ Result.ModificationTicks.resize(PathCount);
+
+ {
+ Stopwatch Timer;
+ auto _ = MakeGuard([&FolderScanStats, &Timer]() { FolderScanStats.ElapsedWallTimeUS = Timer.GetElapsedTimeUs(); });
+
+ tsl::robin_map<std::string, uint32_t> PathToPathIndex;
+ PathToPathIndex.reserve(PathsToCheck.size());
+ std::vector<std::filesystem::path> DirectoriesToScan;
+ {
+ tsl::robin_set<std::string> DirectoriesFound;
+ for (size_t PathIndex = 0; PathIndex < PathsToCheck.size(); PathIndex++)
+ {
+ const std::filesystem::path PathToCheck = (Path / PathsToCheck[PathIndex]);
+ const std::string LookupPath = PathToCheck.generic_string();
+ PathToPathIndex.insert_or_assign(LookupPath, PathIndex);
+ std::filesystem::path ParentDirectoryPath = PathToCheck.parent_path();
+ const std::string Directory = ParentDirectoryPath.generic_string();
+ if (DirectoriesFound.insert(Directory).second)
+ {
+ DirectoriesToScan.push_back(ParentDirectoryPath.make_preferred());
+ }
+ }
+ }
+
+ ParallelWork Work(AbortFlag,
+ PauseFlag,
+ ProgressCallback ? WorkerThreadPool::EMode::EnableBacklog : WorkerThreadPool::EMode::DisableBacklog);
+
+ std::atomic<uint64_t> CompletedDirectoryCount = 0;
+ for (size_t DirectoryIndex = 0; DirectoryIndex < DirectoriesToScan.size(); DirectoryIndex++)
+ {
+ Work.ScheduleWork(
+ WorkerPool,
+ [&DirectoriesToScan, DirectoryIndex, &Result, &FolderScanStats, &PathsToCheck, &PathToPathIndex, &CompletedDirectoryCount](
+ std::atomic<bool>& AbortFlag) {
+ if (!AbortFlag)
+ {
+ ZEN_TRACE_CPU("GetValidFolderContent_ScanDirectory");
+
+ const std::filesystem::path ParentDirectoryPath = DirectoriesToScan[DirectoryIndex];
+ try
+ {
+ if (IsDir(ParentDirectoryPath))
+ {
+ DirectoryContent DirContent;
+ GetDirectoryContent(ParentDirectoryPath,
+ DirectoryContentFlags::IncludeFiles | DirectoryContentFlags::IncludeFileSizes |
+ DirectoryContentFlags::IncludeModificationTick |
+ DirectoryContentFlags::IncludeAttributes,
+ DirContent);
+ for (size_t FoundIndex = 0; FoundIndex < DirContent.Files.size(); FoundIndex++)
+ {
+ const std::filesystem::path& FoundPath = DirContent.Files[FoundIndex];
+ if (auto It = PathToPathIndex.find(FoundPath.generic_string()); It != PathToPathIndex.end())
+ {
+ const size_t PathIndex = It->second;
+
+ Result.Paths[PathIndex] = PathsToCheck[PathIndex];
+ Result.RawSizes[PathIndex] = DirContent.FileSizes[FoundIndex];
+ Result.ModificationTicks[PathIndex] = DirContent.FileModificationTicks[FoundIndex];
+ Result.Attributes[PathIndex] = DirContent.FileAttributes[FoundIndex];
+
+ FolderScanStats.FoundFileCount++;
+ FolderScanStats.FoundFileByteCount += Result.RawSizes[PathIndex];
+ FolderScanStats.AcceptedFileCount++;
+ FolderScanStats.AcceptedFileByteCount += Result.RawSizes[PathIndex];
+ }
+ }
+ }
+ }
+ catch (const std::exception& Ex)
+ {
+ ZEN_WARN("Failed checking content of folder '{}', reason: {}", ParentDirectoryPath, Ex.what());
+ }
+ }
+ CompletedDirectoryCount++;
+ });
+ }
+ Work.Wait(ProgressUpdateDelayMS, [&](bool, bool, ptrdiff_t) {
+ if (ProgressCallback)
+ {
+ ProgressCallback(DirectoriesToScan.size(), CompletedDirectoryCount.load());
+ }
+ });
+ }
+
+ uint32_t WritePathIndex = 0;
+ for (uint32_t ReadPathIndex = 0; ReadPathIndex < PathCount; ReadPathIndex++)
+ {
+ if (!Result.Paths[ReadPathIndex].empty())
+ {
+ if (WritePathIndex < ReadPathIndex)
+ {
+ Result.Paths[WritePathIndex] = std::move(Result.Paths[ReadPathIndex]);
+ Result.RawSizes[WritePathIndex] = Result.RawSizes[ReadPathIndex];
+ Result.Attributes[WritePathIndex] = Result.Attributes[ReadPathIndex];
+ Result.ModificationTicks[WritePathIndex] = Result.ModificationTicks[ReadPathIndex];
+ }
+ WritePathIndex++;
+ }
+ }
+
+ Result.Paths.resize(WritePathIndex);
+ Result.RawSizes.resize(WritePathIndex);
+ Result.Attributes.resize(WritePathIndex);
+ Result.ModificationTicks.resize(WritePathIndex);
+ return Result;
+}
+
void
SaveChunkedFolderContentToCompactBinary(const ChunkedFolderContent& Content, CbWriter& Output)
{
@@ -726,9 +829,10 @@ MergeChunkedFolderContents(const ChunkedFolderContent& Base, std::span<const Chu
}
ChunkedFolderContent
-DeletePathsFromChunkedContent(const ChunkedFolderContent& BaseContent,
- const ChunkedContentLookup& BaseContentLookup,
- std::span<const std::filesystem::path> DeletedPaths)
+DeletePathsFromChunkedContent(const ChunkedFolderContent& BaseContent,
+ const tsl::robin_map<IoHash, uint32_t, IoHash::Hasher>& RawHashToSequenceIndex,
+ std::vector<uint32_t> SequenceIndexChunkOrderOffset,
+ std::span<const std::filesystem::path> DeletedPaths)
{
ZEN_TRACE_CPU("DeletePathsFromChunkedContent");
@@ -776,8 +880,8 @@ DeletePathsFromChunkedContent(const ChunkedFolderContent& BaseContent,
{
RawHashToSequenceRawHashIndex.insert(
{RawHash, gsl::narrow<uint32_t>(Result.ChunkedContent.SequenceRawHashes.size())});
- const uint32_t SequenceRawHashIndex = BaseContentLookup.RawHashToSequenceIndex.at(RawHash);
- const uint32_t OrderIndexOffset = BaseContentLookup.SequenceIndexChunkOrderOffset[SequenceRawHashIndex];
+ const uint32_t SequenceRawHashIndex = RawHashToSequenceIndex.at(RawHash);
+ const uint32_t OrderIndexOffset = SequenceIndexChunkOrderOffset[SequenceRawHashIndex];
const uint32_t ChunkCount = BaseContent.ChunkedContent.ChunkCounts[SequenceRawHashIndex];
std::span<const uint32_t> OriginalChunkOrder =
@@ -820,8 +924,12 @@ DeletePathsFromChunkedContent(const ChunkedFolderContent& BaseContent, std::span
{
return {};
}
- const ChunkedContentLookup BaseLookup = BuildChunkedContentLookup(BaseContent);
- return DeletePathsFromChunkedContent(BaseContent, BaseLookup, DeletedPaths);
+
+ tsl::robin_map<IoHash, uint32_t, IoHash::Hasher> BaseSequenceHashToSequenceIndex =
+ BuildHashLookup(BaseContent.ChunkedContent.SequenceRawHashes);
+ std::vector<uint32_t> BaseSequenceChunkOrderOffset = BuildChunkOrderOffset(BaseContent.ChunkedContent.ChunkCounts);
+
+ return DeletePathsFromChunkedContent(BaseContent, BaseSequenceHashToSequenceIndex, BaseSequenceChunkOrderOffset, DeletedPaths);
}
bool
@@ -1082,6 +1190,33 @@ ChunkFolderContent(ChunkingStatistics& Stats,
return Result;
}
+tsl::robin_map<IoHash, uint32_t, IoHash::Hasher>
+BuildHashLookup(std::span<const IoHash> Hashes)
+{
+ tsl::robin_map<IoHash, uint32_t, IoHash::Hasher> Lookup;
+ Lookup.reserve(Hashes.size());
+ for (uint32_t Index = 0; Index < Hashes.size(); Index++)
+ {
+ bool IsNew = Lookup.insert_or_assign(Hashes[Index], Index).second;
+ ZEN_ASSERT(IsNew);
+ }
+ return Lookup;
+}
+
+std::vector<uint32_t>
+BuildChunkOrderOffset(std::span<const uint32_t> ChunkCounts)
+{
+ std::vector<uint32_t> ChunkOffsets;
+ ChunkOffsets.reserve(ChunkCounts.size());
+ uint32_t Offset = 0;
+ for (uint32_t SequenceIndex = 0; SequenceIndex < ChunkCounts.size(); SequenceIndex++)
+ {
+ ChunkOffsets.push_back(Offset);
+ Offset += ChunkCounts[SequenceIndex];
+ }
+ return ChunkOffsets;
+}
+
ChunkedContentLookup
BuildChunkedContentLookup(const ChunkedFolderContent& Content)
{
@@ -1096,17 +1231,8 @@ BuildChunkedContentLookup(const ChunkedFolderContent& Content)
ChunkedContentLookup Result;
{
- const uint32_t SequenceRawHashesCount = gsl::narrow<uint32_t>(Content.ChunkedContent.SequenceRawHashes.size());
- Result.RawHashToSequenceIndex.reserve(SequenceRawHashesCount);
- Result.SequenceIndexChunkOrderOffset.reserve(SequenceRawHashesCount);
- uint32_t OrderOffset = 0;
- for (uint32_t SequenceRawHashIndex = 0; SequenceRawHashIndex < Content.ChunkedContent.SequenceRawHashes.size();
- SequenceRawHashIndex++)
- {
- Result.RawHashToSequenceIndex.insert({Content.ChunkedContent.SequenceRawHashes[SequenceRawHashIndex], SequenceRawHashIndex});
- Result.SequenceIndexChunkOrderOffset.push_back(OrderOffset);
- OrderOffset += Content.ChunkedContent.ChunkCounts[SequenceRawHashIndex];
- }
+ Result.SequenceIndexChunkOrderOffset = BuildChunkOrderOffset(Content.ChunkedContent.ChunkCounts);
+ Result.RawHashToSequenceIndex = BuildHashLookup(Content.ChunkedContent.SequenceRawHashes);
}
std::vector<ChunkLocationReference> Locations;