diff options
| author | Dan Engelbrecht <[email protected]> | 2025-12-19 16:30:03 +0100 |
|---|---|---|
| committer | GitHub Enterprise <[email protected]> | 2025-12-19 16:30:03 +0100 |
| commit | 0bf7531d530f12e0fa2edab70b6bf4693fb041db (patch) | |
| tree | 0f29a872019d5c5b6952ef5e8babde1b6c7cd555 /src/zenremotestore/chunking/chunkedcontent.cpp | |
| parent | 5.7.15 (diff) | |
| download | zen-0bf7531d530f12e0fa2edab70b6bf4693fb041db.tar.xz zen-0bf7531d530f12e0fa2edab70b6bf4693fb041db.zip | |
optimize scavenge (#697)
* optimize FindScavengeContent
* optimize GetValidFolderContent
Diffstat (limited to 'src/zenremotestore/chunking/chunkedcontent.cpp')
| -rw-r--r-- | src/zenremotestore/chunking/chunkedcontent.cpp | 212 |
1 files changed, 169 insertions, 43 deletions
diff --git a/src/zenremotestore/chunking/chunkedcontent.cpp b/src/zenremotestore/chunking/chunkedcontent.cpp index 5f1876908..e8187d348 100644 --- a/src/zenremotestore/chunking/chunkedcontent.cpp +++ b/src/zenremotestore/chunking/chunkedcontent.cpp @@ -177,31 +177,6 @@ namespace { std::string PathCompareString(const std::filesystem::path& Path) { return ToLower(Path.generic_string()); } - tsl::robin_map<IoHash, uint32_t, IoHash::Hasher> BuildHashLookup(std::span<const IoHash> Hashes) - { - tsl::robin_map<IoHash, uint32_t, IoHash::Hasher> Lookup; - Lookup.reserve(Hashes.size()); - for (uint32_t Index = 0; Index < Hashes.size(); Index++) - { - bool IsNew = Lookup.insert_or_assign(Hashes[Index], Index).second; - ZEN_ASSERT(IsNew); - } - return Lookup; - } - - std::vector<uint32_t> BuildChunkOrderOffset(std::span<const uint32_t> ChunkCounts) - { - std::vector<uint32_t> ChunkOffsets; - ChunkOffsets.reserve(ChunkCounts.size()); - uint32_t Offset = 0; - for (uint32_t SequenceIndex = 0; SequenceIndex < ChunkCounts.size(); SequenceIndex++) - { - ChunkOffsets.push_back(Offset); - Offset += ChunkCounts[SequenceIndex]; - } - return ChunkOffsets; - } - } // namespace std::string_view FolderContentSourcePlatformNames[(size_t)SourcePlatform::_Count] = {"Windows"sv, "Linux"sv, "MacOS"sv}; @@ -551,6 +526,134 @@ GetFolderContent(GetFolderContentStatistics& Stats, return OrderedContent; } +FolderContent +GetValidFolderContent(WorkerThreadPool& WorkerPool, + GetFolderContentStatistics& FolderScanStats, + const std::filesystem::path& Path, + std::span<const std::filesystem::path> PathsToCheck, + std::function<void(uint64_t PathCount, uint64_t CompletedPathCount)>&& ProgressCallback, + uint32_t ProgressUpdateDelayMS, + std::atomic<bool>& AbortFlag, + std::atomic<bool>& PauseFlag) +{ + ZEN_TRACE_CPU("GetValidFolderContent"); + + FolderContent Result; + const uint32_t PathCount = gsl::narrow<uint32_t>(PathsToCheck.size()); + + Result.Paths.resize(PathCount); + Result.RawSizes.resize(PathCount); + Result.Attributes.resize(PathCount); + Result.ModificationTicks.resize(PathCount); + + { + Stopwatch Timer; + auto _ = MakeGuard([&FolderScanStats, &Timer]() { FolderScanStats.ElapsedWallTimeUS = Timer.GetElapsedTimeUs(); }); + + tsl::robin_map<std::string, uint32_t> PathToPathIndex; + PathToPathIndex.reserve(PathsToCheck.size()); + std::vector<std::filesystem::path> DirectoriesToScan; + { + tsl::robin_set<std::string> DirectoriesFound; + for (size_t PathIndex = 0; PathIndex < PathsToCheck.size(); PathIndex++) + { + const std::filesystem::path PathToCheck = (Path / PathsToCheck[PathIndex]); + const std::string LookupPath = PathToCheck.generic_string(); + PathToPathIndex.insert_or_assign(LookupPath, PathIndex); + std::filesystem::path ParentDirectoryPath = PathToCheck.parent_path(); + const std::string Directory = ParentDirectoryPath.generic_string(); + if (DirectoriesFound.insert(Directory).second) + { + DirectoriesToScan.push_back(ParentDirectoryPath.make_preferred()); + } + } + } + + ParallelWork Work(AbortFlag, + PauseFlag, + ProgressCallback ? WorkerThreadPool::EMode::EnableBacklog : WorkerThreadPool::EMode::DisableBacklog); + + std::atomic<uint64_t> CompletedDirectoryCount = 0; + for (size_t DirectoryIndex = 0; DirectoryIndex < DirectoriesToScan.size(); DirectoryIndex++) + { + Work.ScheduleWork( + WorkerPool, + [&DirectoriesToScan, DirectoryIndex, &Result, &FolderScanStats, &PathsToCheck, &PathToPathIndex, &CompletedDirectoryCount]( + std::atomic<bool>& AbortFlag) { + if (!AbortFlag) + { + ZEN_TRACE_CPU("GetValidFolderContent_ScanDirectory"); + + const std::filesystem::path ParentDirectoryPath = DirectoriesToScan[DirectoryIndex]; + try + { + if (IsDir(ParentDirectoryPath)) + { + DirectoryContent DirContent; + GetDirectoryContent(ParentDirectoryPath, + DirectoryContentFlags::IncludeFiles | DirectoryContentFlags::IncludeFileSizes | + DirectoryContentFlags::IncludeModificationTick | + DirectoryContentFlags::IncludeAttributes, + DirContent); + for (size_t FoundIndex = 0; FoundIndex < DirContent.Files.size(); FoundIndex++) + { + const std::filesystem::path& FoundPath = DirContent.Files[FoundIndex]; + if (auto It = PathToPathIndex.find(FoundPath.generic_string()); It != PathToPathIndex.end()) + { + const size_t PathIndex = It->second; + + Result.Paths[PathIndex] = PathsToCheck[PathIndex]; + Result.RawSizes[PathIndex] = DirContent.FileSizes[FoundIndex]; + Result.ModificationTicks[PathIndex] = DirContent.FileModificationTicks[FoundIndex]; + Result.Attributes[PathIndex] = DirContent.FileAttributes[FoundIndex]; + + FolderScanStats.FoundFileCount++; + FolderScanStats.FoundFileByteCount += Result.RawSizes[PathIndex]; + FolderScanStats.AcceptedFileCount++; + FolderScanStats.AcceptedFileByteCount += Result.RawSizes[PathIndex]; + } + } + } + } + catch (const std::exception& Ex) + { + ZEN_WARN("Failed checking content of folder '{}', reason: {}", ParentDirectoryPath, Ex.what()); + } + } + CompletedDirectoryCount++; + }); + } + Work.Wait(ProgressUpdateDelayMS, [&](bool, bool, ptrdiff_t) { + if (ProgressCallback) + { + ProgressCallback(DirectoriesToScan.size(), CompletedDirectoryCount.load()); + } + }); + } + + uint32_t WritePathIndex = 0; + for (uint32_t ReadPathIndex = 0; ReadPathIndex < PathCount; ReadPathIndex++) + { + if (!Result.Paths[ReadPathIndex].empty()) + { + if (WritePathIndex < ReadPathIndex) + { + Result.Paths[WritePathIndex] = std::move(Result.Paths[ReadPathIndex]); + Result.RawSizes[WritePathIndex] = Result.RawSizes[ReadPathIndex]; + Result.Attributes[WritePathIndex] = Result.Attributes[ReadPathIndex]; + Result.ModificationTicks[WritePathIndex] = Result.ModificationTicks[ReadPathIndex]; + } + WritePathIndex++; + } + } + + Result.Paths.resize(WritePathIndex); + Result.RawSizes.resize(WritePathIndex); + Result.Attributes.resize(WritePathIndex); + Result.ModificationTicks.resize(WritePathIndex); + return Result; +} + void SaveChunkedFolderContentToCompactBinary(const ChunkedFolderContent& Content, CbWriter& Output) { @@ -726,9 +829,10 @@ MergeChunkedFolderContents(const ChunkedFolderContent& Base, std::span<const Chu } ChunkedFolderContent -DeletePathsFromChunkedContent(const ChunkedFolderContent& BaseContent, - const ChunkedContentLookup& BaseContentLookup, - std::span<const std::filesystem::path> DeletedPaths) +DeletePathsFromChunkedContent(const ChunkedFolderContent& BaseContent, + const tsl::robin_map<IoHash, uint32_t, IoHash::Hasher>& RawHashToSequenceIndex, + std::vector<uint32_t> SequenceIndexChunkOrderOffset, + std::span<const std::filesystem::path> DeletedPaths) { ZEN_TRACE_CPU("DeletePathsFromChunkedContent"); @@ -776,8 +880,8 @@ DeletePathsFromChunkedContent(const ChunkedFolderContent& BaseContent, { RawHashToSequenceRawHashIndex.insert( {RawHash, gsl::narrow<uint32_t>(Result.ChunkedContent.SequenceRawHashes.size())}); - const uint32_t SequenceRawHashIndex = BaseContentLookup.RawHashToSequenceIndex.at(RawHash); - const uint32_t OrderIndexOffset = BaseContentLookup.SequenceIndexChunkOrderOffset[SequenceRawHashIndex]; + const uint32_t SequenceRawHashIndex = RawHashToSequenceIndex.at(RawHash); + const uint32_t OrderIndexOffset = SequenceIndexChunkOrderOffset[SequenceRawHashIndex]; const uint32_t ChunkCount = BaseContent.ChunkedContent.ChunkCounts[SequenceRawHashIndex]; std::span<const uint32_t> OriginalChunkOrder = @@ -820,8 +924,12 @@ DeletePathsFromChunkedContent(const ChunkedFolderContent& BaseContent, std::span { return {}; } - const ChunkedContentLookup BaseLookup = BuildChunkedContentLookup(BaseContent); - return DeletePathsFromChunkedContent(BaseContent, BaseLookup, DeletedPaths); + + tsl::robin_map<IoHash, uint32_t, IoHash::Hasher> BaseSequenceHashToSequenceIndex = + BuildHashLookup(BaseContent.ChunkedContent.SequenceRawHashes); + std::vector<uint32_t> BaseSequenceChunkOrderOffset = BuildChunkOrderOffset(BaseContent.ChunkedContent.ChunkCounts); + + return DeletePathsFromChunkedContent(BaseContent, BaseSequenceHashToSequenceIndex, BaseSequenceChunkOrderOffset, DeletedPaths); } bool @@ -1082,6 +1190,33 @@ ChunkFolderContent(ChunkingStatistics& Stats, return Result; } +tsl::robin_map<IoHash, uint32_t, IoHash::Hasher> +BuildHashLookup(std::span<const IoHash> Hashes) +{ + tsl::robin_map<IoHash, uint32_t, IoHash::Hasher> Lookup; + Lookup.reserve(Hashes.size()); + for (uint32_t Index = 0; Index < Hashes.size(); Index++) + { + bool IsNew = Lookup.insert_or_assign(Hashes[Index], Index).second; + ZEN_ASSERT(IsNew); + } + return Lookup; +} + +std::vector<uint32_t> +BuildChunkOrderOffset(std::span<const uint32_t> ChunkCounts) +{ + std::vector<uint32_t> ChunkOffsets; + ChunkOffsets.reserve(ChunkCounts.size()); + uint32_t Offset = 0; + for (uint32_t SequenceIndex = 0; SequenceIndex < ChunkCounts.size(); SequenceIndex++) + { + ChunkOffsets.push_back(Offset); + Offset += ChunkCounts[SequenceIndex]; + } + return ChunkOffsets; +} + ChunkedContentLookup BuildChunkedContentLookup(const ChunkedFolderContent& Content) { @@ -1096,17 +1231,8 @@ BuildChunkedContentLookup(const ChunkedFolderContent& Content) ChunkedContentLookup Result; { - const uint32_t SequenceRawHashesCount = gsl::narrow<uint32_t>(Content.ChunkedContent.SequenceRawHashes.size()); - Result.RawHashToSequenceIndex.reserve(SequenceRawHashesCount); - Result.SequenceIndexChunkOrderOffset.reserve(SequenceRawHashesCount); - uint32_t OrderOffset = 0; - for (uint32_t SequenceRawHashIndex = 0; SequenceRawHashIndex < Content.ChunkedContent.SequenceRawHashes.size(); - SequenceRawHashIndex++) - { - Result.RawHashToSequenceIndex.insert({Content.ChunkedContent.SequenceRawHashes[SequenceRawHashIndex], SequenceRawHashIndex}); - Result.SequenceIndexChunkOrderOffset.push_back(OrderOffset); - OrderOffset += Content.ChunkedContent.ChunkCounts[SequenceRawHashIndex]; - } + Result.SequenceIndexChunkOrderOffset = BuildChunkOrderOffset(Content.ChunkedContent.ChunkCounts); + Result.RawHashToSequenceIndex = BuildHashLookup(Content.ChunkedContent.SequenceRawHashes); } std::vector<ChunkLocationReference> Locations; |