diff options
| author | Dan Engelbrecht <[email protected]> | 2025-10-03 11:49:14 +0200 |
|---|---|---|
| committer | GitHub Enterprise <[email protected]> | 2025-10-03 11:49:14 +0200 |
| commit | faf0b7c9b6a08b095f8dc895904f4f7d3f30dcde (patch) | |
| tree | 2bcd09fe17af6f25108fd05578e7eda6a827d8ec /src/zenutil/chunkedcontent.cpp | |
| parent | cache RPC replay fixes (minor) (#544) (diff) | |
| download | zen-faf0b7c9b6a08b095f8dc895904f4f7d3f30dcde.tar.xz zen-faf0b7c9b6a08b095f8dc895904f4f7d3f30dcde.zip | |
move chunking code to zenremotestore lib (#545)
Diffstat (limited to 'src/zenutil/chunkedcontent.cpp')
| -rw-r--r-- | src/zenutil/chunkedcontent.cpp | 953 |
1 files changed, 0 insertions, 953 deletions
diff --git a/src/zenutil/chunkedcontent.cpp b/src/zenutil/chunkedcontent.cpp deleted file mode 100644 index 757bcfae5..000000000 --- a/src/zenutil/chunkedcontent.cpp +++ /dev/null @@ -1,953 +0,0 @@ -// Copyright Epic Games, Inc. All Rights Reserved. - -#include <zenutil/chunkedcontent.h> - -#include <zencore/filesystem.h> -#include <zencore/fmtutils.h> -#include <zencore/logging.h> -#include <zencore/scopeguard.h> -#include <zencore/timer.h> -#include <zencore/trace.h> - -#include <zenutil/chunkedfile.h> -#include <zenutil/chunkingcontroller.h> -#include <zenutil/parallelwork.h> -#include <zenutil/workerpools.h> - -ZEN_THIRD_PARTY_INCLUDES_START -#include <tsl/robin_set.h> -#include <gsl/gsl-lite.hpp> -ZEN_THIRD_PARTY_INCLUDES_END - -namespace zen { - -using namespace std::literals; - -namespace { - void AddChunkSequence(ChunkingStatistics& Stats, - ChunkedContentData& InOutChunkedContent, - tsl::robin_map<IoHash, uint32_t, IoHash::Hasher>& ChunkHashToChunkIndex, - const IoHash& RawHash, - std::span<const uint32_t> ChunkSequence, - std::span<const IoHash> ChunkHashes, - std::span<const uint64_t> ChunkRawSizes) - { - ZEN_ASSERT(ChunkHashes.size() == ChunkRawSizes.size()); - InOutChunkedContent.ChunkCounts.push_back(gsl::narrow<uint32_t>(ChunkSequence.size())); - InOutChunkedContent.ChunkOrders.reserve(InOutChunkedContent.ChunkOrders.size() + ChunkSequence.size()); - - for (uint32_t ChunkedSequenceIndex : ChunkSequence) - { - const IoHash& ChunkHash = ChunkHashes[ChunkedSequenceIndex]; - if (auto It = ChunkHashToChunkIndex.find(ChunkHash); It != ChunkHashToChunkIndex.end()) - { - uint32_t ChunkIndex = gsl::narrow<uint32_t>(It->second); - InOutChunkedContent.ChunkOrders.push_back(ChunkIndex); - } - else - { - uint32_t ChunkIndex = gsl::narrow<uint32_t>(InOutChunkedContent.ChunkHashes.size()); - ChunkHashToChunkIndex.insert_or_assign(ChunkHash, ChunkIndex); - InOutChunkedContent.ChunkHashes.push_back(ChunkHash); - InOutChunkedContent.ChunkRawSizes.push_back(ChunkRawSizes[ChunkedSequenceIndex]); - InOutChunkedContent.ChunkOrders.push_back(ChunkIndex); - Stats.UniqueChunksFound++; - Stats.UniqueBytesFound += ChunkRawSizes[ChunkedSequenceIndex]; - } - } - InOutChunkedContent.SequenceRawHashes.push_back(RawHash); - Stats.UniqueSequencesFound++; - } - - void AddChunkSequence(ChunkingStatistics& Stats, - ChunkedContentData& InOutChunkedContent, - tsl::robin_map<IoHash, uint32_t, IoHash::Hasher>& ChunkHashToChunkIndex, - const IoHash& RawHash, - const uint64_t RawSize) - { - InOutChunkedContent.ChunkCounts.push_back(1); - - if (auto It = ChunkHashToChunkIndex.find(RawHash); It != ChunkHashToChunkIndex.end()) - { - uint32_t ChunkIndex = gsl::narrow<uint32_t>(It->second); - InOutChunkedContent.ChunkOrders.push_back(ChunkIndex); - } - else - { - uint32_t ChunkIndex = gsl::narrow<uint32_t>(InOutChunkedContent.ChunkHashes.size()); - ChunkHashToChunkIndex.insert_or_assign(RawHash, ChunkIndex); - InOutChunkedContent.ChunkHashes.push_back(RawHash); - InOutChunkedContent.ChunkRawSizes.push_back(RawSize); - InOutChunkedContent.ChunkOrders.push_back(ChunkIndex); - Stats.UniqueChunksFound++; - Stats.UniqueBytesFound += RawSize; - } - InOutChunkedContent.SequenceRawHashes.push_back(RawHash); - Stats.UniqueSequencesFound++; - } - - IoHash HashOneFile(ChunkingStatistics& Stats, - const ChunkingController& InChunkingController, - ChunkedFolderContent& OutChunkedContent, - tsl::robin_map<IoHash, uint32_t, IoHash::Hasher>& ChunkHashToChunkIndex, - tsl::robin_map<IoHash, uint32_t, IoHash::Hasher>& RawHashToSequenceRawHashIndex, - RwLock& Lock, - const std::filesystem::path& FolderPath, - uint32_t PathIndex, - std::atomic<bool>& AbortFlag) - { - ZEN_TRACE_CPU("ChunkFolderContent"); - - const uint64_t RawSize = OutChunkedContent.RawSizes[PathIndex]; - const std::filesystem::path& Path = OutChunkedContent.Paths[PathIndex]; - - if (RawSize == 0) - { - return IoHash::Zero; - } - else - { - ChunkedInfoWithSource Chunked; - const bool DidChunking = - InChunkingController.ProcessFile((FolderPath / Path).make_preferred(), RawSize, Chunked, Stats.BytesHashed, AbortFlag); - if (DidChunking) - { - Lock.WithExclusiveLock([&]() { - if (!RawHashToSequenceRawHashIndex.contains(Chunked.Info.RawHash)) - { - RawHashToSequenceRawHashIndex.insert( - {Chunked.Info.RawHash, gsl::narrow<uint32_t>(OutChunkedContent.ChunkedContent.SequenceRawHashes.size())}); - std::vector<uint64_t> ChunkSizes; - ChunkSizes.reserve(Chunked.ChunkSources.size()); - for (const ChunkSource& Source : Chunked.ChunkSources) - { - ChunkSizes.push_back(Source.Size); - } - AddChunkSequence(Stats, - OutChunkedContent.ChunkedContent, - ChunkHashToChunkIndex, - Chunked.Info.RawHash, - Chunked.Info.ChunkSequence, - Chunked.Info.ChunkHashes, - ChunkSizes); - Stats.UniqueSequencesFound++; - } - }); - Stats.FilesChunked++; - return Chunked.Info.RawHash; - } - else - { - ZEN_TRACE_CPU("HashOnly"); - - IoBuffer Buffer = IoBufferBuilder::MakeFromFile((FolderPath / Path).make_preferred()); - if (Buffer.GetSize() != RawSize) - { - throw std::runtime_error(fmt::format("Failed opening file '{}' for hashing", FolderPath / Path)); - } - const IoHash Hash = IoHash::HashBuffer(Buffer, &Stats.BytesHashed); - - Lock.WithExclusiveLock([&]() { - if (!RawHashToSequenceRawHashIndex.contains(Hash)) - { - RawHashToSequenceRawHashIndex.insert( - {Hash, gsl::narrow<uint32_t>(OutChunkedContent.ChunkedContent.SequenceRawHashes.size())}); - AddChunkSequence(Stats, OutChunkedContent.ChunkedContent, ChunkHashToChunkIndex, Hash, RawSize); - Stats.UniqueSequencesFound++; - } - }); - return Hash; - } - } - } - - std::string PathCompareString(const std::filesystem::path& Path) { return ToLower(Path.generic_string()); } - -} // namespace - -std::string_view FolderContentSourcePlatformNames[(size_t)SourcePlatform::_Count] = {"Windows"sv, "Linux"sv, "MacOS"sv}; - -std::string_view -ToString(SourcePlatform Platform) -{ - return FolderContentSourcePlatformNames[(size_t)Platform]; -} - -SourcePlatform -FromString(std::string_view Platform, SourcePlatform Default) -{ - for (size_t Index = 0; Index < (size_t)SourcePlatform::_Count; Index++) - { - if (Platform == FolderContentSourcePlatformNames[Index]) - { - return (SourcePlatform)Index; - } - } - return Default; -} - -SourcePlatform -GetSourceCurrentPlatform() -{ -#if ZEN_PLATFORM_WINDOWS - return SourcePlatform::Windows; -#endif -#if ZEN_PLATFORM_MAC - return SourcePlatform::MacOS; -#endif -#if ZEN_PLATFORM_LINUX - return SourcePlatform::Linux; -#endif -} - -bool -FolderContent::AreFileAttributesEqual(const uint32_t Lhs, const uint32_t Rhs) -{ -#if ZEN_PLATFORM_WINDOWS - return (Lhs & 0xff) == (Rhs & 0xff); -#endif -#if ZEN_PLATFORM_MAC - return Lhs == Rhs; -#endif -#if ZEN_PLATFORM_LINUX - return Lhs == Rhs; -#endif -} - -bool -FolderContent::operator==(const FolderContent& Rhs) const -{ - if ((Platform == Rhs.Platform) && (RawSizes == Rhs.RawSizes) && (Attributes == Rhs.Attributes) && - (ModificationTicks == Rhs.ModificationTicks) && (Paths.size() == Rhs.Paths.size())) - { - size_t PathCount = 0; - for (size_t PathIndex = 0; PathIndex < PathCount; PathIndex++) - { - if (Paths[PathIndex].generic_string() != Rhs.Paths[PathIndex].generic_string()) - { - return false; - } - } - return true; - } - return false; -} - -bool -FolderContent::AreKnownFilesEqual(const FolderContent& Rhs) const -{ - ZEN_TRACE_CPU("FolderContent::AreKnownFilesEqual"); - tsl::robin_map<std::string, size_t> RhsPathToIndex; - const size_t RhsPathCount = Rhs.Paths.size(); - RhsPathToIndex.reserve(RhsPathCount); - for (size_t RhsPathIndex = 0; RhsPathIndex < RhsPathCount; RhsPathIndex++) - { - RhsPathToIndex.insert({Rhs.Paths[RhsPathIndex].generic_string(), RhsPathIndex}); - } - const size_t PathCount = Paths.size(); - for (size_t PathIndex = 0; PathIndex < PathCount; PathIndex++) - { - if (auto It = RhsPathToIndex.find(Paths[PathIndex].generic_string()); It != RhsPathToIndex.end()) - { - const size_t RhsPathIndex = It->second; - if ((RawSizes[PathIndex] != Rhs.RawSizes[RhsPathIndex]) || - (!AreFileAttributesEqual(Attributes[PathIndex], Rhs.Attributes[RhsPathIndex])) || - (ModificationTicks[PathIndex] != Rhs.ModificationTicks[RhsPathIndex])) - { - return false; - } - } - else - { - return false; - } - } - return true; -} - -void -FolderContent::UpdateState(const FolderContent& Rhs, std::vector<uint32_t>& OutPathIndexesOufOfDate) -{ - ZEN_TRACE_CPU("FolderContent::UpdateState"); - tsl::robin_map<std::string, uint32_t> RhsPathToIndex; - const uint32_t RhsPathCount = gsl::narrow<uint32_t>(Rhs.Paths.size()); - RhsPathToIndex.reserve(RhsPathCount); - for (uint32_t RhsPathIndex = 0; RhsPathIndex < RhsPathCount; RhsPathIndex++) - { - RhsPathToIndex.insert({Rhs.Paths[RhsPathIndex].generic_string(), RhsPathIndex}); - } - uint32_t PathCount = gsl::narrow<uint32_t>(Paths.size()); - for (uint32_t PathIndex = 0; PathIndex < PathCount;) - { - if (auto It = RhsPathToIndex.find(Paths[PathIndex].generic_string()); It != RhsPathToIndex.end()) - { - const uint32_t RhsPathIndex = It->second; - - if ((RawSizes[PathIndex] != Rhs.RawSizes[RhsPathIndex]) || - (ModificationTicks[PathIndex] != Rhs.ModificationTicks[RhsPathIndex])) - { - RawSizes[PathIndex] = Rhs.RawSizes[RhsPathIndex]; - ModificationTicks[PathIndex] = Rhs.ModificationTicks[RhsPathIndex]; - OutPathIndexesOufOfDate.push_back(PathIndex); - } - Attributes[PathIndex] = Rhs.Attributes[RhsPathIndex]; - PathIndex++; - } - else - { - Paths.erase(Paths.begin() + PathIndex); - RawSizes.erase(RawSizes.begin() + PathIndex); - Attributes.erase(Attributes.begin() + PathIndex); - ModificationTicks.erase(ModificationTicks.begin() + PathIndex); - PathCount--; - } - } -} - -FolderContent -GetUpdatedContent(const FolderContent& Old, const FolderContent& New, std::vector<std::filesystem::path>& OutDeletedPaths) -{ - ZEN_TRACE_CPU("FolderContent::GetUpdatedContent"); - - const uint32_t NewPathCount = gsl::narrow<uint32_t>(New.Paths.size()); - - FolderContent Result = {.Platform = Old.Platform}; - Result.Paths.reserve(NewPathCount); - Result.RawSizes.reserve(NewPathCount); - Result.Attributes.reserve(NewPathCount); - Result.ModificationTicks.reserve(NewPathCount); - - tsl::robin_map<std::string, uint32_t> NewPathToIndex; - NewPathToIndex.reserve(NewPathCount); - for (uint32_t NewPathIndex = 0; NewPathIndex < NewPathCount; NewPathIndex++) - { - NewPathToIndex.insert({New.Paths[NewPathIndex].generic_string(), NewPathIndex}); - } - - uint32_t OldPathCount = gsl::narrow<uint32_t>(Old.Paths.size()); - for (uint32_t OldPathIndex = 0; OldPathIndex < OldPathCount; OldPathIndex++) - { - if (auto It = NewPathToIndex.find(Old.Paths[OldPathIndex].generic_string()); It != NewPathToIndex.end()) - { - const uint32_t NewPathIndex = It->second; - - if ((Old.RawSizes[OldPathIndex] != New.RawSizes[NewPathIndex]) || - (Old.ModificationTicks[OldPathIndex] != New.ModificationTicks[NewPathIndex])) - { - Result.Paths.push_back(New.Paths[NewPathIndex]); - Result.RawSizes.push_back(New.RawSizes[NewPathIndex]); - Result.Attributes.push_back(New.Attributes[NewPathIndex]); - Result.ModificationTicks.push_back(New.ModificationTicks[NewPathIndex]); - } - } - else - { - OutDeletedPaths.push_back(Old.Paths[OldPathIndex]); - } - } - return Result; -} - -void -SaveFolderContentToCompactBinary(const FolderContent& Content, CbWriter& Output) -{ - ZEN_TRACE_CPU("SaveFolderContentToCompactBinary"); - Output.AddString("platform"sv, ToString(Content.Platform)); - compactbinary_helpers::WriteArray(Content.Paths, "paths"sv, Output); - compactbinary_helpers::WriteArray(Content.RawSizes, "rawSizes"sv, Output); - compactbinary_helpers::WriteArray(Content.Attributes, "attributes"sv, Output); - compactbinary_helpers::WriteArray(Content.ModificationTicks, "modificationTimes"sv, Output); -} - -FolderContent -LoadFolderContentToCompactBinary(CbObjectView Input) -{ - ZEN_TRACE_CPU("LoadFolderContentToCompactBinary"); - FolderContent Content; - Content.Platform = FromString(Input["platform"sv].AsString(), GetSourceCurrentPlatform()); - compactbinary_helpers::ReadArray("paths"sv, Input, Content.Paths); - compactbinary_helpers::ReadArray("rawSizes"sv, Input, Content.RawSizes); - compactbinary_helpers::ReadArray("attributes"sv, Input, Content.Attributes); - compactbinary_helpers::ReadArray("modificationTimes"sv, Input, Content.ModificationTicks); - return Content; -} - -FolderContent -GetFolderContent(GetFolderContentStatistics& Stats, - const std::filesystem::path& RootPath, - std::function<bool(const std::string_view& RelativePath)>&& AcceptDirectory, - std::function<bool(std::string_view RelativePath, uint64_t Size, uint32_t Attributes)>&& AcceptFile, - WorkerThreadPool& WorkerPool, - int32_t UpdateIntervalMS, - std::function<void(bool IsAborted, std::ptrdiff_t PendingWork)>&& UpdateCallback, - std::atomic<bool>& AbortFlag) -{ - ZEN_TRACE_CPU("GetFolderContent"); - - Stopwatch Timer; - auto _ = MakeGuard([&Stats, &Timer]() { Stats.ElapsedWallTimeUS = Timer.GetElapsedTimeUs(); }); - - FolderContent Content; - struct AsyncVisitor : public GetDirectoryContentVisitor - { - AsyncVisitor(GetFolderContentStatistics& Stats, - std::atomic<bool>& AbortFlag, - FolderContent& Content, - std::function<bool(const std::string_view& RelativePath)>&& AcceptDirectory, - std::function<bool(std::string_view RelativePath, uint64_t Size, uint32_t Attributes)>&& AcceptFile) - : m_Stats(Stats) - , m_AbortFlag(AbortFlag) - , m_FoundContent(Content) - , m_AcceptDirectory(std::move(AcceptDirectory)) - , m_AcceptFile(std::move(AcceptFile)) - { - } - virtual void AsyncVisitDirectory(const std::filesystem::path& RelativeRoot, DirectoryContent&& Content) override - { - if (!m_AbortFlag) - { - m_Stats.FoundFileCount += Content.FileNames.size(); - for (uint64_t FileSize : Content.FileSizes) - { - m_Stats.FoundFileByteCount += FileSize; - } - std::string RelativeDirectoryPath = RelativeRoot.generic_string(); - if (m_AcceptDirectory(RelativeDirectoryPath)) - { - std::vector<std::filesystem::path> Paths; - std::vector<uint64_t> RawSizes; - std::vector<uint32_t> Attributes; - std::vector<uint64_t> ModificatonTicks; - Paths.reserve(Content.FileNames.size()); - RawSizes.reserve(Content.FileNames.size()); - Attributes.reserve(Content.FileNames.size()); - ModificatonTicks.reserve(Content.FileModificationTicks.size()); - - for (size_t FileIndex = 0; FileIndex < Content.FileNames.size(); FileIndex++) - { - const std::filesystem::path& FileName = Content.FileNames[FileIndex]; - std::string RelativePath = (RelativeRoot / FileName).generic_string(); - std::replace(RelativePath.begin(), RelativePath.end(), '\\', '/'); - if (m_AcceptFile(RelativePath, Content.FileSizes[FileIndex], Content.FileAttributes[FileIndex])) - { - Paths.emplace_back(std::move(RelativePath)); - RawSizes.emplace_back(Content.FileSizes[FileIndex]); - Attributes.emplace_back(Content.FileAttributes[FileIndex]); - ModificatonTicks.emplace_back(Content.FileModificationTicks[FileIndex]); - - m_Stats.AcceptedFileCount++; - m_Stats.AcceptedFileByteCount += Content.FileSizes[FileIndex]; - } - } - m_Lock.WithExclusiveLock([&]() { - m_FoundContent.Paths.insert(m_FoundContent.Paths.end(), Paths.begin(), Paths.end()); - m_FoundContent.RawSizes.insert(m_FoundContent.RawSizes.end(), RawSizes.begin(), RawSizes.end()); - m_FoundContent.Attributes.insert(m_FoundContent.Attributes.end(), Attributes.begin(), Attributes.end()); - m_FoundContent.ModificationTicks.insert(m_FoundContent.ModificationTicks.end(), - ModificatonTicks.begin(), - ModificatonTicks.end()); - }); - } - } - } - - GetFolderContentStatistics& m_Stats; - std::atomic<bool>& m_AbortFlag; - RwLock m_Lock; - FolderContent& m_FoundContent; - std::function<bool(const std::string_view& RelativePath)> m_AcceptDirectory; - std::function<bool(std::string_view RelativePath, uint64_t Size, uint32_t Attributes)> m_AcceptFile; - } Visitor(Stats, AbortFlag, Content, std::move(AcceptDirectory), std::move(AcceptFile)); - - Latch PendingWork(1); - GetDirectoryContent(RootPath, - DirectoryContentFlags::IncludeFiles | DirectoryContentFlags::Recursive | DirectoryContentFlags::IncludeFileSizes | - DirectoryContentFlags::IncludeAttributes | DirectoryContentFlags::IncludeModificationTick, - Visitor, - WorkerPool, - PendingWork); - PendingWork.CountDown(); - while (!PendingWork.Wait(UpdateIntervalMS)) - { - UpdateCallback(AbortFlag.load(), PendingWork.Remaining()); - } - std::vector<size_t> Order; - size_t PathCount = Content.Paths.size(); - Order.resize(Content.Paths.size()); - std::vector<std::string> Parents; - Parents.reserve(PathCount); - std::vector<std::string> Filenames; - Filenames.reserve(PathCount); - for (size_t OrderIndex = 0; OrderIndex < PathCount; OrderIndex++) - { - Order[OrderIndex] = OrderIndex; - Parents.emplace_back(Content.Paths[OrderIndex].parent_path().generic_string()); - Filenames.emplace_back(Content.Paths[OrderIndex].filename().generic_string()); - } - std::sort(Order.begin(), Order.end(), [&Parents, &Filenames](size_t Lhs, size_t Rhs) { - const std::string& LhsParent = Parents[Lhs]; - const std::string& RhsParent = Parents[Rhs]; - if (LhsParent < RhsParent) - { - return true; - } - else if (LhsParent > RhsParent) - { - return false; - } - return Filenames[Lhs] < Filenames[Rhs]; - }); - FolderContent OrderedContent; - OrderedContent.Paths.reserve(PathCount); - OrderedContent.RawSizes.reserve(PathCount); - OrderedContent.Attributes.reserve(PathCount); - OrderedContent.ModificationTicks.reserve(PathCount); - for (size_t OrderIndex : Order) - { - OrderedContent.Paths.emplace_back(std::move(Content.Paths[OrderIndex])); - OrderedContent.RawSizes.emplace_back(Content.RawSizes[OrderIndex]); - OrderedContent.Attributes.emplace_back(Content.Attributes[OrderIndex]); - OrderedContent.ModificationTicks.emplace_back(Content.ModificationTicks[OrderIndex]); - } - return OrderedContent; -} - -void -SaveChunkedFolderContentToCompactBinary(const ChunkedFolderContent& Content, CbWriter& Output) -{ - ZEN_TRACE_CPU("SaveChunkedFolderContentToCompactBinary"); - Output.AddString("platform"sv, ToString(Content.Platform)); - compactbinary_helpers::WriteArray(Content.Paths, "paths"sv, Output); - compactbinary_helpers::WriteArray(Content.RawSizes, "rawSizes"sv, Output); - compactbinary_helpers::WriteArray(Content.Attributes, "attributes"sv, Output); - compactbinary_helpers::WriteArray(Content.RawHashes, "rawHashes"sv, Output); - - Output.BeginObject("chunkedContent"); - compactbinary_helpers::WriteArray(Content.ChunkedContent.SequenceRawHashes, "sequenceRawHashes"sv, Output); - compactbinary_helpers::WriteArray(Content.ChunkedContent.ChunkCounts, "chunkCounts"sv, Output); - compactbinary_helpers::WriteArray(Content.ChunkedContent.ChunkOrders, "chunkOrders"sv, Output); - compactbinary_helpers::WriteArray(Content.ChunkedContent.ChunkHashes, "chunkHashes"sv, Output); - compactbinary_helpers::WriteArray(Content.ChunkedContent.ChunkRawSizes, "chunkRawSizes"sv, Output); - Output.EndObject(); // chunkedContent -} - -ChunkedFolderContent -LoadChunkedFolderContentToCompactBinary(CbObjectView Input) -{ - ZEN_TRACE_CPU("LoadChunkedFolderContentToCompactBinary"); - ChunkedFolderContent Content; - Content.Platform = FromString(Input["platform"sv].AsString(), GetSourceCurrentPlatform()); - compactbinary_helpers::ReadArray("paths"sv, Input, Content.Paths); - compactbinary_helpers::ReadArray("rawSizes"sv, Input, Content.RawSizes); - compactbinary_helpers::ReadArray("attributes"sv, Input, Content.Attributes); - compactbinary_helpers::ReadArray("rawHashes"sv, Input, Content.RawHashes); - - CbObjectView ChunkedContentView = Input["chunkedContent"sv].AsObjectView(); - compactbinary_helpers::ReadArray("sequenceRawHashes"sv, ChunkedContentView, Content.ChunkedContent.SequenceRawHashes); - compactbinary_helpers::ReadArray("chunkCounts"sv, ChunkedContentView, Content.ChunkedContent.ChunkCounts); - compactbinary_helpers::ReadArray("chunkOrders"sv, ChunkedContentView, Content.ChunkedContent.ChunkOrders); - compactbinary_helpers::ReadArray("chunkHashes"sv, ChunkedContentView, Content.ChunkedContent.ChunkHashes); - compactbinary_helpers::ReadArray("chunkRawSizes"sv, ChunkedContentView, Content.ChunkedContent.ChunkRawSizes); - return Content; -} - -ChunkedFolderContent -MergeChunkedFolderContents(const ChunkedFolderContent& Base, std::span<const ChunkedFolderContent> Overlays) -{ - ZEN_TRACE_CPU("MergeChunkedFolderContents"); - - ZEN_ASSERT(!Overlays.empty()); - - ChunkedFolderContent Result; - const size_t BasePathCount = Base.Paths.size(); - Result.Paths.reserve(BasePathCount); - Result.RawSizes.reserve(BasePathCount); - Result.Attributes.reserve(BasePathCount); - Result.RawHashes.reserve(BasePathCount); - - const size_t BaseChunkCount = Base.ChunkedContent.ChunkHashes.size(); - Result.ChunkedContent.SequenceRawHashes.reserve(Base.ChunkedContent.SequenceRawHashes.size()); - Result.ChunkedContent.ChunkCounts.reserve(BaseChunkCount); - Result.ChunkedContent.ChunkHashes.reserve(BaseChunkCount); - Result.ChunkedContent.ChunkRawSizes.reserve(BaseChunkCount); - Result.ChunkedContent.ChunkOrders.reserve(Base.ChunkedContent.ChunkOrders.size()); - - tsl::robin_map<std::string, std::filesystem::path> GenericPathToActualPath; - for (const std::filesystem::path& Path : Base.Paths) - { - GenericPathToActualPath.insert({PathCompareString(Path), Path}); - } - for (const ChunkedFolderContent& Overlay : Overlays) - { - for (const std::filesystem::path& Path : Overlay.Paths) - { - GenericPathToActualPath.insert({PathCompareString(Path), Path}); - } - } - - tsl::robin_map<IoHash, uint32_t, IoHash::Hasher> RawHashToSequenceRawHashIndex; - - auto BuildOverlayPaths = [](std::span<const ChunkedFolderContent> Overlays) -> tsl::robin_set<std::string> { - tsl::robin_set<std::string> Result; - for (const ChunkedFolderContent& OverlayContent : Overlays) - { - for (const std::filesystem::path& Path : OverlayContent.Paths) - { - Result.insert(PathCompareString(Path)); - } - } - return Result; - }; - - auto AddContent = [&BuildOverlayPaths](ChunkedFolderContent& Result, - const ChunkedFolderContent& OverlayContent, - tsl::robin_map<IoHash, uint32_t, IoHash::Hasher>& ChunkHashToChunkIndex, - tsl::robin_map<IoHash, uint32_t, IoHash::Hasher>& RawHashToSequenceRawHashIndex, - const tsl::robin_map<std::string, std::filesystem::path>& GenericPathToActualPath, - std::span<const ChunkedFolderContent> Overlays) { - const ChunkedContentLookup OverlayLookup = BuildChunkedContentLookup(OverlayContent); - tsl::robin_set<std::string> BaseOverlayPaths = BuildOverlayPaths(Overlays); - for (uint32_t PathIndex = 0; PathIndex < OverlayContent.Paths.size(); PathIndex++) - { - std::string GenericPath = PathCompareString(OverlayContent.Paths[PathIndex]); - if (!BaseOverlayPaths.contains(GenericPath)) - { - // This asset will not be overridden by a later layer - add it - - const std::filesystem::path OriginalPath = GenericPathToActualPath.at(GenericPath); - Result.Paths.push_back(OriginalPath); - const IoHash& RawHash = OverlayContent.RawHashes[PathIndex]; - Result.RawSizes.push_back(OverlayContent.RawSizes[PathIndex]); - Result.Attributes.push_back(OverlayContent.Attributes[PathIndex]); - Result.RawHashes.push_back(RawHash); - - if (OverlayContent.RawSizes[PathIndex] > 0) - { - if (!RawHashToSequenceRawHashIndex.contains(RawHash)) - { - RawHashToSequenceRawHashIndex.insert( - {RawHash, gsl::narrow<uint32_t>(Result.ChunkedContent.SequenceRawHashes.size())}); - const uint32_t SequenceRawHashIndex = OverlayLookup.RawHashToSequenceIndex.at(RawHash); - const uint32_t OrderIndexOffset = OverlayLookup.SequenceIndexChunkOrderOffset[SequenceRawHashIndex]; - const uint32_t ChunkCount = OverlayContent.ChunkedContent.ChunkCounts[SequenceRawHashIndex]; - ChunkingStatistics Stats; - std::span<const uint32_t> OriginalChunkOrder = - std::span<const uint32_t>(OverlayContent.ChunkedContent.ChunkOrders).subspan(OrderIndexOffset, ChunkCount); - AddChunkSequence(Stats, - Result.ChunkedContent, - ChunkHashToChunkIndex, - RawHash, - OriginalChunkOrder, - OverlayContent.ChunkedContent.ChunkHashes, - OverlayContent.ChunkedContent.ChunkRawSizes); - Stats.UniqueSequencesFound++; - } - } - } - } - }; - - tsl::robin_map<IoHash, uint32_t, IoHash::Hasher> MergedChunkHashToChunkIndex; - AddContent(Result, Base, MergedChunkHashToChunkIndex, RawHashToSequenceRawHashIndex, GenericPathToActualPath, Overlays); - for (uint32_t OverlayIndex = 0; OverlayIndex < Overlays.size(); OverlayIndex++) - { - AddContent(Result, - Overlays[OverlayIndex], - MergedChunkHashToChunkIndex, - RawHashToSequenceRawHashIndex, - GenericPathToActualPath, - Overlays.subspan(OverlayIndex + 1)); - } - return Result; -} - -ChunkedFolderContent -DeletePathsFromChunkedContent(const ChunkedFolderContent& BaseContent, - const ChunkedContentLookup& BaseContentLookup, - std::span<const std::filesystem::path> DeletedPaths) -{ - ZEN_TRACE_CPU("DeletePathsFromChunkedContent"); - - ZEN_ASSERT(DeletedPaths.size() <= BaseContent.Paths.size()); - ChunkedFolderContent Result = {.Platform = BaseContent.Platform}; - if (DeletedPaths.size() < BaseContent.Paths.size()) - { - tsl::robin_set<std::string> DeletedPathSet; - DeletedPathSet.reserve(DeletedPaths.size()); - for (const std::filesystem::path& DeletedPath : DeletedPaths) - { - DeletedPathSet.insert(PathCompareString(DeletedPath)); - } - - const size_t BaseChunkCount = BaseContent.ChunkedContent.ChunkHashes.size(); - std::vector<uint32_t> NewChunkIndexes(BaseChunkCount, (uint32_t)-1); - - const size_t ExpectedPathCount = BaseContent.Paths.size() - DeletedPaths.size(); - Result.Paths.reserve(ExpectedPathCount); - Result.RawSizes.reserve(ExpectedPathCount); - Result.Attributes.reserve(ExpectedPathCount); - Result.RawHashes.reserve(ExpectedPathCount); - - Result.ChunkedContent.ChunkHashes.reserve(BaseChunkCount); - Result.ChunkedContent.ChunkRawSizes.reserve(BaseChunkCount); - - tsl::robin_map<IoHash, uint32_t, IoHash::Hasher> RawHashToSequenceRawHashIndex; - for (uint32_t PathIndex = 0; PathIndex < BaseContent.Paths.size(); PathIndex++) - { - const std::filesystem::path& Path = BaseContent.Paths[PathIndex]; - if (!DeletedPathSet.contains(PathCompareString(Path))) - { - const IoHash& RawHash = BaseContent.RawHashes[PathIndex]; - const uint64_t RawSize = BaseContent.RawSizes[PathIndex]; - Result.Paths.push_back(Path); - Result.RawSizes.push_back(RawSize); - Result.Attributes.push_back(BaseContent.Attributes[PathIndex]); - Result.RawHashes.push_back(RawHash); - if (RawSize > 0) - { - if (!RawHashToSequenceRawHashIndex.contains(RawHash)) - { - RawHashToSequenceRawHashIndex.insert( - {RawHash, gsl::narrow<uint32_t>(Result.ChunkedContent.SequenceRawHashes.size())}); - const uint32_t SequenceRawHashIndex = BaseContentLookup.RawHashToSequenceIndex.at(RawHash); - const uint32_t OrderIndexOffset = BaseContentLookup.SequenceIndexChunkOrderOffset[SequenceRawHashIndex]; - const uint32_t ChunkCount = BaseContent.ChunkedContent.ChunkCounts[SequenceRawHashIndex]; - - std::span<const uint32_t> OriginalChunkOrder = - std::span<const uint32_t>(BaseContent.ChunkedContent.ChunkOrders).subspan(OrderIndexOffset, ChunkCount); - - Result.ChunkedContent.ChunkCounts.push_back(gsl::narrow<uint32_t>(OriginalChunkOrder.size())); - - for (uint32_t OldChunkIndex : OriginalChunkOrder) - { - if (uint32_t FoundChunkIndex = NewChunkIndexes[OldChunkIndex]; FoundChunkIndex != (uint32_t)-1) - { - Result.ChunkedContent.ChunkOrders.push_back(FoundChunkIndex); - } - else - { - const uint32_t NewChunkIndex = gsl::narrow<uint32_t>(Result.ChunkedContent.ChunkHashes.size()); - NewChunkIndexes[OldChunkIndex] = NewChunkIndex; - const IoHash& ChunkHash = BaseContent.ChunkedContent.ChunkHashes[OldChunkIndex]; - const uint64_t OldChunkSize = BaseContent.ChunkedContent.ChunkRawSizes[OldChunkIndex]; - Result.ChunkedContent.ChunkHashes.push_back(ChunkHash); - Result.ChunkedContent.ChunkRawSizes.push_back(OldChunkSize); - Result.ChunkedContent.ChunkOrders.push_back(NewChunkIndex); - } - } - Result.ChunkedContent.SequenceRawHashes.push_back(RawHash); - } - } - } - } - } - return Result; -} - -ChunkedFolderContent -DeletePathsFromChunkedContent(const ChunkedFolderContent& BaseContent, std::span<const std::filesystem::path> DeletedPaths) -{ - ZEN_TRACE_CPU("DeletePathsFromChunkedContent"); - ZEN_ASSERT(DeletedPaths.size() <= BaseContent.Paths.size()); - if (DeletedPaths.size() == BaseContent.Paths.size()) - { - return {}; - } - const ChunkedContentLookup BaseLookup = BuildChunkedContentLookup(BaseContent); - return DeletePathsFromChunkedContent(BaseContent, BaseLookup, DeletedPaths); -} - -ChunkedFolderContent -ChunkFolderContent(ChunkingStatistics& Stats, - WorkerThreadPool& WorkerPool, - const std::filesystem::path& RootPath, - const FolderContent& Content, - const ChunkingController& InChunkingController, - int32_t UpdateIntervalMS, - std::function<void(bool IsAborted, bool IsPaused, std::ptrdiff_t PendingWork)>&& UpdateCallback, - std::atomic<bool>& AbortFlag, - std::atomic<bool>& PauseFlag) -{ - ZEN_TRACE_CPU("ChunkFolderContent"); - - Stopwatch Timer; - auto _ = MakeGuard([&Stats, &Timer]() { Stats.ElapsedWallTimeUS = Timer.GetElapsedTimeUs(); }); - - ChunkedFolderContent Result = {.Platform = Content.Platform, - .Paths = Content.Paths, - .RawSizes = Content.RawSizes, - .Attributes = Content.Attributes}; - const size_t ItemCount = Result.Paths.size(); - Result.RawHashes.resize(ItemCount, IoHash::Zero); - Result.ChunkedContent.SequenceRawHashes.reserve(ItemCount); // Up to 1 per file, maybe less - Result.ChunkedContent.ChunkCounts.reserve(ItemCount); // Up to one per file - Result.ChunkedContent.ChunkOrders.reserve(ItemCount); // At least 1 per file, maybe more - Result.ChunkedContent.ChunkHashes.reserve(ItemCount); // At least 1 per file, maybe more - Result.ChunkedContent.ChunkRawSizes.reserve(ItemCount); // At least 1 per file, maybe more - tsl::robin_map<IoHash, uint32_t, IoHash::Hasher> ChunkHashToChunkIndex; - tsl::robin_map<IoHash, uint32_t, IoHash::Hasher> RawHashToChunkSequenceIndex; - RawHashToChunkSequenceIndex.reserve(ItemCount); - ChunkHashToChunkIndex.reserve(ItemCount); - { - std::vector<uint32_t> Order; - Order.resize(ItemCount); - for (uint32_t I = 0; I < ItemCount; I++) - { - Order[I] = I; - } - - // Handle the biggest files first so we don't end up with one straggling large file at the end - // std::sort(Order.begin(), Order.end(), [&](uint32_t Lhs, uint32_t Rhs) { return Result.RawSizes[Lhs] > Result.RawSizes[Rhs]; - //}); - - tsl::robin_map<IoHash, uint32_t, IoHash::Hasher> RawHashToSequenceRawHashIndex; - RawHashToSequenceRawHashIndex.reserve(ItemCount); - - RwLock Lock; - - ParallelWork Work(AbortFlag, PauseFlag, WorkerThreadPool::EMode::EnableBacklog); - - for (uint32_t PathIndex : Order) - { - if (Work.IsAborted()) - { - break; - } - Work.ScheduleWork(WorkerPool, // GetSyncWorkerPool() - [&, PathIndex](std::atomic<bool>& AbortFlag) { - if (!AbortFlag) - { - IoHash RawHash = HashOneFile(Stats, - InChunkingController, - Result, - ChunkHashToChunkIndex, - RawHashToSequenceRawHashIndex, - Lock, - RootPath, - PathIndex, - AbortFlag); - Lock.WithExclusiveLock([&]() { Result.RawHashes[PathIndex] = RawHash; }); - Stats.FilesProcessed++; - } - }); - } - - Work.Wait(UpdateIntervalMS, [&](bool IsAborted, bool IsPaused, std::ptrdiff_t PendingWork) { - ZEN_UNUSED(PendingWork); - UpdateCallback(IsAborted, IsPaused, Work.PendingWork().Remaining()); - }); - } - return Result; -} - -ChunkedContentLookup -BuildChunkedContentLookup(const ChunkedFolderContent& Content) -{ - ZEN_TRACE_CPU("BuildChunkedContentLookup"); - - struct ChunkLocationReference - { - uint32_t ChunkIndex = (uint32_t)-1; - uint32_t SequenceIndex = (uint32_t)-1; - uint64_t Offset = (uint64_t)-1; - }; - - ChunkedContentLookup Result; - { - const uint32_t SequenceRawHashesCount = gsl::narrow<uint32_t>(Content.ChunkedContent.SequenceRawHashes.size()); - Result.RawHashToSequenceIndex.reserve(SequenceRawHashesCount); - Result.SequenceIndexChunkOrderOffset.reserve(SequenceRawHashesCount); - uint32_t OrderOffset = 0; - for (uint32_t SequenceRawHashIndex = 0; SequenceRawHashIndex < Content.ChunkedContent.SequenceRawHashes.size(); - SequenceRawHashIndex++) - { - Result.RawHashToSequenceIndex.insert({Content.ChunkedContent.SequenceRawHashes[SequenceRawHashIndex], SequenceRawHashIndex}); - Result.SequenceIndexChunkOrderOffset.push_back(OrderOffset); - OrderOffset += Content.ChunkedContent.ChunkCounts[SequenceRawHashIndex]; - } - } - - std::vector<ChunkLocationReference> Locations; - Locations.reserve(Content.ChunkedContent.ChunkOrders.size()); - for (uint32_t SequenceIndex = 0; SequenceIndex < Content.ChunkedContent.SequenceRawHashes.size(); SequenceIndex++) - { - const uint32_t OrderOffset = Result.SequenceIndexChunkOrderOffset[SequenceIndex]; - const uint32_t ChunkCount = Content.ChunkedContent.ChunkCounts[SequenceIndex]; - uint64_t LocationOffset = 0; - for (size_t OrderIndex = OrderOffset; OrderIndex < OrderOffset + ChunkCount; OrderIndex++) - { - uint32_t ChunkIndex = Content.ChunkedContent.ChunkOrders[OrderIndex]; - - Locations.push_back(ChunkLocationReference{.ChunkIndex = ChunkIndex, .SequenceIndex = SequenceIndex, .Offset = LocationOffset}); - - LocationOffset += Content.ChunkedContent.ChunkRawSizes[ChunkIndex]; - } - } - - std::sort(Locations.begin(), Locations.end(), [](const ChunkLocationReference& Lhs, const ChunkLocationReference& Rhs) { - if (Lhs.ChunkIndex < Rhs.ChunkIndex) - { - return true; - } - if (Lhs.ChunkIndex > Rhs.ChunkIndex) - { - return false; - } - if (Lhs.SequenceIndex < Rhs.SequenceIndex) - { - return true; - } - if (Lhs.SequenceIndex > Rhs.SequenceIndex) - { - return false; - } - return Lhs.Offset < Rhs.Offset; - }); - - Result.ChunkSequenceLocations.reserve(Locations.size()); - const uint32_t ChunkCount = gsl::narrow<uint32_t>(Content.ChunkedContent.ChunkHashes.size()); - Result.ChunkHashToChunkIndex.reserve(ChunkCount); - size_t RangeOffset = 0; - for (uint32_t ChunkIndex = 0; ChunkIndex < ChunkCount; ChunkIndex++) - { - Result.ChunkHashToChunkIndex.insert({Content.ChunkedContent.ChunkHashes[ChunkIndex], ChunkIndex}); - uint32_t Count = 0; - while ((RangeOffset + Count < Locations.size()) && (Locations[RangeOffset + Count].ChunkIndex == ChunkIndex)) - { - const ChunkLocationReference& LocationReference = Locations[RangeOffset + Count]; - Result.ChunkSequenceLocations.push_back( - ChunkedContentLookup::ChunkSequenceLocation{.SequenceIndex = LocationReference.SequenceIndex, - .Offset = LocationReference.Offset}); - Count++; - } - Result.ChunkSequenceLocationOffset.push_back(RangeOffset); - Result.ChunkSequenceLocationCounts.push_back(Count); - RangeOffset += Count; - } - - Result.SequenceIndexFirstPathIndex.resize(Content.ChunkedContent.SequenceRawHashes.size(), (uint32_t)-1); - Result.PathExtensionHash.resize(Content.Paths.size()); - for (uint32_t PathIndex = 0; PathIndex < Content.Paths.size(); PathIndex++) - { - std::string LowercaseExtension = Content.Paths[PathIndex].extension().string(); - std::transform(LowercaseExtension.begin(), LowercaseExtension.end(), LowercaseExtension.begin(), [](char c) { - return (char)::tolower(c); - }); - Result.PathExtensionHash[PathIndex] = HashStringDjb2(LowercaseExtension); - if (Content.RawSizes[PathIndex] > 0) - { - const IoHash& RawHash = Content.RawHashes[PathIndex]; - auto SequenceIndexIt = Result.RawHashToSequenceIndex.find(RawHash); - ZEN_ASSERT(SequenceIndexIt != Result.RawHashToSequenceIndex.end()); - const uint32_t SequenceIndex = SequenceIndexIt->second; - if (Result.SequenceIndexFirstPathIndex[SequenceIndex] == (uint32_t)-1) - { - Result.SequenceIndexFirstPathIndex[SequenceIndex] = PathIndex; - } - } - } - - return Result; -} - -} // namespace zen |