// Copyright Epic Games, Inc. All Rights Reserved. #include #include #include #include #include #include #include #include #include #include #include ZEN_THIRD_PARTY_INCLUDES_START #include #include ZEN_THIRD_PARTY_INCLUDES_END namespace zen { using namespace std::literals; namespace { void AddChunkSequence(ChunkingStatistics& Stats, ChunkedContentData& InOutChunkedContent, tsl::robin_map& ChunkHashToChunkIndex, const IoHash& RawHash, std::span ChunkSequence, std::span ChunkHashes, std::span ChunkRawSizes) { ZEN_ASSERT(ChunkHashes.size() == ChunkRawSizes.size()); InOutChunkedContent.ChunkCounts.push_back(gsl::narrow(ChunkSequence.size())); InOutChunkedContent.ChunkOrders.reserve(InOutChunkedContent.ChunkOrders.size() + ChunkSequence.size()); for (uint32_t ChunkedSequenceIndex : ChunkSequence) { const IoHash& ChunkHash = ChunkHashes[ChunkedSequenceIndex]; if (auto It = ChunkHashToChunkIndex.find(ChunkHash); It != ChunkHashToChunkIndex.end()) { uint32_t ChunkIndex = gsl::narrow(It->second); InOutChunkedContent.ChunkOrders.push_back(ChunkIndex); } else { uint32_t ChunkIndex = gsl::narrow(InOutChunkedContent.ChunkHashes.size()); ChunkHashToChunkIndex.insert_or_assign(ChunkHash, ChunkIndex); InOutChunkedContent.ChunkHashes.push_back(ChunkHash); InOutChunkedContent.ChunkRawSizes.push_back(ChunkRawSizes[ChunkedSequenceIndex]); InOutChunkedContent.ChunkOrders.push_back(ChunkIndex); Stats.UniqueChunksFound++; Stats.UniqueBytesFound += ChunkRawSizes[ChunkedSequenceIndex]; } } InOutChunkedContent.SequenceRawHashes.push_back(RawHash); Stats.UniqueSequencesFound++; } void AddChunkSequence(ChunkingStatistics& Stats, ChunkedContentData& InOutChunkedContent, tsl::robin_map& ChunkHashToChunkIndex, const IoHash& RawHash, const uint64_t RawSize) { InOutChunkedContent.ChunkCounts.push_back(1); if (auto It = ChunkHashToChunkIndex.find(RawHash); It != ChunkHashToChunkIndex.end()) { uint32_t ChunkIndex = gsl::narrow(It->second); InOutChunkedContent.ChunkOrders.push_back(ChunkIndex); } else { uint32_t ChunkIndex = gsl::narrow(InOutChunkedContent.ChunkHashes.size()); ChunkHashToChunkIndex.insert_or_assign(RawHash, ChunkIndex); InOutChunkedContent.ChunkHashes.push_back(RawHash); InOutChunkedContent.ChunkRawSizes.push_back(RawSize); InOutChunkedContent.ChunkOrders.push_back(ChunkIndex); Stats.UniqueChunksFound++; Stats.UniqueBytesFound += RawSize; } InOutChunkedContent.SequenceRawHashes.push_back(RawHash); Stats.UniqueSequencesFound++; } IoHash HashOneFile(ChunkingStatistics& Stats, const ChunkingController& InChunkingController, ChunkedFolderContent& OutChunkedContent, tsl::robin_map& ChunkHashToChunkIndex, tsl::robin_map& RawHashToSequenceRawHashIndex, RwLock& Lock, const std::filesystem::path& FolderPath, uint32_t PathIndex, std::atomic& AbortFlag) { ZEN_TRACE_CPU("ChunkFolderContent"); const uint64_t RawSize = OutChunkedContent.RawSizes[PathIndex]; const std::filesystem::path& Path = OutChunkedContent.Paths[PathIndex]; if (RawSize == 0) { return IoHash::Zero; } else { ChunkedInfoWithSource Chunked; const bool DidChunking = InChunkingController.ProcessFile((FolderPath / Path).make_preferred(), RawSize, Chunked, Stats.BytesHashed, AbortFlag); if (DidChunking) { Lock.WithExclusiveLock([&]() { if (!RawHashToSequenceRawHashIndex.contains(Chunked.Info.RawHash)) { RawHashToSequenceRawHashIndex.insert( {Chunked.Info.RawHash, gsl::narrow(OutChunkedContent.ChunkedContent.SequenceRawHashes.size())}); std::vector ChunkSizes; ChunkSizes.reserve(Chunked.ChunkSources.size()); for (const ChunkSource& Source : Chunked.ChunkSources) { ChunkSizes.push_back(Source.Size); } AddChunkSequence(Stats, OutChunkedContent.ChunkedContent, ChunkHashToChunkIndex, Chunked.Info.RawHash, Chunked.Info.ChunkSequence, Chunked.Info.ChunkHashes, ChunkSizes); Stats.UniqueSequencesFound++; } }); Stats.FilesChunked++; return Chunked.Info.RawHash; } else { ZEN_TRACE_CPU("HashOnly"); IoBuffer Buffer = IoBufferBuilder::MakeFromFile((FolderPath / Path).make_preferred()); if (Buffer.GetSize() != RawSize) { throw std::runtime_error(fmt::format("Failed opening file '{}' for hashing", FolderPath / Path)); } const IoHash Hash = IoHash::HashBuffer(Buffer, &Stats.BytesHashed); Lock.WithExclusiveLock([&]() { if (!RawHashToSequenceRawHashIndex.contains(Hash)) { RawHashToSequenceRawHashIndex.insert( {Hash, gsl::narrow(OutChunkedContent.ChunkedContent.SequenceRawHashes.size())}); AddChunkSequence(Stats, OutChunkedContent.ChunkedContent, ChunkHashToChunkIndex, Hash, RawSize); Stats.UniqueSequencesFound++; } }); return Hash; } } } std::string PathCompareString(const std::filesystem::path& Path) { return ToLower(Path.generic_string()); } } // namespace std::string_view FolderContentSourcePlatformNames[(size_t)SourcePlatform::_Count] = {"Windows"sv, "Linux"sv, "MacOS"sv}; std::string_view ToString(SourcePlatform Platform) { return FolderContentSourcePlatformNames[(size_t)Platform]; } SourcePlatform FromString(std::string_view Platform, SourcePlatform Default) { for (size_t Index = 0; Index < (size_t)SourcePlatform::_Count; Index++) { if (Platform == FolderContentSourcePlatformNames[Index]) { return (SourcePlatform)Index; } } return Default; } SourcePlatform GetSourceCurrentPlatform() { #if ZEN_PLATFORM_WINDOWS return SourcePlatform::Windows; #endif #if ZEN_PLATFORM_MAC return SourcePlatform::MacOS; #endif #if ZEN_PLATFORM_LINUX return SourcePlatform::Linux; #endif } bool FolderContent::AreFileAttributesEqual(const uint32_t Lhs, const uint32_t Rhs) { #if ZEN_PLATFORM_WINDOWS return (Lhs & 0xff) == (Rhs & 0xff); #endif #if ZEN_PLATFORM_MAC return Lhs == Rhs; #endif #if ZEN_PLATFORM_LINUX return Lhs == Rhs; #endif } bool FolderContent::operator==(const FolderContent& Rhs) const { if ((Platform == Rhs.Platform) && (RawSizes == Rhs.RawSizes) && (Attributes == Rhs.Attributes) && (ModificationTicks == Rhs.ModificationTicks) && (Paths.size() == Rhs.Paths.size())) { size_t PathCount = 0; for (size_t PathIndex = 0; PathIndex < PathCount; PathIndex++) { if (Paths[PathIndex].generic_string() != Rhs.Paths[PathIndex].generic_string()) { return false; } } return true; } return false; } bool FolderContent::AreKnownFilesEqual(const FolderContent& Rhs) const { ZEN_TRACE_CPU("FolderContent::AreKnownFilesEqual"); tsl::robin_map RhsPathToIndex; const size_t RhsPathCount = Rhs.Paths.size(); RhsPathToIndex.reserve(RhsPathCount); for (size_t RhsPathIndex = 0; RhsPathIndex < RhsPathCount; RhsPathIndex++) { RhsPathToIndex.insert({Rhs.Paths[RhsPathIndex].generic_string(), RhsPathIndex}); } const size_t PathCount = Paths.size(); for (size_t PathIndex = 0; PathIndex < PathCount; PathIndex++) { if (auto It = RhsPathToIndex.find(Paths[PathIndex].generic_string()); It != RhsPathToIndex.end()) { const size_t RhsPathIndex = It->second; if ((RawSizes[PathIndex] != Rhs.RawSizes[RhsPathIndex]) || (!AreFileAttributesEqual(Attributes[PathIndex], Rhs.Attributes[RhsPathIndex])) || (ModificationTicks[PathIndex] != Rhs.ModificationTicks[RhsPathIndex])) { return false; } } else { return false; } } return true; } void FolderContent::UpdateState(const FolderContent& Rhs, std::vector& OutPathIndexesOufOfDate) { ZEN_TRACE_CPU("FolderContent::UpdateState"); tsl::robin_map RhsPathToIndex; const uint32_t RhsPathCount = gsl::narrow(Rhs.Paths.size()); RhsPathToIndex.reserve(RhsPathCount); for (uint32_t RhsPathIndex = 0; RhsPathIndex < RhsPathCount; RhsPathIndex++) { RhsPathToIndex.insert({Rhs.Paths[RhsPathIndex].generic_string(), RhsPathIndex}); } uint32_t PathCount = gsl::narrow(Paths.size()); for (uint32_t PathIndex = 0; PathIndex < PathCount;) { if (auto It = RhsPathToIndex.find(Paths[PathIndex].generic_string()); It != RhsPathToIndex.end()) { const uint32_t RhsPathIndex = It->second; if ((RawSizes[PathIndex] != Rhs.RawSizes[RhsPathIndex]) || (ModificationTicks[PathIndex] != Rhs.ModificationTicks[RhsPathIndex])) { RawSizes[PathIndex] = Rhs.RawSizes[RhsPathIndex]; ModificationTicks[PathIndex] = Rhs.ModificationTicks[RhsPathIndex]; OutPathIndexesOufOfDate.push_back(PathIndex); } Attributes[PathIndex] = Rhs.Attributes[RhsPathIndex]; PathIndex++; } else { Paths.erase(Paths.begin() + PathIndex); RawSizes.erase(RawSizes.begin() + PathIndex); Attributes.erase(Attributes.begin() + PathIndex); ModificationTicks.erase(ModificationTicks.begin() + PathIndex); PathCount--; } } } FolderContent GetUpdatedContent(const FolderContent& Old, const FolderContent& New, std::vector& OutDeletedPaths) { ZEN_TRACE_CPU("FolderContent::GetUpdatedContent"); const uint32_t NewPathCount = gsl::narrow(New.Paths.size()); FolderContent Result = {.Platform = Old.Platform}; Result.Paths.reserve(NewPathCount); Result.RawSizes.reserve(NewPathCount); Result.Attributes.reserve(NewPathCount); Result.ModificationTicks.reserve(NewPathCount); tsl::robin_map NewPathToIndex; NewPathToIndex.reserve(NewPathCount); for (uint32_t NewPathIndex = 0; NewPathIndex < NewPathCount; NewPathIndex++) { NewPathToIndex.insert({New.Paths[NewPathIndex].generic_string(), NewPathIndex}); } uint32_t OldPathCount = gsl::narrow(Old.Paths.size()); for (uint32_t OldPathIndex = 0; OldPathIndex < OldPathCount; OldPathIndex++) { if (auto It = NewPathToIndex.find(Old.Paths[OldPathIndex].generic_string()); It != NewPathToIndex.end()) { const uint32_t NewPathIndex = It->second; if ((Old.RawSizes[OldPathIndex] != New.RawSizes[NewPathIndex]) || (Old.ModificationTicks[OldPathIndex] != New.ModificationTicks[NewPathIndex])) { Result.Paths.push_back(New.Paths[NewPathIndex]); Result.RawSizes.push_back(New.RawSizes[NewPathIndex]); Result.Attributes.push_back(New.Attributes[NewPathIndex]); Result.ModificationTicks.push_back(New.ModificationTicks[NewPathIndex]); } } else { OutDeletedPaths.push_back(Old.Paths[OldPathIndex]); } } return Result; } void SaveFolderContentToCompactBinary(const FolderContent& Content, CbWriter& Output) { ZEN_TRACE_CPU("SaveFolderContentToCompactBinary"); Output.AddString("platform"sv, ToString(Content.Platform)); compactbinary_helpers::WriteArray(Content.Paths, "paths"sv, Output); compactbinary_helpers::WriteArray(Content.RawSizes, "rawSizes"sv, Output); compactbinary_helpers::WriteArray(Content.Attributes, "attributes"sv, Output); compactbinary_helpers::WriteArray(Content.ModificationTicks, "modificationTimes"sv, Output); } FolderContent LoadFolderContentToCompactBinary(CbObjectView Input) { ZEN_TRACE_CPU("LoadFolderContentToCompactBinary"); FolderContent Content; Content.Platform = FromString(Input["platform"sv].AsString(), GetSourceCurrentPlatform()); compactbinary_helpers::ReadArray("paths"sv, Input, Content.Paths); compactbinary_helpers::ReadArray("rawSizes"sv, Input, Content.RawSizes); compactbinary_helpers::ReadArray("attributes"sv, Input, Content.Attributes); compactbinary_helpers::ReadArray("modificationTimes"sv, Input, Content.ModificationTicks); return Content; } FolderContent GetFolderContent(GetFolderContentStatistics& Stats, const std::filesystem::path& RootPath, std::function&& AcceptDirectory, std::function&& AcceptFile, WorkerThreadPool& WorkerPool, int32_t UpdateIntervalMS, std::function&& UpdateCallback, std::atomic& AbortFlag) { ZEN_TRACE_CPU("GetFolderContent"); Stopwatch Timer; auto _ = MakeGuard([&Stats, &Timer]() { Stats.ElapsedWallTimeUS = Timer.GetElapsedTimeUs(); }); FolderContent Content; struct AsyncVisitor : public GetDirectoryContentVisitor { AsyncVisitor(GetFolderContentStatistics& Stats, std::atomic& AbortFlag, FolderContent& Content, std::function&& AcceptDirectory, std::function&& AcceptFile) : m_Stats(Stats) , m_AbortFlag(AbortFlag) , m_FoundContent(Content) , m_AcceptDirectory(std::move(AcceptDirectory)) , m_AcceptFile(std::move(AcceptFile)) { } virtual void AsyncVisitDirectory(const std::filesystem::path& RelativeRoot, DirectoryContent&& Content) override { if (!m_AbortFlag) { m_Stats.FoundFileCount += Content.FileNames.size(); for (uint64_t FileSize : Content.FileSizes) { m_Stats.FoundFileByteCount += FileSize; } std::string RelativeDirectoryPath = RelativeRoot.generic_string(); if (m_AcceptDirectory(RelativeDirectoryPath)) { std::vector Paths; std::vector RawSizes; std::vector Attributes; std::vector ModificatonTicks; Paths.reserve(Content.FileNames.size()); RawSizes.reserve(Content.FileNames.size()); Attributes.reserve(Content.FileNames.size()); ModificatonTicks.reserve(Content.FileModificationTicks.size()); for (size_t FileIndex = 0; FileIndex < Content.FileNames.size(); FileIndex++) { const std::filesystem::path& FileName = Content.FileNames[FileIndex]; std::string RelativePath = (RelativeRoot / FileName).generic_string(); std::replace(RelativePath.begin(), RelativePath.end(), '\\', '/'); if (m_AcceptFile(RelativePath, Content.FileSizes[FileIndex], Content.FileAttributes[FileIndex])) { Paths.emplace_back(std::move(RelativePath)); RawSizes.emplace_back(Content.FileSizes[FileIndex]); Attributes.emplace_back(Content.FileAttributes[FileIndex]); ModificatonTicks.emplace_back(Content.FileModificationTicks[FileIndex]); m_Stats.AcceptedFileCount++; m_Stats.AcceptedFileByteCount += Content.FileSizes[FileIndex]; } } m_Lock.WithExclusiveLock([&]() { m_FoundContent.Paths.insert(m_FoundContent.Paths.end(), Paths.begin(), Paths.end()); m_FoundContent.RawSizes.insert(m_FoundContent.RawSizes.end(), RawSizes.begin(), RawSizes.end()); m_FoundContent.Attributes.insert(m_FoundContent.Attributes.end(), Attributes.begin(), Attributes.end()); m_FoundContent.ModificationTicks.insert(m_FoundContent.ModificationTicks.end(), ModificatonTicks.begin(), ModificatonTicks.end()); }); } } } GetFolderContentStatistics& m_Stats; std::atomic& m_AbortFlag; RwLock m_Lock; FolderContent& m_FoundContent; std::function m_AcceptDirectory; std::function m_AcceptFile; } Visitor(Stats, AbortFlag, Content, std::move(AcceptDirectory), std::move(AcceptFile)); Latch PendingWork(1); GetDirectoryContent(RootPath, DirectoryContentFlags::IncludeFiles | DirectoryContentFlags::Recursive | DirectoryContentFlags::IncludeFileSizes | DirectoryContentFlags::IncludeAttributes | DirectoryContentFlags::IncludeModificationTick, Visitor, WorkerPool, PendingWork); PendingWork.CountDown(); while (!PendingWork.Wait(UpdateIntervalMS)) { UpdateCallback(AbortFlag.load(), PendingWork.Remaining()); } std::vector Order; size_t PathCount = Content.Paths.size(); Order.resize(Content.Paths.size()); std::vector Parents; Parents.reserve(PathCount); std::vector Filenames; Filenames.reserve(PathCount); for (size_t OrderIndex = 0; OrderIndex < PathCount; OrderIndex++) { Order[OrderIndex] = OrderIndex; Parents.emplace_back(Content.Paths[OrderIndex].parent_path().generic_string()); Filenames.emplace_back(Content.Paths[OrderIndex].filename().generic_string()); } std::sort(Order.begin(), Order.end(), [&Parents, &Filenames](size_t Lhs, size_t Rhs) { const std::string& LhsParent = Parents[Lhs]; const std::string& RhsParent = Parents[Rhs]; if (LhsParent < RhsParent) { return true; } else if (LhsParent > RhsParent) { return false; } return Filenames[Lhs] < Filenames[Rhs]; }); FolderContent OrderedContent; OrderedContent.Paths.reserve(PathCount); OrderedContent.RawSizes.reserve(PathCount); OrderedContent.Attributes.reserve(PathCount); OrderedContent.ModificationTicks.reserve(PathCount); for (size_t OrderIndex : Order) { OrderedContent.Paths.emplace_back(std::move(Content.Paths[OrderIndex])); OrderedContent.RawSizes.emplace_back(Content.RawSizes[OrderIndex]); OrderedContent.Attributes.emplace_back(Content.Attributes[OrderIndex]); OrderedContent.ModificationTicks.emplace_back(Content.ModificationTicks[OrderIndex]); } return OrderedContent; } void SaveChunkedFolderContentToCompactBinary(const ChunkedFolderContent& Content, CbWriter& Output) { ZEN_TRACE_CPU("SaveChunkedFolderContentToCompactBinary"); Output.AddString("platform"sv, ToString(Content.Platform)); compactbinary_helpers::WriteArray(Content.Paths, "paths"sv, Output); compactbinary_helpers::WriteArray(Content.RawSizes, "rawSizes"sv, Output); compactbinary_helpers::WriteArray(Content.Attributes, "attributes"sv, Output); compactbinary_helpers::WriteArray(Content.RawHashes, "rawHashes"sv, Output); Output.BeginObject("chunkedContent"); compactbinary_helpers::WriteArray(Content.ChunkedContent.SequenceRawHashes, "sequenceRawHashes"sv, Output); compactbinary_helpers::WriteArray(Content.ChunkedContent.ChunkCounts, "chunkCounts"sv, Output); compactbinary_helpers::WriteArray(Content.ChunkedContent.ChunkOrders, "chunkOrders"sv, Output); compactbinary_helpers::WriteArray(Content.ChunkedContent.ChunkHashes, "chunkHashes"sv, Output); compactbinary_helpers::WriteArray(Content.ChunkedContent.ChunkRawSizes, "chunkRawSizes"sv, Output); Output.EndObject(); // chunkedContent } ChunkedFolderContent LoadChunkedFolderContentToCompactBinary(CbObjectView Input) { ZEN_TRACE_CPU("LoadChunkedFolderContentToCompactBinary"); ChunkedFolderContent Content; Content.Platform = FromString(Input["platform"sv].AsString(), GetSourceCurrentPlatform()); compactbinary_helpers::ReadArray("paths"sv, Input, Content.Paths); compactbinary_helpers::ReadArray("rawSizes"sv, Input, Content.RawSizes); compactbinary_helpers::ReadArray("attributes"sv, Input, Content.Attributes); compactbinary_helpers::ReadArray("rawHashes"sv, Input, Content.RawHashes); CbObjectView ChunkedContentView = Input["chunkedContent"sv].AsObjectView(); compactbinary_helpers::ReadArray("sequenceRawHashes"sv, ChunkedContentView, Content.ChunkedContent.SequenceRawHashes); compactbinary_helpers::ReadArray("chunkCounts"sv, ChunkedContentView, Content.ChunkedContent.ChunkCounts); compactbinary_helpers::ReadArray("chunkOrders"sv, ChunkedContentView, Content.ChunkedContent.ChunkOrders); compactbinary_helpers::ReadArray("chunkHashes"sv, ChunkedContentView, Content.ChunkedContent.ChunkHashes); compactbinary_helpers::ReadArray("chunkRawSizes"sv, ChunkedContentView, Content.ChunkedContent.ChunkRawSizes); return Content; } ChunkedFolderContent MergeChunkedFolderContents(const ChunkedFolderContent& Base, std::span Overlays) { ZEN_TRACE_CPU("MergeChunkedFolderContents"); ZEN_ASSERT(!Overlays.empty()); ChunkedFolderContent Result; const size_t BasePathCount = Base.Paths.size(); Result.Paths.reserve(BasePathCount); Result.RawSizes.reserve(BasePathCount); Result.Attributes.reserve(BasePathCount); Result.RawHashes.reserve(BasePathCount); const size_t BaseChunkCount = Base.ChunkedContent.ChunkHashes.size(); Result.ChunkedContent.SequenceRawHashes.reserve(Base.ChunkedContent.SequenceRawHashes.size()); Result.ChunkedContent.ChunkCounts.reserve(BaseChunkCount); Result.ChunkedContent.ChunkHashes.reserve(BaseChunkCount); Result.ChunkedContent.ChunkRawSizes.reserve(BaseChunkCount); Result.ChunkedContent.ChunkOrders.reserve(Base.ChunkedContent.ChunkOrders.size()); tsl::robin_map GenericPathToActualPath; for (const std::filesystem::path& Path : Base.Paths) { GenericPathToActualPath.insert({PathCompareString(Path), Path}); } for (const ChunkedFolderContent& Overlay : Overlays) { for (const std::filesystem::path& Path : Overlay.Paths) { GenericPathToActualPath.insert({PathCompareString(Path), Path}); } } tsl::robin_map RawHashToSequenceRawHashIndex; auto BuildOverlayPaths = [](std::span Overlays) -> tsl::robin_set { tsl::robin_set Result; for (const ChunkedFolderContent& OverlayContent : Overlays) { for (const std::filesystem::path& Path : OverlayContent.Paths) { Result.insert(PathCompareString(Path)); } } return Result; }; auto AddContent = [&BuildOverlayPaths](ChunkedFolderContent& Result, const ChunkedFolderContent& OverlayContent, tsl::robin_map& ChunkHashToChunkIndex, tsl::robin_map& RawHashToSequenceRawHashIndex, const tsl::robin_map& GenericPathToActualPath, std::span Overlays) { const ChunkedContentLookup OverlayLookup = BuildChunkedContentLookup(OverlayContent); tsl::robin_set BaseOverlayPaths = BuildOverlayPaths(Overlays); for (uint32_t PathIndex = 0; PathIndex < OverlayContent.Paths.size(); PathIndex++) { std::string GenericPath = PathCompareString(OverlayContent.Paths[PathIndex]); if (!BaseOverlayPaths.contains(GenericPath)) { // This asset will not be overridden by a later layer - add it const std::filesystem::path OriginalPath = GenericPathToActualPath.at(GenericPath); Result.Paths.push_back(OriginalPath); const IoHash& RawHash = OverlayContent.RawHashes[PathIndex]; Result.RawSizes.push_back(OverlayContent.RawSizes[PathIndex]); Result.Attributes.push_back(OverlayContent.Attributes[PathIndex]); Result.RawHashes.push_back(RawHash); if (OverlayContent.RawSizes[PathIndex] > 0) { if (!RawHashToSequenceRawHashIndex.contains(RawHash)) { RawHashToSequenceRawHashIndex.insert( {RawHash, gsl::narrow(Result.ChunkedContent.SequenceRawHashes.size())}); const uint32_t SequenceRawHashIndex = OverlayLookup.RawHashToSequenceIndex.at(RawHash); const uint32_t OrderIndexOffset = OverlayLookup.SequenceIndexChunkOrderOffset[SequenceRawHashIndex]; const uint32_t ChunkCount = OverlayContent.ChunkedContent.ChunkCounts[SequenceRawHashIndex]; ChunkingStatistics Stats; std::span OriginalChunkOrder = std::span(OverlayContent.ChunkedContent.ChunkOrders).subspan(OrderIndexOffset, ChunkCount); AddChunkSequence(Stats, Result.ChunkedContent, ChunkHashToChunkIndex, RawHash, OriginalChunkOrder, OverlayContent.ChunkedContent.ChunkHashes, OverlayContent.ChunkedContent.ChunkRawSizes); Stats.UniqueSequencesFound++; } } } } }; tsl::robin_map MergedChunkHashToChunkIndex; AddContent(Result, Base, MergedChunkHashToChunkIndex, RawHashToSequenceRawHashIndex, GenericPathToActualPath, Overlays); for (uint32_t OverlayIndex = 0; OverlayIndex < Overlays.size(); OverlayIndex++) { AddContent(Result, Overlays[OverlayIndex], MergedChunkHashToChunkIndex, RawHashToSequenceRawHashIndex, GenericPathToActualPath, Overlays.subspan(OverlayIndex + 1)); } return Result; } ChunkedFolderContent DeletePathsFromChunkedContent(const ChunkedFolderContent& BaseContent, const ChunkedContentLookup& BaseContentLookup, std::span DeletedPaths) { ZEN_TRACE_CPU("DeletePathsFromChunkedContent"); ZEN_ASSERT(DeletedPaths.size() <= BaseContent.Paths.size()); ChunkedFolderContent Result = {.Platform = BaseContent.Platform}; if (DeletedPaths.size() < BaseContent.Paths.size()) { tsl::robin_set DeletedPathSet; DeletedPathSet.reserve(DeletedPaths.size()); for (const std::filesystem::path& DeletedPath : DeletedPaths) { DeletedPathSet.insert(PathCompareString(DeletedPath)); } const size_t BaseChunkCount = BaseContent.ChunkedContent.ChunkHashes.size(); std::vector NewChunkIndexes(BaseChunkCount, (uint32_t)-1); const size_t ExpectedPathCount = BaseContent.Paths.size() - DeletedPaths.size(); Result.Paths.reserve(ExpectedPathCount); Result.RawSizes.reserve(ExpectedPathCount); Result.Attributes.reserve(ExpectedPathCount); Result.RawHashes.reserve(ExpectedPathCount); Result.ChunkedContent.ChunkHashes.reserve(BaseChunkCount); Result.ChunkedContent.ChunkRawSizes.reserve(BaseChunkCount); tsl::robin_map RawHashToSequenceRawHashIndex; for (uint32_t PathIndex = 0; PathIndex < BaseContent.Paths.size(); PathIndex++) { const std::filesystem::path& Path = BaseContent.Paths[PathIndex]; if (!DeletedPathSet.contains(PathCompareString(Path))) { const IoHash& RawHash = BaseContent.RawHashes[PathIndex]; const uint64_t RawSize = BaseContent.RawSizes[PathIndex]; Result.Paths.push_back(Path); Result.RawSizes.push_back(RawSize); Result.Attributes.push_back(BaseContent.Attributes[PathIndex]); Result.RawHashes.push_back(RawHash); if (RawSize > 0) { if (!RawHashToSequenceRawHashIndex.contains(RawHash)) { RawHashToSequenceRawHashIndex.insert( {RawHash, gsl::narrow(Result.ChunkedContent.SequenceRawHashes.size())}); const uint32_t SequenceRawHashIndex = BaseContentLookup.RawHashToSequenceIndex.at(RawHash); const uint32_t OrderIndexOffset = BaseContentLookup.SequenceIndexChunkOrderOffset[SequenceRawHashIndex]; const uint32_t ChunkCount = BaseContent.ChunkedContent.ChunkCounts[SequenceRawHashIndex]; std::span OriginalChunkOrder = std::span(BaseContent.ChunkedContent.ChunkOrders).subspan(OrderIndexOffset, ChunkCount); Result.ChunkedContent.ChunkCounts.push_back(gsl::narrow(OriginalChunkOrder.size())); for (uint32_t OldChunkIndex : OriginalChunkOrder) { if (uint32_t FoundChunkIndex = NewChunkIndexes[OldChunkIndex]; FoundChunkIndex != (uint32_t)-1) { Result.ChunkedContent.ChunkOrders.push_back(FoundChunkIndex); } else { const uint32_t NewChunkIndex = gsl::narrow(Result.ChunkedContent.ChunkHashes.size()); NewChunkIndexes[OldChunkIndex] = NewChunkIndex; const IoHash& ChunkHash = BaseContent.ChunkedContent.ChunkHashes[OldChunkIndex]; const uint64_t OldChunkSize = BaseContent.ChunkedContent.ChunkRawSizes[OldChunkIndex]; Result.ChunkedContent.ChunkHashes.push_back(ChunkHash); Result.ChunkedContent.ChunkRawSizes.push_back(OldChunkSize); Result.ChunkedContent.ChunkOrders.push_back(NewChunkIndex); } } Result.ChunkedContent.SequenceRawHashes.push_back(RawHash); } } } } } return Result; } ChunkedFolderContent DeletePathsFromChunkedContent(const ChunkedFolderContent& BaseContent, std::span DeletedPaths) { ZEN_TRACE_CPU("DeletePathsFromChunkedContent"); ZEN_ASSERT(DeletedPaths.size() <= BaseContent.Paths.size()); if (DeletedPaths.size() == BaseContent.Paths.size()) { return {}; } const ChunkedContentLookup BaseLookup = BuildChunkedContentLookup(BaseContent); return DeletePathsFromChunkedContent(BaseContent, BaseLookup, DeletedPaths); } ChunkedFolderContent ChunkFolderContent(ChunkingStatistics& Stats, WorkerThreadPool& WorkerPool, const std::filesystem::path& RootPath, const FolderContent& Content, const ChunkingController& InChunkingController, int32_t UpdateIntervalMS, std::function&& UpdateCallback, std::atomic& AbortFlag, std::atomic& PauseFlag) { ZEN_TRACE_CPU("ChunkFolderContent"); Stopwatch Timer; auto _ = MakeGuard([&Stats, &Timer]() { Stats.ElapsedWallTimeUS = Timer.GetElapsedTimeUs(); }); ChunkedFolderContent Result = {.Platform = Content.Platform, .Paths = Content.Paths, .RawSizes = Content.RawSizes, .Attributes = Content.Attributes}; const size_t ItemCount = Result.Paths.size(); Result.RawHashes.resize(ItemCount, IoHash::Zero); Result.ChunkedContent.SequenceRawHashes.reserve(ItemCount); // Up to 1 per file, maybe less Result.ChunkedContent.ChunkCounts.reserve(ItemCount); // Up to one per file Result.ChunkedContent.ChunkOrders.reserve(ItemCount); // At least 1 per file, maybe more Result.ChunkedContent.ChunkHashes.reserve(ItemCount); // At least 1 per file, maybe more Result.ChunkedContent.ChunkRawSizes.reserve(ItemCount); // At least 1 per file, maybe more tsl::robin_map ChunkHashToChunkIndex; tsl::robin_map RawHashToChunkSequenceIndex; RawHashToChunkSequenceIndex.reserve(ItemCount); ChunkHashToChunkIndex.reserve(ItemCount); { std::vector Order; Order.resize(ItemCount); for (uint32_t I = 0; I < ItemCount; I++) { Order[I] = I; } // Handle the biggest files first so we don't end up with one straggling large file at the end // std::sort(Order.begin(), Order.end(), [&](uint32_t Lhs, uint32_t Rhs) { return Result.RawSizes[Lhs] > Result.RawSizes[Rhs]; //}); tsl::robin_map RawHashToSequenceRawHashIndex; RawHashToSequenceRawHashIndex.reserve(ItemCount); RwLock Lock; ParallelWork Work(AbortFlag, PauseFlag); for (uint32_t PathIndex : Order) { if (Work.IsAborted()) { break; } Work.ScheduleWork(WorkerPool, // GetSyncWorkerPool() [&, PathIndex](std::atomic& AbortFlag) { if (!AbortFlag) { IoHash RawHash = HashOneFile(Stats, InChunkingController, Result, ChunkHashToChunkIndex, RawHashToSequenceRawHashIndex, Lock, RootPath, PathIndex, AbortFlag); Lock.WithExclusiveLock([&]() { Result.RawHashes[PathIndex] = RawHash; }); Stats.FilesProcessed++; } }); } Work.Wait(UpdateIntervalMS, [&](bool IsAborted, bool IsPaused, std::ptrdiff_t PendingWork) { ZEN_UNUSED(PendingWork); UpdateCallback(IsAborted, IsPaused, Work.PendingWork().Remaining()); }); } return Result; } ChunkedContentLookup BuildChunkedContentLookup(const ChunkedFolderContent& Content) { ZEN_TRACE_CPU("BuildChunkedContentLookup"); struct ChunkLocationReference { uint32_t ChunkIndex = (uint32_t)-1; uint32_t SequenceIndex = (uint32_t)-1; uint64_t Offset = (uint64_t)-1; }; ChunkedContentLookup Result; { const uint32_t SequenceRawHashesCount = gsl::narrow(Content.ChunkedContent.SequenceRawHashes.size()); Result.RawHashToSequenceIndex.reserve(SequenceRawHashesCount); Result.SequenceIndexChunkOrderOffset.reserve(SequenceRawHashesCount); uint32_t OrderOffset = 0; for (uint32_t SequenceRawHashIndex = 0; SequenceRawHashIndex < Content.ChunkedContent.SequenceRawHashes.size(); SequenceRawHashIndex++) { Result.RawHashToSequenceIndex.insert({Content.ChunkedContent.SequenceRawHashes[SequenceRawHashIndex], SequenceRawHashIndex}); Result.SequenceIndexChunkOrderOffset.push_back(OrderOffset); OrderOffset += Content.ChunkedContent.ChunkCounts[SequenceRawHashIndex]; } } std::vector Locations; Locations.reserve(Content.ChunkedContent.ChunkOrders.size()); for (uint32_t SequenceIndex = 0; SequenceIndex < Content.ChunkedContent.SequenceRawHashes.size(); SequenceIndex++) { const uint32_t OrderOffset = Result.SequenceIndexChunkOrderOffset[SequenceIndex]; const uint32_t ChunkCount = Content.ChunkedContent.ChunkCounts[SequenceIndex]; uint64_t LocationOffset = 0; for (size_t OrderIndex = OrderOffset; OrderIndex < OrderOffset + ChunkCount; OrderIndex++) { uint32_t ChunkIndex = Content.ChunkedContent.ChunkOrders[OrderIndex]; Locations.push_back(ChunkLocationReference{.ChunkIndex = ChunkIndex, .SequenceIndex = SequenceIndex, .Offset = LocationOffset}); LocationOffset += Content.ChunkedContent.ChunkRawSizes[ChunkIndex]; } } std::sort(Locations.begin(), Locations.end(), [](const ChunkLocationReference& Lhs, const ChunkLocationReference& Rhs) { if (Lhs.ChunkIndex < Rhs.ChunkIndex) { return true; } if (Lhs.ChunkIndex > Rhs.ChunkIndex) { return false; } if (Lhs.SequenceIndex < Rhs.SequenceIndex) { return true; } if (Lhs.SequenceIndex > Rhs.SequenceIndex) { return false; } return Lhs.Offset < Rhs.Offset; }); Result.ChunkSequenceLocations.reserve(Locations.size()); const uint32_t ChunkCount = gsl::narrow(Content.ChunkedContent.ChunkHashes.size()); Result.ChunkHashToChunkIndex.reserve(ChunkCount); size_t RangeOffset = 0; for (uint32_t ChunkIndex = 0; ChunkIndex < ChunkCount; ChunkIndex++) { Result.ChunkHashToChunkIndex.insert({Content.ChunkedContent.ChunkHashes[ChunkIndex], ChunkIndex}); uint32_t Count = 0; while ((RangeOffset + Count < Locations.size()) && (Locations[RangeOffset + Count].ChunkIndex == ChunkIndex)) { const ChunkLocationReference& LocationReference = Locations[RangeOffset + Count]; Result.ChunkSequenceLocations.push_back( ChunkedContentLookup::ChunkSequenceLocation{.SequenceIndex = LocationReference.SequenceIndex, .Offset = LocationReference.Offset}); Count++; } Result.ChunkSequenceLocationOffset.push_back(RangeOffset); Result.ChunkSequenceLocationCounts.push_back(Count); RangeOffset += Count; } Result.SequenceIndexFirstPathIndex.resize(Content.ChunkedContent.SequenceRawHashes.size(), (uint32_t)-1); Result.PathExtensionHash.resize(Content.Paths.size()); for (uint32_t PathIndex = 0; PathIndex < Content.Paths.size(); PathIndex++) { std::string LowercaseExtension = Content.Paths[PathIndex].extension().string(); std::transform(LowercaseExtension.begin(), LowercaseExtension.end(), LowercaseExtension.begin(), ::tolower); Result.PathExtensionHash[PathIndex] = HashStringDjb2(LowercaseExtension); if (Content.RawSizes[PathIndex] > 0) { const IoHash& RawHash = Content.RawHashes[PathIndex]; auto SequenceIndexIt = Result.RawHashToSequenceIndex.find(RawHash); ZEN_ASSERT(SequenceIndexIt != Result.RawHashToSequenceIndex.end()); const uint32_t SequenceIndex = SequenceIndexIt->second; if (Result.SequenceIndexFirstPathIndex[SequenceIndex] == (uint32_t)-1) { Result.SequenceIndexFirstPathIndex[SequenceIndex] = PathIndex; } } } return Result; } } // namespace zen