diff options
| author | Dan Engelbrecht <[email protected]> | 2025-10-03 11:49:14 +0200 |
|---|---|---|
| committer | GitHub Enterprise <[email protected]> | 2025-10-03 11:49:14 +0200 |
| commit | faf0b7c9b6a08b095f8dc895904f4f7d3f30dcde (patch) | |
| tree | 2bcd09fe17af6f25108fd05578e7eda6a827d8ec /src/zenremotestore | |
| parent | cache RPC replay fixes (minor) (#544) (diff) | |
| download | zen-faf0b7c9b6a08b095f8dc895904f4f7d3f30dcde.tar.xz zen-faf0b7c9b6a08b095f8dc895904f4f7d3f30dcde.zip | |
move chunking code to zenremotestore lib (#545)
Diffstat (limited to 'src/zenremotestore')
15 files changed, 3070 insertions, 4 deletions
diff --git a/src/zenremotestore/chunking/chunkblock.cpp b/src/zenremotestore/chunking/chunkblock.cpp new file mode 100644 index 000000000..05ae13de1 --- /dev/null +++ b/src/zenremotestore/chunking/chunkblock.cpp @@ -0,0 +1,320 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#include <zenremotestore/chunking/chunkblock.h> + +#include <zencore/compactbinarybuilder.h> +#include <zencore/fmtutils.h> +#include <zencore/logging.h> + +#include <vector> + +#if ZEN_WITH_TESTS +# include <zencore/testing.h> +# include <zencore/testutils.h> + +# include <unordered_map> +#endif // ZEN_WITH_TESTS + +namespace zen { + +using namespace std::literals; + +ChunkBlockDescription +ParseChunkBlockDescription(const CbObjectView& BlockObject) +{ + ChunkBlockDescription Result; + Result.BlockHash = BlockObject["rawHash"sv].AsHash(); + if (Result.BlockHash != IoHash::Zero) + { + Result.HeaderSize = BlockObject["headerSize"sv].AsUInt64(); + CbArrayView ChunksArray = BlockObject["rawHashes"sv].AsArrayView(); + Result.ChunkRawHashes.reserve(ChunksArray.Num()); + for (CbFieldView ChunkView : ChunksArray) + { + Result.ChunkRawHashes.push_back(ChunkView.AsHash()); + } + + CbArrayView ChunkRawLengthsArray = BlockObject["chunkRawLengths"sv].AsArrayView(); + Result.ChunkRawLengths.reserve(ChunkRawLengthsArray.Num()); + for (CbFieldView ChunkView : ChunkRawLengthsArray) + { + Result.ChunkRawLengths.push_back(ChunkView.AsUInt32()); + } + + CbArrayView ChunkCompressedLengthsArray = BlockObject["chunkCompressedLengths"sv].AsArrayView(); + Result.ChunkCompressedLengths.reserve(ChunkCompressedLengthsArray.Num()); + for (CbFieldView ChunkView : ChunkCompressedLengthsArray) + { + Result.ChunkCompressedLengths.push_back(ChunkView.AsUInt32()); + } + } + return Result; +} + +std::vector<ChunkBlockDescription> +ParseChunkBlockDescriptionList(const CbObjectView& BlocksObject) +{ + if (!BlocksObject) + { + return {}; + } + std::vector<ChunkBlockDescription> Result; + CbArrayView Blocks = BlocksObject["blocks"sv].AsArrayView(); + Result.reserve(Blocks.Num()); + for (CbFieldView BlockView : Blocks) + { + CbObjectView BlockObject = BlockView.AsObjectView(); + Result.emplace_back(ParseChunkBlockDescription(BlockObject)); + } + return Result; +} + +CbObject +BuildChunkBlockDescription(const ChunkBlockDescription& Block, CbObjectView MetaData) +{ + ZEN_ASSERT(Block.BlockHash != IoHash::Zero); + ZEN_ASSERT(Block.HeaderSize > 0); + ZEN_ASSERT(Block.ChunkRawLengths.size() == Block.ChunkRawHashes.size()); + ZEN_ASSERT(Block.ChunkCompressedLengths.size() == Block.ChunkRawHashes.size()); + + CbObjectWriter Writer; + Writer.AddHash("rawHash"sv, Block.BlockHash); + Writer.AddInteger("headerSize"sv, Block.HeaderSize); + Writer.BeginArray("rawHashes"sv); + { + for (const IoHash& ChunkHash : Block.ChunkRawHashes) + { + Writer.AddHash(ChunkHash); + } + } + Writer.EndArray(); + + Writer.BeginArray("chunkRawLengths"); + { + for (uint32_t ChunkSize : Block.ChunkRawLengths) + { + Writer.AddInteger(ChunkSize); + } + } + Writer.EndArray(); + + Writer.BeginArray("chunkCompressedLengths"); + { + for (uint32_t ChunkSize : Block.ChunkCompressedLengths) + { + Writer.AddInteger(ChunkSize); + } + } + Writer.EndArray(); + + Writer.AddObject("metadata", MetaData); + + return Writer.Save(); +} + +ChunkBlockDescription +GetChunkBlockDescription(const SharedBuffer& BlockPayload, const IoHash& RawHash) +{ + ChunkBlockDescription BlockDescription = {{.BlockHash = IoHash::HashBuffer(BlockPayload)}}; + if (BlockDescription.BlockHash != RawHash) + { + throw std::runtime_error(fmt::format("Block {} content hash {} does not match block hash", RawHash, BlockDescription.BlockHash)); + } + if (IterateChunkBlock( + BlockPayload, + [&BlockDescription, RawHash](CompressedBuffer&& Chunk, const IoHash& AttachmentHash) { + if (CompositeBuffer Decompressed = Chunk.DecompressToComposite(); Decompressed) + { + IoHash ChunkHash = IoHash::HashBuffer(Decompressed.Flatten()); + if (ChunkHash != AttachmentHash) + { + throw std::runtime_error( + fmt::format("Chunk {} in block {} content hash {} does not match chunk", AttachmentHash, RawHash, ChunkHash)); + } + BlockDescription.ChunkRawHashes.push_back(AttachmentHash); + BlockDescription.ChunkRawLengths.push_back(gsl::narrow<uint32_t>(Decompressed.GetSize())); + BlockDescription.ChunkCompressedLengths.push_back(gsl::narrow<uint32_t>(Chunk.GetCompressedSize())); + } + else + { + throw std::runtime_error(fmt::format("Chunk {} in block {} is not a compressed buffer", AttachmentHash, RawHash)); + } + }, + BlockDescription.HeaderSize)) + { + return BlockDescription; + } + else + { + throw std::runtime_error(fmt::format("Block {} is malformed", RawHash)); + } +} + +CompressedBuffer +GenerateChunkBlock(std::vector<std::pair<IoHash, FetchChunkFunc>>&& FetchChunks, ChunkBlockDescription& OutBlock) +{ + const size_t ChunkCount = FetchChunks.size(); + + std::vector<SharedBuffer> ChunkSegments; + ChunkSegments.resize(1); + ChunkSegments.reserve(1 + ChunkCount); + OutBlock.ChunkRawHashes.reserve(ChunkCount); + OutBlock.ChunkRawLengths.reserve(ChunkCount); + OutBlock.ChunkCompressedLengths.reserve(ChunkCount); + { + IoBuffer TempBuffer(ChunkCount * 9); + MutableMemoryView View = TempBuffer.GetMutableView(); + uint8_t* BufferStartPtr = reinterpret_cast<uint8_t*>(View.GetData()); + uint8_t* BufferEndPtr = BufferStartPtr; + BufferEndPtr += WriteVarUInt(gsl::narrow<uint64_t>(ChunkCount), BufferEndPtr); + for (const auto& It : FetchChunks) + { + std::pair<uint64_t, CompressedBuffer> Chunk = It.second(It.first); + uint64_t ChunkSize = 0; + std::span<const SharedBuffer> Segments = Chunk.second.GetCompressed().GetSegments(); + for (const SharedBuffer& Segment : Segments) + { + ZEN_ASSERT(Segment.IsOwned()); + ChunkSize += Segment.GetSize(); + ChunkSegments.push_back(Segment); + } + BufferEndPtr += WriteVarUInt(ChunkSize, BufferEndPtr); + OutBlock.ChunkRawHashes.push_back(It.first); + OutBlock.ChunkRawLengths.push_back(gsl::narrow<uint32_t>(Chunk.first)); + OutBlock.ChunkCompressedLengths.push_back(gsl::narrow<uint32_t>(ChunkSize)); + } + ZEN_ASSERT(BufferEndPtr <= View.GetDataEnd()); + ptrdiff_t TempBufferLength = std::distance(BufferStartPtr, BufferEndPtr); + ChunkSegments[0] = SharedBuffer(IoBuffer(TempBuffer, 0, gsl::narrow<size_t>(TempBufferLength))); + OutBlock.HeaderSize = TempBufferLength; + } + CompressedBuffer CompressedBlock = + CompressedBuffer::Compress(CompositeBuffer(std::move(ChunkSegments)), OodleCompressor::Mermaid, OodleCompressionLevel::None); + OutBlock.BlockHash = CompressedBlock.DecodeRawHash(); + return CompressedBlock; +} + +std::vector<uint32_t> +ReadChunkBlockHeader(const MemoryView BlockView, uint64_t& OutHeaderSize) +{ + const uint8_t* ReadPtr = reinterpret_cast<const uint8_t*>(BlockView.GetData()); + uint32_t NumberSize; + uint64_t ChunkCount = ReadVarUInt(ReadPtr, NumberSize); + ReadPtr += NumberSize; + std::vector<uint32_t> ChunkSizes; + ChunkSizes.reserve(ChunkCount); + while (ChunkCount--) + { + if (ReadPtr >= BlockView.GetDataEnd()) + { + throw std::runtime_error("Invalid block header, block data ended unexpectedly"); + } + uint64_t ChunkSize = ReadVarUInt(ReadPtr, NumberSize); + if (ChunkSize > std::numeric_limits<uint32_t>::max()) + { + throw std::runtime_error("Invalid block header, header data is corrupt"); + } + if (ChunkSize < 1) + { + throw std::runtime_error("Invalid block header, header data is corrupt"); + } + ChunkSizes.push_back(gsl::narrow<uint32_t>(ChunkSize)); + ReadPtr += NumberSize; + } + uint64_t Offset = std::distance((const uint8_t*)BlockView.GetData(), ReadPtr); + OutHeaderSize = Offset; + return ChunkSizes; +} + +bool +IterateChunkBlock(const SharedBuffer& BlockPayload, + std::function<void(CompressedBuffer&& Chunk, const IoHash& AttachmentHash)> Visitor, + uint64_t& OutHeaderSize) +{ + ZEN_ASSERT(BlockPayload); + if (BlockPayload.GetSize() < 1) + { + return false; + } + + MemoryView BlockView = BlockPayload.GetView(); + + std::vector<uint32_t> ChunkSizes = ReadChunkBlockHeader(BlockView, OutHeaderSize); + uint64_t Offset = OutHeaderSize; + OutHeaderSize = Offset; + for (uint64_t ChunkSize : ChunkSizes) + { + IoBuffer Chunk(BlockPayload.AsIoBuffer(), Offset, ChunkSize); + IoHash AttachmentRawHash; + uint64_t AttachmentRawSize; + CompressedBuffer CompressedChunk = CompressedBuffer::FromCompressed(SharedBuffer(Chunk), AttachmentRawHash, AttachmentRawSize); + ZEN_ASSERT_SLOW(IoHash::HashBuffer(CompressedChunk.DecompressToComposite()) == AttachmentRawHash); + if (!CompressedChunk) + { + ZEN_ERROR("Invalid chunk in block"); + return false; + } + Visitor(std::move(CompressedChunk), AttachmentRawHash); + Offset += ChunkSize; + ZEN_ASSERT(Offset <= BlockView.GetSize()); + } + return true; +}; + +#if ZEN_WITH_TESTS + +namespace testutils { + static std::vector<std::pair<Oid, CompressedBuffer>> CreateAttachments( + const std::span<const size_t>& Sizes, + OodleCompressionLevel CompressionLevel = OodleCompressionLevel::VeryFast, + uint64_t BlockSize = 0) + { + std::vector<std::pair<Oid, CompressedBuffer>> Result; + Result.reserve(Sizes.size()); + for (size_t Size : Sizes) + { + CompressedBuffer Compressed = + CompressedBuffer::Compress(SharedBuffer(CreateSemiRandomBlob(Size)), OodleCompressor::Mermaid, CompressionLevel, BlockSize); + Result.emplace_back(std::pair<Oid, CompressedBuffer>(Oid::NewOid(), Compressed)); + } + return Result; + } + +} // namespace testutils + +TEST_CASE("project.store.block") +{ + using namespace std::literals; + using namespace testutils; + + std::vector<std::size_t> AttachmentSizes({7633, 6825, 5738, 8031, 7225, 566, 3656, 6006, 24, 3466, 1093, 4269, 2257, 3685, 3489, + 7194, 6151, 5482, 6217, 3511, 6738, 5061, 7537, 2759, 1916, 8210, 2235, 4024, 1582, 5251, + 491, 5464, 4607, 8135, 3767, 4045, 4415, 5007, 8876, 6761, 3359, 8526, 4097, 4855, 8225}); + + std::vector<std::pair<Oid, CompressedBuffer>> AttachmentsWithId = CreateAttachments(AttachmentSizes); + std::vector<std::pair<IoHash, FetchChunkFunc>> Chunks; + Chunks.reserve(AttachmentSizes.size()); + for (const auto& It : AttachmentsWithId) + { + Chunks.push_back( + std::make_pair(It.second.DecodeRawHash(), [Buffer = It.second](const IoHash&) -> std::pair<uint64_t, CompressedBuffer> { + return {Buffer.DecodeRawSize(), Buffer}; + })); + } + ChunkBlockDescription Block; + CompressedBuffer BlockBuffer = GenerateChunkBlock(std::move(Chunks), Block); + uint64_t HeaderSize; + CHECK(IterateChunkBlock( + BlockBuffer.Decompress(), + [](CompressedBuffer&&, const IoHash&) {}, + HeaderSize)); +} + +void +chunkblock_forcelink() +{ +} + +#endif // ZEN_WITH_TESTS + +} // namespace zen diff --git a/src/zenremotestore/chunking/chunkedcontent.cpp b/src/zenremotestore/chunking/chunkedcontent.cpp new file mode 100644 index 000000000..e97dcff15 --- /dev/null +++ b/src/zenremotestore/chunking/chunkedcontent.cpp @@ -0,0 +1,953 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#include <zenremotestore/chunking/chunkedcontent.h> + +#include <zencore/filesystem.h> +#include <zencore/fmtutils.h> +#include <zencore/logging.h> +#include <zencore/scopeguard.h> +#include <zencore/timer.h> +#include <zencore/trace.h> + +#include <zenremotestore/chunking/chunkedfile.h> +#include <zenremotestore/chunking/chunkingcontroller.h> +#include <zenutil/parallelwork.h> +#include <zenutil/workerpools.h> + +ZEN_THIRD_PARTY_INCLUDES_START +#include <tsl/robin_set.h> +#include <gsl/gsl-lite.hpp> +ZEN_THIRD_PARTY_INCLUDES_END + +namespace zen { + +using namespace std::literals; + +namespace { + void AddChunkSequence(ChunkingStatistics& Stats, + ChunkedContentData& InOutChunkedContent, + tsl::robin_map<IoHash, uint32_t, IoHash::Hasher>& ChunkHashToChunkIndex, + const IoHash& RawHash, + std::span<const uint32_t> ChunkSequence, + std::span<const IoHash> ChunkHashes, + std::span<const uint64_t> ChunkRawSizes) + { + ZEN_ASSERT(ChunkHashes.size() == ChunkRawSizes.size()); + InOutChunkedContent.ChunkCounts.push_back(gsl::narrow<uint32_t>(ChunkSequence.size())); + InOutChunkedContent.ChunkOrders.reserve(InOutChunkedContent.ChunkOrders.size() + ChunkSequence.size()); + + for (uint32_t ChunkedSequenceIndex : ChunkSequence) + { + const IoHash& ChunkHash = ChunkHashes[ChunkedSequenceIndex]; + if (auto It = ChunkHashToChunkIndex.find(ChunkHash); It != ChunkHashToChunkIndex.end()) + { + uint32_t ChunkIndex = gsl::narrow<uint32_t>(It->second); + InOutChunkedContent.ChunkOrders.push_back(ChunkIndex); + } + else + { + uint32_t ChunkIndex = gsl::narrow<uint32_t>(InOutChunkedContent.ChunkHashes.size()); + ChunkHashToChunkIndex.insert_or_assign(ChunkHash, ChunkIndex); + InOutChunkedContent.ChunkHashes.push_back(ChunkHash); + InOutChunkedContent.ChunkRawSizes.push_back(ChunkRawSizes[ChunkedSequenceIndex]); + InOutChunkedContent.ChunkOrders.push_back(ChunkIndex); + Stats.UniqueChunksFound++; + Stats.UniqueBytesFound += ChunkRawSizes[ChunkedSequenceIndex]; + } + } + InOutChunkedContent.SequenceRawHashes.push_back(RawHash); + Stats.UniqueSequencesFound++; + } + + void AddChunkSequence(ChunkingStatistics& Stats, + ChunkedContentData& InOutChunkedContent, + tsl::robin_map<IoHash, uint32_t, IoHash::Hasher>& ChunkHashToChunkIndex, + const IoHash& RawHash, + const uint64_t RawSize) + { + InOutChunkedContent.ChunkCounts.push_back(1); + + if (auto It = ChunkHashToChunkIndex.find(RawHash); It != ChunkHashToChunkIndex.end()) + { + uint32_t ChunkIndex = gsl::narrow<uint32_t>(It->second); + InOutChunkedContent.ChunkOrders.push_back(ChunkIndex); + } + else + { + uint32_t ChunkIndex = gsl::narrow<uint32_t>(InOutChunkedContent.ChunkHashes.size()); + ChunkHashToChunkIndex.insert_or_assign(RawHash, ChunkIndex); + InOutChunkedContent.ChunkHashes.push_back(RawHash); + InOutChunkedContent.ChunkRawSizes.push_back(RawSize); + InOutChunkedContent.ChunkOrders.push_back(ChunkIndex); + Stats.UniqueChunksFound++; + Stats.UniqueBytesFound += RawSize; + } + InOutChunkedContent.SequenceRawHashes.push_back(RawHash); + Stats.UniqueSequencesFound++; + } + + IoHash HashOneFile(ChunkingStatistics& Stats, + const ChunkingController& InChunkingController, + ChunkedFolderContent& OutChunkedContent, + tsl::robin_map<IoHash, uint32_t, IoHash::Hasher>& ChunkHashToChunkIndex, + tsl::robin_map<IoHash, uint32_t, IoHash::Hasher>& RawHashToSequenceRawHashIndex, + RwLock& Lock, + const std::filesystem::path& FolderPath, + uint32_t PathIndex, + std::atomic<bool>& AbortFlag) + { + ZEN_TRACE_CPU("ChunkFolderContent"); + + const uint64_t RawSize = OutChunkedContent.RawSizes[PathIndex]; + const std::filesystem::path& Path = OutChunkedContent.Paths[PathIndex]; + + if (RawSize == 0) + { + return IoHash::Zero; + } + else + { + ChunkedInfoWithSource Chunked; + const bool DidChunking = + InChunkingController.ProcessFile((FolderPath / Path).make_preferred(), RawSize, Chunked, Stats.BytesHashed, AbortFlag); + if (DidChunking) + { + Lock.WithExclusiveLock([&]() { + if (!RawHashToSequenceRawHashIndex.contains(Chunked.Info.RawHash)) + { + RawHashToSequenceRawHashIndex.insert( + {Chunked.Info.RawHash, gsl::narrow<uint32_t>(OutChunkedContent.ChunkedContent.SequenceRawHashes.size())}); + std::vector<uint64_t> ChunkSizes; + ChunkSizes.reserve(Chunked.ChunkSources.size()); + for (const ChunkSource& Source : Chunked.ChunkSources) + { + ChunkSizes.push_back(Source.Size); + } + AddChunkSequence(Stats, + OutChunkedContent.ChunkedContent, + ChunkHashToChunkIndex, + Chunked.Info.RawHash, + Chunked.Info.ChunkSequence, + Chunked.Info.ChunkHashes, + ChunkSizes); + Stats.UniqueSequencesFound++; + } + }); + Stats.FilesChunked++; + return Chunked.Info.RawHash; + } + else + { + ZEN_TRACE_CPU("HashOnly"); + + IoBuffer Buffer = IoBufferBuilder::MakeFromFile((FolderPath / Path).make_preferred()); + if (Buffer.GetSize() != RawSize) + { + throw std::runtime_error(fmt::format("Failed opening file '{}' for hashing", FolderPath / Path)); + } + const IoHash Hash = IoHash::HashBuffer(Buffer, &Stats.BytesHashed); + + Lock.WithExclusiveLock([&]() { + if (!RawHashToSequenceRawHashIndex.contains(Hash)) + { + RawHashToSequenceRawHashIndex.insert( + {Hash, gsl::narrow<uint32_t>(OutChunkedContent.ChunkedContent.SequenceRawHashes.size())}); + AddChunkSequence(Stats, OutChunkedContent.ChunkedContent, ChunkHashToChunkIndex, Hash, RawSize); + Stats.UniqueSequencesFound++; + } + }); + return Hash; + } + } + } + + std::string PathCompareString(const std::filesystem::path& Path) { return ToLower(Path.generic_string()); } + +} // namespace + +std::string_view FolderContentSourcePlatformNames[(size_t)SourcePlatform::_Count] = {"Windows"sv, "Linux"sv, "MacOS"sv}; + +std::string_view +ToString(SourcePlatform Platform) +{ + return FolderContentSourcePlatformNames[(size_t)Platform]; +} + +SourcePlatform +FromString(std::string_view Platform, SourcePlatform Default) +{ + for (size_t Index = 0; Index < (size_t)SourcePlatform::_Count; Index++) + { + if (Platform == FolderContentSourcePlatformNames[Index]) + { + return (SourcePlatform)Index; + } + } + return Default; +} + +SourcePlatform +GetSourceCurrentPlatform() +{ +#if ZEN_PLATFORM_WINDOWS + return SourcePlatform::Windows; +#endif +#if ZEN_PLATFORM_MAC + return SourcePlatform::MacOS; +#endif +#if ZEN_PLATFORM_LINUX + return SourcePlatform::Linux; +#endif +} + +bool +FolderContent::AreFileAttributesEqual(const uint32_t Lhs, const uint32_t Rhs) +{ +#if ZEN_PLATFORM_WINDOWS + return (Lhs & 0xff) == (Rhs & 0xff); +#endif +#if ZEN_PLATFORM_MAC + return Lhs == Rhs; +#endif +#if ZEN_PLATFORM_LINUX + return Lhs == Rhs; +#endif +} + +bool +FolderContent::operator==(const FolderContent& Rhs) const +{ + if ((Platform == Rhs.Platform) && (RawSizes == Rhs.RawSizes) && (Attributes == Rhs.Attributes) && + (ModificationTicks == Rhs.ModificationTicks) && (Paths.size() == Rhs.Paths.size())) + { + size_t PathCount = 0; + for (size_t PathIndex = 0; PathIndex < PathCount; PathIndex++) + { + if (Paths[PathIndex].generic_string() != Rhs.Paths[PathIndex].generic_string()) + { + return false; + } + } + return true; + } + return false; +} + +bool +FolderContent::AreKnownFilesEqual(const FolderContent& Rhs) const +{ + ZEN_TRACE_CPU("FolderContent::AreKnownFilesEqual"); + tsl::robin_map<std::string, size_t> RhsPathToIndex; + const size_t RhsPathCount = Rhs.Paths.size(); + RhsPathToIndex.reserve(RhsPathCount); + for (size_t RhsPathIndex = 0; RhsPathIndex < RhsPathCount; RhsPathIndex++) + { + RhsPathToIndex.insert({Rhs.Paths[RhsPathIndex].generic_string(), RhsPathIndex}); + } + const size_t PathCount = Paths.size(); + for (size_t PathIndex = 0; PathIndex < PathCount; PathIndex++) + { + if (auto It = RhsPathToIndex.find(Paths[PathIndex].generic_string()); It != RhsPathToIndex.end()) + { + const size_t RhsPathIndex = It->second; + if ((RawSizes[PathIndex] != Rhs.RawSizes[RhsPathIndex]) || + (!AreFileAttributesEqual(Attributes[PathIndex], Rhs.Attributes[RhsPathIndex])) || + (ModificationTicks[PathIndex] != Rhs.ModificationTicks[RhsPathIndex])) + { + return false; + } + } + else + { + return false; + } + } + return true; +} + +void +FolderContent::UpdateState(const FolderContent& Rhs, std::vector<uint32_t>& OutPathIndexesOufOfDate) +{ + ZEN_TRACE_CPU("FolderContent::UpdateState"); + tsl::robin_map<std::string, uint32_t> RhsPathToIndex; + const uint32_t RhsPathCount = gsl::narrow<uint32_t>(Rhs.Paths.size()); + RhsPathToIndex.reserve(RhsPathCount); + for (uint32_t RhsPathIndex = 0; RhsPathIndex < RhsPathCount; RhsPathIndex++) + { + RhsPathToIndex.insert({Rhs.Paths[RhsPathIndex].generic_string(), RhsPathIndex}); + } + uint32_t PathCount = gsl::narrow<uint32_t>(Paths.size()); + for (uint32_t PathIndex = 0; PathIndex < PathCount;) + { + if (auto It = RhsPathToIndex.find(Paths[PathIndex].generic_string()); It != RhsPathToIndex.end()) + { + const uint32_t RhsPathIndex = It->second; + + if ((RawSizes[PathIndex] != Rhs.RawSizes[RhsPathIndex]) || + (ModificationTicks[PathIndex] != Rhs.ModificationTicks[RhsPathIndex])) + { + RawSizes[PathIndex] = Rhs.RawSizes[RhsPathIndex]; + ModificationTicks[PathIndex] = Rhs.ModificationTicks[RhsPathIndex]; + OutPathIndexesOufOfDate.push_back(PathIndex); + } + Attributes[PathIndex] = Rhs.Attributes[RhsPathIndex]; + PathIndex++; + } + else + { + Paths.erase(Paths.begin() + PathIndex); + RawSizes.erase(RawSizes.begin() + PathIndex); + Attributes.erase(Attributes.begin() + PathIndex); + ModificationTicks.erase(ModificationTicks.begin() + PathIndex); + PathCount--; + } + } +} + +FolderContent +GetUpdatedContent(const FolderContent& Old, const FolderContent& New, std::vector<std::filesystem::path>& OutDeletedPaths) +{ + ZEN_TRACE_CPU("FolderContent::GetUpdatedContent"); + + const uint32_t NewPathCount = gsl::narrow<uint32_t>(New.Paths.size()); + + FolderContent Result = {.Platform = Old.Platform}; + Result.Paths.reserve(NewPathCount); + Result.RawSizes.reserve(NewPathCount); + Result.Attributes.reserve(NewPathCount); + Result.ModificationTicks.reserve(NewPathCount); + + tsl::robin_map<std::string, uint32_t> NewPathToIndex; + NewPathToIndex.reserve(NewPathCount); + for (uint32_t NewPathIndex = 0; NewPathIndex < NewPathCount; NewPathIndex++) + { + NewPathToIndex.insert({New.Paths[NewPathIndex].generic_string(), NewPathIndex}); + } + + uint32_t OldPathCount = gsl::narrow<uint32_t>(Old.Paths.size()); + for (uint32_t OldPathIndex = 0; OldPathIndex < OldPathCount; OldPathIndex++) + { + if (auto It = NewPathToIndex.find(Old.Paths[OldPathIndex].generic_string()); It != NewPathToIndex.end()) + { + const uint32_t NewPathIndex = It->second; + + if ((Old.RawSizes[OldPathIndex] != New.RawSizes[NewPathIndex]) || + (Old.ModificationTicks[OldPathIndex] != New.ModificationTicks[NewPathIndex])) + { + Result.Paths.push_back(New.Paths[NewPathIndex]); + Result.RawSizes.push_back(New.RawSizes[NewPathIndex]); + Result.Attributes.push_back(New.Attributes[NewPathIndex]); + Result.ModificationTicks.push_back(New.ModificationTicks[NewPathIndex]); + } + } + else + { + OutDeletedPaths.push_back(Old.Paths[OldPathIndex]); + } + } + return Result; +} + +void +SaveFolderContentToCompactBinary(const FolderContent& Content, CbWriter& Output) +{ + ZEN_TRACE_CPU("SaveFolderContentToCompactBinary"); + Output.AddString("platform"sv, ToString(Content.Platform)); + compactbinary_helpers::WriteArray(Content.Paths, "paths"sv, Output); + compactbinary_helpers::WriteArray(Content.RawSizes, "rawSizes"sv, Output); + compactbinary_helpers::WriteArray(Content.Attributes, "attributes"sv, Output); + compactbinary_helpers::WriteArray(Content.ModificationTicks, "modificationTimes"sv, Output); +} + +FolderContent +LoadFolderContentToCompactBinary(CbObjectView Input) +{ + ZEN_TRACE_CPU("LoadFolderContentToCompactBinary"); + FolderContent Content; + Content.Platform = FromString(Input["platform"sv].AsString(), GetSourceCurrentPlatform()); + compactbinary_helpers::ReadArray("paths"sv, Input, Content.Paths); + compactbinary_helpers::ReadArray("rawSizes"sv, Input, Content.RawSizes); + compactbinary_helpers::ReadArray("attributes"sv, Input, Content.Attributes); + compactbinary_helpers::ReadArray("modificationTimes"sv, Input, Content.ModificationTicks); + return Content; +} + +FolderContent +GetFolderContent(GetFolderContentStatistics& Stats, + const std::filesystem::path& RootPath, + std::function<bool(const std::string_view& RelativePath)>&& AcceptDirectory, + std::function<bool(std::string_view RelativePath, uint64_t Size, uint32_t Attributes)>&& AcceptFile, + WorkerThreadPool& WorkerPool, + int32_t UpdateIntervalMS, + std::function<void(bool IsAborted, std::ptrdiff_t PendingWork)>&& UpdateCallback, + std::atomic<bool>& AbortFlag) +{ + ZEN_TRACE_CPU("GetFolderContent"); + + Stopwatch Timer; + auto _ = MakeGuard([&Stats, &Timer]() { Stats.ElapsedWallTimeUS = Timer.GetElapsedTimeUs(); }); + + FolderContent Content; + struct AsyncVisitor : public GetDirectoryContentVisitor + { + AsyncVisitor(GetFolderContentStatistics& Stats, + std::atomic<bool>& AbortFlag, + FolderContent& Content, + std::function<bool(const std::string_view& RelativePath)>&& AcceptDirectory, + std::function<bool(std::string_view RelativePath, uint64_t Size, uint32_t Attributes)>&& AcceptFile) + : m_Stats(Stats) + , m_AbortFlag(AbortFlag) + , m_FoundContent(Content) + , m_AcceptDirectory(std::move(AcceptDirectory)) + , m_AcceptFile(std::move(AcceptFile)) + { + } + virtual void AsyncVisitDirectory(const std::filesystem::path& RelativeRoot, DirectoryContent&& Content) override + { + if (!m_AbortFlag) + { + m_Stats.FoundFileCount += Content.FileNames.size(); + for (uint64_t FileSize : Content.FileSizes) + { + m_Stats.FoundFileByteCount += FileSize; + } + std::string RelativeDirectoryPath = RelativeRoot.generic_string(); + if (m_AcceptDirectory(RelativeDirectoryPath)) + { + std::vector<std::filesystem::path> Paths; + std::vector<uint64_t> RawSizes; + std::vector<uint32_t> Attributes; + std::vector<uint64_t> ModificatonTicks; + Paths.reserve(Content.FileNames.size()); + RawSizes.reserve(Content.FileNames.size()); + Attributes.reserve(Content.FileNames.size()); + ModificatonTicks.reserve(Content.FileModificationTicks.size()); + + for (size_t FileIndex = 0; FileIndex < Content.FileNames.size(); FileIndex++) + { + const std::filesystem::path& FileName = Content.FileNames[FileIndex]; + std::string RelativePath = (RelativeRoot / FileName).generic_string(); + std::replace(RelativePath.begin(), RelativePath.end(), '\\', '/'); + if (m_AcceptFile(RelativePath, Content.FileSizes[FileIndex], Content.FileAttributes[FileIndex])) + { + Paths.emplace_back(std::move(RelativePath)); + RawSizes.emplace_back(Content.FileSizes[FileIndex]); + Attributes.emplace_back(Content.FileAttributes[FileIndex]); + ModificatonTicks.emplace_back(Content.FileModificationTicks[FileIndex]); + + m_Stats.AcceptedFileCount++; + m_Stats.AcceptedFileByteCount += Content.FileSizes[FileIndex]; + } + } + m_Lock.WithExclusiveLock([&]() { + m_FoundContent.Paths.insert(m_FoundContent.Paths.end(), Paths.begin(), Paths.end()); + m_FoundContent.RawSizes.insert(m_FoundContent.RawSizes.end(), RawSizes.begin(), RawSizes.end()); + m_FoundContent.Attributes.insert(m_FoundContent.Attributes.end(), Attributes.begin(), Attributes.end()); + m_FoundContent.ModificationTicks.insert(m_FoundContent.ModificationTicks.end(), + ModificatonTicks.begin(), + ModificatonTicks.end()); + }); + } + } + } + + GetFolderContentStatistics& m_Stats; + std::atomic<bool>& m_AbortFlag; + RwLock m_Lock; + FolderContent& m_FoundContent; + std::function<bool(const std::string_view& RelativePath)> m_AcceptDirectory; + std::function<bool(std::string_view RelativePath, uint64_t Size, uint32_t Attributes)> m_AcceptFile; + } Visitor(Stats, AbortFlag, Content, std::move(AcceptDirectory), std::move(AcceptFile)); + + Latch PendingWork(1); + GetDirectoryContent(RootPath, + DirectoryContentFlags::IncludeFiles | DirectoryContentFlags::Recursive | DirectoryContentFlags::IncludeFileSizes | + DirectoryContentFlags::IncludeAttributes | DirectoryContentFlags::IncludeModificationTick, + Visitor, + WorkerPool, + PendingWork); + PendingWork.CountDown(); + while (!PendingWork.Wait(UpdateIntervalMS)) + { + UpdateCallback(AbortFlag.load(), PendingWork.Remaining()); + } + std::vector<size_t> Order; + size_t PathCount = Content.Paths.size(); + Order.resize(Content.Paths.size()); + std::vector<std::string> Parents; + Parents.reserve(PathCount); + std::vector<std::string> Filenames; + Filenames.reserve(PathCount); + for (size_t OrderIndex = 0; OrderIndex < PathCount; OrderIndex++) + { + Order[OrderIndex] = OrderIndex; + Parents.emplace_back(Content.Paths[OrderIndex].parent_path().generic_string()); + Filenames.emplace_back(Content.Paths[OrderIndex].filename().generic_string()); + } + std::sort(Order.begin(), Order.end(), [&Parents, &Filenames](size_t Lhs, size_t Rhs) { + const std::string& LhsParent = Parents[Lhs]; + const std::string& RhsParent = Parents[Rhs]; + if (LhsParent < RhsParent) + { + return true; + } + else if (LhsParent > RhsParent) + { + return false; + } + return Filenames[Lhs] < Filenames[Rhs]; + }); + FolderContent OrderedContent; + OrderedContent.Paths.reserve(PathCount); + OrderedContent.RawSizes.reserve(PathCount); + OrderedContent.Attributes.reserve(PathCount); + OrderedContent.ModificationTicks.reserve(PathCount); + for (size_t OrderIndex : Order) + { + OrderedContent.Paths.emplace_back(std::move(Content.Paths[OrderIndex])); + OrderedContent.RawSizes.emplace_back(Content.RawSizes[OrderIndex]); + OrderedContent.Attributes.emplace_back(Content.Attributes[OrderIndex]); + OrderedContent.ModificationTicks.emplace_back(Content.ModificationTicks[OrderIndex]); + } + return OrderedContent; +} + +void +SaveChunkedFolderContentToCompactBinary(const ChunkedFolderContent& Content, CbWriter& Output) +{ + ZEN_TRACE_CPU("SaveChunkedFolderContentToCompactBinary"); + Output.AddString("platform"sv, ToString(Content.Platform)); + compactbinary_helpers::WriteArray(Content.Paths, "paths"sv, Output); + compactbinary_helpers::WriteArray(Content.RawSizes, "rawSizes"sv, Output); + compactbinary_helpers::WriteArray(Content.Attributes, "attributes"sv, Output); + compactbinary_helpers::WriteArray(Content.RawHashes, "rawHashes"sv, Output); + + Output.BeginObject("chunkedContent"); + compactbinary_helpers::WriteArray(Content.ChunkedContent.SequenceRawHashes, "sequenceRawHashes"sv, Output); + compactbinary_helpers::WriteArray(Content.ChunkedContent.ChunkCounts, "chunkCounts"sv, Output); + compactbinary_helpers::WriteArray(Content.ChunkedContent.ChunkOrders, "chunkOrders"sv, Output); + compactbinary_helpers::WriteArray(Content.ChunkedContent.ChunkHashes, "chunkHashes"sv, Output); + compactbinary_helpers::WriteArray(Content.ChunkedContent.ChunkRawSizes, "chunkRawSizes"sv, Output); + Output.EndObject(); // chunkedContent +} + +ChunkedFolderContent +LoadChunkedFolderContentToCompactBinary(CbObjectView Input) +{ + ZEN_TRACE_CPU("LoadChunkedFolderContentToCompactBinary"); + ChunkedFolderContent Content; + Content.Platform = FromString(Input["platform"sv].AsString(), GetSourceCurrentPlatform()); + compactbinary_helpers::ReadArray("paths"sv, Input, Content.Paths); + compactbinary_helpers::ReadArray("rawSizes"sv, Input, Content.RawSizes); + compactbinary_helpers::ReadArray("attributes"sv, Input, Content.Attributes); + compactbinary_helpers::ReadArray("rawHashes"sv, Input, Content.RawHashes); + + CbObjectView ChunkedContentView = Input["chunkedContent"sv].AsObjectView(); + compactbinary_helpers::ReadArray("sequenceRawHashes"sv, ChunkedContentView, Content.ChunkedContent.SequenceRawHashes); + compactbinary_helpers::ReadArray("chunkCounts"sv, ChunkedContentView, Content.ChunkedContent.ChunkCounts); + compactbinary_helpers::ReadArray("chunkOrders"sv, ChunkedContentView, Content.ChunkedContent.ChunkOrders); + compactbinary_helpers::ReadArray("chunkHashes"sv, ChunkedContentView, Content.ChunkedContent.ChunkHashes); + compactbinary_helpers::ReadArray("chunkRawSizes"sv, ChunkedContentView, Content.ChunkedContent.ChunkRawSizes); + return Content; +} + +ChunkedFolderContent +MergeChunkedFolderContents(const ChunkedFolderContent& Base, std::span<const ChunkedFolderContent> Overlays) +{ + ZEN_TRACE_CPU("MergeChunkedFolderContents"); + + ZEN_ASSERT(!Overlays.empty()); + + ChunkedFolderContent Result; + const size_t BasePathCount = Base.Paths.size(); + Result.Paths.reserve(BasePathCount); + Result.RawSizes.reserve(BasePathCount); + Result.Attributes.reserve(BasePathCount); + Result.RawHashes.reserve(BasePathCount); + + const size_t BaseChunkCount = Base.ChunkedContent.ChunkHashes.size(); + Result.ChunkedContent.SequenceRawHashes.reserve(Base.ChunkedContent.SequenceRawHashes.size()); + Result.ChunkedContent.ChunkCounts.reserve(BaseChunkCount); + Result.ChunkedContent.ChunkHashes.reserve(BaseChunkCount); + Result.ChunkedContent.ChunkRawSizes.reserve(BaseChunkCount); + Result.ChunkedContent.ChunkOrders.reserve(Base.ChunkedContent.ChunkOrders.size()); + + tsl::robin_map<std::string, std::filesystem::path> GenericPathToActualPath; + for (const std::filesystem::path& Path : Base.Paths) + { + GenericPathToActualPath.insert({PathCompareString(Path), Path}); + } + for (const ChunkedFolderContent& Overlay : Overlays) + { + for (const std::filesystem::path& Path : Overlay.Paths) + { + GenericPathToActualPath.insert({PathCompareString(Path), Path}); + } + } + + tsl::robin_map<IoHash, uint32_t, IoHash::Hasher> RawHashToSequenceRawHashIndex; + + auto BuildOverlayPaths = [](std::span<const ChunkedFolderContent> Overlays) -> tsl::robin_set<std::string> { + tsl::robin_set<std::string> Result; + for (const ChunkedFolderContent& OverlayContent : Overlays) + { + for (const std::filesystem::path& Path : OverlayContent.Paths) + { + Result.insert(PathCompareString(Path)); + } + } + return Result; + }; + + auto AddContent = [&BuildOverlayPaths](ChunkedFolderContent& Result, + const ChunkedFolderContent& OverlayContent, + tsl::robin_map<IoHash, uint32_t, IoHash::Hasher>& ChunkHashToChunkIndex, + tsl::robin_map<IoHash, uint32_t, IoHash::Hasher>& RawHashToSequenceRawHashIndex, + const tsl::robin_map<std::string, std::filesystem::path>& GenericPathToActualPath, + std::span<const ChunkedFolderContent> Overlays) { + const ChunkedContentLookup OverlayLookup = BuildChunkedContentLookup(OverlayContent); + tsl::robin_set<std::string> BaseOverlayPaths = BuildOverlayPaths(Overlays); + for (uint32_t PathIndex = 0; PathIndex < OverlayContent.Paths.size(); PathIndex++) + { + std::string GenericPath = PathCompareString(OverlayContent.Paths[PathIndex]); + if (!BaseOverlayPaths.contains(GenericPath)) + { + // This asset will not be overridden by a later layer - add it + + const std::filesystem::path OriginalPath = GenericPathToActualPath.at(GenericPath); + Result.Paths.push_back(OriginalPath); + const IoHash& RawHash = OverlayContent.RawHashes[PathIndex]; + Result.RawSizes.push_back(OverlayContent.RawSizes[PathIndex]); + Result.Attributes.push_back(OverlayContent.Attributes[PathIndex]); + Result.RawHashes.push_back(RawHash); + + if (OverlayContent.RawSizes[PathIndex] > 0) + { + if (!RawHashToSequenceRawHashIndex.contains(RawHash)) + { + RawHashToSequenceRawHashIndex.insert( + {RawHash, gsl::narrow<uint32_t>(Result.ChunkedContent.SequenceRawHashes.size())}); + const uint32_t SequenceRawHashIndex = OverlayLookup.RawHashToSequenceIndex.at(RawHash); + const uint32_t OrderIndexOffset = OverlayLookup.SequenceIndexChunkOrderOffset[SequenceRawHashIndex]; + const uint32_t ChunkCount = OverlayContent.ChunkedContent.ChunkCounts[SequenceRawHashIndex]; + ChunkingStatistics Stats; + std::span<const uint32_t> OriginalChunkOrder = + std::span<const uint32_t>(OverlayContent.ChunkedContent.ChunkOrders).subspan(OrderIndexOffset, ChunkCount); + AddChunkSequence(Stats, + Result.ChunkedContent, + ChunkHashToChunkIndex, + RawHash, + OriginalChunkOrder, + OverlayContent.ChunkedContent.ChunkHashes, + OverlayContent.ChunkedContent.ChunkRawSizes); + Stats.UniqueSequencesFound++; + } + } + } + } + }; + + tsl::robin_map<IoHash, uint32_t, IoHash::Hasher> MergedChunkHashToChunkIndex; + AddContent(Result, Base, MergedChunkHashToChunkIndex, RawHashToSequenceRawHashIndex, GenericPathToActualPath, Overlays); + for (uint32_t OverlayIndex = 0; OverlayIndex < Overlays.size(); OverlayIndex++) + { + AddContent(Result, + Overlays[OverlayIndex], + MergedChunkHashToChunkIndex, + RawHashToSequenceRawHashIndex, + GenericPathToActualPath, + Overlays.subspan(OverlayIndex + 1)); + } + return Result; +} + +ChunkedFolderContent +DeletePathsFromChunkedContent(const ChunkedFolderContent& BaseContent, + const ChunkedContentLookup& BaseContentLookup, + std::span<const std::filesystem::path> DeletedPaths) +{ + ZEN_TRACE_CPU("DeletePathsFromChunkedContent"); + + ZEN_ASSERT(DeletedPaths.size() <= BaseContent.Paths.size()); + ChunkedFolderContent Result = {.Platform = BaseContent.Platform}; + if (DeletedPaths.size() < BaseContent.Paths.size()) + { + tsl::robin_set<std::string> DeletedPathSet; + DeletedPathSet.reserve(DeletedPaths.size()); + for (const std::filesystem::path& DeletedPath : DeletedPaths) + { + DeletedPathSet.insert(PathCompareString(DeletedPath)); + } + + const size_t BaseChunkCount = BaseContent.ChunkedContent.ChunkHashes.size(); + std::vector<uint32_t> NewChunkIndexes(BaseChunkCount, (uint32_t)-1); + + const size_t ExpectedPathCount = BaseContent.Paths.size() - DeletedPaths.size(); + Result.Paths.reserve(ExpectedPathCount); + Result.RawSizes.reserve(ExpectedPathCount); + Result.Attributes.reserve(ExpectedPathCount); + Result.RawHashes.reserve(ExpectedPathCount); + + Result.ChunkedContent.ChunkHashes.reserve(BaseChunkCount); + Result.ChunkedContent.ChunkRawSizes.reserve(BaseChunkCount); + + tsl::robin_map<IoHash, uint32_t, IoHash::Hasher> RawHashToSequenceRawHashIndex; + for (uint32_t PathIndex = 0; PathIndex < BaseContent.Paths.size(); PathIndex++) + { + const std::filesystem::path& Path = BaseContent.Paths[PathIndex]; + if (!DeletedPathSet.contains(PathCompareString(Path))) + { + const IoHash& RawHash = BaseContent.RawHashes[PathIndex]; + const uint64_t RawSize = BaseContent.RawSizes[PathIndex]; + Result.Paths.push_back(Path); + Result.RawSizes.push_back(RawSize); + Result.Attributes.push_back(BaseContent.Attributes[PathIndex]); + Result.RawHashes.push_back(RawHash); + if (RawSize > 0) + { + if (!RawHashToSequenceRawHashIndex.contains(RawHash)) + { + RawHashToSequenceRawHashIndex.insert( + {RawHash, gsl::narrow<uint32_t>(Result.ChunkedContent.SequenceRawHashes.size())}); + const uint32_t SequenceRawHashIndex = BaseContentLookup.RawHashToSequenceIndex.at(RawHash); + const uint32_t OrderIndexOffset = BaseContentLookup.SequenceIndexChunkOrderOffset[SequenceRawHashIndex]; + const uint32_t ChunkCount = BaseContent.ChunkedContent.ChunkCounts[SequenceRawHashIndex]; + + std::span<const uint32_t> OriginalChunkOrder = + std::span<const uint32_t>(BaseContent.ChunkedContent.ChunkOrders).subspan(OrderIndexOffset, ChunkCount); + + Result.ChunkedContent.ChunkCounts.push_back(gsl::narrow<uint32_t>(OriginalChunkOrder.size())); + + for (uint32_t OldChunkIndex : OriginalChunkOrder) + { + if (uint32_t FoundChunkIndex = NewChunkIndexes[OldChunkIndex]; FoundChunkIndex != (uint32_t)-1) + { + Result.ChunkedContent.ChunkOrders.push_back(FoundChunkIndex); + } + else + { + const uint32_t NewChunkIndex = gsl::narrow<uint32_t>(Result.ChunkedContent.ChunkHashes.size()); + NewChunkIndexes[OldChunkIndex] = NewChunkIndex; + const IoHash& ChunkHash = BaseContent.ChunkedContent.ChunkHashes[OldChunkIndex]; + const uint64_t OldChunkSize = BaseContent.ChunkedContent.ChunkRawSizes[OldChunkIndex]; + Result.ChunkedContent.ChunkHashes.push_back(ChunkHash); + Result.ChunkedContent.ChunkRawSizes.push_back(OldChunkSize); + Result.ChunkedContent.ChunkOrders.push_back(NewChunkIndex); + } + } + Result.ChunkedContent.SequenceRawHashes.push_back(RawHash); + } + } + } + } + } + return Result; +} + +ChunkedFolderContent +DeletePathsFromChunkedContent(const ChunkedFolderContent& BaseContent, std::span<const std::filesystem::path> DeletedPaths) +{ + ZEN_TRACE_CPU("DeletePathsFromChunkedContent"); + ZEN_ASSERT(DeletedPaths.size() <= BaseContent.Paths.size()); + if (DeletedPaths.size() == BaseContent.Paths.size()) + { + return {}; + } + const ChunkedContentLookup BaseLookup = BuildChunkedContentLookup(BaseContent); + return DeletePathsFromChunkedContent(BaseContent, BaseLookup, DeletedPaths); +} + +ChunkedFolderContent +ChunkFolderContent(ChunkingStatistics& Stats, + WorkerThreadPool& WorkerPool, + const std::filesystem::path& RootPath, + const FolderContent& Content, + const ChunkingController& InChunkingController, + int32_t UpdateIntervalMS, + std::function<void(bool IsAborted, bool IsPaused, std::ptrdiff_t PendingWork)>&& UpdateCallback, + std::atomic<bool>& AbortFlag, + std::atomic<bool>& PauseFlag) +{ + ZEN_TRACE_CPU("ChunkFolderContent"); + + Stopwatch Timer; + auto _ = MakeGuard([&Stats, &Timer]() { Stats.ElapsedWallTimeUS = Timer.GetElapsedTimeUs(); }); + + ChunkedFolderContent Result = {.Platform = Content.Platform, + .Paths = Content.Paths, + .RawSizes = Content.RawSizes, + .Attributes = Content.Attributes}; + const size_t ItemCount = Result.Paths.size(); + Result.RawHashes.resize(ItemCount, IoHash::Zero); + Result.ChunkedContent.SequenceRawHashes.reserve(ItemCount); // Up to 1 per file, maybe less + Result.ChunkedContent.ChunkCounts.reserve(ItemCount); // Up to one per file + Result.ChunkedContent.ChunkOrders.reserve(ItemCount); // At least 1 per file, maybe more + Result.ChunkedContent.ChunkHashes.reserve(ItemCount); // At least 1 per file, maybe more + Result.ChunkedContent.ChunkRawSizes.reserve(ItemCount); // At least 1 per file, maybe more + tsl::robin_map<IoHash, uint32_t, IoHash::Hasher> ChunkHashToChunkIndex; + tsl::robin_map<IoHash, uint32_t, IoHash::Hasher> RawHashToChunkSequenceIndex; + RawHashToChunkSequenceIndex.reserve(ItemCount); + ChunkHashToChunkIndex.reserve(ItemCount); + { + std::vector<uint32_t> Order; + Order.resize(ItemCount); + for (uint32_t I = 0; I < ItemCount; I++) + { + Order[I] = I; + } + + // Handle the biggest files first so we don't end up with one straggling large file at the end + // std::sort(Order.begin(), Order.end(), [&](uint32_t Lhs, uint32_t Rhs) { return Result.RawSizes[Lhs] > Result.RawSizes[Rhs]; + //}); + + tsl::robin_map<IoHash, uint32_t, IoHash::Hasher> RawHashToSequenceRawHashIndex; + RawHashToSequenceRawHashIndex.reserve(ItemCount); + + RwLock Lock; + + ParallelWork Work(AbortFlag, PauseFlag, WorkerThreadPool::EMode::EnableBacklog); + + for (uint32_t PathIndex : Order) + { + if (Work.IsAborted()) + { + break; + } + Work.ScheduleWork(WorkerPool, // GetSyncWorkerPool() + [&, PathIndex](std::atomic<bool>& AbortFlag) { + if (!AbortFlag) + { + IoHash RawHash = HashOneFile(Stats, + InChunkingController, + Result, + ChunkHashToChunkIndex, + RawHashToSequenceRawHashIndex, + Lock, + RootPath, + PathIndex, + AbortFlag); + Lock.WithExclusiveLock([&]() { Result.RawHashes[PathIndex] = RawHash; }); + Stats.FilesProcessed++; + } + }); + } + + Work.Wait(UpdateIntervalMS, [&](bool IsAborted, bool IsPaused, std::ptrdiff_t PendingWork) { + ZEN_UNUSED(PendingWork); + UpdateCallback(IsAborted, IsPaused, Work.PendingWork().Remaining()); + }); + } + return Result; +} + +ChunkedContentLookup +BuildChunkedContentLookup(const ChunkedFolderContent& Content) +{ + ZEN_TRACE_CPU("BuildChunkedContentLookup"); + + struct ChunkLocationReference + { + uint32_t ChunkIndex = (uint32_t)-1; + uint32_t SequenceIndex = (uint32_t)-1; + uint64_t Offset = (uint64_t)-1; + }; + + ChunkedContentLookup Result; + { + const uint32_t SequenceRawHashesCount = gsl::narrow<uint32_t>(Content.ChunkedContent.SequenceRawHashes.size()); + Result.RawHashToSequenceIndex.reserve(SequenceRawHashesCount); + Result.SequenceIndexChunkOrderOffset.reserve(SequenceRawHashesCount); + uint32_t OrderOffset = 0; + for (uint32_t SequenceRawHashIndex = 0; SequenceRawHashIndex < Content.ChunkedContent.SequenceRawHashes.size(); + SequenceRawHashIndex++) + { + Result.RawHashToSequenceIndex.insert({Content.ChunkedContent.SequenceRawHashes[SequenceRawHashIndex], SequenceRawHashIndex}); + Result.SequenceIndexChunkOrderOffset.push_back(OrderOffset); + OrderOffset += Content.ChunkedContent.ChunkCounts[SequenceRawHashIndex]; + } + } + + std::vector<ChunkLocationReference> Locations; + Locations.reserve(Content.ChunkedContent.ChunkOrders.size()); + for (uint32_t SequenceIndex = 0; SequenceIndex < Content.ChunkedContent.SequenceRawHashes.size(); SequenceIndex++) + { + const uint32_t OrderOffset = Result.SequenceIndexChunkOrderOffset[SequenceIndex]; + const uint32_t ChunkCount = Content.ChunkedContent.ChunkCounts[SequenceIndex]; + uint64_t LocationOffset = 0; + for (size_t OrderIndex = OrderOffset; OrderIndex < OrderOffset + ChunkCount; OrderIndex++) + { + uint32_t ChunkIndex = Content.ChunkedContent.ChunkOrders[OrderIndex]; + + Locations.push_back(ChunkLocationReference{.ChunkIndex = ChunkIndex, .SequenceIndex = SequenceIndex, .Offset = LocationOffset}); + + LocationOffset += Content.ChunkedContent.ChunkRawSizes[ChunkIndex]; + } + } + + std::sort(Locations.begin(), Locations.end(), [](const ChunkLocationReference& Lhs, const ChunkLocationReference& Rhs) { + if (Lhs.ChunkIndex < Rhs.ChunkIndex) + { + return true; + } + if (Lhs.ChunkIndex > Rhs.ChunkIndex) + { + return false; + } + if (Lhs.SequenceIndex < Rhs.SequenceIndex) + { + return true; + } + if (Lhs.SequenceIndex > Rhs.SequenceIndex) + { + return false; + } + return Lhs.Offset < Rhs.Offset; + }); + + Result.ChunkSequenceLocations.reserve(Locations.size()); + const uint32_t ChunkCount = gsl::narrow<uint32_t>(Content.ChunkedContent.ChunkHashes.size()); + Result.ChunkHashToChunkIndex.reserve(ChunkCount); + size_t RangeOffset = 0; + for (uint32_t ChunkIndex = 0; ChunkIndex < ChunkCount; ChunkIndex++) + { + Result.ChunkHashToChunkIndex.insert({Content.ChunkedContent.ChunkHashes[ChunkIndex], ChunkIndex}); + uint32_t Count = 0; + while ((RangeOffset + Count < Locations.size()) && (Locations[RangeOffset + Count].ChunkIndex == ChunkIndex)) + { + const ChunkLocationReference& LocationReference = Locations[RangeOffset + Count]; + Result.ChunkSequenceLocations.push_back( + ChunkedContentLookup::ChunkSequenceLocation{.SequenceIndex = LocationReference.SequenceIndex, + .Offset = LocationReference.Offset}); + Count++; + } + Result.ChunkSequenceLocationOffset.push_back(RangeOffset); + Result.ChunkSequenceLocationCounts.push_back(Count); + RangeOffset += Count; + } + + Result.SequenceIndexFirstPathIndex.resize(Content.ChunkedContent.SequenceRawHashes.size(), (uint32_t)-1); + Result.PathExtensionHash.resize(Content.Paths.size()); + for (uint32_t PathIndex = 0; PathIndex < Content.Paths.size(); PathIndex++) + { + std::string LowercaseExtension = Content.Paths[PathIndex].extension().string(); + std::transform(LowercaseExtension.begin(), LowercaseExtension.end(), LowercaseExtension.begin(), [](char c) { + return (char)::tolower(c); + }); + Result.PathExtensionHash[PathIndex] = HashStringDjb2(LowercaseExtension); + if (Content.RawSizes[PathIndex] > 0) + { + const IoHash& RawHash = Content.RawHashes[PathIndex]; + auto SequenceIndexIt = Result.RawHashToSequenceIndex.find(RawHash); + ZEN_ASSERT(SequenceIndexIt != Result.RawHashToSequenceIndex.end()); + const uint32_t SequenceIndex = SequenceIndexIt->second; + if (Result.SequenceIndexFirstPathIndex[SequenceIndex] == (uint32_t)-1) + { + Result.SequenceIndexFirstPathIndex[SequenceIndex] = PathIndex; + } + } + } + + return Result; +} + +} // namespace zen diff --git a/src/zenremotestore/chunking/chunkedfile.cpp b/src/zenremotestore/chunking/chunkedfile.cpp new file mode 100644 index 000000000..652110605 --- /dev/null +++ b/src/zenremotestore/chunking/chunkedfile.cpp @@ -0,0 +1,525 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#include <zenremotestore/chunking/chunkedfile.h> + +#include <zencore/basicfile.h> +#include <zencore/trace.h> + +#include "chunking.h" + +ZEN_THIRD_PARTY_INCLUDES_START +#include <tsl/robin_map.h> +#include <gsl/gsl-lite.hpp> +ZEN_THIRD_PARTY_INCLUDES_END + +namespace zen { + +namespace { + struct ChunkedHeader + { + static constexpr uint32_t ExpectedMagic = 0x646b6863; // chkd + static constexpr uint32_t CurrentVersion = 1; + + uint32_t Magic = ExpectedMagic; + uint32_t Version = CurrentVersion; + uint32_t ChunkSequenceLength; + uint32_t ChunkHashCount; + uint64_t ChunkSequenceOffset; + uint64_t ChunkHashesOffset; + uint64_t RawSize = 0; + IoHash RawHash; + }; +} // namespace + +IoBuffer +SerializeChunkedInfo(const ChunkedInfo& Info) +{ + ZEN_TRACE_CPU("SerializeChunkedInfo"); + size_t HeaderSize = RoundUp(sizeof(ChunkedHeader), 16) + RoundUp(sizeof(uint32_t) * Info.ChunkSequence.size(), 16) + + RoundUp(sizeof(IoHash) * Info.ChunkHashes.size(), 16); + IoBuffer HeaderData(HeaderSize); + + ChunkedHeader Header; + Header.ChunkSequenceLength = gsl::narrow<uint32_t>(Info.ChunkSequence.size()); + Header.ChunkHashCount = gsl::narrow<uint32_t>(Info.ChunkHashes.size()); + Header.ChunkSequenceOffset = RoundUp(sizeof(ChunkedHeader), 16); + Header.ChunkHashesOffset = RoundUp(Header.ChunkSequenceOffset + sizeof(uint32_t) * Header.ChunkSequenceLength, 16); + Header.RawSize = Info.RawSize; + Header.RawHash = Info.RawHash; + + MutableMemoryView WriteView = HeaderData.GetMutableView(); + { + MutableMemoryView HeaderWriteView = WriteView.Left(sizeof(Header)); + HeaderWriteView.CopyFrom(MemoryView(&Header, sizeof(Header))); + } + { + MutableMemoryView ChunkSequenceWriteView = WriteView.Mid(Header.ChunkSequenceOffset, sizeof(uint32_t) * Header.ChunkSequenceLength); + ChunkSequenceWriteView.CopyFrom(MemoryView(Info.ChunkSequence.data(), ChunkSequenceWriteView.GetSize())); + } + { + MutableMemoryView ChunksWriteView = WriteView.Mid(Header.ChunkHashesOffset, sizeof(IoHash) * Header.ChunkHashCount); + ChunksWriteView.CopyFrom(MemoryView(Info.ChunkHashes.data(), ChunksWriteView.GetSize())); + } + + return HeaderData; +} + +ChunkedInfo +DeserializeChunkedInfo(IoBuffer& Buffer) +{ + ZEN_TRACE_CPU("DeserializeChunkedInfo"); + MemoryView View = Buffer.GetView(); + ChunkedHeader Header; + { + MutableMemoryView HeaderWriteView(&Header, sizeof(Header)); + HeaderWriteView.CopyFrom(View.Left(sizeof(Header))); + } + if (Header.Magic != ChunkedHeader::ExpectedMagic) + { + return {}; + } + if (Header.Version != ChunkedHeader::CurrentVersion) + { + return {}; + } + ChunkedInfo Info; + Info.RawSize = Header.RawSize; + Info.RawHash = Header.RawHash; + Info.ChunkSequence.resize(Header.ChunkSequenceLength); + Info.ChunkHashes.resize(Header.ChunkHashCount); + { + MutableMemoryView ChunkSequenceWriteView(Info.ChunkSequence.data(), sizeof(uint32_t) * Header.ChunkSequenceLength); + ChunkSequenceWriteView.CopyFrom(View.Mid(Header.ChunkSequenceOffset, ChunkSequenceWriteView.GetSize())); + } + { + MutableMemoryView ChunksWriteView(Info.ChunkHashes.data(), sizeof(IoHash) * Header.ChunkHashCount); + ChunksWriteView.CopyFrom(View.Mid(Header.ChunkHashesOffset, ChunksWriteView.GetSize())); + } + + return Info; +} + +void +Reconstruct(const ChunkedInfo& Info, const std::filesystem::path& TargetPath, std::function<IoBuffer(const IoHash& ChunkHash)> GetChunk) +{ + ZEN_TRACE_CPU("Reconstruct"); + BasicFile Reconstructed; + Reconstructed.Open(TargetPath, BasicFile::Mode::kTruncate); + BasicFileWriter ReconstructedWriter(Reconstructed, 64 * 1024); + uint64_t Offset = 0; + for (uint32_t SequenceIndex : Info.ChunkSequence) + { + IoBuffer Chunk = GetChunk(Info.ChunkHashes[SequenceIndex]); + ReconstructedWriter.Write(Chunk.GetData(), Chunk.GetSize(), Offset); + Offset += Chunk.GetSize(); + } +} + +ChunkedInfoWithSource +ChunkData(BasicFile& RawData, + uint64_t Offset, + uint64_t Size, + ChunkedParams Params, + std::atomic<uint64_t>* BytesProcessed, + std::atomic<bool>* AbortFlag) +{ + ZEN_TRACE_CPU("ChunkData"); + + ChunkedInfoWithSource Result; + tsl::robin_map<IoHash, uint32_t, IoHash::Hasher> FoundChunks; + + ZenChunkHelper Chunker; + Chunker.SetUseThreshold(Params.UseThreshold); + Chunker.SetChunkSize(Params.MinSize, Params.MaxSize, Params.AvgSize); + size_t End = Offset + Size; + const size_t ScanBufferSize = Max(1u * 1024 * 1024, Params.MaxSize); + BasicFileBuffer RawBuffer(RawData, ScanBufferSize); + MemoryView SliceView = RawBuffer.MakeView(Min(End - Offset, ScanBufferSize), Offset); + ZEN_ASSERT(!SliceView.IsEmpty()); + size_t SliceSize = SliceView.GetSize(); + IoHashStream RawHashStream; + while (Offset < End) + { + if (AbortFlag != nullptr && AbortFlag->load()) + { + return {}; + } + size_t ScanLength = Chunker.ScanChunk(SliceView.GetData(), SliceSize); + if (ScanLength == ZenChunkHelper::kNoBoundaryFound) + { + if (Offset + SliceSize == End) + { + ScanLength = SliceSize; + } + else + { + SliceView = RawBuffer.MakeView(Min(End - Offset, ScanBufferSize), Offset); + SliceSize = SliceView.GetSize(); + Chunker.Reset(); + continue; + } + } + uint32_t ChunkLength = gsl::narrow<uint32_t>(ScanLength); // +HashedLength); + MemoryView ChunkView = SliceView.Left(ScanLength); + RawHashStream.Append(ChunkView); + IoHash ChunkHash = IoHash::HashBuffer(ChunkView); + SliceView.RightChopInline(ScanLength); + if (auto It = FoundChunks.find(ChunkHash); It != FoundChunks.end()) + { + Result.Info.ChunkSequence.push_back(It->second); + } + else + { + uint32_t ChunkIndex = gsl::narrow<uint32_t>(Result.Info.ChunkHashes.size()); + FoundChunks.insert_or_assign(ChunkHash, ChunkIndex); + Result.Info.ChunkHashes.push_back(ChunkHash); + Result.ChunkSources.push_back(ChunkSource{.Offset = Offset, .Size = ChunkLength}); + Result.Info.ChunkSequence.push_back(ChunkIndex); + } + + SliceSize = SliceView.GetSize(); + Offset += ChunkLength; + if (BytesProcessed != nullptr) + { + BytesProcessed->fetch_add(ChunkLength); + } + } + Result.Info.RawSize = Size; + Result.Info.RawHash = RawHashStream.GetHash(); + return Result; +} + +} // namespace zen + +#if ZEN_WITH_TESTS +# include <zencore/filesystem.h> +# include <zencore/fmtutils.h> +# include <zencore/iohash.h> +# include <zencore/logging.h> +# include <zencore/scopeguard.h> +# include <zencore/timer.h> +# include <zencore/testing.h> +# include <zencore/testutils.h> +# include <zencore/workthreadpool.h> + +# include "chunking.h" + +ZEN_THIRD_PARTY_INCLUDES_START +# include <tsl/robin_map.h> +# include <tsl/robin_set.h> +ZEN_THIRD_PARTY_INCLUDES_END + +namespace zen { +# if 0 +TEST_CASE("chunkedfile.findparams") +{ +# if 1 + DirectoryContent SourceContent1; + GetDirectoryContent("E:\\Temp\\ChunkingTestData\\31379208", DirectoryContentFlags::IncludeFiles, SourceContent1); + const std::vector<std::filesystem::path>& SourceFiles1 = SourceContent1.Files; + DirectoryContent SourceContent2; + GetDirectoryContent("E:\\Temp\\ChunkingTestData\\31379208_2", DirectoryContentFlags::IncludeFiles, SourceContent2); + const std::vector<std::filesystem::path>& SourceFiles2 = SourceContent2.Files; +# else + std::filesystem::path SourcePath1 = + "E:\\Temp\\ChunkingTestData\\31375996\\ShaderArchive-FortniteGame_Chunk10-PCD3D_SM6-PCD3D_SM6.ushaderbytecode"; + std::filesystem::path SourcePath2 = + "E:\\Temp\\ChunkingTestData\\31379208\\ShaderArchive-FortniteGame_Chunk10-PCD3D_SM6-PCD3D_SM6.ushaderbytecode"; + const std::vector<std::filesystem::path>& SourceFiles1 = {SourcePath1}; + const std::vector<std::filesystem::path>& SourceFiles2 = {SourcePath2}; +# endif + ChunkedParams Params[] = {ChunkedParams{.UseThreshold = false, .MinSize = 17280, .MaxSize = 139264, .AvgSize = 36340}, + ChunkedParams{.UseThreshold = false, .MinSize = 15456, .MaxSize = 122880, .AvgSize = 35598}, + ChunkedParams{.UseThreshold = false, .MinSize = 16848, .MaxSize = 135168, .AvgSize = 39030}, + ChunkedParams{.UseThreshold = false, .MinSize = 14256, .MaxSize = 114688, .AvgSize = 36222}, + ChunkedParams{.UseThreshold = false, .MinSize = 15744, .MaxSize = 126976, .AvgSize = 36600}, + ChunkedParams{.UseThreshold = false, .MinSize = 15264, .MaxSize = 122880, .AvgSize = 35442}, + ChunkedParams{.UseThreshold = false, .MinSize = 16464, .MaxSize = 131072, .AvgSize = 37950}, + ChunkedParams{.UseThreshold = false, .MinSize = 15408, .MaxSize = 122880, .AvgSize = 38914}, + ChunkedParams{.UseThreshold = false, .MinSize = 15408, .MaxSize = 122880, .AvgSize = 35556}, + ChunkedParams{.UseThreshold = false, .MinSize = 15360, .MaxSize = 122880, .AvgSize = 35520}, + ChunkedParams{.UseThreshold = false, .MinSize = 15312, .MaxSize = 122880, .AvgSize = 35478}, + ChunkedParams{.UseThreshold = false, .MinSize = 16896, .MaxSize = 135168, .AvgSize = 39072}, + ChunkedParams{.UseThreshold = false, .MinSize = 15360, .MaxSize = 122880, .AvgSize = 38880}, + ChunkedParams{.UseThreshold = false, .MinSize = 15840, .MaxSize = 126976, .AvgSize = 36678}, + ChunkedParams{.UseThreshold = false, .MinSize = 16800, .MaxSize = 135168, .AvgSize = 38994}, + ChunkedParams{.UseThreshold = false, .MinSize = 15888, .MaxSize = 126976, .AvgSize = 36714}, + ChunkedParams{.UseThreshold = false, .MinSize = 15792, .MaxSize = 126976, .AvgSize = 36636}, + ChunkedParams{.UseThreshold = false, .MinSize = 14880, .MaxSize = 118784, .AvgSize = 37609}, + ChunkedParams{.UseThreshold = false, .MinSize = 15936, .MaxSize = 126976, .AvgSize = 36756}, + ChunkedParams{.UseThreshold = false, .MinSize = 15456, .MaxSize = 122880, .AvgSize = 38955}, + ChunkedParams{.UseThreshold = false, .MinSize = 15984, .MaxSize = 126976, .AvgSize = 36792}, + ChunkedParams{.UseThreshold = false, .MinSize = 14400, .MaxSize = 114688, .AvgSize = 36338}, + ChunkedParams{.UseThreshold = false, .MinSize = 14832, .MaxSize = 118784, .AvgSize = 37568}, + ChunkedParams{.UseThreshold = false, .MinSize = 16944, .MaxSize = 135168, .AvgSize = 39108}, + ChunkedParams{.UseThreshold = false, .MinSize = 14352, .MaxSize = 114688, .AvgSize = 36297}, + ChunkedParams{.UseThreshold = false, .MinSize = 14208, .MaxSize = 114688, .AvgSize = 36188}, + ChunkedParams{.UseThreshold = false, .MinSize = 14448, .MaxSize = 114688, .AvgSize = 36372}, + ChunkedParams{.UseThreshold = false, .MinSize = 13296, .MaxSize = 106496, .AvgSize = 36592}, + ChunkedParams{.UseThreshold = false, .MinSize = 15264, .MaxSize = 122880, .AvgSize = 38805}, + ChunkedParams{.UseThreshold = false, .MinSize = 14304, .MaxSize = 114688, .AvgSize = 36263}, + ChunkedParams{.UseThreshold = false, .MinSize = 14784, .MaxSize = 118784, .AvgSize = 37534}, + ChunkedParams{.UseThreshold = false, .MinSize = 15312, .MaxSize = 122880, .AvgSize = 38839}, + ChunkedParams{.UseThreshold = false, .MinSize = 14256, .MaxSize = 114688, .AvgSize = 39360}, + ChunkedParams{.UseThreshold = false, .MinSize = 13776, .MaxSize = 110592, .AvgSize = 37976}, + ChunkedParams{.UseThreshold = false, .MinSize = 14736, .MaxSize = 118784, .AvgSize = 37493}, + ChunkedParams{.UseThreshold = false, .MinSize = 14928, .MaxSize = 118784, .AvgSize = 37643}, + ChunkedParams{.UseThreshold = false, .MinSize = 14448, .MaxSize = 114688, .AvgSize = 39504}, + ChunkedParams{.UseThreshold = false, .MinSize = 13392, .MaxSize = 106496, .AvgSize = 36664}, + ChunkedParams{.UseThreshold = false, .MinSize = 13872, .MaxSize = 110592, .AvgSize = 38048}, + ChunkedParams{.UseThreshold = false, .MinSize = 14352, .MaxSize = 114688, .AvgSize = 39432}, + ChunkedParams{.UseThreshold = false, .MinSize = 13200, .MaxSize = 106496, .AvgSize = 36520}, + ChunkedParams{.UseThreshold = false, .MinSize = 17328, .MaxSize = 139264, .AvgSize = 36378}, + ChunkedParams{.UseThreshold = false, .MinSize = 17376, .MaxSize = 139264, .AvgSize = 36421}, + ChunkedParams{.UseThreshold = false, .MinSize = 17424, .MaxSize = 139264, .AvgSize = 36459}, + ChunkedParams{.UseThreshold = false, .MinSize = 17472, .MaxSize = 139264, .AvgSize = 36502}, + ChunkedParams{.UseThreshold = false, .MinSize = 17520, .MaxSize = 139264, .AvgSize = 36540}, + ChunkedParams{.UseThreshold = false, .MinSize = 17808, .MaxSize = 143360, .AvgSize = 37423}, + ChunkedParams{.UseThreshold = false, .MinSize = 17856, .MaxSize = 143360, .AvgSize = 37466}, + ChunkedParams{.UseThreshold = false, .MinSize = 18000, .MaxSize = 143360, .AvgSize = 25834}, + ChunkedParams{.UseThreshold = false, .MinSize = 18000, .MaxSize = 143360, .AvgSize = 21917}, + ChunkedParams{.UseThreshold = false, .MinSize = 18000, .MaxSize = 143360, .AvgSize = 29751}, + ChunkedParams{.UseThreshold = false, .MinSize = 18000, .MaxSize = 143360, .AvgSize = 33668}, + ChunkedParams{.UseThreshold = false, .MinSize = 17952, .MaxSize = 143360, .AvgSize = 37547}, + ChunkedParams{.UseThreshold = false, .MinSize = 17904, .MaxSize = 143360, .AvgSize = 37504}, + ChunkedParams{.UseThreshold = false, .MinSize = 18336, .MaxSize = 147456, .AvgSize = 22371}, + ChunkedParams{.UseThreshold = false, .MinSize = 18000, .MaxSize = 143360, .AvgSize = 37585}, + ChunkedParams{.UseThreshold = false, .MinSize = 18336, .MaxSize = 147456, .AvgSize = 26406}, + ChunkedParams{.UseThreshold = false, .MinSize = 18384, .MaxSize = 147456, .AvgSize = 26450}, + ChunkedParams{.UseThreshold = false, .MinSize = 18528, .MaxSize = 147456, .AvgSize = 30615}, + ChunkedParams{.UseThreshold = false, .MinSize = 18336, .MaxSize = 147456, .AvgSize = 30441}, + ChunkedParams{.UseThreshold = false, .MinSize = 18384, .MaxSize = 147456, .AvgSize = 22417}, + ChunkedParams{.UseThreshold = false, .MinSize = 18528, .MaxSize = 147456, .AvgSize = 22557}, + ChunkedParams{.UseThreshold = false, .MinSize = 18432, .MaxSize = 147456, .AvgSize = 30528}, + ChunkedParams{.UseThreshold = false, .MinSize = 18816, .MaxSize = 151552, .AvgSize = 27112}, + ChunkedParams{.UseThreshold = false, .MinSize = 18528, .MaxSize = 147456, .AvgSize = 34644}, + ChunkedParams{.UseThreshold = false, .MinSize = 18336, .MaxSize = 147456, .AvgSize = 34476}, + ChunkedParams{.UseThreshold = false, .MinSize = 18816, .MaxSize = 151552, .AvgSize = 35408}, + ChunkedParams{.UseThreshold = false, .MinSize = 18432, .MaxSize = 147456, .AvgSize = 38592}, + ChunkedParams{.UseThreshold = false, .MinSize = 18384, .MaxSize = 147456, .AvgSize = 30483}, + ChunkedParams{.UseThreshold = false, .MinSize = 18528, .MaxSize = 147456, .AvgSize = 26586}, + ChunkedParams{.UseThreshold = false, .MinSize = 18432, .MaxSize = 147456, .AvgSize = 26496}, + ChunkedParams{.UseThreshold = false, .MinSize = 18864, .MaxSize = 151552, .AvgSize = 31302}, + ChunkedParams{.UseThreshold = false, .MinSize = 18384, .MaxSize = 147456, .AvgSize = 34516}, + ChunkedParams{.UseThreshold = false, .MinSize = 18816, .MaxSize = 151552, .AvgSize = 22964}, + ChunkedParams{.UseThreshold = false, .MinSize = 18864, .MaxSize = 151552, .AvgSize = 35448}, + ChunkedParams{.UseThreshold = false, .MinSize = 18480, .MaxSize = 147456, .AvgSize = 38630}, + ChunkedParams{.UseThreshold = false, .MinSize = 18864, .MaxSize = 151552, .AvgSize = 23010}, + ChunkedParams{.UseThreshold = false, .MinSize = 18816, .MaxSize = 151552, .AvgSize = 31260}, + ChunkedParams{.UseThreshold = false, .MinSize = 18480, .MaxSize = 147456, .AvgSize = 34600}, + ChunkedParams{.UseThreshold = false, .MinSize = 18864, .MaxSize = 151552, .AvgSize = 27156}, + ChunkedParams{.UseThreshold = false, .MinSize = 18480, .MaxSize = 147456, .AvgSize = 30570}, + ChunkedParams{.UseThreshold = false, .MinSize = 18384, .MaxSize = 147456, .AvgSize = 38549}, + ChunkedParams{.UseThreshold = false, .MinSize = 18480, .MaxSize = 147456, .AvgSize = 22510}, + ChunkedParams{.UseThreshold = false, .MinSize = 18528, .MaxSize = 147456, .AvgSize = 38673}, + ChunkedParams{.UseThreshold = false, .MinSize = 18432, .MaxSize = 147456, .AvgSize = 34560}, + ChunkedParams{.UseThreshold = false, .MinSize = 18432, .MaxSize = 147456, .AvgSize = 22464}, + ChunkedParams{.UseThreshold = false, .MinSize = 18480, .MaxSize = 147456, .AvgSize = 26540}, + ChunkedParams{.UseThreshold = false, .MinSize = 18336, .MaxSize = 147456, .AvgSize = 38511}, + ChunkedParams{.UseThreshold = false, .MinSize = 18912, .MaxSize = 151552, .AvgSize = 23057}, + ChunkedParams{.UseThreshold = false, .MinSize = 18912, .MaxSize = 151552, .AvgSize = 27202}, + ChunkedParams{.UseThreshold = false, .MinSize = 18912, .MaxSize = 151552, .AvgSize = 31347}, + ChunkedParams{.UseThreshold = false, .MinSize = 18912, .MaxSize = 151552, .AvgSize = 35492}, + ChunkedParams{.UseThreshold = false, .MinSize = 18960, .MaxSize = 151552, .AvgSize = 31389}, + ChunkedParams{.UseThreshold = false, .MinSize = 18960, .MaxSize = 151552, .AvgSize = 27246}, + ChunkedParams{.UseThreshold = false, .MinSize = 18960, .MaxSize = 151552, .AvgSize = 23103}, + ChunkedParams{.UseThreshold = false, .MinSize = 18960, .MaxSize = 151552, .AvgSize = 35532}, + ChunkedParams{.UseThreshold = false, .MinSize = 19008, .MaxSize = 151552, .AvgSize = 23150}, + ChunkedParams{.UseThreshold = false, .MinSize = 19008, .MaxSize = 151552, .AvgSize = 27292}, + ChunkedParams{.UseThreshold = false, .MinSize = 19008, .MaxSize = 151552, .AvgSize = 31434}, + ChunkedParams{.UseThreshold = false, .MinSize = 19008, .MaxSize = 151552, .AvgSize = 35576}, + ChunkedParams{.UseThreshold = false, .MinSize = 19056, .MaxSize = 151552, .AvgSize = 27336}, + ChunkedParams{.UseThreshold = false, .MinSize = 19056, .MaxSize = 151552, .AvgSize = 23196}, + ChunkedParams{.UseThreshold = false, .MinSize = 19056, .MaxSize = 151552, .AvgSize = 31476}, + ChunkedParams{.UseThreshold = false, .MinSize = 19056, .MaxSize = 151552, .AvgSize = 35616}, + ChunkedParams{.UseThreshold = false, .MinSize = 19344, .MaxSize = 155648, .AvgSize = 27862}, + ChunkedParams{.UseThreshold = false, .MinSize = 19344, .MaxSize = 155648, .AvgSize = 32121}, + ChunkedParams{.UseThreshold = false, .MinSize = 19344, .MaxSize = 155648, .AvgSize = 23603}, + ChunkedParams{.UseThreshold = false, .MinSize = 19344, .MaxSize = 155648, .AvgSize = 36380}, + ChunkedParams{.UseThreshold = false, .MinSize = 19392, .MaxSize = 155648, .AvgSize = 27908}, + ChunkedParams{.UseThreshold = false, .MinSize = 19392, .MaxSize = 155648, .AvgSize = 23650}, + ChunkedParams{.UseThreshold = false, .MinSize = 19392, .MaxSize = 155648, .AvgSize = 32166}, + ChunkedParams{.UseThreshold = false, .MinSize = 19392, .MaxSize = 155648, .AvgSize = 36424}, + ChunkedParams{.UseThreshold = false, .MinSize = 19440, .MaxSize = 155648, .AvgSize = 23696}, + ChunkedParams{.UseThreshold = false, .MinSize = 19488, .MaxSize = 155648, .AvgSize = 32253}, + ChunkedParams{.UseThreshold = false, .MinSize = 19440, .MaxSize = 155648, .AvgSize = 32208}, + ChunkedParams{.UseThreshold = false, .MinSize = 19488, .MaxSize = 155648, .AvgSize = 23743}, + ChunkedParams{.UseThreshold = false, .MinSize = 19536, .MaxSize = 155648, .AvgSize = 36548}, + ChunkedParams{.UseThreshold = false, .MinSize = 19536, .MaxSize = 155648, .AvgSize = 28042}, + ChunkedParams{.UseThreshold = false, .MinSize = 19536, .MaxSize = 155648, .AvgSize = 23789}, + ChunkedParams{.UseThreshold = false, .MinSize = 19536, .MaxSize = 155648, .AvgSize = 32295}, + ChunkedParams{.UseThreshold = false, .MinSize = 19488, .MaxSize = 155648, .AvgSize = 36508}, + ChunkedParams{.UseThreshold = false, .MinSize = 19440, .MaxSize = 155648, .AvgSize = 27952}, + ChunkedParams{.UseThreshold = false, .MinSize = 19488, .MaxSize = 155648, .AvgSize = 27998}, + ChunkedParams{.UseThreshold = false, .MinSize = 19440, .MaxSize = 155648, .AvgSize = 36464}}; + + static const size_t ParamsCount = sizeof(Params) / sizeof(ChunkedParams); + std::vector<ChunkedInfoWithSource> Infos1(SourceFiles1.size()); + std::vector<ChunkedInfoWithSource> Infos2(SourceFiles2.size()); + + WorkerThreadPool WorkerPool(32); + + for (size_t I = 0; I < ParamsCount; I++) + { + for (int UseThreshold = 0; UseThreshold < 2; UseThreshold++) + { + Latch WorkLatch(1); + ChunkedParams Param = Params[I]; + Param.UseThreshold = UseThreshold == 1; + Stopwatch Timer; + for (size_t F = 0; F < SourceFiles1.size(); F++) + { + WorkLatch.AddCount(1); + WorkerPool.ScheduleWork([&WorkLatch, F, Param, &SourceFiles1, &Infos1]() { + auto _ = MakeGuard([&WorkLatch]() { WorkLatch.CountDown(); }); + BasicFile SourceData1; + SourceData1.Open(SourceFiles1[F], BasicFile::Mode::kRead); + Infos1[F] = ChunkData(SourceData1, 0, SourceData1.FileSize(), Param); + }); + } + for (size_t F = 0; F < SourceFiles2.size(); F++) + { + WorkLatch.AddCount(1); + WorkerPool.ScheduleWork([&WorkLatch, F, Param, &SourceFiles2, &Infos2]() { + auto _ = MakeGuard([&WorkLatch]() { WorkLatch.CountDown(); }); + BasicFile SourceData2; + SourceData2.Open(SourceFiles2[F], BasicFile::Mode::kRead); + Infos2[F] = ChunkData(SourceData2, 0, SourceData2.FileSize(), Param); + }); + } + WorkLatch.CountDown(); + WorkLatch.Wait(); + uint64_t ChunkTimeMS = Timer.GetElapsedTimeMs(); + + uint64_t Raw1Size = 0; + tsl::robin_set<IoHash> Chunks1; + size_t ChunkedSize1 = 0; + for (size_t F = 0; F < SourceFiles1.size(); F++) + { + const ChunkedInfoWithSource& Info = Infos1[F]; + Raw1Size += Info.Info.RawSize; + for (uint32_t Chunk1Index = 0; Chunk1Index < Info.Info.ChunkHashes.size(); ++Chunk1Index) + { + const IoHash ChunkHash = Info.Info.ChunkHashes[Chunk1Index]; + if (Chunks1.insert(ChunkHash).second) + { + ChunkedSize1 += Info.ChunkSources[Chunk1Index].Size; + } + } + } + + uint64_t Raw2Size = 0; + tsl::robin_set<IoHash> Chunks2; + size_t ChunkedSize2 = 0; + size_t DiffSize = 0; + for (size_t F = 0; F < SourceFiles2.size(); F++) + { + const ChunkedInfoWithSource& Info = Infos2[F]; + Raw2Size += Info.Info.RawSize; + for (uint32_t Chunk2Index = 0; Chunk2Index < Info.Info.ChunkHashes.size(); ++Chunk2Index) + { + const IoHash ChunkHash = Info.Info.ChunkHashes[Chunk2Index]; + if (Chunks2.insert(ChunkHash).second) + { + ChunkedSize2 += Info.ChunkSources[Chunk2Index].Size; + if (!Chunks1.contains(ChunkHash)) + { + DiffSize += Info.ChunkSources[Chunk2Index].Size; + } + } + } + } + + ZEN_INFO( + "Diff = {}, Chunks1 = {}, Chunks2 = {}, .UseThreshold = {}, .MinSize = {}, .MaxSize = {}, .AvgSize = {}, RawSize(1) = {}, " + "RawSize(2) = {}, " + "Saved(1) = {}, Saved(2) = {} in {}", + NiceBytes(DiffSize), + Chunks1.size(), + Chunks2.size(), + Param.UseThreshold, + Param.MinSize, + Param.MaxSize, + Param.AvgSize, + NiceBytes(Raw1Size), + NiceBytes(Raw2Size), + NiceBytes(Raw1Size - ChunkedSize1), + NiceBytes(Raw2Size - ChunkedSize2), + NiceTimeSpanMs(ChunkTimeMS)); + } + } + +# if 0 + for (int64_t MinSizeBase = (12u * 1024u); MinSizeBase <= (32u * 1024u); MinSizeBase += 512) + { + for (int64_t Wiggle = -132; Wiggle < 126; Wiggle += 2) + { + // size_t MinSize = 7 * 1024 - 61; // (size_t)(MinSizeBase + Wiggle); + // size_t MaxSize = 16 * (7 * 1024); // 8 * 7 * 1024;// MinSizeBase * 6; + // size_t AvgSize = MaxSize / 2; // 4 * 7 * 1024;// MinSizeBase * 3; + size_t MinSize = (size_t)(MinSizeBase + Wiggle); + //for (size_t MaxSize = (MinSize * 4) - 768; MaxSize < (MinSize * 5) + 768; MaxSize += 64) + size_t MaxSize = 8u * MinSizeBase; + { + for (size_t AvgSize = (MaxSize - MinSize) / 32 + MinSize; AvgSize < (MaxSize - MinSize) / 4 + MinSize; AvgSize += (MaxSize - MinSize) / 32) +// size_t AvgSize = (MaxSize - MinSize) / 4 + MinSize; + { + WorkLatch.AddCount(1); + WorkerPool.ScheduleWork([&WorkLatch, MinSize, MaxSize, AvgSize, SourcePath1, SourcePath2]() + { + auto _ = MakeGuard([&WorkLatch]() { WorkLatch.CountDown(); }); + ChunkedParams Params{ .UseThreshold = true, .MinSize = MinSize, .MaxSize = MaxSize, .AvgSize = AvgSize }; + BasicFile SourceData1; + SourceData1.Open(SourcePath1, BasicFile::Mode::kRead); + BasicFile SourceData2; + SourceData2.Open(SourcePath2, BasicFile::Mode::kRead); + ChunkedInfoWithSource Info1 = ChunkData(SourceData1, Params); + ChunkedInfoWithSource Info2 = ChunkData(SourceData2, Params); + + tsl::robin_set<IoHash> Chunks1; + Chunks1.reserve(Info1.Info.ChunkHashes.size()); + Chunks1.insert(Info1.Info.ChunkHashes.begin(), Info1.Info.ChunkHashes.end()); + size_t ChunkedSize1 = 0; + for (uint32_t Chunk1Index = 0; Chunk1Index < Info1.Info.ChunkHashes.size(); ++Chunk1Index) + { + ChunkedSize1 += Info1.ChunkSources[Chunk1Index].Size; + } + size_t DiffSavedSize = 0; + size_t ChunkedSize2 = 0; + for (uint32_t Chunk2Index = 0; Chunk2Index < Info2.Info.ChunkHashes.size(); ++Chunk2Index) + { + ChunkedSize2 += Info2.ChunkSources[Chunk2Index].Size; + if (Chunks1.find(Info2.Info.ChunkHashes[Chunk2Index]) == Chunks1.end()) + { + DiffSavedSize += Info2.ChunkSources[Chunk2Index].Size; + } + } + ZEN_INFO("Diff {}, Chunks1: {}, Chunks2: {}, Min: {}, Max: {}, Avg: {}, Saved(1) {}, Saved(2) {}", + NiceBytes(DiffSavedSize), + Info1.Info.ChunkHashes.size(), + Info2.Info.ChunkHashes.size(), + MinSize, + MaxSize, + AvgSize, + NiceBytes(Info1.Info.RawSize - ChunkedSize1), + NiceBytes(Info2.Info.RawSize - ChunkedSize2)); + }); + } + } + } + } +# endif // 0 + + // WorkLatch.CountDown(); + // WorkLatch.Wait(); +} +# endif // 0 + +void +chunkedfile_forcelink() +{ +} + +} // namespace zen + +#endif diff --git a/src/zenremotestore/chunking/chunking.cpp b/src/zenremotestore/chunking/chunking.cpp new file mode 100644 index 000000000..71f0a06e4 --- /dev/null +++ b/src/zenremotestore/chunking/chunking.cpp @@ -0,0 +1,383 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#include "chunking.h" + +#include <gsl/gsl-lite.hpp> + +#include <cmath> +#include <cstring> + +namespace zen::detail { + +static const uint32_t BuzhashTable[] = { + 0x458be752, 0xc10748cc, 0xfbbcdbb8, 0x6ded5b68, 0xb10a82b5, 0x20d75648, 0xdfc5665f, 0xa8428801, 0x7ebf5191, 0x841135c7, 0x65cc53b3, + 0x280a597c, 0x16f60255, 0xc78cbc3e, 0x294415f5, 0xb938d494, 0xec85c4e6, 0xb7d33edc, 0xe549b544, 0xfdeda5aa, 0x882bf287, 0x3116737c, + 0x05569956, 0xe8cc1f68, 0x0806ac5e, 0x22a14443, 0x15297e10, 0x50d090e7, 0x4ba60f6f, 0xefd9f1a7, 0x5c5c885c, 0x82482f93, 0x9bfd7c64, + 0x0b3e7276, 0xf2688e77, 0x8fad8abc, 0xb0509568, 0xf1ada29f, 0xa53efdfe, 0xcb2b1d00, 0xf2a9e986, 0x6463432b, 0x95094051, 0x5a223ad2, + 0x9be8401b, 0x61e579cb, 0x1a556a14, 0x5840fdc2, 0x9261ddf6, 0xcde002bb, 0x52432bb0, 0xbf17373e, 0x7b7c222f, 0x2955ed16, 0x9f10ca59, + 0xe840c4c9, 0xccabd806, 0x14543f34, 0x1462417a, 0x0d4a1f9c, 0x087ed925, 0xd7f8f24c, 0x7338c425, 0xcf86c8f5, 0xb19165cd, 0x9891c393, + 0x325384ac, 0x0308459d, 0x86141d7e, 0xc922116a, 0xe2ffa6b6, 0x53f52aed, 0x2cd86197, 0xf5b9f498, 0xbf319c8f, 0xe0411fae, 0x977eb18c, + 0xd8770976, 0x9833466a, 0xc674df7f, 0x8c297d45, 0x8ca48d26, 0xc49ed8e2, 0x7344f874, 0x556f79c7, 0x6b25eaed, 0xa03e2b42, 0xf68f66a4, + 0x8e8b09a2, 0xf2e0e62a, 0x0d3a9806, 0x9729e493, 0x8c72b0fc, 0x160b94f6, 0x450e4d3d, 0x7a320e85, 0xbef8f0e1, 0x21d73653, 0x4e3d977a, + 0x1e7b3929, 0x1cc6c719, 0xbe478d53, 0x8d752809, 0xe6d8c2c6, 0x275f0892, 0xc8acc273, 0x4cc21580, 0xecc4a617, 0xf5f7be70, 0xe795248a, + 0x375a2fe9, 0x425570b6, 0x8898dcf8, 0xdc2d97c4, 0x0106114b, 0x364dc22f, 0x1e0cad1f, 0xbe63803c, 0x5f69fac2, 0x4d5afa6f, 0x1bc0dfb5, + 0xfb273589, 0x0ea47f7b, 0x3c1c2b50, 0x21b2a932, 0x6b1223fd, 0x2fe706a8, 0xf9bd6ce2, 0xa268e64e, 0xe987f486, 0x3eacf563, 0x1ca2018c, + 0x65e18228, 0x2207360a, 0x57cf1715, 0x34c37d2b, 0x1f8f3cde, 0x93b657cf, 0x31a019fd, 0xe69eb729, 0x8bca7b9b, 0x4c9d5bed, 0x277ebeaf, + 0xe0d8f8ae, 0xd150821c, 0x31381871, 0xafc3f1b0, 0x927db328, 0xe95effac, 0x305a47bd, 0x426ba35b, 0x1233af3f, 0x686a5b83, 0x50e072e5, + 0xd9d3bb2a, 0x8befc475, 0x487f0de6, 0xc88dff89, 0xbd664d5e, 0x971b5d18, 0x63b14847, 0xd7d3c1ce, 0x7f583cf3, 0x72cbcb09, 0xc0d0a81c, + 0x7fa3429b, 0xe9158a1b, 0x225ea19a, 0xd8ca9ea3, 0xc763b282, 0xbb0c6341, 0x020b8293, 0xd4cd299d, 0x58cfa7f8, 0x91b4ee53, 0x37e4d140, + 0x95ec764c, 0x30f76b06, 0x5ee68d24, 0x679c8661, 0xa41979c2, 0xf2b61284, 0x4fac1475, 0x0adb49f9, 0x19727a23, 0x15a7e374, 0xc43a18d5, + 0x3fb1aa73, 0x342fc615, 0x924c0793, 0xbee2d7f0, 0x8a279de9, 0x4aa2d70c, 0xe24dd37f, 0xbe862c0b, 0x177c22c2, 0x5388e5ee, 0xcd8a7510, + 0xf901b4fd, 0xdbc13dbc, 0x6c0bae5b, 0x64efe8c7, 0x48b02079, 0x80331a49, 0xca3d8ae6, 0xf3546190, 0xfed7108b, 0xc49b941b, 0x32baf4a9, + 0xeb833a4a, 0x88a3f1a5, 0x3a91ce0a, 0x3cc27da1, 0x7112e684, 0x4a3096b1, 0x3794574c, 0xa3c8b6f3, 0x1d213941, 0x6e0a2e00, 0x233479f1, + 0x0f4cd82f, 0x6093edd2, 0x5d7d209e, 0x464fe319, 0xd4dcac9e, 0x0db845cb, 0xfb5e4bc3, 0xe0256ce1, 0x09fb4ed1, 0x0914be1e, 0xa5bdb2c3, + 0xc6eb57bb, 0x30320350, 0x3f397e91, 0xa67791bc, 0x86bc0e2c, 0xefa0a7e2, 0xe9ff7543, 0xe733612c, 0xd185897b, 0x329e5388, 0x91dd236b, + 0x2ecb0d93, 0xf4d82a3d, 0x35b5c03f, 0xe4e606f0, 0x05b21843, 0x37b45964, 0x5eff22f4, 0x6027f4cc, 0x77178b3c, 0xae507131, 0x7bf7cabc, + 0xf9c18d66, 0x593ade65, 0xd95ddf11, +}; + +// ROL operation (compiler turns this into a ROL when optimizing) +ZEN_FORCEINLINE static uint32_t +Rotate32(uint32_t Value, size_t RotateCount) +{ + RotateCount &= 31; + + return ((Value) << (RotateCount)) | ((Value) >> (32 - RotateCount)); +} + +} // namespace zen::detail + +namespace zen { + +void +ZenChunkHelper::Reset() +{ + InternalReset(); + + m_BytesScanned = 0; +} + +void +ZenChunkHelper::InternalReset() +{ + m_CurrentHash = 0; + m_CurrentChunkSize = 0; + m_WindowSize = 0; +} + +void +ZenChunkHelper::SetChunkSize(size_t MinSize, size_t MaxSize, size_t AvgSize) +{ + if (m_WindowSize) + return; // Already started + + static_assert(kChunkSizeLimitMin > kWindowSize); + + if (AvgSize) + { + // TODO: Validate AvgSize range + } + else + { + if (MinSize && MaxSize) + { + AvgSize = std::lrint(std::pow(2, (std::log2(MinSize) + std::log2(MaxSize)) / 2)); + } + else if (MinSize) + { + AvgSize = MinSize * 4; + } + else if (MaxSize) + { + AvgSize = MaxSize / 4; + } + else + { + AvgSize = kDefaultAverageChunkSize; + } + } + + if (MinSize) + { + // TODO: Validate MinSize range + } + else + { + MinSize = std::max(AvgSize / 4, kChunkSizeLimitMin); + } + + if (MaxSize) + { + // TODO: Validate MaxSize range + } + else + { + MaxSize = std::min(AvgSize * 4, kChunkSizeLimitMax); + } + + m_Discriminator = gsl::narrow<uint32_t>(AvgSize - MinSize); + + if (m_Discriminator < MinSize) + { + m_Discriminator = gsl::narrow<uint32_t>(MinSize); + } + + if (m_Discriminator > MaxSize) + { + m_Discriminator = gsl::narrow<uint32_t>(MaxSize); + } + + m_Threshold = gsl::narrow<uint32_t>((uint64_t(std::numeric_limits<uint32_t>::max()) + 1) / m_Discriminator); + + m_ChunkSizeMin = MinSize; + m_ChunkSizeMax = MaxSize; + m_ChunkSizeAvg = AvgSize; +} + +size_t +ZenChunkHelper::ScanChunk(const void* DataBytesIn, size_t ByteCount) +{ + size_t Result = InternalScanChunk(DataBytesIn, ByteCount); + + if (Result == kNoBoundaryFound) + { + m_BytesScanned += ByteCount; + } + else + { + m_BytesScanned += Result; + } + + return Result; +} + +size_t +ZenChunkHelper::InternalScanChunk(const void* DataBytesIn, size_t ByteCount) +{ + size_t CurrentOffset = 0; + const uint8_t* CursorPtr = reinterpret_cast<const uint8_t*>(DataBytesIn); + + // There's no point in updating the hash if we know we're not + // going to have a cut point, so just skip the data. This logic currently + // provides roughly a 20% speedup on my machine + + const size_t NeedHashOffset = m_ChunkSizeMin - kWindowSize; + + if (m_CurrentChunkSize < NeedHashOffset) + { + const uint32_t SkipBytes = gsl::narrow<uint32_t>(std::min<uint64_t>(ByteCount, NeedHashOffset - m_CurrentChunkSize)); + + ByteCount -= SkipBytes; + m_CurrentChunkSize += SkipBytes; + CurrentOffset += SkipBytes; + CursorPtr += SkipBytes; + + m_WindowSize = 0; + + if (ByteCount == 0) + { + return kNoBoundaryFound; + } + } + + // Fill window first + + if (m_WindowSize < kWindowSize) + { + const uint32_t FillBytes = uint32_t(std::min<size_t>(ByteCount, kWindowSize - m_WindowSize)); + + memcpy(&m_Window[m_WindowSize], CursorPtr, FillBytes); + + CursorPtr += FillBytes; + + m_WindowSize += FillBytes; + m_CurrentChunkSize += FillBytes; + + CurrentOffset += FillBytes; + ByteCount -= FillBytes; + + if (m_WindowSize < kWindowSize) + { + return kNoBoundaryFound; + } + + // We have a full window, initialize hash + + uint32_t CurrentHash = 0; + + for (int i = 1; i < kWindowSize; ++i) + { + CurrentHash ^= detail::Rotate32(detail::BuzhashTable[m_Window[i - 1]], kWindowSize - i); + } + + m_CurrentHash = CurrentHash ^ detail::BuzhashTable[m_Window[kWindowSize - 1]]; + } + + // Scan for boundaries (i.e points where the hash matches the value determined by + // the discriminator) + + uint32_t CurrentHash = m_CurrentHash; + uint32_t CurrentChunkSize = m_CurrentChunkSize; + + size_t Index = CurrentChunkSize % kWindowSize; + + if (m_Threshold && m_UseThreshold) + { + // This is roughly 4x faster than the general modulo approach on my + // TR 3990X (~940MB/sec) and doesn't require any special parameters to + // achieve max performance + + while (ByteCount) + { + const uint8_t NewByte = *CursorPtr; + const uint8_t OldByte = m_Window[Index]; + + CurrentHash = detail::Rotate32(CurrentHash, 1) ^ detail::Rotate32(detail::BuzhashTable[OldByte], m_WindowSize) ^ + detail::BuzhashTable[NewByte]; + + CurrentChunkSize++; + CurrentOffset++; + + if (CurrentChunkSize >= m_ChunkSizeMin) + { + bool FoundBoundary; + + if (CurrentChunkSize >= m_ChunkSizeMax) + { + FoundBoundary = true; + } + else + { + FoundBoundary = CurrentHash <= m_Threshold; + } + + if (FoundBoundary) + { + // Boundary found! + InternalReset(); + + return CurrentOffset; + } + } + + m_Window[Index++] = *CursorPtr; + + if (Index == kWindowSize) + { + Index = 0; + } + + ++CursorPtr; + --ByteCount; + } + } + else if ((m_Discriminator & (m_Discriminator - 1)) == 0) + { + // This is quite a bit faster than the generic modulo path, but + // requires a very specific average chunk size to be used. If you + // pass in an even power-of-two divided by 0.75 as the average + // chunk size you'll hit this path + + const uint32_t Mask = m_Discriminator - 1; + + while (ByteCount) + { + const uint8_t NewByte = *CursorPtr; + const uint8_t OldByte = m_Window[Index]; + + CurrentHash = detail::Rotate32(CurrentHash, 1) ^ detail::Rotate32(detail::BuzhashTable[OldByte], m_WindowSize) ^ + detail::BuzhashTable[NewByte]; + + CurrentChunkSize++; + CurrentOffset++; + + if (CurrentChunkSize >= m_ChunkSizeMin) + { + bool FoundBoundary; + + if (CurrentChunkSize >= m_ChunkSizeMax) + { + FoundBoundary = true; + } + else + { + FoundBoundary = (CurrentHash & Mask) == Mask; + } + + if (FoundBoundary) + { + // Boundary found! + InternalReset(); + + return CurrentOffset; + } + } + + m_Window[Index++] = *CursorPtr; + + if (Index == kWindowSize) + { + Index = 0; + } + + ++CursorPtr; + --ByteCount; + } + } + else + { + // This is the slowest path, which caps out around 250MB/sec for large sizes + // on my TR3900X + + while (ByteCount) + { + const uint8_t NewByte = *CursorPtr; + const uint8_t OldByte = m_Window[Index]; + + CurrentHash = detail::Rotate32(CurrentHash, 1) ^ detail::Rotate32(detail::BuzhashTable[OldByte], m_WindowSize) ^ + detail::BuzhashTable[NewByte]; + + CurrentChunkSize++; + CurrentOffset++; + + if (CurrentChunkSize >= m_ChunkSizeMin) + { + bool FoundBoundary; + + if (CurrentChunkSize >= m_ChunkSizeMax) + { + FoundBoundary = true; + } + else + { + FoundBoundary = (CurrentHash % m_Discriminator) == (m_Discriminator - 1); + } + + if (FoundBoundary) + { + // Boundary found! + InternalReset(); + + return CurrentOffset; + } + } + + m_Window[Index++] = *CursorPtr; + + if (Index == kWindowSize) + { + Index = 0; + } + + ++CursorPtr; + --ByteCount; + } + } + + m_CurrentChunkSize = CurrentChunkSize; + m_CurrentHash = CurrentHash; + + return kNoBoundaryFound; +} + +} // namespace zen diff --git a/src/zenremotestore/chunking/chunking.h b/src/zenremotestore/chunking/chunking.h new file mode 100644 index 000000000..09c56454f --- /dev/null +++ b/src/zenremotestore/chunking/chunking.h @@ -0,0 +1,56 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once +#include <zencore/zencore.h> + +namespace zen { + +/** Content-defined chunking helper + */ +class ZenChunkHelper +{ +public: + void SetChunkSize(size_t MinSize, size_t MaxSize, size_t AvgSize); + size_t ScanChunk(const void* DataBytes, size_t ByteCount); + void Reset(); + + // This controls which chunking approach is used - threshold or + // modulo based. Threshold is faster and generates similarly sized + // chunks + void SetUseThreshold(bool NewState) { m_UseThreshold = NewState; } + + inline size_t ChunkSizeMin() const { return m_ChunkSizeMin; } + inline size_t ChunkSizeMax() const { return m_ChunkSizeMax; } + inline size_t ChunkSizeAvg() const { return m_ChunkSizeAvg; } + inline uint64_t BytesScanned() const { return m_BytesScanned; } + + static constexpr size_t kNoBoundaryFound = size_t(~0ull); + +private: + size_t m_ChunkSizeMin = 0; + size_t m_ChunkSizeMax = 0; + size_t m_ChunkSizeAvg = 0; + + uint32_t m_Discriminator = 0; // Computed in SetChunkSize() + uint32_t m_Threshold = 0; // Computed in SetChunkSize() + + bool m_UseThreshold = true; + + static constexpr size_t kChunkSizeLimitMax = 64 * 1024 * 1024; + static constexpr size_t kChunkSizeLimitMin = 1024; + static constexpr size_t kDefaultAverageChunkSize = 64 * 1024; + + static constexpr int kWindowSize = 48; + uint8_t m_Window[kWindowSize]; + uint32_t m_WindowSize = 0; + + uint32_t m_CurrentHash = 0; + uint32_t m_CurrentChunkSize = 0; + + uint64_t m_BytesScanned = 0; + + size_t InternalScanChunk(const void* DataBytes, size_t ByteCount); + void InternalReset(); +}; + +} // namespace zen diff --git a/src/zenremotestore/chunking/chunkingcontroller.cpp b/src/zenremotestore/chunking/chunkingcontroller.cpp new file mode 100644 index 000000000..49332c2ce --- /dev/null +++ b/src/zenremotestore/chunking/chunkingcontroller.cpp @@ -0,0 +1,359 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#include <zenremotestore/chunking/chunkingcontroller.h> + +#include <zencore/basicfile.h> +#include <zencore/compactbinarybuilder.h> +#include <zencore/filesystem.h> +#include <zencore/trace.h> + +ZEN_THIRD_PARTY_INCLUDES_START +#include <tsl/robin_map.h> +ZEN_THIRD_PARTY_INCLUDES_END + +namespace zen { +using namespace std::literals; + +namespace { + std::vector<std::string> ReadStringArray(CbArrayView StringArray) + { + std::vector<std::string> Result; + Result.reserve(StringArray.Num()); + for (CbFieldView FieldView : StringArray) + { + Result.emplace_back(FieldView.AsString()); + } + return Result; + } + + ChunkedParams ReadChunkParams(CbObjectView Params) + { + bool UseThreshold = Params["UseThreshold"sv].AsBool(true); + size_t MinSize = Params["MinSize"sv].AsUInt64(DefaultChunkedParams.MinSize); + size_t MaxSize = Params["MaxSize"sv].AsUInt64(DefaultChunkedParams.MaxSize); + size_t AvgSize = Params["AvgSize"sv].AsUInt64(DefaultChunkedParams.AvgSize); + + return ChunkedParams{.UseThreshold = UseThreshold, .MinSize = MinSize, .MaxSize = MaxSize, .AvgSize = AvgSize}; + } + + void WriteChunkParams(CbObjectWriter& Writer, const ChunkedParams& Params) + { + Writer.BeginObject("ChunkingParams"sv); + { + Writer.AddBool("UseThreshold"sv, Params.UseThreshold); + + Writer.AddInteger("MinSize"sv, (uint64_t)Params.MinSize); + Writer.AddInteger("MaxSize"sv, (uint64_t)Params.MaxSize); + Writer.AddInteger("AvgSize"sv, (uint64_t)Params.AvgSize); + } + Writer.EndObject(); // ChunkingParams + } + + bool IsElfFile(BasicFile& Buffer) + { + if (Buffer.FileSize() > 4) + { + uint32_t ElfCheck = 0; + Buffer.Read(&ElfCheck, 4, 0); + if (ElfCheck == 0x464c457f) + { + return true; + } + } + return false; + } + + bool IsMachOFile(BasicFile& Buffer) + { + if (Buffer.FileSize() > 4) + { + uint32_t MachOCheck = 0; + Buffer.Read(&MachOCheck, 4, 0); + if ((MachOCheck == 0xfeedface) || (MachOCheck == 0xcefaedfe)) + { + return true; + } + } + return false; + } +} // namespace + +class BasicChunkingController : public ChunkingController +{ +public: + BasicChunkingController(const BasicChunkingControllerSettings& Settings) : m_Settings(Settings) {} + + BasicChunkingController(CbObjectView Parameters) : m_Settings(ReadSettings(Parameters)) {} + + virtual bool ProcessFile(const std::filesystem::path& InputPath, + uint64_t RawSize, + ChunkedInfoWithSource& OutChunked, + std::atomic<uint64_t>& BytesProcessed, + std::atomic<bool>& AbortFlag) const override + { + ZEN_TRACE_CPU("BasicChunkingController::ProcessFile"); + const bool ExcludeFromChunking = + std::find(m_Settings.ExcludeExtensions.begin(), m_Settings.ExcludeExtensions.end(), InputPath.extension()) != + m_Settings.ExcludeExtensions.end(); + + if (ExcludeFromChunking || (RawSize < m_Settings.ChunkFileSizeLimit)) + { + return false; + } + + BasicFile Buffer(InputPath, BasicFile::Mode::kRead); + if (m_Settings.ExcludeElfFiles && IsElfFile(Buffer)) + { + return false; + } + if (m_Settings.ExcludeMachOFiles && IsMachOFile(Buffer)) + { + return false; + } + + OutChunked = ChunkData(Buffer, 0, RawSize, m_Settings.ChunkingParams, &BytesProcessed, &AbortFlag); + return true; + } + + virtual std::string_view GetName() const override { return Name; } + + virtual CbObject GetParameters() const override + { + CbObjectWriter Writer; + Writer.BeginArray("ChunkExcludeExtensions"sv); + { + for (const std::string& Extension : m_Settings.ExcludeExtensions) + { + Writer.AddString(Extension); + } + } + Writer.EndArray(); // ChunkExcludeExtensions + + Writer.AddBool("ExcludeElfFiles"sv, m_Settings.ExcludeElfFiles); + Writer.AddBool("ExcludeMachOFiles"sv, m_Settings.ExcludeMachOFiles); + Writer.AddInteger("ChunkFileSizeLimit"sv, m_Settings.ChunkFileSizeLimit); + + WriteChunkParams(Writer, m_Settings.ChunkingParams); + + return Writer.Save(); + } + static constexpr std::string_view Name = "BasicChunkingController"sv; + +private: + static BasicChunkingControllerSettings ReadSettings(CbObjectView Parameters) + { + return BasicChunkingControllerSettings{ + .ExcludeExtensions = ReadStringArray(Parameters["ChunkExcludeExtensions"sv].AsArrayView()), + .ExcludeElfFiles = Parameters["ExcludeElfFiles"sv].AsBool(DefaultChunkingExcludeElfFiles), + .ExcludeMachOFiles = Parameters["ExcludeMachOFiles"sv].AsBool(DefaultChunkingExcludeMachOFiles), + .ChunkFileSizeLimit = Parameters["ChunkFileSizeLimit"sv].AsUInt64(DefaultChunkingFileSizeLimit), + .ChunkingParams = ReadChunkParams(Parameters["ChunkingParams"sv].AsObjectView())}; + } + + const BasicChunkingControllerSettings m_Settings; +}; + +class ChunkingControllerWithFixedChunking : public ChunkingController +{ +public: + ChunkingControllerWithFixedChunking(const ChunkingControllerWithFixedChunkingSettings& Settings) : m_Settings(Settings) {} + + ChunkingControllerWithFixedChunking(CbObjectView Parameters) : m_Settings(ReadSettings(Parameters)) {} + + virtual bool ProcessFile(const std::filesystem::path& InputPath, + uint64_t RawSize, + ChunkedInfoWithSource& OutChunked, + std::atomic<uint64_t>& BytesProcessed, + std::atomic<bool>& AbortFlag) const override + { + ZEN_TRACE_CPU("ChunkingControllerWithFixedChunking::ProcessFile"); + const bool ExcludeFromChunking = + std::find(m_Settings.ExcludeExtensions.begin(), m_Settings.ExcludeExtensions.end(), InputPath.extension()) != + m_Settings.ExcludeExtensions.end(); + + if (ExcludeFromChunking || (RawSize < m_Settings.ChunkFileSizeLimit)) + { + return false; + } + + const bool FixedChunkingExtension = + std::find(m_Settings.FixedChunkingExtensions.begin(), m_Settings.FixedChunkingExtensions.end(), InputPath.extension()) != + m_Settings.FixedChunkingExtensions.end(); + + if (FixedChunkingExtension) + { + if (RawSize < m_Settings.MinSizeForFixedChunking) + { + return false; + } + ZEN_TRACE_CPU("FixedChunking"); + IoHashStream FullHasher; + BasicFile Source(InputPath, BasicFile::Mode::kRead); + uint64_t Offset = 0; + tsl::robin_map<IoHash, uint32_t, IoHash::Hasher> ChunkHashToChunkIndex; + const uint64_t ExpectedChunkCount = 1 + (RawSize / m_Settings.FixedChunkingChunkSize); + ChunkHashToChunkIndex.reserve(ExpectedChunkCount); + OutChunked.Info.ChunkHashes.reserve(ExpectedChunkCount); + OutChunked.Info.ChunkSequence.reserve(ExpectedChunkCount); + OutChunked.ChunkSources.reserve(ExpectedChunkCount); + + static const uint64_t BufferingSize = 256u * 1024u; + + IoHashStream ChunkHasher; + + while (Offset < RawSize) + { + if (AbortFlag) + { + return false; + } + + ChunkHasher.Reset(); + + uint64_t ChunkSize = std::min<uint64_t>(RawSize - Offset, m_Settings.FixedChunkingChunkSize); + if (ChunkSize >= (BufferingSize + BufferingSize / 2)) + { + ScanFile(Source.Handle(), + Offset, + ChunkSize, + BufferingSize, + [&FullHasher, &ChunkHasher, &BytesProcessed](const void* Data, size_t Size) { + FullHasher.Append(Data, Size); + ChunkHasher.Append(Data, Size); + BytesProcessed.fetch_add(Size); + }); + } + else + { + IoBuffer ChunkData = Source.ReadRange(Offset, ChunkSize); + FullHasher.Append(ChunkData); + ChunkHasher.Append(ChunkData); + BytesProcessed.fetch_add(ChunkSize); + } + + const IoHash ChunkHash = ChunkHasher.GetHash(); + if (auto It = ChunkHashToChunkIndex.find(ChunkHash); It != ChunkHashToChunkIndex.end()) + { + OutChunked.Info.ChunkSequence.push_back(It->second); + } + else + { + uint32_t ChunkIndex = gsl::narrow<uint32_t>(OutChunked.Info.ChunkHashes.size()); + OutChunked.Info.ChunkHashes.push_back(ChunkHash); + OutChunked.Info.ChunkSequence.push_back(ChunkIndex); + OutChunked.ChunkSources.push_back({.Offset = Offset, .Size = gsl::narrow<uint32_t>(ChunkSize)}); + } + Offset += ChunkSize; + } + OutChunked.Info.RawSize = RawSize; + OutChunked.Info.RawHash = FullHasher.GetHash(); + return true; + } + else + { + BasicFile Buffer(InputPath, BasicFile::Mode::kRead); + if (m_Settings.ExcludeElfFiles && IsElfFile(Buffer)) + { + return false; + } + if (m_Settings.ExcludeMachOFiles && IsMachOFile(Buffer)) + { + return false; + } + + OutChunked = ChunkData(Buffer, 0, RawSize, m_Settings.ChunkingParams, &BytesProcessed, &AbortFlag); + return true; + } + } + + virtual std::string_view GetName() const override { return Name; } + + virtual CbObject GetParameters() const override + { + CbObjectWriter Writer; + Writer.BeginArray("FixedChunkingExtensions"); + { + for (const std::string& Extension : m_Settings.FixedChunkingExtensions) + { + Writer.AddString(Extension); + } + } + Writer.EndArray(); // ChunkExcludeExtensions + + Writer.BeginArray("ChunkExcludeExtensions"sv); + { + for (const std::string& Extension : m_Settings.ExcludeExtensions) + { + Writer.AddString(Extension); + } + } + Writer.EndArray(); // ChunkExcludeExtensions + + Writer.AddBool("ExcludeElfFiles"sv, m_Settings.ExcludeElfFiles); + Writer.AddBool("ExcludeMachOFiles"sv, m_Settings.ExcludeMachOFiles); + + Writer.AddInteger("ChunkFileSizeLimit"sv, m_Settings.ChunkFileSizeLimit); + + WriteChunkParams(Writer, m_Settings.ChunkingParams); + + Writer.AddInteger("FixedChunkingChunkSize"sv, m_Settings.FixedChunkingChunkSize); + Writer.AddInteger("MinSizeForFixedChunking"sv, m_Settings.MinSizeForFixedChunking); + return Writer.Save(); + } + + static constexpr std::string_view Name = "ChunkingControllerWithFixedChunking"sv; + +private: + static ChunkingControllerWithFixedChunkingSettings ReadSettings(CbObjectView Parameters) + { + return ChunkingControllerWithFixedChunkingSettings{ + .FixedChunkingExtensions = ReadStringArray(Parameters["FixedChunkingExtensions"sv].AsArrayView()), + .ExcludeExtensions = ReadStringArray(Parameters["ChunkExcludeExtensions"sv].AsArrayView()), + .ExcludeElfFiles = Parameters["ExcludeElfFiles"sv].AsBool(DefaultChunkingExcludeElfFiles), + .ExcludeMachOFiles = Parameters["ExcludeMachOFiles"sv].AsBool(DefaultChunkingExcludeMachOFiles), + .ChunkFileSizeLimit = Parameters["ChunkFileSizeLimit"sv].AsUInt64(DefaultChunkingFileSizeLimit), + .ChunkingParams = ReadChunkParams(Parameters["ChunkingParams"sv].AsObjectView()), + .FixedChunkingChunkSize = Parameters["FixedChunkingChunkSize"sv].AsUInt64(DefaultFixedChunkingChunkSize), + .MinSizeForFixedChunking = Parameters["MinSizeForFixedChunking"sv].AsUInt64(DefaultFixedChunkingChunkSize)}; + } + + const ChunkingControllerWithFixedChunkingSettings m_Settings; +}; + +std::unique_ptr<ChunkingController> +CreateBasicChunkingController(const BasicChunkingControllerSettings& Settings) +{ + return std::make_unique<BasicChunkingController>(Settings); +} +std::unique_ptr<ChunkingController> +CreateBasicChunkingController(CbObjectView Parameters) +{ + return std::make_unique<BasicChunkingController>(Parameters); +} + +std::unique_ptr<ChunkingController> +CreateChunkingControllerWithFixedChunking(const ChunkingControllerWithFixedChunkingSettings& Setting) +{ + return std::make_unique<ChunkingControllerWithFixedChunking>(Setting); +} +std::unique_ptr<ChunkingController> +CreateChunkingControllerWithFixedChunking(CbObjectView Parameters) +{ + return std::make_unique<ChunkingControllerWithFixedChunking>(Parameters); +} + +std::unique_ptr<ChunkingController> +CreateChunkingController(std::string_view Name, CbObjectView Parameters) +{ + if (Name == BasicChunkingController::Name) + { + return CreateBasicChunkingController(Parameters); + } + else if (Name == ChunkingControllerWithFixedChunking::Name) + { + return CreateChunkingControllerWithFixedChunking(Parameters); + } + return {}; +} + +} // namespace zen diff --git a/src/zenremotestore/include/zenremotestore/builds/buildstorage.h b/src/zenremotestore/include/zenremotestore/builds/buildstorage.h index 46ecd0a11..ee0ddcaa4 100644 --- a/src/zenremotestore/include/zenremotestore/builds/buildstorage.h +++ b/src/zenremotestore/include/zenremotestore/builds/buildstorage.h @@ -3,7 +3,7 @@ #pragma once #include <zencore/compactbinary.h> -#include <zenutil/chunkblock.h> +#include <zenremotestore/chunking/chunkblock.h> ZEN_THIRD_PARTY_INCLUDES_START #include <tsl/robin_map.h> diff --git a/src/zenremotestore/include/zenremotestore/builds/buildstoragecache.h b/src/zenremotestore/include/zenremotestore/builds/buildstoragecache.h index e6ca2c5e4..2e8024915 100644 --- a/src/zenremotestore/include/zenremotestore/builds/buildstoragecache.h +++ b/src/zenremotestore/include/zenremotestore/builds/buildstoragecache.h @@ -6,7 +6,7 @@ #include <zencore/compactbinary.h> #include <zencore/compositebuffer.h> -#include <zenutil/chunkblock.h> +#include <zenremotestore/chunking/chunkblock.h> namespace zen { diff --git a/src/zenremotestore/include/zenremotestore/chunking/chunkblock.h b/src/zenremotestore/include/zenremotestore/chunking/chunkblock.h new file mode 100644 index 000000000..b0d8ef24c --- /dev/null +++ b/src/zenremotestore/include/zenremotestore/chunking/chunkblock.h @@ -0,0 +1,42 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include <zencore/iohash.h> + +#include <zencore/compactbinary.h> +#include <zencore/compress.h> + +#include <optional> +#include <vector> + +namespace zen { + +struct ThinChunkBlockDescription +{ + IoHash BlockHash; + std::vector<IoHash> ChunkRawHashes; +}; + +struct ChunkBlockDescription : public ThinChunkBlockDescription +{ + uint64_t HeaderSize; + std::vector<uint32_t> ChunkRawLengths; + std::vector<uint32_t> ChunkCompressedLengths; +}; + +std::vector<ChunkBlockDescription> ParseChunkBlockDescriptionList(const CbObjectView& BlocksObject); +ChunkBlockDescription ParseChunkBlockDescription(const CbObjectView& BlockObject); +CbObject BuildChunkBlockDescription(const ChunkBlockDescription& Block, CbObjectView MetaData); +ChunkBlockDescription GetChunkBlockDescription(const SharedBuffer& BlockPayload, const IoHash& RawHash); +typedef std::function<std::pair<uint64_t, CompressedBuffer>(const IoHash& RawHash)> FetchChunkFunc; + +CompressedBuffer GenerateChunkBlock(std::vector<std::pair<IoHash, FetchChunkFunc>>&& FetchChunks, ChunkBlockDescription& OutBlock); +bool IterateChunkBlock(const SharedBuffer& BlockPayload, + std::function<void(CompressedBuffer&& Chunk, const IoHash& AttachmentHash)> Visitor, + uint64_t& OutHeaderSize); +std::vector<uint32_t> ReadChunkBlockHeader(const MemoryView BlockView, uint64_t& OutHeaderSize); + +void chunkblock_forcelink(); + +} // namespace zen diff --git a/src/zenremotestore/include/zenremotestore/chunking/chunkedcontent.h b/src/zenremotestore/include/zenremotestore/chunking/chunkedcontent.h new file mode 100644 index 000000000..306a5d990 --- /dev/null +++ b/src/zenremotestore/include/zenremotestore/chunking/chunkedcontent.h @@ -0,0 +1,288 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include <zencore/compactbinary.h> +#include <zencore/compactbinarybuilder.h> +#include <zencore/iohash.h> + +#include <filesystem> +#include <vector> + +ZEN_THIRD_PARTY_INCLUDES_START +#include <tsl/robin_map.h> +ZEN_THIRD_PARTY_INCLUDES_END + +namespace zen { + +class CbWriter; +class ChunkingController; +class WorkerThreadPool; + +enum class SourcePlatform +{ + Windows = 0, + Linux = 1, + MacOS = 2, + _Count +}; + +std::string_view ToString(SourcePlatform Platform); +SourcePlatform FromString(std::string_view Platform, SourcePlatform Default); +SourcePlatform GetSourceCurrentPlatform(); + +struct FolderContent +{ + SourcePlatform Platform = GetSourceCurrentPlatform(); + std::vector<std::filesystem::path> Paths; + std::vector<uint64_t> RawSizes; + std::vector<uint32_t> Attributes; + std::vector<uint64_t> ModificationTicks; + + bool operator==(const FolderContent& Rhs) const; + + bool AreKnownFilesEqual(const FolderContent& Rhs) const; + void UpdateState(const FolderContent& Rhs, std::vector<uint32_t>& PathIndexesOufOfDate); + static bool AreFileAttributesEqual(const uint32_t Lhs, const uint32_t Rhs); +}; + +FolderContent GetUpdatedContent(const FolderContent& Old, + const FolderContent& New, + std::vector<std::filesystem::path>& OutDeletedPathIndexes); + +void SaveFolderContentToCompactBinary(const FolderContent& Content, CbWriter& Output); +FolderContent LoadFolderContentToCompactBinary(CbObjectView Input); + +struct GetFolderContentStatistics +{ + std::atomic<uint64_t> FoundFileCount = 0; + std::atomic<uint64_t> FoundFileByteCount = 0; + std::atomic<uint64_t> AcceptedFileCount = 0; + std::atomic<uint64_t> AcceptedFileByteCount = 0; + uint64_t ElapsedWallTimeUS = 0; +}; + +FolderContent GetFolderContent(GetFolderContentStatistics& Stats, + const std::filesystem::path& RootPath, + std::function<bool(const std::string_view& RelativePath)>&& AcceptDirectory, + std::function<bool(std::string_view RelativePath, uint64_t Size, uint32_t Attributes)>&& AcceptFile, + WorkerThreadPool& WorkerPool, + int32_t UpdateIntervalMS, + std::function<void(bool IsAborted, std::ptrdiff_t PendingWork)>&& UpdateCallback, + std::atomic<bool>& AbortFlag); + +struct ChunkedContentData +{ + // To describe one asset with a particular RawHash, find the index of the hash in SequenceRawHashes + // ChunkCounts for that index will be the number of indexes in ChunkOrders that describe + // the sequence of chunks required to reconstruct the asset. + // Offset into ChunkOrders is based on how many entries in ChunkOrders the previous [n - 1] SequenceRawHashes uses + std::vector<IoHash> SequenceRawHashes; // Raw hash for Chunk sequence + std::vector<uint32_t> ChunkCounts; // Chunk count of ChunkOrder for SequenceRawHashes[n] + std::vector<uint32_t> ChunkOrders; // Chunk sequence indexed into ChunkHashes, ChunkCounts[n] indexes per SequenceRawHashes[n] + std::vector<IoHash> ChunkHashes; // Unique chunk hashes + std::vector<uint64_t> ChunkRawSizes; // Unique chunk raw size for ChunkHash[n] +}; + +struct ChunkedFolderContent +{ + SourcePlatform Platform = GetSourceCurrentPlatform(); + std::vector<std::filesystem::path> Paths; + std::vector<uint64_t> RawSizes; + std::vector<uint32_t> Attributes; + std::vector<IoHash> RawHashes; + ChunkedContentData ChunkedContent; +}; + +struct ChunkedContentLookup +{ + struct ChunkSequenceLocation + { + uint32_t SequenceIndex = (uint32_t)-1; + uint64_t Offset = (uint64_t)-1; + }; + tsl::robin_map<IoHash, uint32_t, IoHash::Hasher> ChunkHashToChunkIndex; + tsl::robin_map<IoHash, uint32_t, IoHash::Hasher> RawHashToSequenceIndex; + std::vector<uint32_t> SequenceIndexChunkOrderOffset; + std::vector<ChunkSequenceLocation> ChunkSequenceLocations; + std::vector<size_t> + ChunkSequenceLocationOffset; // ChunkSequenceLocations[ChunkLocationOffset[ChunkIndex]] -> start of sources for ChunkIndex + std::vector<uint32_t> ChunkSequenceLocationCounts; // ChunkSequenceLocationCounts[ChunkIndex] count of chunk locations for ChunkIndex + std::vector<uint32_t> SequenceIndexFirstPathIndex; // SequenceIndexFirstPathIndex[SequenceIndex] -> first path index with that RawHash + std::vector<uint32_t> PathExtensionHash; +}; + +void SaveChunkedFolderContentToCompactBinary(const ChunkedFolderContent& Content, CbWriter& Output); +ChunkedFolderContent LoadChunkedFolderContentToCompactBinary(CbObjectView Input); + +ChunkedFolderContent MergeChunkedFolderContents(const ChunkedFolderContent& Base, std::span<const ChunkedFolderContent> Overlays); +ChunkedFolderContent DeletePathsFromChunkedContent(const ChunkedFolderContent& Base, + const ChunkedContentLookup& BaseContentLookup, + std::span<const std::filesystem::path> DeletedPaths); +ChunkedFolderContent DeletePathsFromChunkedContent(const ChunkedFolderContent& Base, std::span<const std::filesystem::path> DeletedPaths); + +struct ChunkingStatistics +{ + std::atomic<uint64_t> FilesProcessed = 0; + std::atomic<uint64_t> FilesChunked = 0; + std::atomic<uint64_t> BytesHashed = 0; + std::atomic<uint64_t> UniqueChunksFound = 0; + std::atomic<uint64_t> UniqueSequencesFound = 0; + std::atomic<uint64_t> UniqueBytesFound = 0; + uint64_t ElapsedWallTimeUS = 0; +}; + +ChunkedFolderContent ChunkFolderContent(ChunkingStatistics& Stats, + WorkerThreadPool& WorkerPool, + const std::filesystem::path& RootPath, + const FolderContent& Content, + const ChunkingController& InChunkingController, + int32_t UpdateIntervalMS, + std::function<void(bool IsAborted, bool IsPaused, std::ptrdiff_t PendingWork)>&& UpdateCallback, + std::atomic<bool>& AbortFlag, + std::atomic<bool>& PauseFlag); + +ChunkedContentLookup BuildChunkedContentLookup(const ChunkedFolderContent& Content); + +inline std::pair<size_t, uint32_t> +GetChunkSequenceLocationRange(const ChunkedContentLookup& Lookup, uint32_t ChunkIndex) +{ + return std::make_pair(Lookup.ChunkSequenceLocationOffset[ChunkIndex], Lookup.ChunkSequenceLocationCounts[ChunkIndex]); +} + +inline std::span<const ChunkedContentLookup::ChunkSequenceLocation> +GetChunkSequenceLocations(const ChunkedContentLookup& Lookup, uint32_t ChunkIndex) +{ + std::pair<size_t, uint32_t> Range = GetChunkSequenceLocationRange(Lookup, ChunkIndex); + return std::span<const ChunkedContentLookup::ChunkSequenceLocation>(Lookup.ChunkSequenceLocations).subspan(Range.first, Range.second); +} + +inline uint32_t +GetSequenceIndexForRawHash(const ChunkedContentLookup& Lookup, const IoHash& RawHash) +{ + return Lookup.RawHashToSequenceIndex.at(RawHash); +} + +inline uint32_t +GetChunkIndexForRawHash(const ChunkedContentLookup& Lookup, const IoHash& RawHash) +{ + return Lookup.RawHashToSequenceIndex.at(RawHash); +} + +inline uint32_t +GetFirstPathIndexForSeqeuenceIndex(const ChunkedContentLookup& Lookup, const uint32_t SequenceIndex) +{ + return Lookup.SequenceIndexFirstPathIndex[SequenceIndex]; +} + +inline uint32_t +GetFirstPathIndexForRawHash(const ChunkedContentLookup& Lookup, const IoHash& RawHash) +{ + const uint32_t SequenceIndex = GetSequenceIndexForRawHash(Lookup, RawHash); + return GetFirstPathIndexForSeqeuenceIndex(Lookup, SequenceIndex); +} + +namespace compactbinary_helpers { + template<typename Type> + void WriteArray(std::span<const Type> Values, std::string_view ArrayName, CbWriter& Output) + { + Output.BeginArray(ArrayName); + for (const Type Value : Values) + { + Output << Value; + } + Output.EndArray(); + } + + template<typename Type> + void WriteArray(const std::vector<Type>& Values, std::string_view ArrayName, CbWriter& Output) + { + WriteArray(std::span<const Type>(Values), ArrayName, Output); + } + + template<> + inline void WriteArray(std::span<const std::filesystem::path> Values, std::string_view ArrayName, CbWriter& Output) + { + Output.BeginArray(ArrayName); + for (const std::filesystem::path& Path : Values) + { + Output.AddString((const char*)Path.generic_u8string().c_str()); + } + Output.EndArray(); + } + + template<> + inline void WriteArray(const std::vector<std::filesystem::path>& Values, std::string_view ArrayName, CbWriter& Output) + { + WriteArray(std::span<const std::filesystem::path>(Values), ArrayName, Output); + } + + inline void WriteBinaryAttachmentArray(std::span<const IoHash> Values, std::string_view ArrayName, CbWriter& Output) + { + Output.BeginArray(ArrayName); + for (const IoHash& Hash : Values) + { + Output.AddBinaryAttachment(Hash); + } + Output.EndArray(); + } + + inline void WriteBinaryAttachmentArray(const std::vector<IoHash>& Values, std::string_view ArrayName, CbWriter& Output) + { + WriteArray(std::span<const IoHash>(Values), ArrayName, Output); + } + + inline void ReadArray(std::string_view ArrayName, CbObjectView Input, std::vector<uint32_t>& Result) + { + CbArrayView Array = Input[ArrayName].AsArrayView(); + Result.reserve(Array.Num()); + for (CbFieldView ItemView : Array) + { + Result.push_back(ItemView.AsUInt32()); + } + } + + inline void ReadArray(std::string_view ArrayName, CbObjectView Input, std::vector<uint64_t>& Result) + { + CbArrayView Array = Input[ArrayName].AsArrayView(); + Result.reserve(Array.Num()); + for (CbFieldView ItemView : Array) + { + Result.push_back(ItemView.AsUInt64()); + } + } + + inline void ReadArray(std::string_view ArrayName, CbObjectView Input, std::vector<std::filesystem::path>& Result) + { + CbArrayView Array = Input[ArrayName].AsArrayView(); + Result.reserve(Array.Num()); + for (CbFieldView ItemView : Array) + { + std::u8string_view U8Path = ItemView.AsU8String(); + Result.push_back(std::filesystem::path(U8Path)); + } + } + + inline void ReadArray(std::string_view ArrayName, CbObjectView Input, std::vector<IoHash>& Result) + { + CbArrayView Array = Input[ArrayName].AsArrayView(); + Result.reserve(Array.Num()); + for (CbFieldView ItemView : Array) + { + Result.push_back(ItemView.AsHash()); + } + } + + inline void ReadBinaryAttachmentArray(std::string_view ArrayName, CbObjectView Input, std::vector<IoHash>& Result) + { + CbArrayView Array = Input[ArrayName].AsArrayView(); + Result.reserve(Array.Num()); + for (CbFieldView ItemView : Array) + { + Result.push_back(ItemView.AsBinaryAttachment()); + } + } + +} // namespace compactbinary_helpers + +} // namespace zen diff --git a/src/zenremotestore/include/zenremotestore/chunking/chunkedfile.h b/src/zenremotestore/include/zenremotestore/chunking/chunkedfile.h new file mode 100644 index 000000000..4cec80fdb --- /dev/null +++ b/src/zenremotestore/include/zenremotestore/chunking/chunkedfile.h @@ -0,0 +1,59 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include <zencore/iobuffer.h> +#include <zencore/iohash.h> +#include <zencore/zencore.h> + +#include <functional> +#include <vector> + +namespace zen { + +class BasicFile; + +struct ChunkedInfo +{ + uint64_t RawSize = 0; + IoHash RawHash; + std::vector<uint32_t> ChunkSequence; + std::vector<IoHash> ChunkHashes; +}; + +struct ChunkSource +{ + uint64_t Offset; // 8 + uint32_t Size; // 4 +}; + +struct ChunkedInfoWithSource +{ + ChunkedInfo Info; + std::vector<ChunkSource> ChunkSources; +}; + +struct ChunkedParams +{ + bool UseThreshold = true; + size_t MinSize = (2u * 1024u) - 128u; + size_t MaxSize = (16u * 1024u); + size_t AvgSize = (3u * 1024u); +}; + +static const ChunkedParams UShaderByteCodeParams = {.UseThreshold = true, .MinSize = 17280, .MaxSize = 139264, .AvgSize = 36340}; + +ChunkedInfoWithSource ChunkData(BasicFile& RawData, + uint64_t Offset, + uint64_t Size, + ChunkedParams Params = {}, + std::atomic<uint64_t>* BytesProcessed = nullptr, + std::atomic<bool>* AbortFlag = nullptr); +void Reconstruct(const ChunkedInfo& Info, + const std::filesystem::path& TargetPath, + std::function<IoBuffer(const IoHash& ChunkHash)> GetChunk); +IoBuffer SerializeChunkedInfo(const ChunkedInfo& Info); +ChunkedInfo DeserializeChunkedInfo(IoBuffer& Buffer); + +void chunkedfile_forcelink(); +} // namespace zen diff --git a/src/zenremotestore/include/zenremotestore/chunking/chunkingcontroller.h b/src/zenremotestore/include/zenremotestore/chunking/chunkingcontroller.h new file mode 100644 index 000000000..2d1ba36aa --- /dev/null +++ b/src/zenremotestore/include/zenremotestore/chunking/chunkingcontroller.h @@ -0,0 +1,75 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include <zencore/compactbinary.h> + +#include <zenremotestore/chunking/chunkedfile.h> + +#include <atomic> +#include <filesystem> + +namespace zen { + +const std::vector<std::string> DefaultChunkingExcludeExtensions = + {".exe", ".dll", ".pdb", ".self", ".mp4", ".zip", ".7z", ".bzip", ".rar", ".gzip"}; +const std::vector<std::string> DefaultFixedChunkingExtensions = {".apk", ".nsp", ".xvc", ".pkg", ".dmg", ".ipa"}; +const bool DefaultChunkingExcludeElfFiles = true; +const bool DefaultChunkingExcludeMachOFiles = true; + +const ChunkedParams DefaultChunkedParams = {.MinSize = ((8u * 1u) * 1024u) - 128u, + .MaxSize = 128u * 1024u, + .AvgSize = ((8u * 4u) * 1024u) + 128u}; + +const size_t DefaultChunkingFileSizeLimit = DefaultChunkedParams.MaxSize; + +const uint64_t DefaultFixedChunkingChunkSize = 32u * 1024u * 1024u; +const uint64_t DefaultMinSizeForFixedChunking = DefaultFixedChunkingChunkSize * 8u; + +struct ChunkedInfoWithSource; + +class ChunkingController +{ +public: + virtual ~ChunkingController() {} + + // Return true if the input file was processed. If true is returned OutChunked will contain the chunked info + virtual bool ProcessFile(const std::filesystem::path& InputPath, + uint64_t RawSize, + ChunkedInfoWithSource& OutChunked, + std::atomic<uint64_t>& BytesProcessed, + std::atomic<bool>& AbortFlag) const = 0; + virtual std::string_view GetName() const = 0; + virtual CbObject GetParameters() const = 0; +}; + +struct BasicChunkingControllerSettings +{ + std::vector<std::string> ExcludeExtensions = DefaultChunkingExcludeExtensions; + bool ExcludeElfFiles = DefaultChunkingExcludeElfFiles; + bool ExcludeMachOFiles = DefaultChunkingExcludeMachOFiles; + uint64_t ChunkFileSizeLimit = DefaultChunkingFileSizeLimit; + ChunkedParams ChunkingParams = DefaultChunkedParams; +}; + +std::unique_ptr<ChunkingController> CreateBasicChunkingController(const BasicChunkingControllerSettings& Settings); +std::unique_ptr<ChunkingController> CreateBasicChunkingController(CbObjectView Parameters); + +struct ChunkingControllerWithFixedChunkingSettings +{ + std::vector<std::string> FixedChunkingExtensions = DefaultFixedChunkingExtensions; + std::vector<std::string> ExcludeExtensions = DefaultChunkingExcludeExtensions; + bool ExcludeElfFiles = DefaultChunkingExcludeElfFiles; + bool ExcludeMachOFiles = DefaultChunkingExcludeMachOFiles; + uint64_t ChunkFileSizeLimit = DefaultChunkingFileSizeLimit; + ChunkedParams ChunkingParams = DefaultChunkedParams; + uint64_t FixedChunkingChunkSize = DefaultFixedChunkingChunkSize; + uint64_t MinSizeForFixedChunking = DefaultMinSizeForFixedChunking; +}; + +std::unique_ptr<ChunkingController> CreateChunkingControllerWithFixedChunking(const ChunkingControllerWithFixedChunkingSettings& Setting); +std::unique_ptr<ChunkingController> CreateChunkingControllerWithFixedChunking(CbObjectView Parameters); + +std::unique_ptr<ChunkingController> CreateChunkingController(std::string_view Name, CbObjectView Parameters); + +} // namespace zen diff --git a/src/zenremotestore/include/zenremotestore/projectstore/remoteprojectstore.h b/src/zenremotestore/include/zenremotestore/projectstore/remoteprojectstore.h index 11cc58e4d..7e5af5e6b 100644 --- a/src/zenremotestore/include/zenremotestore/projectstore/remoteprojectstore.h +++ b/src/zenremotestore/include/zenremotestore/projectstore/remoteprojectstore.h @@ -5,7 +5,7 @@ #include <zencore/jobqueue.h> #include <zenstore/projectstore.h> -#include <zenutil/chunkblock.h> +#include <zenremotestore/chunking/chunkblock.h> #include <unordered_set> diff --git a/src/zenremotestore/projectstore/remoteprojectstore.cpp b/src/zenremotestore/projectstore/remoteprojectstore.cpp index 822f0b29e..c2e270909 100644 --- a/src/zenremotestore/projectstore/remoteprojectstore.cpp +++ b/src/zenremotestore/projectstore/remoteprojectstore.cpp @@ -13,8 +13,8 @@ #include <zencore/timer.h> #include <zencore/workthreadpool.h> #include <zenhttp/httpcommon.h> +#include <zenremotestore/chunking/chunkedfile.h> #include <zenstore/cidstore.h> -#include <zenutil/chunkedfile.h> #include <zenutil/workerpools.h> #include <unordered_map> diff --git a/src/zenremotestore/zenremotestore.cpp b/src/zenremotestore/zenremotestore.cpp index 2fa3ac6a4..c019bc71d 100644 --- a/src/zenremotestore/zenremotestore.cpp +++ b/src/zenremotestore/zenremotestore.cpp @@ -2,6 +2,9 @@ #include <zenremotestore/zenremotestore.h> +#include <zenremotestore/chunking/chunkedfile.h> +#include <zenremotestore/projectstore/remoteprojectstore.h> + #if ZEN_WITH_TESTS namespace zen { @@ -9,6 +12,9 @@ namespace zen { void zenremotestore_forcelinktests() { + chunkblock_forcelink(); + chunkedfile_forcelink(); + remoteprojectstore_forcelink(); } } // namespace zen |