aboutsummaryrefslogtreecommitdiff
path: root/src/zenutil
diff options
context:
space:
mode:
authorDan Engelbrecht <[email protected]>2025-02-12 09:02:35 +0100
committerGitHub Enterprise <[email protected]>2025-02-12 09:02:35 +0100
commitda9179d330a37132488f6deb8d8068783b087256 (patch)
tree3309dfe685495bab7f18068f7c0d1dbd76a4b536 /src/zenutil
parentimproved builds api interface in jupiter (#281) (diff)
downloadzen-da9179d330a37132488f6deb8d8068783b087256.tar.xz
zen-da9179d330a37132488f6deb8d8068783b087256.zip
moving and small refactor of chunk blocks to prepare for builds api (#282)
Diffstat (limited to 'src/zenutil')
-rw-r--r--src/zenutil/chunkblock.cpp166
-rw-r--r--src/zenutil/chunkedfile.cpp510
-rw-r--r--src/zenutil/chunking.cpp382
-rw-r--r--src/zenutil/chunking.h56
-rw-r--r--src/zenutil/include/zenutil/chunkblock.h32
-rw-r--r--src/zenutil/include/zenutil/chunkedfile.h58
-rw-r--r--src/zenutil/zenutil.cpp2
7 files changed, 1206 insertions, 0 deletions
diff --git a/src/zenutil/chunkblock.cpp b/src/zenutil/chunkblock.cpp
new file mode 100644
index 000000000..6dae5af11
--- /dev/null
+++ b/src/zenutil/chunkblock.cpp
@@ -0,0 +1,166 @@
+// Copyright Epic Games, Inc. All Rights Reserved.
+
+#include <zenutil/chunkblock.h>
+
+#include <zencore/compactbinarybuilder.h>
+#include <zencore/logging.h>
+
+#include <vector>
+
+namespace zen {
+
+using namespace std::literals;
+
+ChunkBlockDescription
+ParseChunkBlockDescription(const CbObjectView& BlockObject)
+{
+ ChunkBlockDescription Result;
+ Result.BlockHash = BlockObject["rawHash"sv].AsHash();
+ if (Result.BlockHash != IoHash::Zero)
+ {
+ CbArrayView ChunksArray = BlockObject["rawHashes"sv].AsArrayView();
+ Result.ChunkHashes.reserve(ChunksArray.Num());
+ for (CbFieldView ChunkView : ChunksArray)
+ {
+ Result.ChunkHashes.push_back(ChunkView.AsHash());
+ }
+
+ CbArrayView ChunkRawLengthsArray = BlockObject["chunkRawLengths"sv].AsArrayView();
+ std::vector<uint32_t> ChunkLengths;
+ Result.ChunkRawLengths.reserve(ChunkRawLengthsArray.Num());
+ for (CbFieldView ChunkView : ChunkRawLengthsArray)
+ {
+ Result.ChunkRawLengths.push_back(ChunkView.AsUInt32());
+ }
+ }
+ return Result;
+}
+
+std::vector<ChunkBlockDescription>
+ParseChunkBlockDescriptionList(const CbObjectView& BlocksObject)
+{
+ if (!BlocksObject)
+ {
+ return {};
+ }
+ std::vector<ChunkBlockDescription> Result;
+ CbArrayView Blocks = BlocksObject["blocks"].AsArrayView();
+ Result.reserve(Blocks.Num());
+ for (CbFieldView BlockView : Blocks)
+ {
+ CbObjectView BlockObject = BlockView.AsObjectView();
+ Result.emplace_back(ParseChunkBlockDescription(BlockObject));
+ }
+ return Result;
+}
+
+CbObject
+BuildChunkBlockDescription(const ChunkBlockDescription& Block, CbObjectView MetaData)
+{
+ ZEN_ASSERT(Block.ChunkRawLengths.size() == Block.ChunkHashes.size());
+
+ CbObjectWriter Writer;
+ Writer.AddHash("rawHash"sv, Block.BlockHash);
+ Writer.BeginArray("rawHashes"sv);
+ {
+ for (const IoHash& ChunkHash : Block.ChunkHashes)
+ {
+ Writer.AddHash(ChunkHash);
+ }
+ }
+ Writer.EndArray();
+ Writer.BeginArray("chunkRawLengths");
+ {
+ for (uint32_t ChunkSize : Block.ChunkRawLengths)
+ {
+ Writer.AddInteger(ChunkSize);
+ }
+ }
+ Writer.EndArray();
+
+ Writer.AddObject("metadata", MetaData);
+
+ return Writer.Save();
+}
+
+CompressedBuffer
+GenerateChunkBlock(std::vector<std::pair<IoHash, FetchChunkFunc>>&& FetchChunks, ChunkBlockDescription& OutBlock)
+{
+ const size_t ChunkCount = FetchChunks.size();
+
+ std::vector<SharedBuffer> ChunkSegments;
+ ChunkSegments.resize(1);
+ ChunkSegments.reserve(1 + ChunkCount);
+ OutBlock.ChunkHashes.reserve(ChunkCount);
+ OutBlock.ChunkRawLengths.reserve(ChunkCount);
+ {
+ IoBuffer TempBuffer(ChunkCount * 9);
+ MutableMemoryView View = TempBuffer.GetMutableView();
+ uint8_t* BufferStartPtr = reinterpret_cast<uint8_t*>(View.GetData());
+ uint8_t* BufferEndPtr = BufferStartPtr;
+ BufferEndPtr += WriteVarUInt(gsl::narrow<uint64_t>(ChunkCount), BufferEndPtr);
+ for (const auto& It : FetchChunks)
+ {
+ std::pair<uint64_t, CompressedBuffer> Chunk = It.second(It.first);
+ uint64_t ChunkSize = 0;
+ std::span<const SharedBuffer> Segments = Chunk.second.GetCompressed().GetSegments();
+ for (const SharedBuffer& Segment : Segments)
+ {
+ ChunkSize += Segment.GetSize();
+ ChunkSegments.push_back(Segment);
+ }
+ BufferEndPtr += WriteVarUInt(ChunkSize, BufferEndPtr);
+ OutBlock.ChunkHashes.push_back(It.first);
+ OutBlock.ChunkRawLengths.push_back(gsl::narrow<uint32_t>(Chunk.first));
+ }
+ ZEN_ASSERT(BufferEndPtr <= View.GetDataEnd());
+ ptrdiff_t TempBufferLength = std::distance(BufferStartPtr, BufferEndPtr);
+ ChunkSegments[0] = SharedBuffer(IoBuffer(TempBuffer, 0, gsl::narrow<size_t>(TempBufferLength)));
+ }
+ CompressedBuffer CompressedBlock =
+ CompressedBuffer::Compress(CompositeBuffer(std::move(ChunkSegments)), OodleCompressor::Mermaid, OodleCompressionLevel::None);
+ OutBlock.BlockHash = CompressedBlock.DecodeRawHash();
+ return CompressedBlock;
+}
+
+bool
+IterateChunkBlock(const SharedBuffer& BlockPayload, std::function<void(CompressedBuffer&& Chunk, const IoHash& AttachmentHash)> Visitor)
+{
+ ZEN_ASSERT(BlockPayload);
+ if (BlockPayload.GetSize() < 1)
+ {
+ return false;
+ }
+
+ MemoryView BlockView = BlockPayload.GetView();
+ const uint8_t* ReadPtr = reinterpret_cast<const uint8_t*>(BlockView.GetData());
+ uint32_t NumberSize;
+ uint64_t ChunkCount = ReadVarUInt(ReadPtr, NumberSize);
+ ReadPtr += NumberSize;
+ std::vector<uint64_t> ChunkSizes;
+ ChunkSizes.reserve(ChunkCount);
+ while (ChunkCount--)
+ {
+ ChunkSizes.push_back(ReadVarUInt(ReadPtr, NumberSize));
+ ReadPtr += NumberSize;
+ }
+ for (uint64_t ChunkSize : ChunkSizes)
+ {
+ IoBuffer Chunk(IoBuffer::Wrap, ReadPtr, ChunkSize);
+ IoHash AttachmentRawHash;
+ uint64_t AttachmentRawSize;
+ CompressedBuffer CompressedChunk = CompressedBuffer::FromCompressed(SharedBuffer(Chunk), AttachmentRawHash, AttachmentRawSize);
+
+ if (!CompressedChunk)
+ {
+ ZEN_ERROR("Invalid chunk in block");
+ return false;
+ }
+ Visitor(std::move(CompressedChunk), AttachmentRawHash);
+ ReadPtr += ChunkSize;
+ ZEN_ASSERT(ReadPtr <= BlockView.GetDataEnd());
+ }
+ return true;
+};
+
+} // namespace zen
diff --git a/src/zenutil/chunkedfile.cpp b/src/zenutil/chunkedfile.cpp
new file mode 100644
index 000000000..c08492eb0
--- /dev/null
+++ b/src/zenutil/chunkedfile.cpp
@@ -0,0 +1,510 @@
+// Copyright Epic Games, Inc. All Rights Reserved.
+
+#include <zenutil/chunkedfile.h>
+
+#include <zencore/basicfile.h>
+
+#include "chunking.h"
+
+ZEN_THIRD_PARTY_INCLUDES_START
+#include <tsl/robin_map.h>
+#include <gsl/gsl-lite.hpp>
+ZEN_THIRD_PARTY_INCLUDES_END
+
+namespace zen {
+
+namespace {
+ struct ChunkedHeader
+ {
+ static constexpr uint32_t ExpectedMagic = 0x646b6863; // chkd
+ static constexpr uint32_t CurrentVersion = 1;
+
+ uint32_t Magic = ExpectedMagic;
+ uint32_t Version = CurrentVersion;
+ uint32_t ChunkSequenceLength;
+ uint32_t ChunkHashCount;
+ uint64_t ChunkSequenceOffset;
+ uint64_t ChunkHashesOffset;
+ uint64_t RawSize = 0;
+ IoHash RawHash;
+ };
+} // namespace
+
+IoBuffer
+SerializeChunkedInfo(const ChunkedInfo& Info)
+{
+ size_t HeaderSize = RoundUp(sizeof(ChunkedHeader), 16) + RoundUp(sizeof(uint32_t) * Info.ChunkSequence.size(), 16) +
+ RoundUp(sizeof(IoHash) * Info.ChunkHashes.size(), 16);
+ IoBuffer HeaderData(HeaderSize);
+
+ ChunkedHeader Header;
+ Header.ChunkSequenceLength = gsl::narrow<uint32_t>(Info.ChunkSequence.size());
+ Header.ChunkHashCount = gsl::narrow<uint32_t>(Info.ChunkHashes.size());
+ Header.ChunkSequenceOffset = RoundUp(sizeof(ChunkedHeader), 16);
+ Header.ChunkHashesOffset = RoundUp(Header.ChunkSequenceOffset + sizeof(uint32_t) * Header.ChunkSequenceLength, 16);
+ Header.RawSize = Info.RawSize;
+ Header.RawHash = Info.RawHash;
+
+ MutableMemoryView WriteView = HeaderData.GetMutableView();
+ {
+ MutableMemoryView HeaderWriteView = WriteView.Left(sizeof(Header));
+ HeaderWriteView.CopyFrom(MemoryView(&Header, sizeof(Header)));
+ }
+ {
+ MutableMemoryView ChunkSequenceWriteView = WriteView.Mid(Header.ChunkSequenceOffset, sizeof(uint32_t) * Header.ChunkSequenceLength);
+ ChunkSequenceWriteView.CopyFrom(MemoryView(Info.ChunkSequence.data(), ChunkSequenceWriteView.GetSize()));
+ }
+ {
+ MutableMemoryView ChunksWriteView = WriteView.Mid(Header.ChunkHashesOffset, sizeof(IoHash) * Header.ChunkHashCount);
+ ChunksWriteView.CopyFrom(MemoryView(Info.ChunkHashes.data(), ChunksWriteView.GetSize()));
+ }
+
+ return HeaderData;
+}
+
+ChunkedInfo
+DeserializeChunkedInfo(IoBuffer& Buffer)
+{
+ MemoryView View = Buffer.GetView();
+ ChunkedHeader Header;
+ {
+ MutableMemoryView HeaderWriteView(&Header, sizeof(Header));
+ HeaderWriteView.CopyFrom(View.Left(sizeof(Header)));
+ }
+ if (Header.Magic != ChunkedHeader::ExpectedMagic)
+ {
+ return {};
+ }
+ if (Header.Version != ChunkedHeader::CurrentVersion)
+ {
+ return {};
+ }
+ ChunkedInfo Info;
+ Info.RawSize = Header.RawSize;
+ Info.RawHash = Header.RawHash;
+ Info.ChunkSequence.resize(Header.ChunkSequenceLength);
+ Info.ChunkHashes.resize(Header.ChunkHashCount);
+ {
+ MutableMemoryView ChunkSequenceWriteView(Info.ChunkSequence.data(), sizeof(uint32_t) * Header.ChunkSequenceLength);
+ ChunkSequenceWriteView.CopyFrom(View.Mid(Header.ChunkSequenceOffset, ChunkSequenceWriteView.GetSize()));
+ }
+ {
+ MutableMemoryView ChunksWriteView(Info.ChunkHashes.data(), sizeof(IoHash) * Header.ChunkHashCount);
+ ChunksWriteView.CopyFrom(View.Mid(Header.ChunkHashesOffset, ChunksWriteView.GetSize()));
+ }
+
+ return Info;
+}
+
+void
+Reconstruct(const ChunkedInfo& Info, const std::filesystem::path& TargetPath, std::function<IoBuffer(const IoHash& ChunkHash)> GetChunk)
+{
+ BasicFile Reconstructed;
+ Reconstructed.Open(TargetPath, BasicFile::Mode::kTruncate);
+ BasicFileWriter ReconstructedWriter(Reconstructed, 64 * 1024);
+ uint64_t Offset = 0;
+ for (uint32_t SequenceIndex : Info.ChunkSequence)
+ {
+ IoBuffer Chunk = GetChunk(Info.ChunkHashes[SequenceIndex]);
+ ReconstructedWriter.Write(Chunk.GetData(), Chunk.GetSize(), Offset);
+ Offset += Chunk.GetSize();
+ }
+}
+
+ChunkedInfoWithSource
+ChunkData(BasicFile& RawData, uint64_t Offset, uint64_t Size, ChunkedParams Params, std::atomic<uint64_t>* BytesProcessed)
+{
+ ChunkedInfoWithSource Result;
+ tsl::robin_map<IoHash, uint32_t, IoHash::Hasher> FoundChunks;
+
+ ZenChunkHelper Chunker;
+ Chunker.SetUseThreshold(Params.UseThreshold);
+ Chunker.SetChunkSize(Params.MinSize, Params.MaxSize, Params.AvgSize);
+ size_t End = Offset + Size;
+ const size_t ScanBufferSize = 1u * 1024 * 1024; // (Params.MaxSize * 9) / 3;//1 * 1024 * 1024;
+ BasicFileBuffer RawBuffer(RawData, ScanBufferSize);
+ MemoryView SliceView = RawBuffer.MakeView(Min(End - Offset, ScanBufferSize), Offset);
+ ZEN_ASSERT(!SliceView.IsEmpty());
+ size_t SliceSize = SliceView.GetSize();
+ IoHashStream RawHashStream;
+ while (Offset < End)
+ {
+ size_t ScanLength = Chunker.ScanChunk(SliceView.GetData(), SliceSize);
+ if (ScanLength == ZenChunkHelper::kNoBoundaryFound)
+ {
+ if (Offset + SliceSize == End)
+ {
+ ScanLength = SliceSize;
+ }
+ else
+ {
+ SliceView = RawBuffer.MakeView(Min(End - Offset, ScanBufferSize), Offset);
+ SliceSize = SliceView.GetSize();
+ Chunker.Reset();
+ continue;
+ }
+ }
+ uint32_t ChunkLength = gsl::narrow<uint32_t>(ScanLength); // +HashedLength);
+ MemoryView ChunkView = SliceView.Left(ScanLength);
+ RawHashStream.Append(ChunkView);
+ IoHash ChunkHash = IoHash::HashBuffer(ChunkView);
+ SliceView.RightChopInline(ScanLength);
+ if (auto It = FoundChunks.find(ChunkHash); It != FoundChunks.end())
+ {
+ Result.Info.ChunkSequence.push_back(It->second);
+ }
+ else
+ {
+ uint32_t ChunkIndex = gsl::narrow<uint32_t>(Result.Info.ChunkHashes.size());
+ FoundChunks.insert_or_assign(ChunkHash, ChunkIndex);
+ Result.Info.ChunkHashes.push_back(ChunkHash);
+ Result.ChunkSources.push_back(ChunkSource{.Offset = Offset, .Size = ChunkLength});
+ Result.Info.ChunkSequence.push_back(ChunkIndex);
+ }
+
+ SliceSize = SliceView.GetSize();
+ Offset += ChunkLength;
+ if (BytesProcessed != nullptr)
+ {
+ BytesProcessed->fetch_add(ChunkLength);
+ }
+ }
+ Result.Info.RawSize = Size;
+ Result.Info.RawHash = RawHashStream.GetHash();
+ return Result;
+}
+
+} // namespace zen
+
+#if ZEN_WITH_TESTS
+# include <zencore/filesystem.h>
+# include <zencore/fmtutils.h>
+# include <zencore/iohash.h>
+# include <zencore/logging.h>
+# include <zencore/scopeguard.h>
+# include <zencore/timer.h>
+# include <zencore/testing.h>
+# include <zencore/testutils.h>
+# include <zencore/workthreadpool.h>
+
+# include "chunking.h"
+
+ZEN_THIRD_PARTY_INCLUDES_START
+# include <tsl/robin_map.h>
+# include <tsl/robin_set.h>
+ZEN_THIRD_PARTY_INCLUDES_END
+
+namespace zen {
+# if 0
+TEST_CASE("chunkedfile.findparams")
+{
+# if 1
+ DirectoryContent SourceContent1;
+ GetDirectoryContent("E:\\Temp\\ChunkingTestData\\31379208", DirectoryContentFlags::IncludeFiles, SourceContent1);
+ const std::vector<std::filesystem::path>& SourceFiles1 = SourceContent1.Files;
+ DirectoryContent SourceContent2;
+ GetDirectoryContent("E:\\Temp\\ChunkingTestData\\31379208_2", DirectoryContentFlags::IncludeFiles, SourceContent2);
+ const std::vector<std::filesystem::path>& SourceFiles2 = SourceContent2.Files;
+# else
+ std::filesystem::path SourcePath1 =
+ "E:\\Temp\\ChunkingTestData\\31375996\\ShaderArchive-FortniteGame_Chunk10-PCD3D_SM6-PCD3D_SM6.ushaderbytecode";
+ std::filesystem::path SourcePath2 =
+ "E:\\Temp\\ChunkingTestData\\31379208\\ShaderArchive-FortniteGame_Chunk10-PCD3D_SM6-PCD3D_SM6.ushaderbytecode";
+ const std::vector<std::filesystem::path>& SourceFiles1 = {SourcePath1};
+ const std::vector<std::filesystem::path>& SourceFiles2 = {SourcePath2};
+# endif
+ ChunkedParams Params[] = {ChunkedParams{.UseThreshold = false, .MinSize = 17280, .MaxSize = 139264, .AvgSize = 36340},
+ ChunkedParams{.UseThreshold = false, .MinSize = 15456, .MaxSize = 122880, .AvgSize = 35598},
+ ChunkedParams{.UseThreshold = false, .MinSize = 16848, .MaxSize = 135168, .AvgSize = 39030},
+ ChunkedParams{.UseThreshold = false, .MinSize = 14256, .MaxSize = 114688, .AvgSize = 36222},
+ ChunkedParams{.UseThreshold = false, .MinSize = 15744, .MaxSize = 126976, .AvgSize = 36600},
+ ChunkedParams{.UseThreshold = false, .MinSize = 15264, .MaxSize = 122880, .AvgSize = 35442},
+ ChunkedParams{.UseThreshold = false, .MinSize = 16464, .MaxSize = 131072, .AvgSize = 37950},
+ ChunkedParams{.UseThreshold = false, .MinSize = 15408, .MaxSize = 122880, .AvgSize = 38914},
+ ChunkedParams{.UseThreshold = false, .MinSize = 15408, .MaxSize = 122880, .AvgSize = 35556},
+ ChunkedParams{.UseThreshold = false, .MinSize = 15360, .MaxSize = 122880, .AvgSize = 35520},
+ ChunkedParams{.UseThreshold = false, .MinSize = 15312, .MaxSize = 122880, .AvgSize = 35478},
+ ChunkedParams{.UseThreshold = false, .MinSize = 16896, .MaxSize = 135168, .AvgSize = 39072},
+ ChunkedParams{.UseThreshold = false, .MinSize = 15360, .MaxSize = 122880, .AvgSize = 38880},
+ ChunkedParams{.UseThreshold = false, .MinSize = 15840, .MaxSize = 126976, .AvgSize = 36678},
+ ChunkedParams{.UseThreshold = false, .MinSize = 16800, .MaxSize = 135168, .AvgSize = 38994},
+ ChunkedParams{.UseThreshold = false, .MinSize = 15888, .MaxSize = 126976, .AvgSize = 36714},
+ ChunkedParams{.UseThreshold = false, .MinSize = 15792, .MaxSize = 126976, .AvgSize = 36636},
+ ChunkedParams{.UseThreshold = false, .MinSize = 14880, .MaxSize = 118784, .AvgSize = 37609},
+ ChunkedParams{.UseThreshold = false, .MinSize = 15936, .MaxSize = 126976, .AvgSize = 36756},
+ ChunkedParams{.UseThreshold = false, .MinSize = 15456, .MaxSize = 122880, .AvgSize = 38955},
+ ChunkedParams{.UseThreshold = false, .MinSize = 15984, .MaxSize = 126976, .AvgSize = 36792},
+ ChunkedParams{.UseThreshold = false, .MinSize = 14400, .MaxSize = 114688, .AvgSize = 36338},
+ ChunkedParams{.UseThreshold = false, .MinSize = 14832, .MaxSize = 118784, .AvgSize = 37568},
+ ChunkedParams{.UseThreshold = false, .MinSize = 16944, .MaxSize = 135168, .AvgSize = 39108},
+ ChunkedParams{.UseThreshold = false, .MinSize = 14352, .MaxSize = 114688, .AvgSize = 36297},
+ ChunkedParams{.UseThreshold = false, .MinSize = 14208, .MaxSize = 114688, .AvgSize = 36188},
+ ChunkedParams{.UseThreshold = false, .MinSize = 14448, .MaxSize = 114688, .AvgSize = 36372},
+ ChunkedParams{.UseThreshold = false, .MinSize = 13296, .MaxSize = 106496, .AvgSize = 36592},
+ ChunkedParams{.UseThreshold = false, .MinSize = 15264, .MaxSize = 122880, .AvgSize = 38805},
+ ChunkedParams{.UseThreshold = false, .MinSize = 14304, .MaxSize = 114688, .AvgSize = 36263},
+ ChunkedParams{.UseThreshold = false, .MinSize = 14784, .MaxSize = 118784, .AvgSize = 37534},
+ ChunkedParams{.UseThreshold = false, .MinSize = 15312, .MaxSize = 122880, .AvgSize = 38839},
+ ChunkedParams{.UseThreshold = false, .MinSize = 14256, .MaxSize = 114688, .AvgSize = 39360},
+ ChunkedParams{.UseThreshold = false, .MinSize = 13776, .MaxSize = 110592, .AvgSize = 37976},
+ ChunkedParams{.UseThreshold = false, .MinSize = 14736, .MaxSize = 118784, .AvgSize = 37493},
+ ChunkedParams{.UseThreshold = false, .MinSize = 14928, .MaxSize = 118784, .AvgSize = 37643},
+ ChunkedParams{.UseThreshold = false, .MinSize = 14448, .MaxSize = 114688, .AvgSize = 39504},
+ ChunkedParams{.UseThreshold = false, .MinSize = 13392, .MaxSize = 106496, .AvgSize = 36664},
+ ChunkedParams{.UseThreshold = false, .MinSize = 13872, .MaxSize = 110592, .AvgSize = 38048},
+ ChunkedParams{.UseThreshold = false, .MinSize = 14352, .MaxSize = 114688, .AvgSize = 39432},
+ ChunkedParams{.UseThreshold = false, .MinSize = 13200, .MaxSize = 106496, .AvgSize = 36520},
+ ChunkedParams{.UseThreshold = false, .MinSize = 17328, .MaxSize = 139264, .AvgSize = 36378},
+ ChunkedParams{.UseThreshold = false, .MinSize = 17376, .MaxSize = 139264, .AvgSize = 36421},
+ ChunkedParams{.UseThreshold = false, .MinSize = 17424, .MaxSize = 139264, .AvgSize = 36459},
+ ChunkedParams{.UseThreshold = false, .MinSize = 17472, .MaxSize = 139264, .AvgSize = 36502},
+ ChunkedParams{.UseThreshold = false, .MinSize = 17520, .MaxSize = 139264, .AvgSize = 36540},
+ ChunkedParams{.UseThreshold = false, .MinSize = 17808, .MaxSize = 143360, .AvgSize = 37423},
+ ChunkedParams{.UseThreshold = false, .MinSize = 17856, .MaxSize = 143360, .AvgSize = 37466},
+ ChunkedParams{.UseThreshold = false, .MinSize = 18000, .MaxSize = 143360, .AvgSize = 25834},
+ ChunkedParams{.UseThreshold = false, .MinSize = 18000, .MaxSize = 143360, .AvgSize = 21917},
+ ChunkedParams{.UseThreshold = false, .MinSize = 18000, .MaxSize = 143360, .AvgSize = 29751},
+ ChunkedParams{.UseThreshold = false, .MinSize = 18000, .MaxSize = 143360, .AvgSize = 33668},
+ ChunkedParams{.UseThreshold = false, .MinSize = 17952, .MaxSize = 143360, .AvgSize = 37547},
+ ChunkedParams{.UseThreshold = false, .MinSize = 17904, .MaxSize = 143360, .AvgSize = 37504},
+ ChunkedParams{.UseThreshold = false, .MinSize = 18336, .MaxSize = 147456, .AvgSize = 22371},
+ ChunkedParams{.UseThreshold = false, .MinSize = 18000, .MaxSize = 143360, .AvgSize = 37585},
+ ChunkedParams{.UseThreshold = false, .MinSize = 18336, .MaxSize = 147456, .AvgSize = 26406},
+ ChunkedParams{.UseThreshold = false, .MinSize = 18384, .MaxSize = 147456, .AvgSize = 26450},
+ ChunkedParams{.UseThreshold = false, .MinSize = 18528, .MaxSize = 147456, .AvgSize = 30615},
+ ChunkedParams{.UseThreshold = false, .MinSize = 18336, .MaxSize = 147456, .AvgSize = 30441},
+ ChunkedParams{.UseThreshold = false, .MinSize = 18384, .MaxSize = 147456, .AvgSize = 22417},
+ ChunkedParams{.UseThreshold = false, .MinSize = 18528, .MaxSize = 147456, .AvgSize = 22557},
+ ChunkedParams{.UseThreshold = false, .MinSize = 18432, .MaxSize = 147456, .AvgSize = 30528},
+ ChunkedParams{.UseThreshold = false, .MinSize = 18816, .MaxSize = 151552, .AvgSize = 27112},
+ ChunkedParams{.UseThreshold = false, .MinSize = 18528, .MaxSize = 147456, .AvgSize = 34644},
+ ChunkedParams{.UseThreshold = false, .MinSize = 18336, .MaxSize = 147456, .AvgSize = 34476},
+ ChunkedParams{.UseThreshold = false, .MinSize = 18816, .MaxSize = 151552, .AvgSize = 35408},
+ ChunkedParams{.UseThreshold = false, .MinSize = 18432, .MaxSize = 147456, .AvgSize = 38592},
+ ChunkedParams{.UseThreshold = false, .MinSize = 18384, .MaxSize = 147456, .AvgSize = 30483},
+ ChunkedParams{.UseThreshold = false, .MinSize = 18528, .MaxSize = 147456, .AvgSize = 26586},
+ ChunkedParams{.UseThreshold = false, .MinSize = 18432, .MaxSize = 147456, .AvgSize = 26496},
+ ChunkedParams{.UseThreshold = false, .MinSize = 18864, .MaxSize = 151552, .AvgSize = 31302},
+ ChunkedParams{.UseThreshold = false, .MinSize = 18384, .MaxSize = 147456, .AvgSize = 34516},
+ ChunkedParams{.UseThreshold = false, .MinSize = 18816, .MaxSize = 151552, .AvgSize = 22964},
+ ChunkedParams{.UseThreshold = false, .MinSize = 18864, .MaxSize = 151552, .AvgSize = 35448},
+ ChunkedParams{.UseThreshold = false, .MinSize = 18480, .MaxSize = 147456, .AvgSize = 38630},
+ ChunkedParams{.UseThreshold = false, .MinSize = 18864, .MaxSize = 151552, .AvgSize = 23010},
+ ChunkedParams{.UseThreshold = false, .MinSize = 18816, .MaxSize = 151552, .AvgSize = 31260},
+ ChunkedParams{.UseThreshold = false, .MinSize = 18480, .MaxSize = 147456, .AvgSize = 34600},
+ ChunkedParams{.UseThreshold = false, .MinSize = 18864, .MaxSize = 151552, .AvgSize = 27156},
+ ChunkedParams{.UseThreshold = false, .MinSize = 18480, .MaxSize = 147456, .AvgSize = 30570},
+ ChunkedParams{.UseThreshold = false, .MinSize = 18384, .MaxSize = 147456, .AvgSize = 38549},
+ ChunkedParams{.UseThreshold = false, .MinSize = 18480, .MaxSize = 147456, .AvgSize = 22510},
+ ChunkedParams{.UseThreshold = false, .MinSize = 18528, .MaxSize = 147456, .AvgSize = 38673},
+ ChunkedParams{.UseThreshold = false, .MinSize = 18432, .MaxSize = 147456, .AvgSize = 34560},
+ ChunkedParams{.UseThreshold = false, .MinSize = 18432, .MaxSize = 147456, .AvgSize = 22464},
+ ChunkedParams{.UseThreshold = false, .MinSize = 18480, .MaxSize = 147456, .AvgSize = 26540},
+ ChunkedParams{.UseThreshold = false, .MinSize = 18336, .MaxSize = 147456, .AvgSize = 38511},
+ ChunkedParams{.UseThreshold = false, .MinSize = 18912, .MaxSize = 151552, .AvgSize = 23057},
+ ChunkedParams{.UseThreshold = false, .MinSize = 18912, .MaxSize = 151552, .AvgSize = 27202},
+ ChunkedParams{.UseThreshold = false, .MinSize = 18912, .MaxSize = 151552, .AvgSize = 31347},
+ ChunkedParams{.UseThreshold = false, .MinSize = 18912, .MaxSize = 151552, .AvgSize = 35492},
+ ChunkedParams{.UseThreshold = false, .MinSize = 18960, .MaxSize = 151552, .AvgSize = 31389},
+ ChunkedParams{.UseThreshold = false, .MinSize = 18960, .MaxSize = 151552, .AvgSize = 27246},
+ ChunkedParams{.UseThreshold = false, .MinSize = 18960, .MaxSize = 151552, .AvgSize = 23103},
+ ChunkedParams{.UseThreshold = false, .MinSize = 18960, .MaxSize = 151552, .AvgSize = 35532},
+ ChunkedParams{.UseThreshold = false, .MinSize = 19008, .MaxSize = 151552, .AvgSize = 23150},
+ ChunkedParams{.UseThreshold = false, .MinSize = 19008, .MaxSize = 151552, .AvgSize = 27292},
+ ChunkedParams{.UseThreshold = false, .MinSize = 19008, .MaxSize = 151552, .AvgSize = 31434},
+ ChunkedParams{.UseThreshold = false, .MinSize = 19008, .MaxSize = 151552, .AvgSize = 35576},
+ ChunkedParams{.UseThreshold = false, .MinSize = 19056, .MaxSize = 151552, .AvgSize = 27336},
+ ChunkedParams{.UseThreshold = false, .MinSize = 19056, .MaxSize = 151552, .AvgSize = 23196},
+ ChunkedParams{.UseThreshold = false, .MinSize = 19056, .MaxSize = 151552, .AvgSize = 31476},
+ ChunkedParams{.UseThreshold = false, .MinSize = 19056, .MaxSize = 151552, .AvgSize = 35616},
+ ChunkedParams{.UseThreshold = false, .MinSize = 19344, .MaxSize = 155648, .AvgSize = 27862},
+ ChunkedParams{.UseThreshold = false, .MinSize = 19344, .MaxSize = 155648, .AvgSize = 32121},
+ ChunkedParams{.UseThreshold = false, .MinSize = 19344, .MaxSize = 155648, .AvgSize = 23603},
+ ChunkedParams{.UseThreshold = false, .MinSize = 19344, .MaxSize = 155648, .AvgSize = 36380},
+ ChunkedParams{.UseThreshold = false, .MinSize = 19392, .MaxSize = 155648, .AvgSize = 27908},
+ ChunkedParams{.UseThreshold = false, .MinSize = 19392, .MaxSize = 155648, .AvgSize = 23650},
+ ChunkedParams{.UseThreshold = false, .MinSize = 19392, .MaxSize = 155648, .AvgSize = 32166},
+ ChunkedParams{.UseThreshold = false, .MinSize = 19392, .MaxSize = 155648, .AvgSize = 36424},
+ ChunkedParams{.UseThreshold = false, .MinSize = 19440, .MaxSize = 155648, .AvgSize = 23696},
+ ChunkedParams{.UseThreshold = false, .MinSize = 19488, .MaxSize = 155648, .AvgSize = 32253},
+ ChunkedParams{.UseThreshold = false, .MinSize = 19440, .MaxSize = 155648, .AvgSize = 32208},
+ ChunkedParams{.UseThreshold = false, .MinSize = 19488, .MaxSize = 155648, .AvgSize = 23743},
+ ChunkedParams{.UseThreshold = false, .MinSize = 19536, .MaxSize = 155648, .AvgSize = 36548},
+ ChunkedParams{.UseThreshold = false, .MinSize = 19536, .MaxSize = 155648, .AvgSize = 28042},
+ ChunkedParams{.UseThreshold = false, .MinSize = 19536, .MaxSize = 155648, .AvgSize = 23789},
+ ChunkedParams{.UseThreshold = false, .MinSize = 19536, .MaxSize = 155648, .AvgSize = 32295},
+ ChunkedParams{.UseThreshold = false, .MinSize = 19488, .MaxSize = 155648, .AvgSize = 36508},
+ ChunkedParams{.UseThreshold = false, .MinSize = 19440, .MaxSize = 155648, .AvgSize = 27952},
+ ChunkedParams{.UseThreshold = false, .MinSize = 19488, .MaxSize = 155648, .AvgSize = 27998},
+ ChunkedParams{.UseThreshold = false, .MinSize = 19440, .MaxSize = 155648, .AvgSize = 36464}};
+
+ static const size_t ParamsCount = sizeof(Params) / sizeof(ChunkedParams);
+ std::vector<ChunkedInfoWithSource> Infos1(SourceFiles1.size());
+ std::vector<ChunkedInfoWithSource> Infos2(SourceFiles2.size());
+
+ WorkerThreadPool WorkerPool(32);
+
+ for (size_t I = 0; I < ParamsCount; I++)
+ {
+ for (int UseThreshold = 0; UseThreshold < 2; UseThreshold++)
+ {
+ Latch WorkLatch(1);
+ ChunkedParams Param = Params[I];
+ Param.UseThreshold = UseThreshold == 1;
+ Stopwatch Timer;
+ for (size_t F = 0; F < SourceFiles1.size(); F++)
+ {
+ WorkLatch.AddCount(1);
+ WorkerPool.ScheduleWork([&WorkLatch, F, Param, &SourceFiles1, &Infos1]() {
+ auto _ = MakeGuard([&WorkLatch]() { WorkLatch.CountDown(); });
+ BasicFile SourceData1;
+ SourceData1.Open(SourceFiles1[F], BasicFile::Mode::kRead);
+ Infos1[F] = ChunkData(SourceData1, 0, SourceData1.FileSize(), Param);
+ });
+ }
+ for (size_t F = 0; F < SourceFiles2.size(); F++)
+ {
+ WorkLatch.AddCount(1);
+ WorkerPool.ScheduleWork([&WorkLatch, F, Param, &SourceFiles2, &Infos2]() {
+ auto _ = MakeGuard([&WorkLatch]() { WorkLatch.CountDown(); });
+ BasicFile SourceData2;
+ SourceData2.Open(SourceFiles2[F], BasicFile::Mode::kRead);
+ Infos2[F] = ChunkData(SourceData2, 0, SourceData2.FileSize(), Param);
+ });
+ }
+ WorkLatch.CountDown();
+ WorkLatch.Wait();
+ uint64_t ChunkTimeMS = Timer.GetElapsedTimeMs();
+
+ uint64_t Raw1Size = 0;
+ tsl::robin_set<IoHash> Chunks1;
+ size_t ChunkedSize1 = 0;
+ for (size_t F = 0; F < SourceFiles1.size(); F++)
+ {
+ const ChunkedInfoWithSource& Info = Infos1[F];
+ Raw1Size += Info.Info.RawSize;
+ for (uint32_t Chunk1Index = 0; Chunk1Index < Info.Info.ChunkHashes.size(); ++Chunk1Index)
+ {
+ const IoHash ChunkHash = Info.Info.ChunkHashes[Chunk1Index];
+ if (Chunks1.insert(ChunkHash).second)
+ {
+ ChunkedSize1 += Info.ChunkSources[Chunk1Index].Size;
+ }
+ }
+ }
+
+ uint64_t Raw2Size = 0;
+ tsl::robin_set<IoHash> Chunks2;
+ size_t ChunkedSize2 = 0;
+ size_t DiffSize = 0;
+ for (size_t F = 0; F < SourceFiles2.size(); F++)
+ {
+ const ChunkedInfoWithSource& Info = Infos2[F];
+ Raw2Size += Info.Info.RawSize;
+ for (uint32_t Chunk2Index = 0; Chunk2Index < Info.Info.ChunkHashes.size(); ++Chunk2Index)
+ {
+ const IoHash ChunkHash = Info.Info.ChunkHashes[Chunk2Index];
+ if (Chunks2.insert(ChunkHash).second)
+ {
+ ChunkedSize2 += Info.ChunkSources[Chunk2Index].Size;
+ if (!Chunks1.contains(ChunkHash))
+ {
+ DiffSize += Info.ChunkSources[Chunk2Index].Size;
+ }
+ }
+ }
+ }
+
+ ZEN_INFO(
+ "Diff = {}, Chunks1 = {}, Chunks2 = {}, .UseThreshold = {}, .MinSize = {}, .MaxSize = {}, .AvgSize = {}, RawSize(1) = {}, "
+ "RawSize(2) = {}, "
+ "Saved(1) = {}, Saved(2) = {} in {}",
+ NiceBytes(DiffSize),
+ Chunks1.size(),
+ Chunks2.size(),
+ Param.UseThreshold,
+ Param.MinSize,
+ Param.MaxSize,
+ Param.AvgSize,
+ NiceBytes(Raw1Size),
+ NiceBytes(Raw2Size),
+ NiceBytes(Raw1Size - ChunkedSize1),
+ NiceBytes(Raw2Size - ChunkedSize2),
+ NiceTimeSpanMs(ChunkTimeMS));
+ }
+ }
+
+# if 0
+ for (int64_t MinSizeBase = (12u * 1024u); MinSizeBase <= (32u * 1024u); MinSizeBase += 512)
+ {
+ for (int64_t Wiggle = -132; Wiggle < 126; Wiggle += 2)
+ {
+ // size_t MinSize = 7 * 1024 - 61; // (size_t)(MinSizeBase + Wiggle);
+ // size_t MaxSize = 16 * (7 * 1024); // 8 * 7 * 1024;// MinSizeBase * 6;
+ // size_t AvgSize = MaxSize / 2; // 4 * 7 * 1024;// MinSizeBase * 3;
+ size_t MinSize = (size_t)(MinSizeBase + Wiggle);
+ //for (size_t MaxSize = (MinSize * 4) - 768; MaxSize < (MinSize * 5) + 768; MaxSize += 64)
+ size_t MaxSize = 8u * MinSizeBase;
+ {
+ for (size_t AvgSize = (MaxSize - MinSize) / 32 + MinSize; AvgSize < (MaxSize - MinSize) / 4 + MinSize; AvgSize += (MaxSize - MinSize) / 32)
+// size_t AvgSize = (MaxSize - MinSize) / 4 + MinSize;
+ {
+ WorkLatch.AddCount(1);
+ WorkerPool.ScheduleWork([&WorkLatch, MinSize, MaxSize, AvgSize, SourcePath1, SourcePath2]()
+ {
+ auto _ = MakeGuard([&WorkLatch]() { WorkLatch.CountDown(); });
+ ChunkedParams Params{ .UseThreshold = true, .MinSize = MinSize, .MaxSize = MaxSize, .AvgSize = AvgSize };
+ BasicFile SourceData1;
+ SourceData1.Open(SourcePath1, BasicFile::Mode::kRead);
+ BasicFile SourceData2;
+ SourceData2.Open(SourcePath2, BasicFile::Mode::kRead);
+ ChunkedInfoWithSource Info1 = ChunkData(SourceData1, Params);
+ ChunkedInfoWithSource Info2 = ChunkData(SourceData2, Params);
+
+ tsl::robin_set<IoHash> Chunks1;
+ Chunks1.reserve(Info1.Info.ChunkHashes.size());
+ Chunks1.insert(Info1.Info.ChunkHashes.begin(), Info1.Info.ChunkHashes.end());
+ size_t ChunkedSize1 = 0;
+ for (uint32_t Chunk1Index = 0; Chunk1Index < Info1.Info.ChunkHashes.size(); ++Chunk1Index)
+ {
+ ChunkedSize1 += Info1.ChunkSources[Chunk1Index].Size;
+ }
+ size_t DiffSavedSize = 0;
+ size_t ChunkedSize2 = 0;
+ for (uint32_t Chunk2Index = 0; Chunk2Index < Info2.Info.ChunkHashes.size(); ++Chunk2Index)
+ {
+ ChunkedSize2 += Info2.ChunkSources[Chunk2Index].Size;
+ if (Chunks1.find(Info2.Info.ChunkHashes[Chunk2Index]) == Chunks1.end())
+ {
+ DiffSavedSize += Info2.ChunkSources[Chunk2Index].Size;
+ }
+ }
+ ZEN_INFO("Diff {}, Chunks1: {}, Chunks2: {}, Min: {}, Max: {}, Avg: {}, Saved(1) {}, Saved(2) {}",
+ NiceBytes(DiffSavedSize),
+ Info1.Info.ChunkHashes.size(),
+ Info2.Info.ChunkHashes.size(),
+ MinSize,
+ MaxSize,
+ AvgSize,
+ NiceBytes(Info1.Info.RawSize - ChunkedSize1),
+ NiceBytes(Info2.Info.RawSize - ChunkedSize2));
+ });
+ }
+ }
+ }
+ }
+# endif // 0
+
+ // WorkLatch.CountDown();
+ // WorkLatch.Wait();
+}
+# endif // 0
+
+void
+chunkedfile_forcelink()
+{
+}
+
+} // namespace zen
+
+#endif
diff --git a/src/zenutil/chunking.cpp b/src/zenutil/chunking.cpp
new file mode 100644
index 000000000..30edd322a
--- /dev/null
+++ b/src/zenutil/chunking.cpp
@@ -0,0 +1,382 @@
+// Copyright Epic Games, Inc. All Rights Reserved.
+
+#include "chunking.h"
+
+#include <gsl/gsl-lite.hpp>
+
+#include <cmath>
+
+namespace zen::detail {
+
+static const uint32_t BuzhashTable[] = {
+ 0x458be752, 0xc10748cc, 0xfbbcdbb8, 0x6ded5b68, 0xb10a82b5, 0x20d75648, 0xdfc5665f, 0xa8428801, 0x7ebf5191, 0x841135c7, 0x65cc53b3,
+ 0x280a597c, 0x16f60255, 0xc78cbc3e, 0x294415f5, 0xb938d494, 0xec85c4e6, 0xb7d33edc, 0xe549b544, 0xfdeda5aa, 0x882bf287, 0x3116737c,
+ 0x05569956, 0xe8cc1f68, 0x0806ac5e, 0x22a14443, 0x15297e10, 0x50d090e7, 0x4ba60f6f, 0xefd9f1a7, 0x5c5c885c, 0x82482f93, 0x9bfd7c64,
+ 0x0b3e7276, 0xf2688e77, 0x8fad8abc, 0xb0509568, 0xf1ada29f, 0xa53efdfe, 0xcb2b1d00, 0xf2a9e986, 0x6463432b, 0x95094051, 0x5a223ad2,
+ 0x9be8401b, 0x61e579cb, 0x1a556a14, 0x5840fdc2, 0x9261ddf6, 0xcde002bb, 0x52432bb0, 0xbf17373e, 0x7b7c222f, 0x2955ed16, 0x9f10ca59,
+ 0xe840c4c9, 0xccabd806, 0x14543f34, 0x1462417a, 0x0d4a1f9c, 0x087ed925, 0xd7f8f24c, 0x7338c425, 0xcf86c8f5, 0xb19165cd, 0x9891c393,
+ 0x325384ac, 0x0308459d, 0x86141d7e, 0xc922116a, 0xe2ffa6b6, 0x53f52aed, 0x2cd86197, 0xf5b9f498, 0xbf319c8f, 0xe0411fae, 0x977eb18c,
+ 0xd8770976, 0x9833466a, 0xc674df7f, 0x8c297d45, 0x8ca48d26, 0xc49ed8e2, 0x7344f874, 0x556f79c7, 0x6b25eaed, 0xa03e2b42, 0xf68f66a4,
+ 0x8e8b09a2, 0xf2e0e62a, 0x0d3a9806, 0x9729e493, 0x8c72b0fc, 0x160b94f6, 0x450e4d3d, 0x7a320e85, 0xbef8f0e1, 0x21d73653, 0x4e3d977a,
+ 0x1e7b3929, 0x1cc6c719, 0xbe478d53, 0x8d752809, 0xe6d8c2c6, 0x275f0892, 0xc8acc273, 0x4cc21580, 0xecc4a617, 0xf5f7be70, 0xe795248a,
+ 0x375a2fe9, 0x425570b6, 0x8898dcf8, 0xdc2d97c4, 0x0106114b, 0x364dc22f, 0x1e0cad1f, 0xbe63803c, 0x5f69fac2, 0x4d5afa6f, 0x1bc0dfb5,
+ 0xfb273589, 0x0ea47f7b, 0x3c1c2b50, 0x21b2a932, 0x6b1223fd, 0x2fe706a8, 0xf9bd6ce2, 0xa268e64e, 0xe987f486, 0x3eacf563, 0x1ca2018c,
+ 0x65e18228, 0x2207360a, 0x57cf1715, 0x34c37d2b, 0x1f8f3cde, 0x93b657cf, 0x31a019fd, 0xe69eb729, 0x8bca7b9b, 0x4c9d5bed, 0x277ebeaf,
+ 0xe0d8f8ae, 0xd150821c, 0x31381871, 0xafc3f1b0, 0x927db328, 0xe95effac, 0x305a47bd, 0x426ba35b, 0x1233af3f, 0x686a5b83, 0x50e072e5,
+ 0xd9d3bb2a, 0x8befc475, 0x487f0de6, 0xc88dff89, 0xbd664d5e, 0x971b5d18, 0x63b14847, 0xd7d3c1ce, 0x7f583cf3, 0x72cbcb09, 0xc0d0a81c,
+ 0x7fa3429b, 0xe9158a1b, 0x225ea19a, 0xd8ca9ea3, 0xc763b282, 0xbb0c6341, 0x020b8293, 0xd4cd299d, 0x58cfa7f8, 0x91b4ee53, 0x37e4d140,
+ 0x95ec764c, 0x30f76b06, 0x5ee68d24, 0x679c8661, 0xa41979c2, 0xf2b61284, 0x4fac1475, 0x0adb49f9, 0x19727a23, 0x15a7e374, 0xc43a18d5,
+ 0x3fb1aa73, 0x342fc615, 0x924c0793, 0xbee2d7f0, 0x8a279de9, 0x4aa2d70c, 0xe24dd37f, 0xbe862c0b, 0x177c22c2, 0x5388e5ee, 0xcd8a7510,
+ 0xf901b4fd, 0xdbc13dbc, 0x6c0bae5b, 0x64efe8c7, 0x48b02079, 0x80331a49, 0xca3d8ae6, 0xf3546190, 0xfed7108b, 0xc49b941b, 0x32baf4a9,
+ 0xeb833a4a, 0x88a3f1a5, 0x3a91ce0a, 0x3cc27da1, 0x7112e684, 0x4a3096b1, 0x3794574c, 0xa3c8b6f3, 0x1d213941, 0x6e0a2e00, 0x233479f1,
+ 0x0f4cd82f, 0x6093edd2, 0x5d7d209e, 0x464fe319, 0xd4dcac9e, 0x0db845cb, 0xfb5e4bc3, 0xe0256ce1, 0x09fb4ed1, 0x0914be1e, 0xa5bdb2c3,
+ 0xc6eb57bb, 0x30320350, 0x3f397e91, 0xa67791bc, 0x86bc0e2c, 0xefa0a7e2, 0xe9ff7543, 0xe733612c, 0xd185897b, 0x329e5388, 0x91dd236b,
+ 0x2ecb0d93, 0xf4d82a3d, 0x35b5c03f, 0xe4e606f0, 0x05b21843, 0x37b45964, 0x5eff22f4, 0x6027f4cc, 0x77178b3c, 0xae507131, 0x7bf7cabc,
+ 0xf9c18d66, 0x593ade65, 0xd95ddf11,
+};
+
+// ROL operation (compiler turns this into a ROL when optimizing)
+ZEN_FORCEINLINE static uint32_t
+Rotate32(uint32_t Value, size_t RotateCount)
+{
+ RotateCount &= 31;
+
+ return ((Value) << (RotateCount)) | ((Value) >> (32 - RotateCount));
+}
+
+} // namespace zen::detail
+
+namespace zen {
+
+void
+ZenChunkHelper::Reset()
+{
+ InternalReset();
+
+ m_BytesScanned = 0;
+}
+
+void
+ZenChunkHelper::InternalReset()
+{
+ m_CurrentHash = 0;
+ m_CurrentChunkSize = 0;
+ m_WindowSize = 0;
+}
+
+void
+ZenChunkHelper::SetChunkSize(size_t MinSize, size_t MaxSize, size_t AvgSize)
+{
+ if (m_WindowSize)
+ return; // Already started
+
+ static_assert(kChunkSizeLimitMin > kWindowSize);
+
+ if (AvgSize)
+ {
+ // TODO: Validate AvgSize range
+ }
+ else
+ {
+ if (MinSize && MaxSize)
+ {
+ AvgSize = std::lrint(std::pow(2, (std::log2(MinSize) + std::log2(MaxSize)) / 2));
+ }
+ else if (MinSize)
+ {
+ AvgSize = MinSize * 4;
+ }
+ else if (MaxSize)
+ {
+ AvgSize = MaxSize / 4;
+ }
+ else
+ {
+ AvgSize = kDefaultAverageChunkSize;
+ }
+ }
+
+ if (MinSize)
+ {
+ // TODO: Validate MinSize range
+ }
+ else
+ {
+ MinSize = std::max(AvgSize / 4, kChunkSizeLimitMin);
+ }
+
+ if (MaxSize)
+ {
+ // TODO: Validate MaxSize range
+ }
+ else
+ {
+ MaxSize = std::min(AvgSize * 4, kChunkSizeLimitMax);
+ }
+
+ m_Discriminator = gsl::narrow<uint32_t>(AvgSize - MinSize);
+
+ if (m_Discriminator < MinSize)
+ {
+ m_Discriminator = gsl::narrow<uint32_t>(MinSize);
+ }
+
+ if (m_Discriminator > MaxSize)
+ {
+ m_Discriminator = gsl::narrow<uint32_t>(MaxSize);
+ }
+
+ m_Threshold = gsl::narrow<uint32_t>((uint64_t(std::numeric_limits<uint32_t>::max()) + 1) / m_Discriminator);
+
+ m_ChunkSizeMin = MinSize;
+ m_ChunkSizeMax = MaxSize;
+ m_ChunkSizeAvg = AvgSize;
+}
+
+size_t
+ZenChunkHelper::ScanChunk(const void* DataBytesIn, size_t ByteCount)
+{
+ size_t Result = InternalScanChunk(DataBytesIn, ByteCount);
+
+ if (Result == kNoBoundaryFound)
+ {
+ m_BytesScanned += ByteCount;
+ }
+ else
+ {
+ m_BytesScanned += Result;
+ }
+
+ return Result;
+}
+
+size_t
+ZenChunkHelper::InternalScanChunk(const void* DataBytesIn, size_t ByteCount)
+{
+ size_t CurrentOffset = 0;
+ const uint8_t* CursorPtr = reinterpret_cast<const uint8_t*>(DataBytesIn);
+
+ // There's no point in updating the hash if we know we're not
+ // going to have a cut point, so just skip the data. This logic currently
+ // provides roughly a 20% speedup on my machine
+
+ const size_t NeedHashOffset = m_ChunkSizeMin - kWindowSize;
+
+ if (m_CurrentChunkSize < NeedHashOffset)
+ {
+ const uint32_t SkipBytes = gsl::narrow<uint32_t>(std::min<uint64_t>(ByteCount, NeedHashOffset - m_CurrentChunkSize));
+
+ ByteCount -= SkipBytes;
+ m_CurrentChunkSize += SkipBytes;
+ CurrentOffset += SkipBytes;
+ CursorPtr += SkipBytes;
+
+ m_WindowSize = 0;
+
+ if (ByteCount == 0)
+ {
+ return kNoBoundaryFound;
+ }
+ }
+
+ // Fill window first
+
+ if (m_WindowSize < kWindowSize)
+ {
+ const uint32_t FillBytes = uint32_t(std::min<size_t>(ByteCount, kWindowSize - m_WindowSize));
+
+ memcpy(&m_Window[m_WindowSize], CursorPtr, FillBytes);
+
+ CursorPtr += FillBytes;
+
+ m_WindowSize += FillBytes;
+ m_CurrentChunkSize += FillBytes;
+
+ CurrentOffset += FillBytes;
+ ByteCount -= FillBytes;
+
+ if (m_WindowSize < kWindowSize)
+ {
+ return kNoBoundaryFound;
+ }
+
+ // We have a full window, initialize hash
+
+ uint32_t CurrentHash = 0;
+
+ for (int i = 1; i < kWindowSize; ++i)
+ {
+ CurrentHash ^= detail::Rotate32(detail::BuzhashTable[m_Window[i - 1]], kWindowSize - i);
+ }
+
+ m_CurrentHash = CurrentHash ^ detail::BuzhashTable[m_Window[kWindowSize - 1]];
+ }
+
+ // Scan for boundaries (i.e points where the hash matches the value determined by
+ // the discriminator)
+
+ uint32_t CurrentHash = m_CurrentHash;
+ uint32_t CurrentChunkSize = m_CurrentChunkSize;
+
+ size_t Index = CurrentChunkSize % kWindowSize;
+
+ if (m_Threshold && m_UseThreshold)
+ {
+ // This is roughly 4x faster than the general modulo approach on my
+ // TR 3990X (~940MB/sec) and doesn't require any special parameters to
+ // achieve max performance
+
+ while (ByteCount)
+ {
+ const uint8_t NewByte = *CursorPtr;
+ const uint8_t OldByte = m_Window[Index];
+
+ CurrentHash = detail::Rotate32(CurrentHash, 1) ^ detail::Rotate32(detail::BuzhashTable[OldByte], m_WindowSize) ^
+ detail::BuzhashTable[NewByte];
+
+ CurrentChunkSize++;
+ CurrentOffset++;
+
+ if (CurrentChunkSize >= m_ChunkSizeMin)
+ {
+ bool FoundBoundary;
+
+ if (CurrentChunkSize >= m_ChunkSizeMax)
+ {
+ FoundBoundary = true;
+ }
+ else
+ {
+ FoundBoundary = CurrentHash <= m_Threshold;
+ }
+
+ if (FoundBoundary)
+ {
+ // Boundary found!
+ InternalReset();
+
+ return CurrentOffset;
+ }
+ }
+
+ m_Window[Index++] = *CursorPtr;
+
+ if (Index == kWindowSize)
+ {
+ Index = 0;
+ }
+
+ ++CursorPtr;
+ --ByteCount;
+ }
+ }
+ else if ((m_Discriminator & (m_Discriminator - 1)) == 0)
+ {
+ // This is quite a bit faster than the generic modulo path, but
+ // requires a very specific average chunk size to be used. If you
+ // pass in an even power-of-two divided by 0.75 as the average
+ // chunk size you'll hit this path
+
+ const uint32_t Mask = m_Discriminator - 1;
+
+ while (ByteCount)
+ {
+ const uint8_t NewByte = *CursorPtr;
+ const uint8_t OldByte = m_Window[Index];
+
+ CurrentHash = detail::Rotate32(CurrentHash, 1) ^ detail::Rotate32(detail::BuzhashTable[OldByte], m_WindowSize) ^
+ detail::BuzhashTable[NewByte];
+
+ CurrentChunkSize++;
+ CurrentOffset++;
+
+ if (CurrentChunkSize >= m_ChunkSizeMin)
+ {
+ bool FoundBoundary;
+
+ if (CurrentChunkSize >= m_ChunkSizeMax)
+ {
+ FoundBoundary = true;
+ }
+ else
+ {
+ FoundBoundary = (CurrentHash & Mask) == Mask;
+ }
+
+ if (FoundBoundary)
+ {
+ // Boundary found!
+ InternalReset();
+
+ return CurrentOffset;
+ }
+ }
+
+ m_Window[Index++] = *CursorPtr;
+
+ if (Index == kWindowSize)
+ {
+ Index = 0;
+ }
+
+ ++CursorPtr;
+ --ByteCount;
+ }
+ }
+ else
+ {
+ // This is the slowest path, which caps out around 250MB/sec for large sizes
+ // on my TR3900X
+
+ while (ByteCount)
+ {
+ const uint8_t NewByte = *CursorPtr;
+ const uint8_t OldByte = m_Window[Index];
+
+ CurrentHash = detail::Rotate32(CurrentHash, 1) ^ detail::Rotate32(detail::BuzhashTable[OldByte], m_WindowSize) ^
+ detail::BuzhashTable[NewByte];
+
+ CurrentChunkSize++;
+ CurrentOffset++;
+
+ if (CurrentChunkSize >= m_ChunkSizeMin)
+ {
+ bool FoundBoundary;
+
+ if (CurrentChunkSize >= m_ChunkSizeMax)
+ {
+ FoundBoundary = true;
+ }
+ else
+ {
+ FoundBoundary = (CurrentHash % m_Discriminator) == (m_Discriminator - 1);
+ }
+
+ if (FoundBoundary)
+ {
+ // Boundary found!
+ InternalReset();
+
+ return CurrentOffset;
+ }
+ }
+
+ m_Window[Index++] = *CursorPtr;
+
+ if (Index == kWindowSize)
+ {
+ Index = 0;
+ }
+
+ ++CursorPtr;
+ --ByteCount;
+ }
+ }
+
+ m_CurrentChunkSize = CurrentChunkSize;
+ m_CurrentHash = CurrentHash;
+
+ return kNoBoundaryFound;
+}
+
+} // namespace zen
diff --git a/src/zenutil/chunking.h b/src/zenutil/chunking.h
new file mode 100644
index 000000000..09c56454f
--- /dev/null
+++ b/src/zenutil/chunking.h
@@ -0,0 +1,56 @@
+// Copyright Epic Games, Inc. All Rights Reserved.
+
+#pragma once
+#include <zencore/zencore.h>
+
+namespace zen {
+
+/** Content-defined chunking helper
+ */
+class ZenChunkHelper
+{
+public:
+ void SetChunkSize(size_t MinSize, size_t MaxSize, size_t AvgSize);
+ size_t ScanChunk(const void* DataBytes, size_t ByteCount);
+ void Reset();
+
+ // This controls which chunking approach is used - threshold or
+ // modulo based. Threshold is faster and generates similarly sized
+ // chunks
+ void SetUseThreshold(bool NewState) { m_UseThreshold = NewState; }
+
+ inline size_t ChunkSizeMin() const { return m_ChunkSizeMin; }
+ inline size_t ChunkSizeMax() const { return m_ChunkSizeMax; }
+ inline size_t ChunkSizeAvg() const { return m_ChunkSizeAvg; }
+ inline uint64_t BytesScanned() const { return m_BytesScanned; }
+
+ static constexpr size_t kNoBoundaryFound = size_t(~0ull);
+
+private:
+ size_t m_ChunkSizeMin = 0;
+ size_t m_ChunkSizeMax = 0;
+ size_t m_ChunkSizeAvg = 0;
+
+ uint32_t m_Discriminator = 0; // Computed in SetChunkSize()
+ uint32_t m_Threshold = 0; // Computed in SetChunkSize()
+
+ bool m_UseThreshold = true;
+
+ static constexpr size_t kChunkSizeLimitMax = 64 * 1024 * 1024;
+ static constexpr size_t kChunkSizeLimitMin = 1024;
+ static constexpr size_t kDefaultAverageChunkSize = 64 * 1024;
+
+ static constexpr int kWindowSize = 48;
+ uint8_t m_Window[kWindowSize];
+ uint32_t m_WindowSize = 0;
+
+ uint32_t m_CurrentHash = 0;
+ uint32_t m_CurrentChunkSize = 0;
+
+ uint64_t m_BytesScanned = 0;
+
+ size_t InternalScanChunk(const void* DataBytes, size_t ByteCount);
+ void InternalReset();
+};
+
+} // namespace zen
diff --git a/src/zenutil/include/zenutil/chunkblock.h b/src/zenutil/include/zenutil/chunkblock.h
new file mode 100644
index 000000000..9b7414629
--- /dev/null
+++ b/src/zenutil/include/zenutil/chunkblock.h
@@ -0,0 +1,32 @@
+// Copyright Epic Games, Inc. All Rights Reserved.
+
+#pragma once
+
+#include <zencore/iohash.h>
+
+#include <zencore/compactbinary.h>
+#include <zencore/compress.h>
+
+#include <optional>
+#include <vector>
+
+namespace zen {
+
+struct ChunkBlockDescription
+{
+ IoHash BlockHash;
+ std::vector<IoHash> ChunkHashes;
+ std::vector<uint32_t> ChunkRawLengths;
+};
+
+std::vector<ChunkBlockDescription> ParseChunkBlockDescriptionList(const CbObjectView& BlocksObject);
+ChunkBlockDescription ParseChunkBlockDescription(const CbObjectView& BlockObject);
+CbObject BuildChunkBlockDescription(const ChunkBlockDescription& Block, CbObjectView MetaData);
+
+typedef std::function<std::pair<uint64_t, CompressedBuffer>(const IoHash& RawHash)> FetchChunkFunc;
+
+CompressedBuffer GenerateChunkBlock(std::vector<std::pair<IoHash, FetchChunkFunc>>&& FetchChunks, ChunkBlockDescription& OutBlock);
+bool IterateChunkBlock(const SharedBuffer& BlockPayload,
+ std::function<void(CompressedBuffer&& Chunk, const IoHash& AttachmentHash)> Visitor);
+
+} // namespace zen
diff --git a/src/zenutil/include/zenutil/chunkedfile.h b/src/zenutil/include/zenutil/chunkedfile.h
new file mode 100644
index 000000000..7110ad317
--- /dev/null
+++ b/src/zenutil/include/zenutil/chunkedfile.h
@@ -0,0 +1,58 @@
+// Copyright Epic Games, Inc. All Rights Reserved.
+
+#pragma once
+
+#include <zencore/iobuffer.h>
+#include <zencore/iohash.h>
+#include <zencore/zencore.h>
+
+#include <functional>
+#include <vector>
+
+namespace zen {
+
+class BasicFile;
+
+struct ChunkedInfo
+{
+ uint64_t RawSize = 0;
+ IoHash RawHash;
+ std::vector<uint32_t> ChunkSequence;
+ std::vector<IoHash> ChunkHashes;
+};
+
+struct ChunkSource
+{
+ uint64_t Offset; // 8
+ uint32_t Size; // 4
+};
+
+struct ChunkedInfoWithSource
+{
+ ChunkedInfo Info;
+ std::vector<ChunkSource> ChunkSources;
+};
+
+struct ChunkedParams
+{
+ bool UseThreshold = true;
+ size_t MinSize = (2u * 1024u) - 128u;
+ size_t MaxSize = (16u * 1024u);
+ size_t AvgSize = (3u * 1024u);
+};
+
+static const ChunkedParams UShaderByteCodeParams = {.UseThreshold = true, .MinSize = 17280, .MaxSize = 139264, .AvgSize = 36340};
+
+ChunkedInfoWithSource ChunkData(BasicFile& RawData,
+ uint64_t Offset,
+ uint64_t Size,
+ ChunkedParams Params = {},
+ std::atomic<uint64_t>* BytesProcessed = nullptr);
+void Reconstruct(const ChunkedInfo& Info,
+ const std::filesystem::path& TargetPath,
+ std::function<IoBuffer(const IoHash& ChunkHash)> GetChunk);
+IoBuffer SerializeChunkedInfo(const ChunkedInfo& Info);
+ChunkedInfo DeserializeChunkedInfo(IoBuffer& Buffer);
+
+void chunkedfile_forcelink();
+} // namespace zen
diff --git a/src/zenutil/zenutil.cpp b/src/zenutil/zenutil.cpp
index c54144549..19eb63ce9 100644
--- a/src/zenutil/zenutil.cpp
+++ b/src/zenutil/zenutil.cpp
@@ -6,6 +6,7 @@
# include <zenutil/cache/cacherequests.h>
# include <zenutil/cache/rpcrecording.h>
+# include <zenutil/chunkedfile.h>
namespace zen {
@@ -15,6 +16,7 @@ zenutil_forcelinktests()
cachepolicy_forcelink();
cache::rpcrecord_forcelink();
cacherequests_forcelink();
+ chunkedfile_forcelink();
}
} // namespace zen