diff options
| author | Dan Engelbrecht <[email protected]> | 2025-10-20 12:09:46 +0200 |
|---|---|---|
| committer | GitHub Enterprise <[email protected]> | 2025-10-20 12:09:46 +0200 |
| commit | c1af02eeb2badfbd2c01125730c6b85bbed8be9e (patch) | |
| tree | d5a21612f886940166f905b6abc408959220834d /src/zenremotestore/chunking/chunkingcontroller.cpp | |
| parent | 5.7.7-pre0 (diff) | |
| download | zen-c1af02eeb2badfbd2c01125730c6b85bbed8be9e.tar.xz zen-c1af02eeb2badfbd2c01125730c6b85bbed8be9e.zip | |
updated chunking strategy (#589)
- Improvement: `zen builds`now split large files that are compress only into 64 MB chunks to avoiding very large files in Cloud Storage
- Improvement: `zen builds` now treats `.msixvc` files as non-compressable
Moved and cleaned up compactbinary_helpers functions
Tweaked fixed chunking implementation for better performance
Refactored so we have one list of "non-compressable" extensions
Implemented new `StandardChunkingStrategy` and move the two existing to hidden legacy namespace
Added `FilteredDownloadedBytesPerSecond.Start();` call that got lost during previous refactoring
Diffstat (limited to 'src/zenremotestore/chunking/chunkingcontroller.cpp')
| -rw-r--r-- | src/zenremotestore/chunking/chunkingcontroller.cpp | 642 |
1 files changed, 427 insertions, 215 deletions
diff --git a/src/zenremotestore/chunking/chunkingcontroller.cpp b/src/zenremotestore/chunking/chunkingcontroller.cpp index cc20446ea..91ca18d10 100644 --- a/src/zenremotestore/chunking/chunkingcontroller.cpp +++ b/src/zenremotestore/chunking/chunkingcontroller.cpp @@ -3,7 +3,7 @@ #include <zenremotestore/chunking/chunkingcontroller.h> #include <zencore/basicfile.h> -#include <zencore/compactbinarybuilder.h> +#include <zencore/compactbinaryutil.h> #include <zencore/filesystem.h> #include <zencore/trace.h> @@ -16,23 +16,12 @@ namespace zen { using namespace std::literals; namespace { - std::vector<std::string> ReadStringArray(CbArrayView StringArray) - { - std::vector<std::string> Result; - Result.reserve(StringArray.Num()); - for (CbFieldView FieldView : StringArray) - { - Result.emplace_back(FieldView.AsString()); - } - return Result; - } - ChunkedParams ReadChunkParams(CbObjectView Params) { bool UseThreshold = Params["UseThreshold"sv].AsBool(true); - size_t MinSize = Params["MinSize"sv].AsUInt64(DefaultChunkedParams.MinSize); - size_t MaxSize = Params["MaxSize"sv].AsUInt64(DefaultChunkedParams.MaxSize); - size_t AvgSize = Params["AvgSize"sv].AsUInt64(DefaultChunkedParams.AvgSize); + size_t MinSize = Params["MinSize"sv].AsUInt64(DefaultDynamicChunkingParams.MinSize); + size_t MaxSize = Params["MaxSize"sv].AsUInt64(DefaultDynamicChunkingParams.MaxSize); + size_t AvgSize = Params["AvgSize"sv].AsUInt64(DefaultDynamicChunkingParams.AvgSize); return ChunkedParams{.UseThreshold = UseThreshold, .MinSize = MinSize, .MaxSize = MaxSize, .AvgSize = AvgSize}; } @@ -50,6 +39,44 @@ namespace { Writer.EndObject(); // ChunkingParams } +} // namespace + +namespace legacy { + const std::vector<std::string> DefaultChunkingExcludeExtensions = { + ".exe", ".dll", ".pdb", ".self", ".mp4", ".zip", ".7z", ".bzip", ".rar", ".gzip", + ".sym", ".psym", ".txt", ".ini", ".json", ".verse", ".versemodule", ".jpg", ".c", ".h", + ".cpp", ".cxx", ".c++", ".cc", ".hpp", ".hxx", ".h++", ".py", ".ogg", ".plist"}; + + const std::vector<std::string> DefaultFixedChunkingExtensions = {".apk", ".nsp", ".xvc", ".pkg", ".dmg", ".ipa"}; + const bool DefaultChunkingExcludeElfFiles = true; + const bool DefaultChunkingExcludeMachOFiles = true; + + const size_t DefaultChunkingFileSizeLimit = DefaultDynamicChunkingParams.MaxSize; + + const uint64_t DefaultFixedChunkingChunkSize = 32u * 1024u * 1024u; + const uint64_t DefaultMinSizeForFixedChunking = DefaultFixedChunkingChunkSize * 8u; + + struct BasicChunkingControllerSettings + { + std::vector<std::string> ExcludeExtensions = DefaultChunkingExcludeExtensions; + bool ExcludeElfFiles = DefaultChunkingExcludeElfFiles; + bool ExcludeMachOFiles = DefaultChunkingExcludeMachOFiles; + uint64_t ChunkFileSizeLimit = DefaultChunkingFileSizeLimit; + ChunkedParams ChunkingParams = DefaultDynamicChunkingParams; + }; + + struct ChunkingControllerWithFixedChunkingSettings + { + std::vector<std::string> FixedChunkingExtensions = DefaultFixedChunkingExtensions; + std::vector<std::string> ExcludeExtensions = DefaultChunkingExcludeExtensions; + bool ExcludeElfFiles = DefaultChunkingExcludeElfFiles; + bool ExcludeMachOFiles = DefaultChunkingExcludeMachOFiles; + uint64_t ChunkFileSizeLimit = DefaultChunkingFileSizeLimit; + ChunkedParams ChunkingParams = DefaultDynamicChunkingParams; + uint64_t FixedChunkingChunkSize = DefaultFixedChunkingChunkSize; + uint64_t MinSizeForFixedChunking = DefaultMinSizeForFixedChunking; + }; + bool IsElfFile(BasicFile& Buffer) { if (Buffer.FileSize() > 4) @@ -77,114 +104,363 @@ namespace { } return false; } -} // namespace -class BasicChunkingController : public ChunkingController -{ -public: - BasicChunkingController(const BasicChunkingControllerSettings& Settings) : m_Settings(Settings) + //////////// BasicChunkingController + + class BasicChunkingController : public ChunkingController { - m_ExcludeExtensionHashes.reserve(Settings.ExcludeExtensions.size()); - for (const std::string& Extension : Settings.ExcludeExtensions) + public: + BasicChunkingController(const BasicChunkingControllerSettings& Settings) : m_Settings(Settings) { - m_ExcludeExtensionHashes.insert(HashStringAsLowerDjb2(Extension)); + m_ExcludeExtensionHashes.reserve(Settings.ExcludeExtensions.size()); + for (const std::string& Extension : Settings.ExcludeExtensions) + { + m_ExcludeExtensionHashes.insert(HashStringAsLowerDjb2(Extension)); + } } - } - BasicChunkingController(CbObjectView Parameters) : m_Settings(ReadSettings(Parameters)) {} + BasicChunkingController(CbObjectView Parameters) : m_Settings(ReadSettings(Parameters)) {} - virtual bool ProcessFile(const std::filesystem::path& InputPath, - uint64_t RawSize, - ChunkedInfoWithSource& OutChunked, - std::atomic<uint64_t>& BytesProcessed, - std::atomic<bool>& AbortFlag) const override - { - ZEN_TRACE_CPU("BasicChunkingController::ProcessFile"); - if (RawSize < m_Settings.ChunkFileSizeLimit) + virtual bool ProcessFile(const std::filesystem::path& InputPath, + uint64_t RawSize, + ChunkedInfoWithSource& OutChunked, + std::atomic<uint64_t>& BytesProcessed, + std::atomic<bool>& AbortFlag) const override { - return false; - } + ZEN_TRACE_CPU("BasicChunkingController::ProcessFile"); + if (RawSize < m_Settings.ChunkFileSizeLimit) + { + return false; + } - const uint32_t ExtensionHash = HashStringAsLowerDjb2(InputPath.extension().string()); - const bool ExcludeFromChunking = m_ExcludeExtensionHashes.contains(ExtensionHash); + const uint32_t ExtensionHash = HashStringAsLowerDjb2(InputPath.extension().string()); + const bool ExcludeFromChunking = m_ExcludeExtensionHashes.contains(ExtensionHash); - if (ExcludeFromChunking) - { - return false; + if (ExcludeFromChunking) + { + return false; + } + + BasicFile Buffer(InputPath, BasicFile::Mode::kRead); + if (m_Settings.ExcludeElfFiles && IsElfFile(Buffer)) + { + return false; + } + if (m_Settings.ExcludeMachOFiles && IsMachOFile(Buffer)) + { + return false; + } + + OutChunked = ChunkData(Buffer, 0, RawSize, m_Settings.ChunkingParams, &BytesProcessed, &AbortFlag); + return true; } - BasicFile Buffer(InputPath, BasicFile::Mode::kRead); - if (m_Settings.ExcludeElfFiles && IsElfFile(Buffer)) + virtual std::string_view GetName() const override { return Name; } + + virtual CbObject GetParameters() const override { - return false; + CbObjectWriter Writer; + compactbinary_helpers::WriteArray(m_Settings.ExcludeExtensions, "ChunkExcludeExtensions"sv, Writer); + + Writer.AddBool("ExcludeElfFiles"sv, m_Settings.ExcludeElfFiles); + Writer.AddBool("ExcludeMachOFiles"sv, m_Settings.ExcludeMachOFiles); + Writer.AddInteger("ChunkFileSizeLimit"sv, m_Settings.ChunkFileSizeLimit); + + WriteChunkParams(Writer, m_Settings.ChunkingParams); + + return Writer.Save(); } - if (m_Settings.ExcludeMachOFiles && IsMachOFile(Buffer)) + static constexpr std::string_view Name = "BasicChunkingController"sv; + + private: + static BasicChunkingControllerSettings ReadSettings(CbObjectView Parameters) { - return false; + return BasicChunkingControllerSettings{ + .ExcludeExtensions = compactbinary_helpers::ReadArray<std::string>("ChunkExcludeExtensions"sv, Parameters), + .ExcludeElfFiles = Parameters["ExcludeElfFiles"sv].AsBool(DefaultChunkingExcludeElfFiles), + .ExcludeMachOFiles = Parameters["ExcludeMachOFiles"sv].AsBool(DefaultChunkingExcludeMachOFiles), + .ChunkFileSizeLimit = Parameters["ChunkFileSizeLimit"sv].AsUInt64(DefaultChunkingFileSizeLimit), + .ChunkingParams = ReadChunkParams(Parameters["ChunkingParams"sv].AsObjectView())}; } - OutChunked = ChunkData(Buffer, 0, RawSize, m_Settings.ChunkingParams, &BytesProcessed, &AbortFlag); - return true; + const BasicChunkingControllerSettings m_Settings; + tsl::robin_set<uint32_t> m_ExcludeExtensionHashes; + }; + + std::unique_ptr<ChunkingController> CreateBasicChunkingController(CbObjectView Parameters) + { + return std::make_unique<BasicChunkingController>(Parameters); } - virtual std::string_view GetName() const override { return Name; } + //////////// ChunkingControllerWithFixedChunking - virtual CbObject GetParameters() const override + class ChunkingControllerWithFixedChunking : public ChunkingController { - CbObjectWriter Writer; - Writer.BeginArray("ChunkExcludeExtensions"sv); + public: + ChunkingControllerWithFixedChunking(const ChunkingControllerWithFixedChunkingSettings& Settings) : m_Settings(Settings) { - for (const std::string& Extension : m_Settings.ExcludeExtensions) + m_ExcludeExtensionHashes.reserve(Settings.ExcludeExtensions.size()); + for (const std::string& Extension : Settings.ExcludeExtensions) + { + m_ExcludeExtensionHashes.insert(HashStringAsLowerDjb2(Extension)); + } + + m_FixedChunkingExtensionHashes.reserve(Settings.FixedChunkingExtensions.size()); + for (const std::string& Extension : Settings.FixedChunkingExtensions) { - Writer.AddString(Extension); + m_FixedChunkingExtensionHashes.insert(HashStringAsLowerDjb2(Extension)); } } - Writer.EndArray(); // ChunkExcludeExtensions - Writer.AddBool("ExcludeElfFiles"sv, m_Settings.ExcludeElfFiles); - Writer.AddBool("ExcludeMachOFiles"sv, m_Settings.ExcludeMachOFiles); - Writer.AddInteger("ChunkFileSizeLimit"sv, m_Settings.ChunkFileSizeLimit); + ChunkingControllerWithFixedChunking(CbObjectView Parameters) : m_Settings(ReadSettings(Parameters)) {} - WriteChunkParams(Writer, m_Settings.ChunkingParams); + virtual bool ProcessFile(const std::filesystem::path& InputPath, + uint64_t RawSize, + ChunkedInfoWithSource& OutChunked, + std::atomic<uint64_t>& BytesProcessed, + std::atomic<bool>& AbortFlag) const override + { + ZEN_TRACE_CPU("ChunkingControllerWithFixedChunking::ProcessFile"); + if (RawSize < m_Settings.ChunkFileSizeLimit) + { + return false; + } - return Writer.Save(); - } - static constexpr std::string_view Name = "BasicChunkingController"sv; + const uint32_t ExtensionHash = HashStringAsLowerDjb2(InputPath.extension().string()); + const bool ExcludeFromChunking = m_ExcludeExtensionHashes.contains(ExtensionHash); -private: - static BasicChunkingControllerSettings ReadSettings(CbObjectView Parameters) + if (ExcludeFromChunking) + { + return false; + } + + const bool FixedChunkingExtension = m_FixedChunkingExtensionHashes.contains(ExtensionHash); + + if (FixedChunkingExtension) + { + if (RawSize < m_Settings.MinSizeForFixedChunking) + { + return false; + } + ZEN_TRACE_CPU("FixedChunking"); + IoHashStream FullHasher; + BasicFile Source(InputPath, BasicFile::Mode::kRead); + uint64_t Offset = 0; + tsl::robin_map<IoHash, uint32_t, IoHash::Hasher> ChunkHashToChunkIndex; + const uint64_t ExpectedChunkCount = 1 + (RawSize / m_Settings.FixedChunkingChunkSize); + ChunkHashToChunkIndex.reserve(ExpectedChunkCount); + OutChunked.Info.ChunkHashes.reserve(ExpectedChunkCount); + OutChunked.Info.ChunkSequence.reserve(ExpectedChunkCount); + OutChunked.ChunkSources.reserve(ExpectedChunkCount); + + static const uint64_t BufferingSize = 256u * 1024u; + + IoHashStream ChunkHasher; + + while (Offset < RawSize) + { + if (AbortFlag) + { + return false; + } + + ChunkHasher.Reset(); + + uint64_t ChunkSize = std::min<uint64_t>(RawSize - Offset, m_Settings.FixedChunkingChunkSize); + if (ChunkSize >= (BufferingSize + BufferingSize / 2)) + { + ScanFile(Source.Handle(), + Offset, + ChunkSize, + BufferingSize, + [&FullHasher, &ChunkHasher, &BytesProcessed](const void* Data, size_t Size) { + FullHasher.Append(Data, Size); + ChunkHasher.Append(Data, Size); + BytesProcessed.fetch_add(Size); + }); + } + else + { + IoBuffer ChunkData = Source.ReadRange(Offset, ChunkSize); + FullHasher.Append(ChunkData); + ChunkHasher.Append(ChunkData); + BytesProcessed.fetch_add(ChunkSize); + } + + const IoHash ChunkHash = ChunkHasher.GetHash(); + if (auto It = ChunkHashToChunkIndex.find(ChunkHash); It != ChunkHashToChunkIndex.end()) + { + OutChunked.Info.ChunkSequence.push_back(It->second); + } + else + { + uint32_t ChunkIndex = gsl::narrow<uint32_t>(OutChunked.Info.ChunkHashes.size()); + OutChunked.Info.ChunkHashes.push_back(ChunkHash); + OutChunked.Info.ChunkSequence.push_back(ChunkIndex); + OutChunked.ChunkSources.push_back({.Offset = Offset, .Size = gsl::narrow<uint32_t>(ChunkSize)}); + } + Offset += ChunkSize; + } + OutChunked.Info.RawSize = RawSize; + OutChunked.Info.RawHash = FullHasher.GetHash(); + return true; + } + else + { + BasicFile Buffer(InputPath, BasicFile::Mode::kRead); + if (m_Settings.ExcludeElfFiles && IsElfFile(Buffer)) + { + return false; + } + if (m_Settings.ExcludeMachOFiles && IsMachOFile(Buffer)) + { + return false; + } + + OutChunked = ChunkData(Buffer, 0, RawSize, m_Settings.ChunkingParams, &BytesProcessed, &AbortFlag); + return true; + } + } + + virtual std::string_view GetName() const override { return Name; } + + virtual CbObject GetParameters() const override + { + CbObjectWriter Writer; + compactbinary_helpers::WriteArray(m_Settings.FixedChunkingExtensions, "FixedChunkingExtensions", Writer); + compactbinary_helpers::WriteArray(m_Settings.ExcludeExtensions, "ChunkExcludeExtensions", Writer); + + Writer.AddBool("ExcludeElfFiles"sv, m_Settings.ExcludeElfFiles); + Writer.AddBool("ExcludeMachOFiles"sv, m_Settings.ExcludeMachOFiles); + + Writer.AddInteger("ChunkFileSizeLimit"sv, m_Settings.ChunkFileSizeLimit); + + WriteChunkParams(Writer, m_Settings.ChunkingParams); + + Writer.AddInteger("FixedChunkingChunkSize"sv, m_Settings.FixedChunkingChunkSize); + Writer.AddInteger("MinSizeForFixedChunking"sv, m_Settings.MinSizeForFixedChunking); + return Writer.Save(); + } + + static constexpr std::string_view Name = "ChunkingControllerWithFixedChunking"sv; + + private: + static ChunkingControllerWithFixedChunkingSettings ReadSettings(CbObjectView Parameters) + { + return ChunkingControllerWithFixedChunkingSettings{ + .FixedChunkingExtensions = compactbinary_helpers::ReadArray<std::string>("FixedChunkingExtensions"sv, Parameters), + .ExcludeExtensions = compactbinary_helpers::ReadArray<std::string>("ChunkExcludeExtensions"sv, Parameters), + .ExcludeElfFiles = Parameters["ExcludeElfFiles"sv].AsBool(DefaultChunkingExcludeElfFiles), + .ExcludeMachOFiles = Parameters["ExcludeMachOFiles"sv].AsBool(DefaultChunkingExcludeMachOFiles), + .ChunkFileSizeLimit = Parameters["ChunkFileSizeLimit"sv].AsUInt64(DefaultChunkingFileSizeLimit), + .ChunkingParams = ReadChunkParams(Parameters["ChunkingParams"sv].AsObjectView()), + .FixedChunkingChunkSize = Parameters["FixedChunkingChunkSize"sv].AsUInt64(DefaultFixedChunkingChunkSize), + .MinSizeForFixedChunking = Parameters["MinSizeForFixedChunking"sv].AsUInt64(DefaultFixedChunkingChunkSize)}; + } + + const ChunkingControllerWithFixedChunkingSettings m_Settings; + tsl::robin_set<uint32_t> m_FixedChunkingExtensionHashes; + tsl::robin_set<uint32_t> m_ExcludeExtensionHashes; + }; + + std::unique_ptr<ChunkingController> CreateChunkingControllerWithFixedChunking(CbObjectView Parameters) { - return BasicChunkingControllerSettings{ - .ExcludeExtensions = ReadStringArray(Parameters["ChunkExcludeExtensions"sv].AsArrayView()), - .ExcludeElfFiles = Parameters["ExcludeElfFiles"sv].AsBool(DefaultChunkingExcludeElfFiles), - .ExcludeMachOFiles = Parameters["ExcludeMachOFiles"sv].AsBool(DefaultChunkingExcludeMachOFiles), - .ChunkFileSizeLimit = Parameters["ChunkFileSizeLimit"sv].AsUInt64(DefaultChunkingFileSizeLimit), - .ChunkingParams = ReadChunkParams(Parameters["ChunkingParams"sv].AsObjectView())}; + return std::make_unique<ChunkingControllerWithFixedChunking>(Parameters); } +} // namespace legacy - const BasicChunkingControllerSettings m_Settings; - tsl::robin_set<uint32_t> m_ExcludeExtensionHashes; -}; +//////////// StandardChunkingController -class ChunkingControllerWithFixedChunking : public ChunkingController +class StandardChunkingController : public ChunkingController { public: - ChunkingControllerWithFixedChunking(const ChunkingControllerWithFixedChunkingSettings& Settings) : m_Settings(Settings) + StandardChunkingController(const StandardChunkingControllerSettings& Settings) : m_Settings(Settings) { - m_ExcludeExtensionHashes.reserve(Settings.ExcludeExtensions.size()); - for (const std::string& Extension : Settings.ExcludeExtensions) + m_SplitOnlyExtensionHashes.reserve(Settings.SplitOnlyExtensions.size()); + for (const std::string& Extension : Settings.SplitOnlyExtensions) { - m_ExcludeExtensionHashes.insert(HashStringAsLowerDjb2(Extension)); + m_SplitOnlyExtensionHashes.insert(HashStringAsLowerDjb2(Extension)); } - m_FixedChunkingExtensionHashes.reserve(Settings.FixedChunkingExtensions.size()); - for (const std::string& Extension : Settings.FixedChunkingExtensions) + m_SplitAndCompressExtensionHashes.reserve(Settings.SplitAndCompressExtensions.size()); + for (const std::string& Extension : Settings.SplitAndCompressExtensions) { - m_FixedChunkingExtensionHashes.insert(HashStringAsLowerDjb2(Extension)); + m_SplitAndCompressExtensionHashes.insert(HashStringAsLowerDjb2(Extension)); } } - ChunkingControllerWithFixedChunking(CbObjectView Parameters) : m_Settings(ReadSettings(Parameters)) {} + StandardChunkingController(CbObjectView Parameters) : m_Settings(ReadSettings(Parameters)) {} + + bool FixedChunking(BasicFile& Source, + uint64_t RawSize, + ChunkedInfoWithSource& OutChunked, + std::atomic<uint64_t>& BytesProcessed, + const uint64_t FixedChunkSize, + std::atomic<bool>& AbortFlag) const + { + ZEN_TRACE_CPU("FixedChunking"); + + IoHashStream FullHasher; + uint64_t Offset = 0; + tsl::robin_map<IoHash, uint32_t, IoHash::Hasher> ChunkHashToChunkIndex; + const uint64_t ExpectedChunkCount = 1 + (RawSize / FixedChunkSize); + ChunkHashToChunkIndex.reserve(ExpectedChunkCount); + OutChunked.Info.ChunkHashes.reserve(ExpectedChunkCount); + OutChunked.Info.ChunkSequence.reserve(ExpectedChunkCount); + OutChunked.ChunkSources.reserve(ExpectedChunkCount); + + static const uint64_t BufferingSize = 256u * 1024u; + static const uint64_t MinimumLastChunkSize = Min(128u * 1024u, FixedChunkSize / 32); + + IoHashStream ChunkHasher; + + BasicFileBuffer SourceBuffer(Source, Min(BufferingSize, RawSize)); + while (Offset < RawSize) + { + if (AbortFlag) + { + return false; + } + + ChunkHasher.Reset(); + + const uint64_t ChunkStartOffset = Offset; + const uint64_t BytesLeft = RawSize - Offset; + uint64_t ChunkSize = std::min<uint64_t>(BytesLeft, FixedChunkSize); + if ((BytesLeft - ChunkSize) < MinimumLastChunkSize) + { + // Avoid small chunks from the end of the file + ChunkSize = BytesLeft; + } + const uint64_t End = ChunkStartOffset + ChunkSize; + while (Offset < End) + { + const uint64_t BufferSize = std::min<uint64_t>(RawSize - Offset, BufferingSize); + MemoryView ChunkData = SourceBuffer.MakeView(BufferSize, Offset); + FullHasher.Append(ChunkData); + ChunkHasher.Append(ChunkData); + BytesProcessed.fetch_add(BufferSize); + Offset += BufferSize; + } + + const IoHash ChunkHash = ChunkHasher.GetHash(); + if (auto It = ChunkHashToChunkIndex.find(ChunkHash); It != ChunkHashToChunkIndex.end()) + { + OutChunked.Info.ChunkSequence.push_back(It->second); + } + else + { + uint32_t ChunkIndex = gsl::narrow<uint32_t>(OutChunked.Info.ChunkHashes.size()); + OutChunked.Info.ChunkHashes.push_back(ChunkHash); + OutChunked.Info.ChunkSequence.push_back(ChunkIndex); + OutChunked.ChunkSources.push_back({.Offset = ChunkStartOffset, .Size = gsl::narrow<uint32_t>(ChunkSize)}); + } + } + + OutChunked.Info.RawSize = RawSize; + OutChunked.Info.RawHash = FullHasher.GetHash(); + return true; + } virtual bool ProcessFile(const std::filesystem::path& InputPath, uint64_t RawSize, @@ -192,106 +468,60 @@ public: std::atomic<uint64_t>& BytesProcessed, std::atomic<bool>& AbortFlag) const override { - ZEN_TRACE_CPU("ChunkingControllerWithFixedChunking::ProcessFile"); - if (RawSize < m_Settings.ChunkFileSizeLimit) - { - return false; - } + ZEN_TRACE_CPU("StandardChunkingController::ProcessFile"); - const uint32_t ExtensionHash = HashStringAsLowerDjb2(InputPath.extension().string()); - const bool ExcludeFromChunking = m_ExcludeExtensionHashes.contains(ExtensionHash); + const uint32_t ExtensionHash = HashStringAsLowerDjb2(InputPath.extension().string()); - if (ExcludeFromChunking) + if (m_SplitOnlyExtensionHashes.contains(ExtensionHash)) { - return false; + if (RawSize < m_Settings.SplitOnlyMinSize) + { + return false; + } + BasicFile Source(InputPath, BasicFile::Mode::kRead); + return FixedChunking(Source, RawSize, OutChunked, BytesProcessed, m_Settings.SplitOnlyChunkSize, AbortFlag); } - const bool FixedChunkingExtension = m_FixedChunkingExtensionHashes.contains(ExtensionHash); - - if (FixedChunkingExtension) + if (m_SplitAndCompressExtensionHashes.contains(ExtensionHash)) { - if (RawSize < m_Settings.MinSizeForFixedChunking) + if (RawSize < m_Settings.SplitAndCompressMinSize) { return false; } - ZEN_TRACE_CPU("FixedChunking"); - IoHashStream FullHasher; - BasicFile Source(InputPath, BasicFile::Mode::kRead); - uint64_t Offset = 0; - tsl::robin_map<IoHash, uint32_t, IoHash::Hasher> ChunkHashToChunkIndex; - const uint64_t ExpectedChunkCount = 1 + (RawSize / m_Settings.FixedChunkingChunkSize); - ChunkHashToChunkIndex.reserve(ExpectedChunkCount); - OutChunked.Info.ChunkHashes.reserve(ExpectedChunkCount); - OutChunked.Info.ChunkSequence.reserve(ExpectedChunkCount); - OutChunked.ChunkSources.reserve(ExpectedChunkCount); - - static const uint64_t BufferingSize = 256u * 1024u; - - IoHashStream ChunkHasher; - - while (Offset < RawSize) + BasicFile Source(InputPath, BasicFile::Mode::kRead); + return FixedChunking(Source, RawSize, OutChunked, BytesProcessed, m_Settings.SplitAndCompressChunkSize, AbortFlag); + } + + if (RawSize > sizeof(uint32_t) && !m_Settings.SplitAndCompressFileLeadingBytes.empty()) + { + BasicFile Source(InputPath, BasicFile::Mode::kRead); + uint32_t LeadingBytes = 0; + Source.Read(&LeadingBytes, 4, 0); + if (std::find(m_Settings.SplitAndCompressFileLeadingBytes.begin(), + m_Settings.SplitAndCompressFileLeadingBytes.end(), + LeadingBytes) != m_Settings.SplitAndCompressFileLeadingBytes.end()) { - if (AbortFlag) + if (RawSize < m_Settings.SplitAndCompressMinSize) { return false; } - - ChunkHasher.Reset(); - - uint64_t ChunkSize = std::min<uint64_t>(RawSize - Offset, m_Settings.FixedChunkingChunkSize); - if (ChunkSize >= (BufferingSize + BufferingSize / 2)) - { - ScanFile(Source.Handle(), - Offset, - ChunkSize, - BufferingSize, - [&FullHasher, &ChunkHasher, &BytesProcessed](const void* Data, size_t Size) { - FullHasher.Append(Data, Size); - ChunkHasher.Append(Data, Size); - BytesProcessed.fetch_add(Size); - }); - } - else - { - IoBuffer ChunkData = Source.ReadRange(Offset, ChunkSize); - FullHasher.Append(ChunkData); - ChunkHasher.Append(ChunkData); - BytesProcessed.fetch_add(ChunkSize); - } - - const IoHash ChunkHash = ChunkHasher.GetHash(); - if (auto It = ChunkHashToChunkIndex.find(ChunkHash); It != ChunkHashToChunkIndex.end()) - { - OutChunked.Info.ChunkSequence.push_back(It->second); - } else { - uint32_t ChunkIndex = gsl::narrow<uint32_t>(OutChunked.Info.ChunkHashes.size()); - OutChunked.Info.ChunkHashes.push_back(ChunkHash); - OutChunked.Info.ChunkSequence.push_back(ChunkIndex); - OutChunked.ChunkSources.push_back({.Offset = Offset, .Size = gsl::narrow<uint32_t>(ChunkSize)}); + return FixedChunking(Source, RawSize, OutChunked, BytesProcessed, m_Settings.SplitAndCompressChunkSize, AbortFlag); } - Offset += ChunkSize; } - OutChunked.Info.RawSize = RawSize; - OutChunked.Info.RawHash = FullHasher.GetHash(); - return true; + } + + if (RawSize < m_Settings.DynamicChunkingParams.MaxSize) + { + return false; } else { - BasicFile Buffer(InputPath, BasicFile::Mode::kRead); - if (m_Settings.ExcludeElfFiles && IsElfFile(Buffer)) - { - return false; - } - if (m_Settings.ExcludeMachOFiles && IsMachOFile(Buffer)) - { - return false; - } - - OutChunked = ChunkData(Buffer, 0, RawSize, m_Settings.ChunkingParams, &BytesProcessed, &AbortFlag); - return true; + BasicFile Source(InputPath, BasicFile::Mode::kRead); + OutChunked = ChunkData(Source, 0, RawSize, m_Settings.DynamicChunkingParams, &BytesProcessed, &AbortFlag); } + return true; } virtual std::string_view GetName() const override { return Name; } @@ -299,89 +529,71 @@ public: virtual CbObject GetParameters() const override { CbObjectWriter Writer; - Writer.BeginArray("FixedChunkingExtensions"); - { - for (const std::string& Extension : m_Settings.FixedChunkingExtensions) - { - Writer.AddString(Extension); - } - } - Writer.EndArray(); // ChunkExcludeExtensions - - Writer.BeginArray("ChunkExcludeExtensions"sv); - { - for (const std::string& Extension : m_Settings.ExcludeExtensions) - { - Writer.AddString(Extension); - } - } - Writer.EndArray(); // ChunkExcludeExtensions - Writer.AddBool("ExcludeElfFiles"sv, m_Settings.ExcludeElfFiles); - Writer.AddBool("ExcludeMachOFiles"sv, m_Settings.ExcludeMachOFiles); + compactbinary_helpers::WriteArray(m_Settings.SplitOnlyExtensions, "SplitOnlyExtensions"sv, Writer); + Writer.AddInteger("SplitOnlyChunkSize"sv, m_Settings.SplitOnlyChunkSize); + Writer.AddInteger("SplitOnlyMinSize"sv, m_Settings.SplitOnlyMinSize); - Writer.AddInteger("ChunkFileSizeLimit"sv, m_Settings.ChunkFileSizeLimit); + compactbinary_helpers::WriteArray(m_Settings.SplitAndCompressExtensions, "SplitAndCompressExtensions"sv, Writer); + compactbinary_helpers::WriteArray(m_Settings.SplitAndCompressFileLeadingBytes, "SplitAndCompressFileLeadingBytes"sv, Writer); + Writer.AddInteger("SplitAndCompressChunkSize"sv, m_Settings.SplitAndCompressChunkSize); + Writer.AddInteger("SplitAndCompressMinSize"sv, m_Settings.SplitAndCompressMinSize); - WriteChunkParams(Writer, m_Settings.ChunkingParams); + WriteChunkParams(Writer, m_Settings.DynamicChunkingParams); - Writer.AddInteger("FixedChunkingChunkSize"sv, m_Settings.FixedChunkingChunkSize); - Writer.AddInteger("MinSizeForFixedChunking"sv, m_Settings.MinSizeForFixedChunking); return Writer.Save(); } - static constexpr std::string_view Name = "ChunkingControllerWithFixedChunking"sv; + static constexpr std::string_view Name = "StandardChunkingController"sv; private: - static ChunkingControllerWithFixedChunkingSettings ReadSettings(CbObjectView Parameters) + static StandardChunkingControllerSettings ReadSettings(CbObjectView Parameters) { - return ChunkingControllerWithFixedChunkingSettings{ - .FixedChunkingExtensions = ReadStringArray(Parameters["FixedChunkingExtensions"sv].AsArrayView()), - .ExcludeExtensions = ReadStringArray(Parameters["ChunkExcludeExtensions"sv].AsArrayView()), - .ExcludeElfFiles = Parameters["ExcludeElfFiles"sv].AsBool(DefaultChunkingExcludeElfFiles), - .ExcludeMachOFiles = Parameters["ExcludeMachOFiles"sv].AsBool(DefaultChunkingExcludeMachOFiles), - .ChunkFileSizeLimit = Parameters["ChunkFileSizeLimit"sv].AsUInt64(DefaultChunkingFileSizeLimit), - .ChunkingParams = ReadChunkParams(Parameters["ChunkingParams"sv].AsObjectView()), - .FixedChunkingChunkSize = Parameters["FixedChunkingChunkSize"sv].AsUInt64(DefaultFixedChunkingChunkSize), - .MinSizeForFixedChunking = Parameters["MinSizeForFixedChunking"sv].AsUInt64(DefaultFixedChunkingChunkSize)}; + return StandardChunkingControllerSettings{ + .SplitOnlyExtensions = compactbinary_helpers::ReadArray<std::string>("SplitOnlyExtensions"sv, Parameters), + .SplitOnlyChunkSize = Parameters["SplitOnlyChunkSize"sv].AsUInt64(DefaultSplitOnlyChunkSize), + .SplitOnlyMinSize = Parameters["SplitOnlyMinSize"sv].AsUInt64(DefaultSplitOnlyMinSize), + + .SplitAndCompressExtensions = compactbinary_helpers::ReadArray<std::string>("SplitAndCompressExtensions"sv, Parameters), + .SplitAndCompressFileLeadingBytes = + compactbinary_helpers::ReadArray<uint32_t>("SplitAndCompressFileLeadingBytes"sv, Parameters), + .SplitAndCompressChunkSize = Parameters["SplitAndCompressChunkSize"sv].AsUInt64(DefaultSplitAndCompressChunkSize), + .SplitAndCompressMinSize = Parameters["SplitAndCompressMinSize"sv].AsUInt64(DefaultSplitAndCompressMinSize), + + .DynamicChunkingParams = ReadChunkParams(Parameters["ChunkingParams"sv].AsObjectView())}; } - const ChunkingControllerWithFixedChunkingSettings m_Settings; - tsl::robin_set<uint32_t> m_FixedChunkingExtensionHashes; - tsl::robin_set<uint32_t> m_ExcludeExtensionHashes; + const StandardChunkingControllerSettings m_Settings; + tsl::robin_set<uint32_t> m_SplitOnlyExtensionHashes; + tsl::robin_set<uint32_t> m_SplitAndCompressExtensionHashes; }; std::unique_ptr<ChunkingController> -CreateBasicChunkingController(const BasicChunkingControllerSettings& Settings) -{ - return std::make_unique<BasicChunkingController>(Settings); -} -std::unique_ptr<ChunkingController> -CreateBasicChunkingController(CbObjectView Parameters) +CreateStandardChunkingController(const StandardChunkingControllerSettings& Setting) { - return std::make_unique<BasicChunkingController>(Parameters); + return std::make_unique<StandardChunkingController>(Setting); } std::unique_ptr<ChunkingController> -CreateChunkingControllerWithFixedChunking(const ChunkingControllerWithFixedChunkingSettings& Setting) -{ - return std::make_unique<ChunkingControllerWithFixedChunking>(Setting); -} -std::unique_ptr<ChunkingController> -CreateChunkingControllerWithFixedChunking(CbObjectView Parameters) +CreateStandardChunkingController(CbObjectView Parameters) { - return std::make_unique<ChunkingControllerWithFixedChunking>(Parameters); + return std::make_unique<StandardChunkingController>(Parameters); } std::unique_ptr<ChunkingController> CreateChunkingController(std::string_view Name, CbObjectView Parameters) { - if (Name == BasicChunkingController::Name) + if (Name == legacy::BasicChunkingController::Name) + { + return legacy::CreateBasicChunkingController(Parameters); + } + else if (Name == legacy::ChunkingControllerWithFixedChunking::Name) { - return CreateBasicChunkingController(Parameters); + return legacy::CreateChunkingControllerWithFixedChunking(Parameters); } - else if (Name == ChunkingControllerWithFixedChunking::Name) + else if (Name == StandardChunkingController::Name) { - return CreateChunkingControllerWithFixedChunking(Parameters); + return CreateStandardChunkingController(Parameters); } return {}; } |