aboutsummaryrefslogtreecommitdiff
path: root/src/zenremotestore/chunking/chunkingcontroller.cpp
diff options
context:
space:
mode:
authorDan Engelbrecht <[email protected]>2025-10-20 12:09:46 +0200
committerGitHub Enterprise <[email protected]>2025-10-20 12:09:46 +0200
commitc1af02eeb2badfbd2c01125730c6b85bbed8be9e (patch)
treed5a21612f886940166f905b6abc408959220834d /src/zenremotestore/chunking/chunkingcontroller.cpp
parent5.7.7-pre0 (diff)
downloadzen-c1af02eeb2badfbd2c01125730c6b85bbed8be9e.tar.xz
zen-c1af02eeb2badfbd2c01125730c6b85bbed8be9e.zip
updated chunking strategy (#589)
- Improvement: `zen builds`now split large files that are compress only into 64 MB chunks to avoiding very large files in Cloud Storage - Improvement: `zen builds` now treats `.msixvc` files as non-compressable Moved and cleaned up compactbinary_helpers functions Tweaked fixed chunking implementation for better performance Refactored so we have one list of "non-compressable" extensions Implemented new `StandardChunkingStrategy` and move the two existing to hidden legacy namespace Added `FilteredDownloadedBytesPerSecond.Start();` call that got lost during previous refactoring
Diffstat (limited to 'src/zenremotestore/chunking/chunkingcontroller.cpp')
-rw-r--r--src/zenremotestore/chunking/chunkingcontroller.cpp642
1 files changed, 427 insertions, 215 deletions
diff --git a/src/zenremotestore/chunking/chunkingcontroller.cpp b/src/zenremotestore/chunking/chunkingcontroller.cpp
index cc20446ea..91ca18d10 100644
--- a/src/zenremotestore/chunking/chunkingcontroller.cpp
+++ b/src/zenremotestore/chunking/chunkingcontroller.cpp
@@ -3,7 +3,7 @@
#include <zenremotestore/chunking/chunkingcontroller.h>
#include <zencore/basicfile.h>
-#include <zencore/compactbinarybuilder.h>
+#include <zencore/compactbinaryutil.h>
#include <zencore/filesystem.h>
#include <zencore/trace.h>
@@ -16,23 +16,12 @@ namespace zen {
using namespace std::literals;
namespace {
- std::vector<std::string> ReadStringArray(CbArrayView StringArray)
- {
- std::vector<std::string> Result;
- Result.reserve(StringArray.Num());
- for (CbFieldView FieldView : StringArray)
- {
- Result.emplace_back(FieldView.AsString());
- }
- return Result;
- }
-
ChunkedParams ReadChunkParams(CbObjectView Params)
{
bool UseThreshold = Params["UseThreshold"sv].AsBool(true);
- size_t MinSize = Params["MinSize"sv].AsUInt64(DefaultChunkedParams.MinSize);
- size_t MaxSize = Params["MaxSize"sv].AsUInt64(DefaultChunkedParams.MaxSize);
- size_t AvgSize = Params["AvgSize"sv].AsUInt64(DefaultChunkedParams.AvgSize);
+ size_t MinSize = Params["MinSize"sv].AsUInt64(DefaultDynamicChunkingParams.MinSize);
+ size_t MaxSize = Params["MaxSize"sv].AsUInt64(DefaultDynamicChunkingParams.MaxSize);
+ size_t AvgSize = Params["AvgSize"sv].AsUInt64(DefaultDynamicChunkingParams.AvgSize);
return ChunkedParams{.UseThreshold = UseThreshold, .MinSize = MinSize, .MaxSize = MaxSize, .AvgSize = AvgSize};
}
@@ -50,6 +39,44 @@ namespace {
Writer.EndObject(); // ChunkingParams
}
+} // namespace
+
+namespace legacy {
+ const std::vector<std::string> DefaultChunkingExcludeExtensions = {
+ ".exe", ".dll", ".pdb", ".self", ".mp4", ".zip", ".7z", ".bzip", ".rar", ".gzip",
+ ".sym", ".psym", ".txt", ".ini", ".json", ".verse", ".versemodule", ".jpg", ".c", ".h",
+ ".cpp", ".cxx", ".c++", ".cc", ".hpp", ".hxx", ".h++", ".py", ".ogg", ".plist"};
+
+ const std::vector<std::string> DefaultFixedChunkingExtensions = {".apk", ".nsp", ".xvc", ".pkg", ".dmg", ".ipa"};
+ const bool DefaultChunkingExcludeElfFiles = true;
+ const bool DefaultChunkingExcludeMachOFiles = true;
+
+ const size_t DefaultChunkingFileSizeLimit = DefaultDynamicChunkingParams.MaxSize;
+
+ const uint64_t DefaultFixedChunkingChunkSize = 32u * 1024u * 1024u;
+ const uint64_t DefaultMinSizeForFixedChunking = DefaultFixedChunkingChunkSize * 8u;
+
+ struct BasicChunkingControllerSettings
+ {
+ std::vector<std::string> ExcludeExtensions = DefaultChunkingExcludeExtensions;
+ bool ExcludeElfFiles = DefaultChunkingExcludeElfFiles;
+ bool ExcludeMachOFiles = DefaultChunkingExcludeMachOFiles;
+ uint64_t ChunkFileSizeLimit = DefaultChunkingFileSizeLimit;
+ ChunkedParams ChunkingParams = DefaultDynamicChunkingParams;
+ };
+
+ struct ChunkingControllerWithFixedChunkingSettings
+ {
+ std::vector<std::string> FixedChunkingExtensions = DefaultFixedChunkingExtensions;
+ std::vector<std::string> ExcludeExtensions = DefaultChunkingExcludeExtensions;
+ bool ExcludeElfFiles = DefaultChunkingExcludeElfFiles;
+ bool ExcludeMachOFiles = DefaultChunkingExcludeMachOFiles;
+ uint64_t ChunkFileSizeLimit = DefaultChunkingFileSizeLimit;
+ ChunkedParams ChunkingParams = DefaultDynamicChunkingParams;
+ uint64_t FixedChunkingChunkSize = DefaultFixedChunkingChunkSize;
+ uint64_t MinSizeForFixedChunking = DefaultMinSizeForFixedChunking;
+ };
+
bool IsElfFile(BasicFile& Buffer)
{
if (Buffer.FileSize() > 4)
@@ -77,114 +104,363 @@ namespace {
}
return false;
}
-} // namespace
-class BasicChunkingController : public ChunkingController
-{
-public:
- BasicChunkingController(const BasicChunkingControllerSettings& Settings) : m_Settings(Settings)
+ //////////// BasicChunkingController
+
+ class BasicChunkingController : public ChunkingController
{
- m_ExcludeExtensionHashes.reserve(Settings.ExcludeExtensions.size());
- for (const std::string& Extension : Settings.ExcludeExtensions)
+ public:
+ BasicChunkingController(const BasicChunkingControllerSettings& Settings) : m_Settings(Settings)
{
- m_ExcludeExtensionHashes.insert(HashStringAsLowerDjb2(Extension));
+ m_ExcludeExtensionHashes.reserve(Settings.ExcludeExtensions.size());
+ for (const std::string& Extension : Settings.ExcludeExtensions)
+ {
+ m_ExcludeExtensionHashes.insert(HashStringAsLowerDjb2(Extension));
+ }
}
- }
- BasicChunkingController(CbObjectView Parameters) : m_Settings(ReadSettings(Parameters)) {}
+ BasicChunkingController(CbObjectView Parameters) : m_Settings(ReadSettings(Parameters)) {}
- virtual bool ProcessFile(const std::filesystem::path& InputPath,
- uint64_t RawSize,
- ChunkedInfoWithSource& OutChunked,
- std::atomic<uint64_t>& BytesProcessed,
- std::atomic<bool>& AbortFlag) const override
- {
- ZEN_TRACE_CPU("BasicChunkingController::ProcessFile");
- if (RawSize < m_Settings.ChunkFileSizeLimit)
+ virtual bool ProcessFile(const std::filesystem::path& InputPath,
+ uint64_t RawSize,
+ ChunkedInfoWithSource& OutChunked,
+ std::atomic<uint64_t>& BytesProcessed,
+ std::atomic<bool>& AbortFlag) const override
{
- return false;
- }
+ ZEN_TRACE_CPU("BasicChunkingController::ProcessFile");
+ if (RawSize < m_Settings.ChunkFileSizeLimit)
+ {
+ return false;
+ }
- const uint32_t ExtensionHash = HashStringAsLowerDjb2(InputPath.extension().string());
- const bool ExcludeFromChunking = m_ExcludeExtensionHashes.contains(ExtensionHash);
+ const uint32_t ExtensionHash = HashStringAsLowerDjb2(InputPath.extension().string());
+ const bool ExcludeFromChunking = m_ExcludeExtensionHashes.contains(ExtensionHash);
- if (ExcludeFromChunking)
- {
- return false;
+ if (ExcludeFromChunking)
+ {
+ return false;
+ }
+
+ BasicFile Buffer(InputPath, BasicFile::Mode::kRead);
+ if (m_Settings.ExcludeElfFiles && IsElfFile(Buffer))
+ {
+ return false;
+ }
+ if (m_Settings.ExcludeMachOFiles && IsMachOFile(Buffer))
+ {
+ return false;
+ }
+
+ OutChunked = ChunkData(Buffer, 0, RawSize, m_Settings.ChunkingParams, &BytesProcessed, &AbortFlag);
+ return true;
}
- BasicFile Buffer(InputPath, BasicFile::Mode::kRead);
- if (m_Settings.ExcludeElfFiles && IsElfFile(Buffer))
+ virtual std::string_view GetName() const override { return Name; }
+
+ virtual CbObject GetParameters() const override
{
- return false;
+ CbObjectWriter Writer;
+ compactbinary_helpers::WriteArray(m_Settings.ExcludeExtensions, "ChunkExcludeExtensions"sv, Writer);
+
+ Writer.AddBool("ExcludeElfFiles"sv, m_Settings.ExcludeElfFiles);
+ Writer.AddBool("ExcludeMachOFiles"sv, m_Settings.ExcludeMachOFiles);
+ Writer.AddInteger("ChunkFileSizeLimit"sv, m_Settings.ChunkFileSizeLimit);
+
+ WriteChunkParams(Writer, m_Settings.ChunkingParams);
+
+ return Writer.Save();
}
- if (m_Settings.ExcludeMachOFiles && IsMachOFile(Buffer))
+ static constexpr std::string_view Name = "BasicChunkingController"sv;
+
+ private:
+ static BasicChunkingControllerSettings ReadSettings(CbObjectView Parameters)
{
- return false;
+ return BasicChunkingControllerSettings{
+ .ExcludeExtensions = compactbinary_helpers::ReadArray<std::string>("ChunkExcludeExtensions"sv, Parameters),
+ .ExcludeElfFiles = Parameters["ExcludeElfFiles"sv].AsBool(DefaultChunkingExcludeElfFiles),
+ .ExcludeMachOFiles = Parameters["ExcludeMachOFiles"sv].AsBool(DefaultChunkingExcludeMachOFiles),
+ .ChunkFileSizeLimit = Parameters["ChunkFileSizeLimit"sv].AsUInt64(DefaultChunkingFileSizeLimit),
+ .ChunkingParams = ReadChunkParams(Parameters["ChunkingParams"sv].AsObjectView())};
}
- OutChunked = ChunkData(Buffer, 0, RawSize, m_Settings.ChunkingParams, &BytesProcessed, &AbortFlag);
- return true;
+ const BasicChunkingControllerSettings m_Settings;
+ tsl::robin_set<uint32_t> m_ExcludeExtensionHashes;
+ };
+
+ std::unique_ptr<ChunkingController> CreateBasicChunkingController(CbObjectView Parameters)
+ {
+ return std::make_unique<BasicChunkingController>(Parameters);
}
- virtual std::string_view GetName() const override { return Name; }
+ //////////// ChunkingControllerWithFixedChunking
- virtual CbObject GetParameters() const override
+ class ChunkingControllerWithFixedChunking : public ChunkingController
{
- CbObjectWriter Writer;
- Writer.BeginArray("ChunkExcludeExtensions"sv);
+ public:
+ ChunkingControllerWithFixedChunking(const ChunkingControllerWithFixedChunkingSettings& Settings) : m_Settings(Settings)
{
- for (const std::string& Extension : m_Settings.ExcludeExtensions)
+ m_ExcludeExtensionHashes.reserve(Settings.ExcludeExtensions.size());
+ for (const std::string& Extension : Settings.ExcludeExtensions)
+ {
+ m_ExcludeExtensionHashes.insert(HashStringAsLowerDjb2(Extension));
+ }
+
+ m_FixedChunkingExtensionHashes.reserve(Settings.FixedChunkingExtensions.size());
+ for (const std::string& Extension : Settings.FixedChunkingExtensions)
{
- Writer.AddString(Extension);
+ m_FixedChunkingExtensionHashes.insert(HashStringAsLowerDjb2(Extension));
}
}
- Writer.EndArray(); // ChunkExcludeExtensions
- Writer.AddBool("ExcludeElfFiles"sv, m_Settings.ExcludeElfFiles);
- Writer.AddBool("ExcludeMachOFiles"sv, m_Settings.ExcludeMachOFiles);
- Writer.AddInteger("ChunkFileSizeLimit"sv, m_Settings.ChunkFileSizeLimit);
+ ChunkingControllerWithFixedChunking(CbObjectView Parameters) : m_Settings(ReadSettings(Parameters)) {}
- WriteChunkParams(Writer, m_Settings.ChunkingParams);
+ virtual bool ProcessFile(const std::filesystem::path& InputPath,
+ uint64_t RawSize,
+ ChunkedInfoWithSource& OutChunked,
+ std::atomic<uint64_t>& BytesProcessed,
+ std::atomic<bool>& AbortFlag) const override
+ {
+ ZEN_TRACE_CPU("ChunkingControllerWithFixedChunking::ProcessFile");
+ if (RawSize < m_Settings.ChunkFileSizeLimit)
+ {
+ return false;
+ }
- return Writer.Save();
- }
- static constexpr std::string_view Name = "BasicChunkingController"sv;
+ const uint32_t ExtensionHash = HashStringAsLowerDjb2(InputPath.extension().string());
+ const bool ExcludeFromChunking = m_ExcludeExtensionHashes.contains(ExtensionHash);
-private:
- static BasicChunkingControllerSettings ReadSettings(CbObjectView Parameters)
+ if (ExcludeFromChunking)
+ {
+ return false;
+ }
+
+ const bool FixedChunkingExtension = m_FixedChunkingExtensionHashes.contains(ExtensionHash);
+
+ if (FixedChunkingExtension)
+ {
+ if (RawSize < m_Settings.MinSizeForFixedChunking)
+ {
+ return false;
+ }
+ ZEN_TRACE_CPU("FixedChunking");
+ IoHashStream FullHasher;
+ BasicFile Source(InputPath, BasicFile::Mode::kRead);
+ uint64_t Offset = 0;
+ tsl::robin_map<IoHash, uint32_t, IoHash::Hasher> ChunkHashToChunkIndex;
+ const uint64_t ExpectedChunkCount = 1 + (RawSize / m_Settings.FixedChunkingChunkSize);
+ ChunkHashToChunkIndex.reserve(ExpectedChunkCount);
+ OutChunked.Info.ChunkHashes.reserve(ExpectedChunkCount);
+ OutChunked.Info.ChunkSequence.reserve(ExpectedChunkCount);
+ OutChunked.ChunkSources.reserve(ExpectedChunkCount);
+
+ static const uint64_t BufferingSize = 256u * 1024u;
+
+ IoHashStream ChunkHasher;
+
+ while (Offset < RawSize)
+ {
+ if (AbortFlag)
+ {
+ return false;
+ }
+
+ ChunkHasher.Reset();
+
+ uint64_t ChunkSize = std::min<uint64_t>(RawSize - Offset, m_Settings.FixedChunkingChunkSize);
+ if (ChunkSize >= (BufferingSize + BufferingSize / 2))
+ {
+ ScanFile(Source.Handle(),
+ Offset,
+ ChunkSize,
+ BufferingSize,
+ [&FullHasher, &ChunkHasher, &BytesProcessed](const void* Data, size_t Size) {
+ FullHasher.Append(Data, Size);
+ ChunkHasher.Append(Data, Size);
+ BytesProcessed.fetch_add(Size);
+ });
+ }
+ else
+ {
+ IoBuffer ChunkData = Source.ReadRange(Offset, ChunkSize);
+ FullHasher.Append(ChunkData);
+ ChunkHasher.Append(ChunkData);
+ BytesProcessed.fetch_add(ChunkSize);
+ }
+
+ const IoHash ChunkHash = ChunkHasher.GetHash();
+ if (auto It = ChunkHashToChunkIndex.find(ChunkHash); It != ChunkHashToChunkIndex.end())
+ {
+ OutChunked.Info.ChunkSequence.push_back(It->second);
+ }
+ else
+ {
+ uint32_t ChunkIndex = gsl::narrow<uint32_t>(OutChunked.Info.ChunkHashes.size());
+ OutChunked.Info.ChunkHashes.push_back(ChunkHash);
+ OutChunked.Info.ChunkSequence.push_back(ChunkIndex);
+ OutChunked.ChunkSources.push_back({.Offset = Offset, .Size = gsl::narrow<uint32_t>(ChunkSize)});
+ }
+ Offset += ChunkSize;
+ }
+ OutChunked.Info.RawSize = RawSize;
+ OutChunked.Info.RawHash = FullHasher.GetHash();
+ return true;
+ }
+ else
+ {
+ BasicFile Buffer(InputPath, BasicFile::Mode::kRead);
+ if (m_Settings.ExcludeElfFiles && IsElfFile(Buffer))
+ {
+ return false;
+ }
+ if (m_Settings.ExcludeMachOFiles && IsMachOFile(Buffer))
+ {
+ return false;
+ }
+
+ OutChunked = ChunkData(Buffer, 0, RawSize, m_Settings.ChunkingParams, &BytesProcessed, &AbortFlag);
+ return true;
+ }
+ }
+
+ virtual std::string_view GetName() const override { return Name; }
+
+ virtual CbObject GetParameters() const override
+ {
+ CbObjectWriter Writer;
+ compactbinary_helpers::WriteArray(m_Settings.FixedChunkingExtensions, "FixedChunkingExtensions", Writer);
+ compactbinary_helpers::WriteArray(m_Settings.ExcludeExtensions, "ChunkExcludeExtensions", Writer);
+
+ Writer.AddBool("ExcludeElfFiles"sv, m_Settings.ExcludeElfFiles);
+ Writer.AddBool("ExcludeMachOFiles"sv, m_Settings.ExcludeMachOFiles);
+
+ Writer.AddInteger("ChunkFileSizeLimit"sv, m_Settings.ChunkFileSizeLimit);
+
+ WriteChunkParams(Writer, m_Settings.ChunkingParams);
+
+ Writer.AddInteger("FixedChunkingChunkSize"sv, m_Settings.FixedChunkingChunkSize);
+ Writer.AddInteger("MinSizeForFixedChunking"sv, m_Settings.MinSizeForFixedChunking);
+ return Writer.Save();
+ }
+
+ static constexpr std::string_view Name = "ChunkingControllerWithFixedChunking"sv;
+
+ private:
+ static ChunkingControllerWithFixedChunkingSettings ReadSettings(CbObjectView Parameters)
+ {
+ return ChunkingControllerWithFixedChunkingSettings{
+ .FixedChunkingExtensions = compactbinary_helpers::ReadArray<std::string>("FixedChunkingExtensions"sv, Parameters),
+ .ExcludeExtensions = compactbinary_helpers::ReadArray<std::string>("ChunkExcludeExtensions"sv, Parameters),
+ .ExcludeElfFiles = Parameters["ExcludeElfFiles"sv].AsBool(DefaultChunkingExcludeElfFiles),
+ .ExcludeMachOFiles = Parameters["ExcludeMachOFiles"sv].AsBool(DefaultChunkingExcludeMachOFiles),
+ .ChunkFileSizeLimit = Parameters["ChunkFileSizeLimit"sv].AsUInt64(DefaultChunkingFileSizeLimit),
+ .ChunkingParams = ReadChunkParams(Parameters["ChunkingParams"sv].AsObjectView()),
+ .FixedChunkingChunkSize = Parameters["FixedChunkingChunkSize"sv].AsUInt64(DefaultFixedChunkingChunkSize),
+ .MinSizeForFixedChunking = Parameters["MinSizeForFixedChunking"sv].AsUInt64(DefaultFixedChunkingChunkSize)};
+ }
+
+ const ChunkingControllerWithFixedChunkingSettings m_Settings;
+ tsl::robin_set<uint32_t> m_FixedChunkingExtensionHashes;
+ tsl::robin_set<uint32_t> m_ExcludeExtensionHashes;
+ };
+
+ std::unique_ptr<ChunkingController> CreateChunkingControllerWithFixedChunking(CbObjectView Parameters)
{
- return BasicChunkingControllerSettings{
- .ExcludeExtensions = ReadStringArray(Parameters["ChunkExcludeExtensions"sv].AsArrayView()),
- .ExcludeElfFiles = Parameters["ExcludeElfFiles"sv].AsBool(DefaultChunkingExcludeElfFiles),
- .ExcludeMachOFiles = Parameters["ExcludeMachOFiles"sv].AsBool(DefaultChunkingExcludeMachOFiles),
- .ChunkFileSizeLimit = Parameters["ChunkFileSizeLimit"sv].AsUInt64(DefaultChunkingFileSizeLimit),
- .ChunkingParams = ReadChunkParams(Parameters["ChunkingParams"sv].AsObjectView())};
+ return std::make_unique<ChunkingControllerWithFixedChunking>(Parameters);
}
+} // namespace legacy
- const BasicChunkingControllerSettings m_Settings;
- tsl::robin_set<uint32_t> m_ExcludeExtensionHashes;
-};
+//////////// StandardChunkingController
-class ChunkingControllerWithFixedChunking : public ChunkingController
+class StandardChunkingController : public ChunkingController
{
public:
- ChunkingControllerWithFixedChunking(const ChunkingControllerWithFixedChunkingSettings& Settings) : m_Settings(Settings)
+ StandardChunkingController(const StandardChunkingControllerSettings& Settings) : m_Settings(Settings)
{
- m_ExcludeExtensionHashes.reserve(Settings.ExcludeExtensions.size());
- for (const std::string& Extension : Settings.ExcludeExtensions)
+ m_SplitOnlyExtensionHashes.reserve(Settings.SplitOnlyExtensions.size());
+ for (const std::string& Extension : Settings.SplitOnlyExtensions)
{
- m_ExcludeExtensionHashes.insert(HashStringAsLowerDjb2(Extension));
+ m_SplitOnlyExtensionHashes.insert(HashStringAsLowerDjb2(Extension));
}
- m_FixedChunkingExtensionHashes.reserve(Settings.FixedChunkingExtensions.size());
- for (const std::string& Extension : Settings.FixedChunkingExtensions)
+ m_SplitAndCompressExtensionHashes.reserve(Settings.SplitAndCompressExtensions.size());
+ for (const std::string& Extension : Settings.SplitAndCompressExtensions)
{
- m_FixedChunkingExtensionHashes.insert(HashStringAsLowerDjb2(Extension));
+ m_SplitAndCompressExtensionHashes.insert(HashStringAsLowerDjb2(Extension));
}
}
- ChunkingControllerWithFixedChunking(CbObjectView Parameters) : m_Settings(ReadSettings(Parameters)) {}
+ StandardChunkingController(CbObjectView Parameters) : m_Settings(ReadSettings(Parameters)) {}
+
+ bool FixedChunking(BasicFile& Source,
+ uint64_t RawSize,
+ ChunkedInfoWithSource& OutChunked,
+ std::atomic<uint64_t>& BytesProcessed,
+ const uint64_t FixedChunkSize,
+ std::atomic<bool>& AbortFlag) const
+ {
+ ZEN_TRACE_CPU("FixedChunking");
+
+ IoHashStream FullHasher;
+ uint64_t Offset = 0;
+ tsl::robin_map<IoHash, uint32_t, IoHash::Hasher> ChunkHashToChunkIndex;
+ const uint64_t ExpectedChunkCount = 1 + (RawSize / FixedChunkSize);
+ ChunkHashToChunkIndex.reserve(ExpectedChunkCount);
+ OutChunked.Info.ChunkHashes.reserve(ExpectedChunkCount);
+ OutChunked.Info.ChunkSequence.reserve(ExpectedChunkCount);
+ OutChunked.ChunkSources.reserve(ExpectedChunkCount);
+
+ static const uint64_t BufferingSize = 256u * 1024u;
+ static const uint64_t MinimumLastChunkSize = Min(128u * 1024u, FixedChunkSize / 32);
+
+ IoHashStream ChunkHasher;
+
+ BasicFileBuffer SourceBuffer(Source, Min(BufferingSize, RawSize));
+ while (Offset < RawSize)
+ {
+ if (AbortFlag)
+ {
+ return false;
+ }
+
+ ChunkHasher.Reset();
+
+ const uint64_t ChunkStartOffset = Offset;
+ const uint64_t BytesLeft = RawSize - Offset;
+ uint64_t ChunkSize = std::min<uint64_t>(BytesLeft, FixedChunkSize);
+ if ((BytesLeft - ChunkSize) < MinimumLastChunkSize)
+ {
+ // Avoid small chunks from the end of the file
+ ChunkSize = BytesLeft;
+ }
+ const uint64_t End = ChunkStartOffset + ChunkSize;
+ while (Offset < End)
+ {
+ const uint64_t BufferSize = std::min<uint64_t>(RawSize - Offset, BufferingSize);
+ MemoryView ChunkData = SourceBuffer.MakeView(BufferSize, Offset);
+ FullHasher.Append(ChunkData);
+ ChunkHasher.Append(ChunkData);
+ BytesProcessed.fetch_add(BufferSize);
+ Offset += BufferSize;
+ }
+
+ const IoHash ChunkHash = ChunkHasher.GetHash();
+ if (auto It = ChunkHashToChunkIndex.find(ChunkHash); It != ChunkHashToChunkIndex.end())
+ {
+ OutChunked.Info.ChunkSequence.push_back(It->second);
+ }
+ else
+ {
+ uint32_t ChunkIndex = gsl::narrow<uint32_t>(OutChunked.Info.ChunkHashes.size());
+ OutChunked.Info.ChunkHashes.push_back(ChunkHash);
+ OutChunked.Info.ChunkSequence.push_back(ChunkIndex);
+ OutChunked.ChunkSources.push_back({.Offset = ChunkStartOffset, .Size = gsl::narrow<uint32_t>(ChunkSize)});
+ }
+ }
+
+ OutChunked.Info.RawSize = RawSize;
+ OutChunked.Info.RawHash = FullHasher.GetHash();
+ return true;
+ }
virtual bool ProcessFile(const std::filesystem::path& InputPath,
uint64_t RawSize,
@@ -192,106 +468,60 @@ public:
std::atomic<uint64_t>& BytesProcessed,
std::atomic<bool>& AbortFlag) const override
{
- ZEN_TRACE_CPU("ChunkingControllerWithFixedChunking::ProcessFile");
- if (RawSize < m_Settings.ChunkFileSizeLimit)
- {
- return false;
- }
+ ZEN_TRACE_CPU("StandardChunkingController::ProcessFile");
- const uint32_t ExtensionHash = HashStringAsLowerDjb2(InputPath.extension().string());
- const bool ExcludeFromChunking = m_ExcludeExtensionHashes.contains(ExtensionHash);
+ const uint32_t ExtensionHash = HashStringAsLowerDjb2(InputPath.extension().string());
- if (ExcludeFromChunking)
+ if (m_SplitOnlyExtensionHashes.contains(ExtensionHash))
{
- return false;
+ if (RawSize < m_Settings.SplitOnlyMinSize)
+ {
+ return false;
+ }
+ BasicFile Source(InputPath, BasicFile::Mode::kRead);
+ return FixedChunking(Source, RawSize, OutChunked, BytesProcessed, m_Settings.SplitOnlyChunkSize, AbortFlag);
}
- const bool FixedChunkingExtension = m_FixedChunkingExtensionHashes.contains(ExtensionHash);
-
- if (FixedChunkingExtension)
+ if (m_SplitAndCompressExtensionHashes.contains(ExtensionHash))
{
- if (RawSize < m_Settings.MinSizeForFixedChunking)
+ if (RawSize < m_Settings.SplitAndCompressMinSize)
{
return false;
}
- ZEN_TRACE_CPU("FixedChunking");
- IoHashStream FullHasher;
- BasicFile Source(InputPath, BasicFile::Mode::kRead);
- uint64_t Offset = 0;
- tsl::robin_map<IoHash, uint32_t, IoHash::Hasher> ChunkHashToChunkIndex;
- const uint64_t ExpectedChunkCount = 1 + (RawSize / m_Settings.FixedChunkingChunkSize);
- ChunkHashToChunkIndex.reserve(ExpectedChunkCount);
- OutChunked.Info.ChunkHashes.reserve(ExpectedChunkCount);
- OutChunked.Info.ChunkSequence.reserve(ExpectedChunkCount);
- OutChunked.ChunkSources.reserve(ExpectedChunkCount);
-
- static const uint64_t BufferingSize = 256u * 1024u;
-
- IoHashStream ChunkHasher;
-
- while (Offset < RawSize)
+ BasicFile Source(InputPath, BasicFile::Mode::kRead);
+ return FixedChunking(Source, RawSize, OutChunked, BytesProcessed, m_Settings.SplitAndCompressChunkSize, AbortFlag);
+ }
+
+ if (RawSize > sizeof(uint32_t) && !m_Settings.SplitAndCompressFileLeadingBytes.empty())
+ {
+ BasicFile Source(InputPath, BasicFile::Mode::kRead);
+ uint32_t LeadingBytes = 0;
+ Source.Read(&LeadingBytes, 4, 0);
+ if (std::find(m_Settings.SplitAndCompressFileLeadingBytes.begin(),
+ m_Settings.SplitAndCompressFileLeadingBytes.end(),
+ LeadingBytes) != m_Settings.SplitAndCompressFileLeadingBytes.end())
{
- if (AbortFlag)
+ if (RawSize < m_Settings.SplitAndCompressMinSize)
{
return false;
}
-
- ChunkHasher.Reset();
-
- uint64_t ChunkSize = std::min<uint64_t>(RawSize - Offset, m_Settings.FixedChunkingChunkSize);
- if (ChunkSize >= (BufferingSize + BufferingSize / 2))
- {
- ScanFile(Source.Handle(),
- Offset,
- ChunkSize,
- BufferingSize,
- [&FullHasher, &ChunkHasher, &BytesProcessed](const void* Data, size_t Size) {
- FullHasher.Append(Data, Size);
- ChunkHasher.Append(Data, Size);
- BytesProcessed.fetch_add(Size);
- });
- }
- else
- {
- IoBuffer ChunkData = Source.ReadRange(Offset, ChunkSize);
- FullHasher.Append(ChunkData);
- ChunkHasher.Append(ChunkData);
- BytesProcessed.fetch_add(ChunkSize);
- }
-
- const IoHash ChunkHash = ChunkHasher.GetHash();
- if (auto It = ChunkHashToChunkIndex.find(ChunkHash); It != ChunkHashToChunkIndex.end())
- {
- OutChunked.Info.ChunkSequence.push_back(It->second);
- }
else
{
- uint32_t ChunkIndex = gsl::narrow<uint32_t>(OutChunked.Info.ChunkHashes.size());
- OutChunked.Info.ChunkHashes.push_back(ChunkHash);
- OutChunked.Info.ChunkSequence.push_back(ChunkIndex);
- OutChunked.ChunkSources.push_back({.Offset = Offset, .Size = gsl::narrow<uint32_t>(ChunkSize)});
+ return FixedChunking(Source, RawSize, OutChunked, BytesProcessed, m_Settings.SplitAndCompressChunkSize, AbortFlag);
}
- Offset += ChunkSize;
}
- OutChunked.Info.RawSize = RawSize;
- OutChunked.Info.RawHash = FullHasher.GetHash();
- return true;
+ }
+
+ if (RawSize < m_Settings.DynamicChunkingParams.MaxSize)
+ {
+ return false;
}
else
{
- BasicFile Buffer(InputPath, BasicFile::Mode::kRead);
- if (m_Settings.ExcludeElfFiles && IsElfFile(Buffer))
- {
- return false;
- }
- if (m_Settings.ExcludeMachOFiles && IsMachOFile(Buffer))
- {
- return false;
- }
-
- OutChunked = ChunkData(Buffer, 0, RawSize, m_Settings.ChunkingParams, &BytesProcessed, &AbortFlag);
- return true;
+ BasicFile Source(InputPath, BasicFile::Mode::kRead);
+ OutChunked = ChunkData(Source, 0, RawSize, m_Settings.DynamicChunkingParams, &BytesProcessed, &AbortFlag);
}
+ return true;
}
virtual std::string_view GetName() const override { return Name; }
@@ -299,89 +529,71 @@ public:
virtual CbObject GetParameters() const override
{
CbObjectWriter Writer;
- Writer.BeginArray("FixedChunkingExtensions");
- {
- for (const std::string& Extension : m_Settings.FixedChunkingExtensions)
- {
- Writer.AddString(Extension);
- }
- }
- Writer.EndArray(); // ChunkExcludeExtensions
-
- Writer.BeginArray("ChunkExcludeExtensions"sv);
- {
- for (const std::string& Extension : m_Settings.ExcludeExtensions)
- {
- Writer.AddString(Extension);
- }
- }
- Writer.EndArray(); // ChunkExcludeExtensions
- Writer.AddBool("ExcludeElfFiles"sv, m_Settings.ExcludeElfFiles);
- Writer.AddBool("ExcludeMachOFiles"sv, m_Settings.ExcludeMachOFiles);
+ compactbinary_helpers::WriteArray(m_Settings.SplitOnlyExtensions, "SplitOnlyExtensions"sv, Writer);
+ Writer.AddInteger("SplitOnlyChunkSize"sv, m_Settings.SplitOnlyChunkSize);
+ Writer.AddInteger("SplitOnlyMinSize"sv, m_Settings.SplitOnlyMinSize);
- Writer.AddInteger("ChunkFileSizeLimit"sv, m_Settings.ChunkFileSizeLimit);
+ compactbinary_helpers::WriteArray(m_Settings.SplitAndCompressExtensions, "SplitAndCompressExtensions"sv, Writer);
+ compactbinary_helpers::WriteArray(m_Settings.SplitAndCompressFileLeadingBytes, "SplitAndCompressFileLeadingBytes"sv, Writer);
+ Writer.AddInteger("SplitAndCompressChunkSize"sv, m_Settings.SplitAndCompressChunkSize);
+ Writer.AddInteger("SplitAndCompressMinSize"sv, m_Settings.SplitAndCompressMinSize);
- WriteChunkParams(Writer, m_Settings.ChunkingParams);
+ WriteChunkParams(Writer, m_Settings.DynamicChunkingParams);
- Writer.AddInteger("FixedChunkingChunkSize"sv, m_Settings.FixedChunkingChunkSize);
- Writer.AddInteger("MinSizeForFixedChunking"sv, m_Settings.MinSizeForFixedChunking);
return Writer.Save();
}
- static constexpr std::string_view Name = "ChunkingControllerWithFixedChunking"sv;
+ static constexpr std::string_view Name = "StandardChunkingController"sv;
private:
- static ChunkingControllerWithFixedChunkingSettings ReadSettings(CbObjectView Parameters)
+ static StandardChunkingControllerSettings ReadSettings(CbObjectView Parameters)
{
- return ChunkingControllerWithFixedChunkingSettings{
- .FixedChunkingExtensions = ReadStringArray(Parameters["FixedChunkingExtensions"sv].AsArrayView()),
- .ExcludeExtensions = ReadStringArray(Parameters["ChunkExcludeExtensions"sv].AsArrayView()),
- .ExcludeElfFiles = Parameters["ExcludeElfFiles"sv].AsBool(DefaultChunkingExcludeElfFiles),
- .ExcludeMachOFiles = Parameters["ExcludeMachOFiles"sv].AsBool(DefaultChunkingExcludeMachOFiles),
- .ChunkFileSizeLimit = Parameters["ChunkFileSizeLimit"sv].AsUInt64(DefaultChunkingFileSizeLimit),
- .ChunkingParams = ReadChunkParams(Parameters["ChunkingParams"sv].AsObjectView()),
- .FixedChunkingChunkSize = Parameters["FixedChunkingChunkSize"sv].AsUInt64(DefaultFixedChunkingChunkSize),
- .MinSizeForFixedChunking = Parameters["MinSizeForFixedChunking"sv].AsUInt64(DefaultFixedChunkingChunkSize)};
+ return StandardChunkingControllerSettings{
+ .SplitOnlyExtensions = compactbinary_helpers::ReadArray<std::string>("SplitOnlyExtensions"sv, Parameters),
+ .SplitOnlyChunkSize = Parameters["SplitOnlyChunkSize"sv].AsUInt64(DefaultSplitOnlyChunkSize),
+ .SplitOnlyMinSize = Parameters["SplitOnlyMinSize"sv].AsUInt64(DefaultSplitOnlyMinSize),
+
+ .SplitAndCompressExtensions = compactbinary_helpers::ReadArray<std::string>("SplitAndCompressExtensions"sv, Parameters),
+ .SplitAndCompressFileLeadingBytes =
+ compactbinary_helpers::ReadArray<uint32_t>("SplitAndCompressFileLeadingBytes"sv, Parameters),
+ .SplitAndCompressChunkSize = Parameters["SplitAndCompressChunkSize"sv].AsUInt64(DefaultSplitAndCompressChunkSize),
+ .SplitAndCompressMinSize = Parameters["SplitAndCompressMinSize"sv].AsUInt64(DefaultSplitAndCompressMinSize),
+
+ .DynamicChunkingParams = ReadChunkParams(Parameters["ChunkingParams"sv].AsObjectView())};
}
- const ChunkingControllerWithFixedChunkingSettings m_Settings;
- tsl::robin_set<uint32_t> m_FixedChunkingExtensionHashes;
- tsl::robin_set<uint32_t> m_ExcludeExtensionHashes;
+ const StandardChunkingControllerSettings m_Settings;
+ tsl::robin_set<uint32_t> m_SplitOnlyExtensionHashes;
+ tsl::robin_set<uint32_t> m_SplitAndCompressExtensionHashes;
};
std::unique_ptr<ChunkingController>
-CreateBasicChunkingController(const BasicChunkingControllerSettings& Settings)
-{
- return std::make_unique<BasicChunkingController>(Settings);
-}
-std::unique_ptr<ChunkingController>
-CreateBasicChunkingController(CbObjectView Parameters)
+CreateStandardChunkingController(const StandardChunkingControllerSettings& Setting)
{
- return std::make_unique<BasicChunkingController>(Parameters);
+ return std::make_unique<StandardChunkingController>(Setting);
}
std::unique_ptr<ChunkingController>
-CreateChunkingControllerWithFixedChunking(const ChunkingControllerWithFixedChunkingSettings& Setting)
-{
- return std::make_unique<ChunkingControllerWithFixedChunking>(Setting);
-}
-std::unique_ptr<ChunkingController>
-CreateChunkingControllerWithFixedChunking(CbObjectView Parameters)
+CreateStandardChunkingController(CbObjectView Parameters)
{
- return std::make_unique<ChunkingControllerWithFixedChunking>(Parameters);
+ return std::make_unique<StandardChunkingController>(Parameters);
}
std::unique_ptr<ChunkingController>
CreateChunkingController(std::string_view Name, CbObjectView Parameters)
{
- if (Name == BasicChunkingController::Name)
+ if (Name == legacy::BasicChunkingController::Name)
+ {
+ return legacy::CreateBasicChunkingController(Parameters);
+ }
+ else if (Name == legacy::ChunkingControllerWithFixedChunking::Name)
{
- return CreateBasicChunkingController(Parameters);
+ return legacy::CreateChunkingControllerWithFixedChunking(Parameters);
}
- else if (Name == ChunkingControllerWithFixedChunking::Name)
+ else if (Name == StandardChunkingController::Name)
{
- return CreateChunkingControllerWithFixedChunking(Parameters);
+ return CreateStandardChunkingController(Parameters);
}
return {};
}