aboutsummaryrefslogtreecommitdiff
path: root/src/zenutil/chunkingcontroller.cpp
diff options
context:
space:
mode:
authorDan Engelbrecht <[email protected]>2025-06-03 16:21:01 +0200
committerGitHub Enterprise <[email protected]>2025-06-03 16:21:01 +0200
commita0b10b046095d57ffbdb46c83084601a832f4562 (patch)
treefe015645ea07d83c2784e3e28d0e976a37054859 /src/zenutil/chunkingcontroller.cpp
parentminor: fix unused variable warning on some compilers (diff)
downloadzen-a0b10b046095d57ffbdb46c83084601a832f4562.tar.xz
zen-a0b10b046095d57ffbdb46c83084601a832f4562.zip
fixed size chunking for encrypted files (#410)
- Improvement: Use fixed size block chunking for know encrypted/compressed file types - Improvement: Skip trying to compress chunks that are sourced from files that are known to be encrypted/compressed - Improvement: Add global open file cache for written files increasing throughput during download by reducing overhead of open/close of file by 80%
Diffstat (limited to 'src/zenutil/chunkingcontroller.cpp')
-rw-r--r--src/zenutil/chunkingcontroller.cpp289
1 files changed, 168 insertions, 121 deletions
diff --git a/src/zenutil/chunkingcontroller.cpp b/src/zenutil/chunkingcontroller.cpp
index a5ebce193..6fb4182c0 100644
--- a/src/zenutil/chunkingcontroller.cpp
+++ b/src/zenutil/chunkingcontroller.cpp
@@ -4,6 +4,7 @@
#include <zencore/basicfile.h>
#include <zencore/compactbinarybuilder.h>
+#include <zencore/filesystem.h>
#include <zencore/trace.h>
ZEN_THIRD_PARTY_INCLUDES_START
@@ -35,32 +36,54 @@ namespace {
return ChunkedParams{.UseThreshold = UseThreshold, .MinSize = MinSize, .MaxSize = MaxSize, .AvgSize = AvgSize};
}
-} // namespace
+ void WriteChunkParams(CbObjectWriter& Writer, const ChunkedParams& Params)
+ {
+ Writer.BeginObject("ChunkingParams"sv);
+ {
+ Writer.AddBool("UseThreshold"sv, Params.UseThreshold);
-class BasicChunkingController : public ChunkingController
-{
-public:
- BasicChunkingController(std::span<const std::string_view> ExcludeExtensions,
- bool ExcludeElfFiles,
- bool ExcludeMachOFiles,
- uint64_t ChunkFileSizeLimit,
- const ChunkedParams& ChunkingParams)
- : m_ChunkExcludeExtensions(ExcludeExtensions.begin(), ExcludeExtensions.end())
- , m_ExcludeElfFiles(ExcludeElfFiles)
- , m_ExcludeMachOFiles(ExcludeMachOFiles)
- , m_ChunkFileSizeLimit(ChunkFileSizeLimit)
- , m_ChunkingParams(ChunkingParams)
+ Writer.AddInteger("MinSize"sv, (uint64_t)Params.MinSize);
+ Writer.AddInteger("MaxSize"sv, (uint64_t)Params.MaxSize);
+ Writer.AddInteger("AvgSize"sv, (uint64_t)Params.AvgSize);
+ }
+ Writer.EndObject(); // ChunkingParams
+ }
+
+ bool IsElfFile(BasicFile& Buffer)
{
+ if (Buffer.FileSize() > 4)
+ {
+ uint32_t ElfCheck = 0;
+ Buffer.Read(&ElfCheck, 4, 0);
+ if (ElfCheck == 0x464c457f)
+ {
+ return true;
+ }
+ }
+ return false;
}
- BasicChunkingController(CbObjectView Parameters)
- : m_ChunkExcludeExtensions(ReadStringArray(Parameters["ChunkExcludeExtensions"sv].AsArrayView()))
- , m_ExcludeElfFiles(Parameters["ExcludeElfFiles"sv].AsBool(DefaultChunkingExcludeElfFiles))
- , m_ExcludeMachOFiles(Parameters["ExcludeMachOFiles"sv].AsBool(DefaultChunkingExcludeMachOFiles))
- , m_ChunkFileSizeLimit(Parameters["ChunkFileSizeLimit"sv].AsUInt64(DefaultChunkingFileSizeLimit))
- , m_ChunkingParams(ReadChunkParams(Parameters["ChunkingParams"sv].AsObjectView()))
+ bool IsMachOFile(BasicFile& Buffer)
{
+ if (Buffer.FileSize() > 4)
+ {
+ uint32_t MachOCheck = 0;
+ Buffer.Read(&MachOCheck, 4, 0);
+ if ((MachOCheck == 0xfeedface) || (MachOCheck == 0xcefaedfe))
+ {
+ return true;
+ }
+ }
+ return false;
}
+} // namespace
+
+class BasicChunkingController : public ChunkingController
+{
+public:
+ BasicChunkingController(const BasicChunkingControllerSettings& Settings) : m_Settings(Settings) {}
+
+ BasicChunkingController(CbObjectView Parameters) : m_Settings(ReadSettings(Parameters)) {}
virtual bool ProcessFile(const std::filesystem::path& InputPath,
uint64_t RawSize,
@@ -70,35 +93,25 @@ public:
{
ZEN_TRACE_CPU("BasicChunkingController::ProcessFile");
const bool ExcludeFromChunking =
- std::find(m_ChunkExcludeExtensions.begin(), m_ChunkExcludeExtensions.end(), InputPath.extension()) !=
- m_ChunkExcludeExtensions.end();
+ std::find(m_Settings.ExcludeExtensions.begin(), m_Settings.ExcludeExtensions.end(), InputPath.extension()) !=
+ m_Settings.ExcludeExtensions.end();
- if (ExcludeFromChunking || (RawSize < m_ChunkFileSizeLimit))
+ if (ExcludeFromChunking || (RawSize < m_Settings.ChunkFileSizeLimit))
{
return false;
}
BasicFile Buffer(InputPath, BasicFile::Mode::kRead);
- if (m_ExcludeElfFiles && Buffer.FileSize() > 4)
+ if (m_Settings.ExcludeElfFiles && IsElfFile(Buffer))
{
- uint32_t ElfCheck = 0;
- Buffer.Read(&ElfCheck, 4, 0);
- if (ElfCheck == 0x464c457f)
- {
- return false;
- }
+ return false;
}
- if (m_ExcludeMachOFiles && Buffer.FileSize() > 4)
+ if (m_Settings.ExcludeMachOFiles && IsMachOFile(Buffer))
{
- uint32_t MachOCheck = 0;
- Buffer.Read(&MachOCheck, 4, 0);
- if ((MachOCheck == 0xfeedface) || (MachOCheck == 0xcefaedfe))
- {
- return false;
- }
+ return false;
}
- OutChunked = ChunkData(Buffer, 0, RawSize, m_ChunkingParams, &BytesProcessed, &AbortFlag);
+ OutChunked = ChunkData(Buffer, 0, RawSize, m_Settings.ChunkingParams, &BytesProcessed, &AbortFlag);
return true;
}
@@ -109,59 +122,43 @@ public:
CbObjectWriter Writer;
Writer.BeginArray("ChunkExcludeExtensions"sv);
{
- for (const std::string& Extension : m_ChunkExcludeExtensions)
+ for (const std::string& Extension : m_Settings.ExcludeExtensions)
{
Writer.AddString(Extension);
}
}
Writer.EndArray(); // ChunkExcludeExtensions
- Writer.AddBool("ExcludeElfFiles"sv, m_ExcludeElfFiles);
- Writer.AddBool("ExcludeMachOFiles"sv, m_ExcludeMachOFiles);
+ Writer.AddBool("ExcludeElfFiles"sv, m_Settings.ExcludeElfFiles);
+ Writer.AddBool("ExcludeMachOFiles"sv, m_Settings.ExcludeMachOFiles);
+ Writer.AddInteger("ChunkFileSizeLimit"sv, m_Settings.ChunkFileSizeLimit);
- Writer.AddInteger("ChunkFileSizeLimit"sv, m_ChunkFileSizeLimit);
- Writer.BeginObject("ChunkingParams"sv);
- {
- Writer.AddBool("UseThreshold"sv, m_ChunkingParams.UseThreshold);
+ WriteChunkParams(Writer, m_Settings.ChunkingParams);
- Writer.AddInteger("MinSize"sv, (uint64_t)m_ChunkingParams.MinSize);
- Writer.AddInteger("MaxSize"sv, (uint64_t)m_ChunkingParams.MaxSize);
- Writer.AddInteger("AvgSize"sv, (uint64_t)m_ChunkingParams.AvgSize);
- }
- Writer.EndObject(); // ChunkingParams
return Writer.Save();
}
static constexpr std::string_view Name = "BasicChunkingController"sv;
-protected:
- const std::vector<std::string> m_ChunkExcludeExtensions;
- const bool m_ExcludeElfFiles = false;
- const bool m_ExcludeMachOFiles = false;
- const uint64_t m_ChunkFileSizeLimit;
- const ChunkedParams m_ChunkingParams;
+private:
+ static BasicChunkingControllerSettings ReadSettings(CbObjectView Parameters)
+ {
+ return BasicChunkingControllerSettings{
+ .ExcludeExtensions = ReadStringArray(Parameters["ChunkExcludeExtensions"sv].AsArrayView()),
+ .ExcludeElfFiles = Parameters["ExcludeElfFiles"sv].AsBool(DefaultChunkingExcludeElfFiles),
+ .ExcludeMachOFiles = Parameters["ExcludeMachOFiles"sv].AsBool(DefaultChunkingExcludeMachOFiles),
+ .ChunkFileSizeLimit = Parameters["ChunkFileSizeLimit"sv].AsUInt64(DefaultChunkingFileSizeLimit),
+ .ChunkingParams = ReadChunkParams(Parameters["ChunkingParams"sv].AsObjectView())};
+ }
+
+ const BasicChunkingControllerSettings m_Settings;
};
class ChunkingControllerWithFixedChunking : public ChunkingController
{
public:
- ChunkingControllerWithFixedChunking(std::span<const std::string_view> FixedChunkingExtensions,
- uint64_t ChunkFileSizeLimit,
- const ChunkedParams& ChunkingParams,
- uint32_t FixedChunkingChunkSize)
- : m_FixedChunkingExtensions(FixedChunkingExtensions.begin(), FixedChunkingExtensions.end())
- , m_ChunkFileSizeLimit(ChunkFileSizeLimit)
- , m_ChunkingParams(ChunkingParams)
- , m_FixedChunkingChunkSize(FixedChunkingChunkSize)
- {
- }
+ ChunkingControllerWithFixedChunking(const ChunkingControllerWithFixedChunkingSettings& Settings) : m_Settings(Settings) {}
- ChunkingControllerWithFixedChunking(CbObjectView Parameters)
- : m_FixedChunkingExtensions(ReadStringArray(Parameters["FixedChunkingExtensions"sv].AsArrayView()))
- , m_ChunkFileSizeLimit(Parameters["ChunkFileSizeLimit"sv].AsUInt64(DefaultChunkingFileSizeLimit))
- , m_ChunkingParams(ReadChunkParams(Parameters["ChunkingParams"sv].AsObjectView()))
- , m_FixedChunkingChunkSize(Parameters["FixedChunkingChunkSize"sv].AsUInt32(16u * 1024u * 1024u))
- {
- }
+ ChunkingControllerWithFixedChunking(CbObjectView Parameters) : m_Settings(ReadSettings(Parameters)) {}
virtual bool ProcessFile(const std::filesystem::path& InputPath,
uint64_t RawSize,
@@ -170,33 +167,71 @@ public:
std::atomic<bool>& AbortFlag) const override
{
ZEN_TRACE_CPU("ChunkingControllerWithFixedChunking::ProcessFile");
- if (RawSize < m_ChunkFileSizeLimit)
+ const bool ExcludeFromChunking =
+ std::find(m_Settings.ExcludeExtensions.begin(), m_Settings.ExcludeExtensions.end(), InputPath.extension()) !=
+ m_Settings.ExcludeExtensions.end();
+
+ if (ExcludeFromChunking || (RawSize < m_Settings.ChunkFileSizeLimit))
{
return false;
}
- const bool FixedChunking = std::find(m_FixedChunkingExtensions.begin(), m_FixedChunkingExtensions.end(), InputPath.extension()) !=
- m_FixedChunkingExtensions.end();
- if (FixedChunking)
+ const bool FixedChunkingExtension =
+ std::find(m_Settings.FixedChunkingExtensions.begin(), m_Settings.FixedChunkingExtensions.end(), InputPath.extension()) !=
+ m_Settings.FixedChunkingExtensions.end();
+
+ if (FixedChunkingExtension)
{
+ if (RawSize < m_Settings.MinSizeForFixedChunking)
+ {
+ return false;
+ }
ZEN_TRACE_CPU("FixedChunking");
- IoHashStream FullHash;
- IoBuffer Source = IoBufferBuilder::MakeFromFile(InputPath);
+ IoHashStream FullHasher;
+ BasicFile Source(InputPath, BasicFile::Mode::kRead);
uint64_t Offset = 0;
tsl::robin_map<IoHash, uint32_t, IoHash::Hasher> ChunkHashToChunkIndex;
- ChunkHashToChunkIndex.reserve(1 + (RawSize / m_FixedChunkingChunkSize));
+ const uint64_t ExpectedChunkCount = 1 + (RawSize / m_Settings.FixedChunkingChunkSize);
+ ChunkHashToChunkIndex.reserve(ExpectedChunkCount);
+ OutChunked.Info.ChunkHashes.reserve(ExpectedChunkCount);
+ OutChunked.Info.ChunkSequence.reserve(ExpectedChunkCount);
+ OutChunked.ChunkSources.reserve(ExpectedChunkCount);
+
+ static const uint64_t BufferingSize = 256u * 1024u;
+
+ IoHashStream ChunkHasher;
+
while (Offset < RawSize)
{
if (AbortFlag)
{
return false;
}
- uint64_t ChunkSize = std::min<uint64_t>(RawSize - Offset, m_FixedChunkingChunkSize);
- IoBuffer Chunk(Source, Offset, ChunkSize);
- MemoryView ChunkData = Chunk.GetView();
- FullHash.Append(ChunkData);
- IoHash ChunkHash = IoHash::HashBuffer(ChunkData);
+ ChunkHasher.Reset();
+
+ uint64_t ChunkSize = std::min<uint64_t>(RawSize - Offset, m_Settings.FixedChunkingChunkSize);
+ if (ChunkSize >= (BufferingSize + BufferingSize / 2))
+ {
+ ScanFile(Source.Handle(),
+ Offset,
+ ChunkSize,
+ BufferingSize,
+ [&FullHasher, &ChunkHasher, &BytesProcessed](const void* Data, size_t Size) {
+ FullHasher.Append(Data, Size);
+ ChunkHasher.Append(Data, Size);
+ BytesProcessed.fetch_add(Size);
+ });
+ }
+ else
+ {
+ IoBuffer ChunkData = Source.ReadRange(Offset, ChunkSize);
+ FullHasher.Append(ChunkData);
+ ChunkHasher.Append(ChunkData);
+ BytesProcessed.fetch_add(ChunkSize);
+ }
+
+ const IoHash ChunkHash = ChunkHasher.GetHash();
if (auto It = ChunkHashToChunkIndex.find(ChunkHash); It != ChunkHashToChunkIndex.end())
{
OutChunked.Info.ChunkSequence.push_back(It->second);
@@ -209,16 +244,24 @@ public:
OutChunked.ChunkSources.push_back({.Offset = Offset, .Size = gsl::narrow<uint32_t>(ChunkSize)});
}
Offset += ChunkSize;
- BytesProcessed.fetch_add(ChunkSize);
}
OutChunked.Info.RawSize = RawSize;
- OutChunked.Info.RawHash = FullHash.GetHash();
+ OutChunked.Info.RawHash = FullHasher.GetHash();
return true;
}
else
{
BasicFile Buffer(InputPath, BasicFile::Mode::kRead);
- OutChunked = ChunkData(Buffer, 0, RawSize, m_ChunkingParams, &BytesProcessed);
+ if (m_Settings.ExcludeElfFiles && IsElfFile(Buffer))
+ {
+ return false;
+ }
+ if (m_Settings.ExcludeMachOFiles && IsMachOFile(Buffer))
+ {
+ return false;
+ }
+
+ OutChunked = ChunkData(Buffer, 0, RawSize, m_Settings.ChunkingParams, &BytesProcessed, &AbortFlag);
return true;
}
}
@@ -230,47 +273,57 @@ public:
CbObjectWriter Writer;
Writer.BeginArray("FixedChunkingExtensions");
{
- for (const std::string& Extension : m_FixedChunkingExtensions)
+ for (const std::string& Extension : m_Settings.FixedChunkingExtensions)
{
Writer.AddString(Extension);
}
}
Writer.EndArray(); // ChunkExcludeExtensions
- Writer.AddInteger("ChunkFileSizeLimit"sv, m_ChunkFileSizeLimit);
- Writer.BeginObject("ChunkingParams"sv);
- {
- Writer.AddBool("UseThreshold"sv, m_ChunkingParams.UseThreshold);
- Writer.AddInteger("MinSize"sv, (uint64_t)m_ChunkingParams.MinSize);
- Writer.AddInteger("MaxSize"sv, (uint64_t)m_ChunkingParams.MaxSize);
- Writer.AddInteger("AvgSize"sv, (uint64_t)m_ChunkingParams.AvgSize);
+ Writer.BeginArray("ChunkExcludeExtensions"sv);
+ {
+ for (const std::string& Extension : m_Settings.ExcludeExtensions)
+ {
+ Writer.AddString(Extension);
+ }
}
- Writer.EndObject(); // ChunkingParams
- Writer.AddInteger("FixedChunkingChunkSize"sv, m_FixedChunkingChunkSize);
+ Writer.EndArray(); // ChunkExcludeExtensions
+
+ Writer.AddBool("ExcludeElfFiles"sv, m_Settings.ExcludeElfFiles);
+ Writer.AddBool("ExcludeMachOFiles"sv, m_Settings.ExcludeMachOFiles);
+
+ Writer.AddInteger("ChunkFileSizeLimit"sv, m_Settings.ChunkFileSizeLimit);
+
+ WriteChunkParams(Writer, m_Settings.ChunkingParams);
+
+ Writer.AddInteger("FixedChunkingChunkSize"sv, m_Settings.FixedChunkingChunkSize);
+ Writer.AddInteger("MinSizeForFixedChunking"sv, m_Settings.MinSizeForFixedChunking);
return Writer.Save();
}
static constexpr std::string_view Name = "ChunkingControllerWithFixedChunking"sv;
-protected:
- const std::vector<std::string> m_FixedChunkingExtensions;
- const uint64_t m_ChunkFileSizeLimit;
- const ChunkedParams m_ChunkingParams;
- const uint32_t m_FixedChunkingChunkSize;
+private:
+ static ChunkingControllerWithFixedChunkingSettings ReadSettings(CbObjectView Parameters)
+ {
+ return ChunkingControllerWithFixedChunkingSettings{
+ .FixedChunkingExtensions = ReadStringArray(Parameters["FixedChunkingExtensions"sv].AsArrayView()),
+ .ExcludeExtensions = ReadStringArray(Parameters["ChunkExcludeExtensions"sv].AsArrayView()),
+ .ExcludeElfFiles = Parameters["ExcludeElfFiles"sv].AsBool(DefaultChunkingExcludeElfFiles),
+ .ExcludeMachOFiles = Parameters["ExcludeMachOFiles"sv].AsBool(DefaultChunkingExcludeMachOFiles),
+ .ChunkFileSizeLimit = Parameters["ChunkFileSizeLimit"sv].AsUInt64(DefaultChunkingFileSizeLimit),
+ .ChunkingParams = ReadChunkParams(Parameters["ChunkingParams"sv].AsObjectView()),
+ .FixedChunkingChunkSize = Parameters["FixedChunkingChunkSize"sv].AsUInt64(DefaultFixedChunkingChunkSize),
+ .MinSizeForFixedChunking = Parameters["MinSizeForFixedChunking"sv].AsUInt64(DefaultFixedChunkingChunkSize)};
+ }
+
+ const ChunkingControllerWithFixedChunkingSettings m_Settings;
};
std::unique_ptr<ChunkingController>
-CreateBasicChunkingController(std::span<const std::string_view> ExcludeExtensions,
- bool ExcludeElfFiles,
- bool ExcludeMachOFiles,
- uint64_t ChunkFileSizeLimit,
- const ChunkedParams& ChunkingParams)
+CreateBasicChunkingController(const BasicChunkingControllerSettings& Settings)
{
- return std::make_unique<BasicChunkingController>(ExcludeExtensions,
- ExcludeElfFiles,
- ExcludeMachOFiles,
- ChunkFileSizeLimit,
- ChunkingParams);
+ return std::make_unique<BasicChunkingController>(Settings);
}
std::unique_ptr<ChunkingController>
CreateBasicChunkingController(CbObjectView Parameters)
@@ -279,15 +332,9 @@ CreateBasicChunkingController(CbObjectView Parameters)
}
std::unique_ptr<ChunkingController>
-CreateChunkingControllerWithFixedChunking(std::span<const std::string_view> FixedChunkingExtensions,
- uint64_t ChunkFileSizeLimit,
- const ChunkedParams& ChunkingParams,
- uint32_t FixedChunkingChunkSize)
+CreateChunkingControllerWithFixedChunking(const ChunkingControllerWithFixedChunkingSettings& Setting)
{
- return std::make_unique<ChunkingControllerWithFixedChunking>(FixedChunkingExtensions,
- ChunkFileSizeLimit,
- ChunkingParams,
- FixedChunkingChunkSize);
+ return std::make_unique<ChunkingControllerWithFixedChunking>(Setting);
}
std::unique_ptr<ChunkingController>
CreateChunkingControllerWithFixedChunking(CbObjectView Parameters)