diff options
| author | Dan Engelbrecht <[email protected]> | 2025-10-20 12:09:46 +0200 |
|---|---|---|
| committer | GitHub Enterprise <[email protected]> | 2025-10-20 12:09:46 +0200 |
| commit | c1af02eeb2badfbd2c01125730c6b85bbed8be9e (patch) | |
| tree | d5a21612f886940166f905b6abc408959220834d /src | |
| parent | 5.7.7-pre0 (diff) | |
| download | zen-c1af02eeb2badfbd2c01125730c6b85bbed8be9e.tar.xz zen-c1af02eeb2badfbd2c01125730c6b85bbed8be9e.zip | |
updated chunking strategy (#589)
- Improvement: `zen builds`now split large files that are compress only into 64 MB chunks to avoiding very large files in Cloud Storage
- Improvement: `zen builds` now treats `.msixvc` files as non-compressable
Moved and cleaned up compactbinary_helpers functions
Tweaked fixed chunking implementation for better performance
Refactored so we have one list of "non-compressable" extensions
Implemented new `StandardChunkingStrategy` and move the two existing to hidden legacy namespace
Added `FilteredDownloadedBytesPerSecond.Start();` call that got lost during previous refactoring
Diffstat (limited to 'src')
9 files changed, 666 insertions, 421 deletions
diff --git a/src/zen/cmds/builds_cmd.cpp b/src/zen/cmds/builds_cmd.cpp index 27f050a44..9c6fd17ab 100644 --- a/src/zen/cmds/builds_cmd.cpp +++ b/src/zen/cmds/builds_cmd.cpp @@ -550,7 +550,8 @@ namespace { .TempDir = TempDir, .ExcludeFolders = DefaultExcludeFolders, .ExcludeExtensions = DefaultExcludeExtensions, - .ZenExcludeManifestName = ZenExcludeManifestName}); + .ZenExcludeManifestName = ZenExcludeManifestName, + .NonCompressableExtensions = DefaultSplitOnlyExtensions}); UploadOp.Execute(); if (AbortFlag) { @@ -1282,6 +1283,11 @@ namespace { // TODO: GetBlockDescriptions for all BlockRawHashes in one go - check for local block descriptions when we cache them { + if (!IsQuiet) + { + ZEN_CONSOLE("Fetching metadata for {} blocks", BlockRawHashes.size()); + } + Stopwatch GetBlockMetadataTimer; std::vector<ChunkBlockDescription> UnorderedList; @@ -2045,7 +2051,7 @@ namespace { if (!ChunkController && !IsQuiet) { ZEN_CONSOLE_WARN("Unspecified chunking algorith, using default"); - ChunkController = CreateChunkingControllerWithFixedChunking(ChunkingControllerWithFixedChunkingSettings{}); + ChunkController = CreateStandardChunkingController(StandardChunkingControllerSettings{}); } LocalContent = GetLocalContent(LocalFolderScanStats, @@ -2348,14 +2354,17 @@ namespace { ChunkedFolderContent CompareFolderContent; { - std::unique_ptr<ChunkingController> ChunkController = - CreateChunkingControllerWithFixedChunking(ChunkingControllerWithFixedChunkingSettings{}); - std::vector<std::string> ExcludeExtensions = DefaultExcludeExtensions; + StandardChunkingControllerSettings ChunkingSettings; + std::unique_ptr<ChunkingController> ChunkController = CreateStandardChunkingController(ChunkingSettings); + std::vector<std::string> ExcludeExtensions = DefaultExcludeExtensions; if (OnlyChunked) { ExcludeExtensions.insert(ExcludeExtensions.end(), - DefaultChunkingExcludeExtensions.begin(), - DefaultChunkingExcludeExtensions.end()); + ChunkingSettings.SplitOnlyExtensions.begin(), + ChunkingSettings.SplitOnlyExtensions.end()); + ExcludeExtensions.insert(ExcludeExtensions.end(), + ChunkingSettings.SplitAndCompressExtensions.begin(), + ChunkingSettings.SplitAndCompressExtensions.end()); } auto IsAcceptedFolder = [ExcludeFolders = DefaultExcludeFolders](const std::string_view& RelativePath) -> bool { diff --git a/src/zencore/include/zencore/compactbinaryutil.h b/src/zencore/include/zencore/compactbinaryutil.h index d750c6492..eecc3344b 100644 --- a/src/zencore/include/zencore/compactbinaryutil.h +++ b/src/zencore/include/zencore/compactbinaryutil.h @@ -52,4 +52,136 @@ ValidateAndReadCompactBinaryObject(const IoBuffer&& Payload, CbValidateError& Ou } CbObject ValidateAndReadCompactBinaryObject(const CompressedBuffer&& Payload, CbValidateError& OutError); +namespace compactbinary_helpers { + template<typename Type> + inline void WriteArray(std::span<const Type> Values, std::string_view ArrayName, CbWriter& Output) + { + Output.BeginArray(ArrayName); + for (const Type Value : Values) + { + Output << Value; + } + Output.EndArray(); + } + + template<typename Type> + inline void WriteArray(const std::vector<Type>& Values, std::string_view ArrayName, CbWriter& Output) + { + WriteArray(std::span<const Type>(Values), ArrayName, Output); + } + + template<> + inline void WriteArray(std::span<const std::filesystem::path> Values, std::string_view ArrayName, CbWriter& Output) + { + Output.BeginArray(ArrayName); + for (const std::filesystem::path& Path : Values) + { + Output.AddString((const char*)Path.generic_u8string().c_str()); + } + Output.EndArray(); + } + + template<> + inline void WriteArray(const std::vector<std::filesystem::path>& Values, std::string_view ArrayName, CbWriter& Output) + { + WriteArray(std::span<const std::filesystem::path>(Values), ArrayName, Output); + } + + inline void WriteBinaryAttachmentArray(std::span<const IoHash> Values, std::string_view ArrayName, CbWriter& Output) + { + Output.BeginArray(ArrayName); + for (const IoHash& Hash : Values) + { + Output.AddBinaryAttachment(Hash); + } + Output.EndArray(); + } + + inline void WriteBinaryAttachmentArray(const std::vector<IoHash>& Values, std::string_view ArrayName, CbWriter& Output) + { + WriteArray(std::span<const IoHash>(Values), ArrayName, Output); + } + + template<typename Type> + std::vector<Type> ReadArray(std::string_view ArrayName, CbObjectView Input); + + template<> + inline std::vector<uint32_t> ReadArray(std::string_view ArrayName, CbObjectView Input) + { + std::vector<uint32_t> Result; + CbArrayView Array = Input[ArrayName].AsArrayView(); + Result.reserve(Array.Num()); + for (CbFieldView ItemView : Array) + { + Result.push_back(ItemView.AsUInt32()); + } + return Result; + } + + template<> + inline std::vector<uint64_t> ReadArray(std::string_view ArrayName, CbObjectView Input) + { + std::vector<uint64_t> Result; + CbArrayView Array = Input[ArrayName].AsArrayView(); + Result.reserve(Array.Num()); + for (CbFieldView ItemView : Array) + { + Result.push_back(ItemView.AsUInt64()); + } + return Result; + } + + template<> + inline std::vector<std::filesystem::path> ReadArray(std::string_view ArrayName, CbObjectView Input) + { + std::vector<std::filesystem::path> Result; + CbArrayView Array = Input[ArrayName].AsArrayView(); + Result.reserve(Array.Num()); + for (CbFieldView ItemView : Array) + { + Result.push_back(std::filesystem::path(ItemView.AsU8String())); + } + return Result; + } + + template<> + inline std::vector<std::string> ReadArray(std::string_view ArrayName, CbObjectView Input) + { + std::vector<std::string> Result; + CbArrayView Array = Input[ArrayName].AsArrayView(); + Result.reserve(Array.Num()); + for (CbFieldView ItemView : Array) + { + Result.push_back(std::string(ItemView.AsString())); + } + return Result; + } + + template<> + inline std::vector<IoHash> ReadArray(std::string_view ArrayName, CbObjectView Input) + { + std::vector<IoHash> Result; + CbArrayView Array = Input[ArrayName].AsArrayView(); + Result.reserve(Array.Num()); + for (CbFieldView ItemView : Array) + { + Result.push_back(ItemView.AsHash()); + } + return Result; + } + + inline std::vector<IoHash> ReadBinaryAttachmentArray(std::string_view ArrayName, CbObjectView Input) + { + std::vector<IoHash> Result; + CbArrayView Array = Input[ArrayName].AsArrayView(); + Result.reserve(Array.Num()); + for (CbFieldView ItemView : Array) + { + Result.push_back(ItemView.AsBinaryAttachment()); + } + return Result; + } + +} // namespace compactbinary_helpers + } // namespace zen diff --git a/src/zenremotestore/builds/buildsavedstate.cpp b/src/zenremotestore/builds/buildsavedstate.cpp index 933616856..cf46668f9 100644 --- a/src/zenremotestore/builds/buildsavedstate.cpp +++ b/src/zenremotestore/builds/buildsavedstate.cpp @@ -89,9 +89,9 @@ ReadBuildContentFromCompactBinary(CbObjectView BuildPartManifest, CbObjectView FilesObject = BuildPartManifest["files"sv].AsObjectView(); - compactbinary_helpers::ReadArray("paths"sv, FilesObject, OutPaths); - compactbinary_helpers::ReadArray("rawhashes"sv, FilesObject, OutRawHashes); - compactbinary_helpers::ReadArray("rawsizes"sv, FilesObject, OutRawSizes); + OutPaths = compactbinary_helpers::ReadArray<std::filesystem::path>("paths"sv, FilesObject); + OutRawHashes = compactbinary_helpers::ReadArray<IoHash>("rawhashes"sv, FilesObject); + OutRawSizes = compactbinary_helpers::ReadArray<uint64_t>("rawsizes"sv, FilesObject); uint64_t PathCount = OutPaths.size(); if (OutRawHashes.size() != PathCount) @@ -103,15 +103,13 @@ ReadBuildContentFromCompactBinary(CbObjectView BuildPartManifest, throw std::runtime_error(fmt::format("Number of raw sizes entries does not match number of paths")); } - std::vector<uint32_t> ModeArray; - compactbinary_helpers::ReadArray("mode"sv, FilesObject, ModeArray); + std::vector<uint32_t> ModeArray = compactbinary_helpers::ReadArray<uint32_t>("mode"sv, FilesObject); if (ModeArray.size() != PathCount && ModeArray.size() != 0) { throw std::runtime_error(fmt::format("Number of attribute entries does not match number of paths")); } - std::vector<uint32_t> AttributeArray; - compactbinary_helpers::ReadArray("attributes"sv, FilesObject, ModeArray); + std::vector<uint32_t> AttributeArray = compactbinary_helpers::ReadArray<uint32_t>("attributes"sv, FilesObject); if (AttributeArray.size() != PathCount && AttributeArray.size() != 0) { throw std::runtime_error(fmt::format("Number of attribute entries does not match number of paths")); @@ -143,26 +141,24 @@ ReadBuildContentFromCompactBinary(CbObjectView BuildPartManifest, if (CbObjectView ChunkContentView = BuildPartManifest["chunkedContent"sv].AsObjectView(); ChunkContentView) { - compactbinary_helpers::ReadArray("sequenceRawHashes"sv, ChunkContentView, OutSequenceRawHashes); - compactbinary_helpers::ReadArray("chunkcounts"sv, ChunkContentView, OutChunkCounts); + OutSequenceRawHashes = compactbinary_helpers::ReadArray<IoHash>("sequenceRawHashes"sv, ChunkContentView); + OutChunkCounts = compactbinary_helpers::ReadArray<uint32_t>("chunkcounts"sv, ChunkContentView); if (OutChunkCounts.size() != OutSequenceRawHashes.size()) { throw std::runtime_error(fmt::format("Number of chunk count entries does not match number of paths")); } - compactbinary_helpers::ReadArray("chunkorders"sv, ChunkContentView, OutAbsoluteChunkOrders); + OutAbsoluteChunkOrders = compactbinary_helpers::ReadArray<uint32_t>("chunkorders"sv, ChunkContentView); } else if (FilesObject["chunkcounts"sv]) { // Legacy zen style - std::vector<uint32_t> LegacyChunkCounts; - compactbinary_helpers::ReadArray("chunkcounts"sv, FilesObject, LegacyChunkCounts); + std::vector<uint32_t> LegacyChunkCounts = compactbinary_helpers::ReadArray<uint32_t>("chunkcounts"sv, FilesObject); if (LegacyChunkCounts.size() != PathCount) { throw std::runtime_error(fmt::format("Number of chunk count entries does not match number of paths")); } - std::vector<uint32_t> LegacyAbsoluteChunkOrders; - compactbinary_helpers::ReadArray("chunkorders"sv, FilesObject, LegacyAbsoluteChunkOrders); + std::vector<uint32_t> LegacyAbsoluteChunkOrders = compactbinary_helpers::ReadArray<uint32_t>("chunkorders"sv, FilesObject); CbArrayView ChunkOrdersArray = BuildPartManifest["chunkorders"sv].AsArrayView(); const uint64_t ChunkOrdersCount = ChunkOrdersArray.Num(); @@ -217,8 +213,8 @@ ReadBuildContentFromCompactBinary(CbObjectView BuildPartManifest, CbObjectView ChunkAttachmentsView = BuildPartManifest["chunkAttachments"sv].AsObjectView(); { - compactbinary_helpers::ReadBinaryAttachmentArray("rawHashes"sv, ChunkAttachmentsView, OutLooseChunkHashes); - compactbinary_helpers::ReadArray("chunkRawSizes"sv, ChunkAttachmentsView, OutLooseChunkRawSizes); + OutLooseChunkHashes = compactbinary_helpers::ReadBinaryAttachmentArray("rawHashes"sv, ChunkAttachmentsView); + OutLooseChunkRawSizes = compactbinary_helpers::ReadArray<uint64_t>("chunkRawSizes"sv, ChunkAttachmentsView); if (OutLooseChunkHashes.size() != OutLooseChunkRawSizes.size()) { throw std::runtime_error(fmt::format("Number of attachment chunk hashes does not match number of attachemnt chunk raw sizes")); @@ -227,7 +223,7 @@ ReadBuildContentFromCompactBinary(CbObjectView BuildPartManifest, CbObjectView BlocksView = BuildPartManifest["blockAttachments"sv].AsObjectView(); { - compactbinary_helpers::ReadBinaryAttachmentArray("rawHashes"sv, BlocksView, OutBlockRawHashes); + OutBlockRawHashes = compactbinary_helpers::ReadBinaryAttachmentArray("rawHashes"sv, BlocksView); } } diff --git a/src/zenremotestore/builds/buildstorageoperations.cpp b/src/zenremotestore/builds/buildstorageoperations.cpp index 39f7e8edf..ebb876ed9 100644 --- a/src/zenremotestore/builds/buildstorageoperations.cpp +++ b/src/zenremotestore/builds/buildstorageoperations.cpp @@ -11,6 +11,7 @@ #include <zencore/basicfile.h> #include <zencore/compactbinary.h> #include <zencore/compactbinaryfile.h> +#include <zencore/compactbinaryutil.h> #include <zencore/filesystem.h> #include <zencore/fmtutils.h> #include <zencore/parallelwork.h> @@ -143,24 +144,15 @@ namespace { return Result.FailedRemovePaths.empty(); } - const std::vector<uint32_t> NonCompressableExtensions({HashStringDjb2(".mp4"sv), - HashStringDjb2(".zip"sv), - HashStringDjb2(".7z"sv), - HashStringDjb2(".bzip"sv), - HashStringDjb2(".rar"sv), - HashStringDjb2(".gzip"sv), - HashStringDjb2(".apk"sv), - HashStringDjb2(".nsp"sv), - HashStringDjb2(".xvc"sv), - HashStringDjb2(".pkg"sv), - HashStringDjb2(".dmg"sv), - HashStringDjb2(".ipa"sv)}); - - const tsl::robin_set<uint32_t> NonCompressableExtensionSet(NonCompressableExtensions.begin(), NonCompressableExtensions.end()); - - bool IsExtensionHashCompressable(const uint32_t PathHash) { return !NonCompressableExtensionSet.contains(PathHash); } + bool IsExtensionHashCompressable(const tsl::robin_set<uint32_t>& NonCompressableExtensionHashes, const uint32_t PathHash) + { + return !NonCompressableExtensionHashes.contains(PathHash); + } - bool IsChunkCompressable(const ChunkedFolderContent& Content, const ChunkedContentLookup& Lookup, uint32_t ChunkIndex) + bool IsChunkCompressable(const tsl::robin_set<uint32_t>& NonCompressableExtensionHashes, + const ChunkedFolderContent& Content, + const ChunkedContentLookup& Lookup, + uint32_t ChunkIndex) { ZEN_UNUSED(Content); const uint32_t ChunkLocationCount = Lookup.ChunkSequenceLocationCounts[ChunkIndex]; @@ -173,7 +165,7 @@ namespace { const uint32_t PathIndex = Lookup.SequenceIndexFirstPathIndex[SequenceIndex]; const uint32_t ExtensionHash = Lookup.PathExtensionHash[PathIndex]; - const bool IsCompressable = IsExtensionHashCompressable(ExtensionHash); + const bool IsCompressable = IsExtensionHashCompressable(NonCompressableExtensionHashes, ExtensionHash); return IsCompressable; } @@ -1701,6 +1693,8 @@ BuildsOperationUpdateFolder::Execute(FolderContent& OutLocalFolderState) const BlockRangeDescriptor& BlockRange = BlockRangeWorks[BlockRangeIndex]; + FilteredDownloadedBytesPerSecond.Start(); + DownloadPartialBlock( BlockRange, ExistsResult, @@ -4603,6 +4597,11 @@ BuildsOperationUploadFolder::BuildsOperationUploadFolder(BuildOpLogOutput& L , m_MetaData(MetaData) , m_Options(Options) { + m_NonCompressableExtensionHashes.reserve(Options.NonCompressableExtensions.size()); + for (const std::string& Extension : Options.NonCompressableExtensions) + { + m_NonCompressableExtensionHashes.insert(HashStringAsLowerDjb2(Extension)); + } } void @@ -4783,8 +4782,7 @@ BuildsOperationUploadFolder::Execute() m_LocalFolderScanStats.ElapsedWallTimeUS = ManifestParseTimer.GetElapsedTimeUs(); } - std::unique_ptr<ChunkingController> ChunkController = - CreateChunkingControllerWithFixedChunking(ChunkingControllerWithFixedChunkingSettings{}); + std::unique_ptr<ChunkingController> ChunkController = CreateStandardChunkingController(StandardChunkingControllerSettings{}); { CbObjectWriter ChunkParametersWriter; ChunkParametersWriter.AddString("name"sv, ChunkController->GetName()); @@ -6145,7 +6143,7 @@ BuildsOperationUploadFolder::GenerateBlock(const ChunkedFolderContent& Content, uint64_t RawSize = Chunk.GetSize(); const bool ShouldCompressChunk = Lookup.RawHashToSequenceIndex.contains(ChunkHash) && (RawSize >= m_Options.MinimumSizeForCompressInBlock) && - IsChunkCompressable(Content, Lookup, ChunkIndex); + IsChunkCompressable(m_NonCompressableExtensionHashes, Content, Lookup, ChunkIndex); const OodleCompressionLevel CompressionLevel = ShouldCompressChunk ? OodleCompressionLevel::VeryFast : OodleCompressionLevel::None; return {RawSize, CompressedBuffer::Compress(Chunk, OodleCompressor::Mermaid, CompressionLevel)}; @@ -6187,7 +6185,7 @@ BuildsOperationUploadFolder::RebuildBlock(const ChunkedFolderContent& Content, const uint64_t RawSize = Chunk.GetSize(); const bool ShouldCompressChunk = Lookup.RawHashToSequenceIndex.contains(ChunkHash) && (RawSize >= m_Options.MinimumSizeForCompressInBlock) && - IsChunkCompressable(Content, Lookup, ChunkIndex); + IsChunkCompressable(m_NonCompressableExtensionHashes, Content, Lookup, ChunkIndex); const OodleCompressionLevel CompressionLevel = ShouldCompressChunk ? OodleCompressionLevel::VeryFast : OodleCompressionLevel::None; CompositeBuffer CompressedChunk = @@ -6682,7 +6680,7 @@ BuildsOperationUploadFolder::CompressChunk(const ChunkedFolderContent& Content, throw std::runtime_error(fmt::format("Fetched chunk {} has invalid size", ChunkHash)); } - const bool ShouldCompressChunk = IsChunkCompressable(Content, Lookup, ChunkIndex); + const bool ShouldCompressChunk = IsChunkCompressable(m_NonCompressableExtensionHashes, Content, Lookup, ChunkIndex); const OodleCompressionLevel CompressionLevel = ShouldCompressChunk ? OodleCompressionLevel::VeryFast : OodleCompressionLevel::None; if (ShouldCompressChunk) diff --git a/src/zenremotestore/chunking/chunkedcontent.cpp b/src/zenremotestore/chunking/chunkedcontent.cpp index eb0e8bdc9..ea67e3d94 100644 --- a/src/zenremotestore/chunking/chunkedcontent.cpp +++ b/src/zenremotestore/chunking/chunkedcontent.cpp @@ -2,6 +2,7 @@ #include <zenremotestore/chunking/chunkedcontent.h> +#include <zencore/compactbinaryutil.h> #include <zencore/filesystem.h> #include <zencore/fmtutils.h> #include <zencore/logging.h> @@ -362,11 +363,11 @@ LoadFolderContentToCompactBinary(CbObjectView Input) { ZEN_TRACE_CPU("LoadFolderContentToCompactBinary"); FolderContent Content; - Content.Platform = FromString(Input["platform"sv].AsString(), GetSourceCurrentPlatform()); - compactbinary_helpers::ReadArray("paths"sv, Input, Content.Paths); - compactbinary_helpers::ReadArray("rawSizes"sv, Input, Content.RawSizes); - compactbinary_helpers::ReadArray("attributes"sv, Input, Content.Attributes); - compactbinary_helpers::ReadArray("modificationTimes"sv, Input, Content.ModificationTicks); + Content.Platform = FromString(Input["platform"sv].AsString(), GetSourceCurrentPlatform()); + Content.Paths = compactbinary_helpers::ReadArray<std::filesystem::path>("paths"sv, Input); + Content.RawSizes = compactbinary_helpers::ReadArray<uint64_t>("rawSizes"sv, Input); + Content.Attributes = compactbinary_helpers::ReadArray<uint32_t>("attributes"sv, Input); + Content.ModificationTicks = compactbinary_helpers::ReadArray<uint64_t>("modificationTimes"sv, Input); return Content; } @@ -534,18 +535,18 @@ LoadChunkedFolderContentToCompactBinary(CbObjectView Input) { ZEN_TRACE_CPU("LoadChunkedFolderContentToCompactBinary"); ChunkedFolderContent Content; - Content.Platform = FromString(Input["platform"sv].AsString(), GetSourceCurrentPlatform()); - compactbinary_helpers::ReadArray("paths"sv, Input, Content.Paths); - compactbinary_helpers::ReadArray("rawSizes"sv, Input, Content.RawSizes); - compactbinary_helpers::ReadArray("attributes"sv, Input, Content.Attributes); - compactbinary_helpers::ReadArray("rawHashes"sv, Input, Content.RawHashes); - - CbObjectView ChunkedContentView = Input["chunkedContent"sv].AsObjectView(); - compactbinary_helpers::ReadArray("sequenceRawHashes"sv, ChunkedContentView, Content.ChunkedContent.SequenceRawHashes); - compactbinary_helpers::ReadArray("chunkCounts"sv, ChunkedContentView, Content.ChunkedContent.ChunkCounts); - compactbinary_helpers::ReadArray("chunkOrders"sv, ChunkedContentView, Content.ChunkedContent.ChunkOrders); - compactbinary_helpers::ReadArray("chunkHashes"sv, ChunkedContentView, Content.ChunkedContent.ChunkHashes); - compactbinary_helpers::ReadArray("chunkRawSizes"sv, ChunkedContentView, Content.ChunkedContent.ChunkRawSizes); + Content.Platform = FromString(Input["platform"sv].AsString(), GetSourceCurrentPlatform()); + Content.Paths = compactbinary_helpers::ReadArray<std::filesystem::path>("paths"sv, Input); + Content.RawSizes = compactbinary_helpers::ReadArray<uint64_t>("rawSizes"sv, Input); + Content.Attributes = compactbinary_helpers::ReadArray<uint32_t>("attributes"sv, Input); + Content.RawHashes = compactbinary_helpers::ReadArray<IoHash>("rawHashes"sv, Input); + + CbObjectView ChunkedContentView = Input["chunkedContent"sv].AsObjectView(); + Content.ChunkedContent.SequenceRawHashes = compactbinary_helpers::ReadArray<IoHash>("sequenceRawHashes"sv, ChunkedContentView); + Content.ChunkedContent.ChunkCounts = compactbinary_helpers::ReadArray<uint32_t>("chunkCounts"sv, ChunkedContentView); + Content.ChunkedContent.ChunkOrders = compactbinary_helpers::ReadArray<uint32_t>("chunkOrders"sv, ChunkedContentView); + Content.ChunkedContent.ChunkHashes = compactbinary_helpers::ReadArray<IoHash>("chunkHashes"sv, ChunkedContentView); + Content.ChunkedContent.ChunkRawSizes = compactbinary_helpers::ReadArray<uint64_t>("chunkRawSizes"sv, ChunkedContentView); return Content; } diff --git a/src/zenremotestore/chunking/chunkingcontroller.cpp b/src/zenremotestore/chunking/chunkingcontroller.cpp index cc20446ea..91ca18d10 100644 --- a/src/zenremotestore/chunking/chunkingcontroller.cpp +++ b/src/zenremotestore/chunking/chunkingcontroller.cpp @@ -3,7 +3,7 @@ #include <zenremotestore/chunking/chunkingcontroller.h> #include <zencore/basicfile.h> -#include <zencore/compactbinarybuilder.h> +#include <zencore/compactbinaryutil.h> #include <zencore/filesystem.h> #include <zencore/trace.h> @@ -16,23 +16,12 @@ namespace zen { using namespace std::literals; namespace { - std::vector<std::string> ReadStringArray(CbArrayView StringArray) - { - std::vector<std::string> Result; - Result.reserve(StringArray.Num()); - for (CbFieldView FieldView : StringArray) - { - Result.emplace_back(FieldView.AsString()); - } - return Result; - } - ChunkedParams ReadChunkParams(CbObjectView Params) { bool UseThreshold = Params["UseThreshold"sv].AsBool(true); - size_t MinSize = Params["MinSize"sv].AsUInt64(DefaultChunkedParams.MinSize); - size_t MaxSize = Params["MaxSize"sv].AsUInt64(DefaultChunkedParams.MaxSize); - size_t AvgSize = Params["AvgSize"sv].AsUInt64(DefaultChunkedParams.AvgSize); + size_t MinSize = Params["MinSize"sv].AsUInt64(DefaultDynamicChunkingParams.MinSize); + size_t MaxSize = Params["MaxSize"sv].AsUInt64(DefaultDynamicChunkingParams.MaxSize); + size_t AvgSize = Params["AvgSize"sv].AsUInt64(DefaultDynamicChunkingParams.AvgSize); return ChunkedParams{.UseThreshold = UseThreshold, .MinSize = MinSize, .MaxSize = MaxSize, .AvgSize = AvgSize}; } @@ -50,6 +39,44 @@ namespace { Writer.EndObject(); // ChunkingParams } +} // namespace + +namespace legacy { + const std::vector<std::string> DefaultChunkingExcludeExtensions = { + ".exe", ".dll", ".pdb", ".self", ".mp4", ".zip", ".7z", ".bzip", ".rar", ".gzip", + ".sym", ".psym", ".txt", ".ini", ".json", ".verse", ".versemodule", ".jpg", ".c", ".h", + ".cpp", ".cxx", ".c++", ".cc", ".hpp", ".hxx", ".h++", ".py", ".ogg", ".plist"}; + + const std::vector<std::string> DefaultFixedChunkingExtensions = {".apk", ".nsp", ".xvc", ".pkg", ".dmg", ".ipa"}; + const bool DefaultChunkingExcludeElfFiles = true; + const bool DefaultChunkingExcludeMachOFiles = true; + + const size_t DefaultChunkingFileSizeLimit = DefaultDynamicChunkingParams.MaxSize; + + const uint64_t DefaultFixedChunkingChunkSize = 32u * 1024u * 1024u; + const uint64_t DefaultMinSizeForFixedChunking = DefaultFixedChunkingChunkSize * 8u; + + struct BasicChunkingControllerSettings + { + std::vector<std::string> ExcludeExtensions = DefaultChunkingExcludeExtensions; + bool ExcludeElfFiles = DefaultChunkingExcludeElfFiles; + bool ExcludeMachOFiles = DefaultChunkingExcludeMachOFiles; + uint64_t ChunkFileSizeLimit = DefaultChunkingFileSizeLimit; + ChunkedParams ChunkingParams = DefaultDynamicChunkingParams; + }; + + struct ChunkingControllerWithFixedChunkingSettings + { + std::vector<std::string> FixedChunkingExtensions = DefaultFixedChunkingExtensions; + std::vector<std::string> ExcludeExtensions = DefaultChunkingExcludeExtensions; + bool ExcludeElfFiles = DefaultChunkingExcludeElfFiles; + bool ExcludeMachOFiles = DefaultChunkingExcludeMachOFiles; + uint64_t ChunkFileSizeLimit = DefaultChunkingFileSizeLimit; + ChunkedParams ChunkingParams = DefaultDynamicChunkingParams; + uint64_t FixedChunkingChunkSize = DefaultFixedChunkingChunkSize; + uint64_t MinSizeForFixedChunking = DefaultMinSizeForFixedChunking; + }; + bool IsElfFile(BasicFile& Buffer) { if (Buffer.FileSize() > 4) @@ -77,114 +104,363 @@ namespace { } return false; } -} // namespace -class BasicChunkingController : public ChunkingController -{ -public: - BasicChunkingController(const BasicChunkingControllerSettings& Settings) : m_Settings(Settings) + //////////// BasicChunkingController + + class BasicChunkingController : public ChunkingController { - m_ExcludeExtensionHashes.reserve(Settings.ExcludeExtensions.size()); - for (const std::string& Extension : Settings.ExcludeExtensions) + public: + BasicChunkingController(const BasicChunkingControllerSettings& Settings) : m_Settings(Settings) { - m_ExcludeExtensionHashes.insert(HashStringAsLowerDjb2(Extension)); + m_ExcludeExtensionHashes.reserve(Settings.ExcludeExtensions.size()); + for (const std::string& Extension : Settings.ExcludeExtensions) + { + m_ExcludeExtensionHashes.insert(HashStringAsLowerDjb2(Extension)); + } } - } - BasicChunkingController(CbObjectView Parameters) : m_Settings(ReadSettings(Parameters)) {} + BasicChunkingController(CbObjectView Parameters) : m_Settings(ReadSettings(Parameters)) {} - virtual bool ProcessFile(const std::filesystem::path& InputPath, - uint64_t RawSize, - ChunkedInfoWithSource& OutChunked, - std::atomic<uint64_t>& BytesProcessed, - std::atomic<bool>& AbortFlag) const override - { - ZEN_TRACE_CPU("BasicChunkingController::ProcessFile"); - if (RawSize < m_Settings.ChunkFileSizeLimit) + virtual bool ProcessFile(const std::filesystem::path& InputPath, + uint64_t RawSize, + ChunkedInfoWithSource& OutChunked, + std::atomic<uint64_t>& BytesProcessed, + std::atomic<bool>& AbortFlag) const override { - return false; - } + ZEN_TRACE_CPU("BasicChunkingController::ProcessFile"); + if (RawSize < m_Settings.ChunkFileSizeLimit) + { + return false; + } - const uint32_t ExtensionHash = HashStringAsLowerDjb2(InputPath.extension().string()); - const bool ExcludeFromChunking = m_ExcludeExtensionHashes.contains(ExtensionHash); + const uint32_t ExtensionHash = HashStringAsLowerDjb2(InputPath.extension().string()); + const bool ExcludeFromChunking = m_ExcludeExtensionHashes.contains(ExtensionHash); - if (ExcludeFromChunking) - { - return false; + if (ExcludeFromChunking) + { + return false; + } + + BasicFile Buffer(InputPath, BasicFile::Mode::kRead); + if (m_Settings.ExcludeElfFiles && IsElfFile(Buffer)) + { + return false; + } + if (m_Settings.ExcludeMachOFiles && IsMachOFile(Buffer)) + { + return false; + } + + OutChunked = ChunkData(Buffer, 0, RawSize, m_Settings.ChunkingParams, &BytesProcessed, &AbortFlag); + return true; } - BasicFile Buffer(InputPath, BasicFile::Mode::kRead); - if (m_Settings.ExcludeElfFiles && IsElfFile(Buffer)) + virtual std::string_view GetName() const override { return Name; } + + virtual CbObject GetParameters() const override { - return false; + CbObjectWriter Writer; + compactbinary_helpers::WriteArray(m_Settings.ExcludeExtensions, "ChunkExcludeExtensions"sv, Writer); + + Writer.AddBool("ExcludeElfFiles"sv, m_Settings.ExcludeElfFiles); + Writer.AddBool("ExcludeMachOFiles"sv, m_Settings.ExcludeMachOFiles); + Writer.AddInteger("ChunkFileSizeLimit"sv, m_Settings.ChunkFileSizeLimit); + + WriteChunkParams(Writer, m_Settings.ChunkingParams); + + return Writer.Save(); } - if (m_Settings.ExcludeMachOFiles && IsMachOFile(Buffer)) + static constexpr std::string_view Name = "BasicChunkingController"sv; + + private: + static BasicChunkingControllerSettings ReadSettings(CbObjectView Parameters) { - return false; + return BasicChunkingControllerSettings{ + .ExcludeExtensions = compactbinary_helpers::ReadArray<std::string>("ChunkExcludeExtensions"sv, Parameters), + .ExcludeElfFiles = Parameters["ExcludeElfFiles"sv].AsBool(DefaultChunkingExcludeElfFiles), + .ExcludeMachOFiles = Parameters["ExcludeMachOFiles"sv].AsBool(DefaultChunkingExcludeMachOFiles), + .ChunkFileSizeLimit = Parameters["ChunkFileSizeLimit"sv].AsUInt64(DefaultChunkingFileSizeLimit), + .ChunkingParams = ReadChunkParams(Parameters["ChunkingParams"sv].AsObjectView())}; } - OutChunked = ChunkData(Buffer, 0, RawSize, m_Settings.ChunkingParams, &BytesProcessed, &AbortFlag); - return true; + const BasicChunkingControllerSettings m_Settings; + tsl::robin_set<uint32_t> m_ExcludeExtensionHashes; + }; + + std::unique_ptr<ChunkingController> CreateBasicChunkingController(CbObjectView Parameters) + { + return std::make_unique<BasicChunkingController>(Parameters); } - virtual std::string_view GetName() const override { return Name; } + //////////// ChunkingControllerWithFixedChunking - virtual CbObject GetParameters() const override + class ChunkingControllerWithFixedChunking : public ChunkingController { - CbObjectWriter Writer; - Writer.BeginArray("ChunkExcludeExtensions"sv); + public: + ChunkingControllerWithFixedChunking(const ChunkingControllerWithFixedChunkingSettings& Settings) : m_Settings(Settings) { - for (const std::string& Extension : m_Settings.ExcludeExtensions) + m_ExcludeExtensionHashes.reserve(Settings.ExcludeExtensions.size()); + for (const std::string& Extension : Settings.ExcludeExtensions) + { + m_ExcludeExtensionHashes.insert(HashStringAsLowerDjb2(Extension)); + } + + m_FixedChunkingExtensionHashes.reserve(Settings.FixedChunkingExtensions.size()); + for (const std::string& Extension : Settings.FixedChunkingExtensions) { - Writer.AddString(Extension); + m_FixedChunkingExtensionHashes.insert(HashStringAsLowerDjb2(Extension)); } } - Writer.EndArray(); // ChunkExcludeExtensions - Writer.AddBool("ExcludeElfFiles"sv, m_Settings.ExcludeElfFiles); - Writer.AddBool("ExcludeMachOFiles"sv, m_Settings.ExcludeMachOFiles); - Writer.AddInteger("ChunkFileSizeLimit"sv, m_Settings.ChunkFileSizeLimit); + ChunkingControllerWithFixedChunking(CbObjectView Parameters) : m_Settings(ReadSettings(Parameters)) {} - WriteChunkParams(Writer, m_Settings.ChunkingParams); + virtual bool ProcessFile(const std::filesystem::path& InputPath, + uint64_t RawSize, + ChunkedInfoWithSource& OutChunked, + std::atomic<uint64_t>& BytesProcessed, + std::atomic<bool>& AbortFlag) const override + { + ZEN_TRACE_CPU("ChunkingControllerWithFixedChunking::ProcessFile"); + if (RawSize < m_Settings.ChunkFileSizeLimit) + { + return false; + } - return Writer.Save(); - } - static constexpr std::string_view Name = "BasicChunkingController"sv; + const uint32_t ExtensionHash = HashStringAsLowerDjb2(InputPath.extension().string()); + const bool ExcludeFromChunking = m_ExcludeExtensionHashes.contains(ExtensionHash); -private: - static BasicChunkingControllerSettings ReadSettings(CbObjectView Parameters) + if (ExcludeFromChunking) + { + return false; + } + + const bool FixedChunkingExtension = m_FixedChunkingExtensionHashes.contains(ExtensionHash); + + if (FixedChunkingExtension) + { + if (RawSize < m_Settings.MinSizeForFixedChunking) + { + return false; + } + ZEN_TRACE_CPU("FixedChunking"); + IoHashStream FullHasher; + BasicFile Source(InputPath, BasicFile::Mode::kRead); + uint64_t Offset = 0; + tsl::robin_map<IoHash, uint32_t, IoHash::Hasher> ChunkHashToChunkIndex; + const uint64_t ExpectedChunkCount = 1 + (RawSize / m_Settings.FixedChunkingChunkSize); + ChunkHashToChunkIndex.reserve(ExpectedChunkCount); + OutChunked.Info.ChunkHashes.reserve(ExpectedChunkCount); + OutChunked.Info.ChunkSequence.reserve(ExpectedChunkCount); + OutChunked.ChunkSources.reserve(ExpectedChunkCount); + + static const uint64_t BufferingSize = 256u * 1024u; + + IoHashStream ChunkHasher; + + while (Offset < RawSize) + { + if (AbortFlag) + { + return false; + } + + ChunkHasher.Reset(); + + uint64_t ChunkSize = std::min<uint64_t>(RawSize - Offset, m_Settings.FixedChunkingChunkSize); + if (ChunkSize >= (BufferingSize + BufferingSize / 2)) + { + ScanFile(Source.Handle(), + Offset, + ChunkSize, + BufferingSize, + [&FullHasher, &ChunkHasher, &BytesProcessed](const void* Data, size_t Size) { + FullHasher.Append(Data, Size); + ChunkHasher.Append(Data, Size); + BytesProcessed.fetch_add(Size); + }); + } + else + { + IoBuffer ChunkData = Source.ReadRange(Offset, ChunkSize); + FullHasher.Append(ChunkData); + ChunkHasher.Append(ChunkData); + BytesProcessed.fetch_add(ChunkSize); + } + + const IoHash ChunkHash = ChunkHasher.GetHash(); + if (auto It = ChunkHashToChunkIndex.find(ChunkHash); It != ChunkHashToChunkIndex.end()) + { + OutChunked.Info.ChunkSequence.push_back(It->second); + } + else + { + uint32_t ChunkIndex = gsl::narrow<uint32_t>(OutChunked.Info.ChunkHashes.size()); + OutChunked.Info.ChunkHashes.push_back(ChunkHash); + OutChunked.Info.ChunkSequence.push_back(ChunkIndex); + OutChunked.ChunkSources.push_back({.Offset = Offset, .Size = gsl::narrow<uint32_t>(ChunkSize)}); + } + Offset += ChunkSize; + } + OutChunked.Info.RawSize = RawSize; + OutChunked.Info.RawHash = FullHasher.GetHash(); + return true; + } + else + { + BasicFile Buffer(InputPath, BasicFile::Mode::kRead); + if (m_Settings.ExcludeElfFiles && IsElfFile(Buffer)) + { + return false; + } + if (m_Settings.ExcludeMachOFiles && IsMachOFile(Buffer)) + { + return false; + } + + OutChunked = ChunkData(Buffer, 0, RawSize, m_Settings.ChunkingParams, &BytesProcessed, &AbortFlag); + return true; + } + } + + virtual std::string_view GetName() const override { return Name; } + + virtual CbObject GetParameters() const override + { + CbObjectWriter Writer; + compactbinary_helpers::WriteArray(m_Settings.FixedChunkingExtensions, "FixedChunkingExtensions", Writer); + compactbinary_helpers::WriteArray(m_Settings.ExcludeExtensions, "ChunkExcludeExtensions", Writer); + + Writer.AddBool("ExcludeElfFiles"sv, m_Settings.ExcludeElfFiles); + Writer.AddBool("ExcludeMachOFiles"sv, m_Settings.ExcludeMachOFiles); + + Writer.AddInteger("ChunkFileSizeLimit"sv, m_Settings.ChunkFileSizeLimit); + + WriteChunkParams(Writer, m_Settings.ChunkingParams); + + Writer.AddInteger("FixedChunkingChunkSize"sv, m_Settings.FixedChunkingChunkSize); + Writer.AddInteger("MinSizeForFixedChunking"sv, m_Settings.MinSizeForFixedChunking); + return Writer.Save(); + } + + static constexpr std::string_view Name = "ChunkingControllerWithFixedChunking"sv; + + private: + static ChunkingControllerWithFixedChunkingSettings ReadSettings(CbObjectView Parameters) + { + return ChunkingControllerWithFixedChunkingSettings{ + .FixedChunkingExtensions = compactbinary_helpers::ReadArray<std::string>("FixedChunkingExtensions"sv, Parameters), + .ExcludeExtensions = compactbinary_helpers::ReadArray<std::string>("ChunkExcludeExtensions"sv, Parameters), + .ExcludeElfFiles = Parameters["ExcludeElfFiles"sv].AsBool(DefaultChunkingExcludeElfFiles), + .ExcludeMachOFiles = Parameters["ExcludeMachOFiles"sv].AsBool(DefaultChunkingExcludeMachOFiles), + .ChunkFileSizeLimit = Parameters["ChunkFileSizeLimit"sv].AsUInt64(DefaultChunkingFileSizeLimit), + .ChunkingParams = ReadChunkParams(Parameters["ChunkingParams"sv].AsObjectView()), + .FixedChunkingChunkSize = Parameters["FixedChunkingChunkSize"sv].AsUInt64(DefaultFixedChunkingChunkSize), + .MinSizeForFixedChunking = Parameters["MinSizeForFixedChunking"sv].AsUInt64(DefaultFixedChunkingChunkSize)}; + } + + const ChunkingControllerWithFixedChunkingSettings m_Settings; + tsl::robin_set<uint32_t> m_FixedChunkingExtensionHashes; + tsl::robin_set<uint32_t> m_ExcludeExtensionHashes; + }; + + std::unique_ptr<ChunkingController> CreateChunkingControllerWithFixedChunking(CbObjectView Parameters) { - return BasicChunkingControllerSettings{ - .ExcludeExtensions = ReadStringArray(Parameters["ChunkExcludeExtensions"sv].AsArrayView()), - .ExcludeElfFiles = Parameters["ExcludeElfFiles"sv].AsBool(DefaultChunkingExcludeElfFiles), - .ExcludeMachOFiles = Parameters["ExcludeMachOFiles"sv].AsBool(DefaultChunkingExcludeMachOFiles), - .ChunkFileSizeLimit = Parameters["ChunkFileSizeLimit"sv].AsUInt64(DefaultChunkingFileSizeLimit), - .ChunkingParams = ReadChunkParams(Parameters["ChunkingParams"sv].AsObjectView())}; + return std::make_unique<ChunkingControllerWithFixedChunking>(Parameters); } +} // namespace legacy - const BasicChunkingControllerSettings m_Settings; - tsl::robin_set<uint32_t> m_ExcludeExtensionHashes; -}; +//////////// StandardChunkingController -class ChunkingControllerWithFixedChunking : public ChunkingController +class StandardChunkingController : public ChunkingController { public: - ChunkingControllerWithFixedChunking(const ChunkingControllerWithFixedChunkingSettings& Settings) : m_Settings(Settings) + StandardChunkingController(const StandardChunkingControllerSettings& Settings) : m_Settings(Settings) { - m_ExcludeExtensionHashes.reserve(Settings.ExcludeExtensions.size()); - for (const std::string& Extension : Settings.ExcludeExtensions) + m_SplitOnlyExtensionHashes.reserve(Settings.SplitOnlyExtensions.size()); + for (const std::string& Extension : Settings.SplitOnlyExtensions) { - m_ExcludeExtensionHashes.insert(HashStringAsLowerDjb2(Extension)); + m_SplitOnlyExtensionHashes.insert(HashStringAsLowerDjb2(Extension)); } - m_FixedChunkingExtensionHashes.reserve(Settings.FixedChunkingExtensions.size()); - for (const std::string& Extension : Settings.FixedChunkingExtensions) + m_SplitAndCompressExtensionHashes.reserve(Settings.SplitAndCompressExtensions.size()); + for (const std::string& Extension : Settings.SplitAndCompressExtensions) { - m_FixedChunkingExtensionHashes.insert(HashStringAsLowerDjb2(Extension)); + m_SplitAndCompressExtensionHashes.insert(HashStringAsLowerDjb2(Extension)); } } - ChunkingControllerWithFixedChunking(CbObjectView Parameters) : m_Settings(ReadSettings(Parameters)) {} + StandardChunkingController(CbObjectView Parameters) : m_Settings(ReadSettings(Parameters)) {} + + bool FixedChunking(BasicFile& Source, + uint64_t RawSize, + ChunkedInfoWithSource& OutChunked, + std::atomic<uint64_t>& BytesProcessed, + const uint64_t FixedChunkSize, + std::atomic<bool>& AbortFlag) const + { + ZEN_TRACE_CPU("FixedChunking"); + + IoHashStream FullHasher; + uint64_t Offset = 0; + tsl::robin_map<IoHash, uint32_t, IoHash::Hasher> ChunkHashToChunkIndex; + const uint64_t ExpectedChunkCount = 1 + (RawSize / FixedChunkSize); + ChunkHashToChunkIndex.reserve(ExpectedChunkCount); + OutChunked.Info.ChunkHashes.reserve(ExpectedChunkCount); + OutChunked.Info.ChunkSequence.reserve(ExpectedChunkCount); + OutChunked.ChunkSources.reserve(ExpectedChunkCount); + + static const uint64_t BufferingSize = 256u * 1024u; + static const uint64_t MinimumLastChunkSize = Min(128u * 1024u, FixedChunkSize / 32); + + IoHashStream ChunkHasher; + + BasicFileBuffer SourceBuffer(Source, Min(BufferingSize, RawSize)); + while (Offset < RawSize) + { + if (AbortFlag) + { + return false; + } + + ChunkHasher.Reset(); + + const uint64_t ChunkStartOffset = Offset; + const uint64_t BytesLeft = RawSize - Offset; + uint64_t ChunkSize = std::min<uint64_t>(BytesLeft, FixedChunkSize); + if ((BytesLeft - ChunkSize) < MinimumLastChunkSize) + { + // Avoid small chunks from the end of the file + ChunkSize = BytesLeft; + } + const uint64_t End = ChunkStartOffset + ChunkSize; + while (Offset < End) + { + const uint64_t BufferSize = std::min<uint64_t>(RawSize - Offset, BufferingSize); + MemoryView ChunkData = SourceBuffer.MakeView(BufferSize, Offset); + FullHasher.Append(ChunkData); + ChunkHasher.Append(ChunkData); + BytesProcessed.fetch_add(BufferSize); + Offset += BufferSize; + } + + const IoHash ChunkHash = ChunkHasher.GetHash(); + if (auto It = ChunkHashToChunkIndex.find(ChunkHash); It != ChunkHashToChunkIndex.end()) + { + OutChunked.Info.ChunkSequence.push_back(It->second); + } + else + { + uint32_t ChunkIndex = gsl::narrow<uint32_t>(OutChunked.Info.ChunkHashes.size()); + OutChunked.Info.ChunkHashes.push_back(ChunkHash); + OutChunked.Info.ChunkSequence.push_back(ChunkIndex); + OutChunked.ChunkSources.push_back({.Offset = ChunkStartOffset, .Size = gsl::narrow<uint32_t>(ChunkSize)}); + } + } + + OutChunked.Info.RawSize = RawSize; + OutChunked.Info.RawHash = FullHasher.GetHash(); + return true; + } virtual bool ProcessFile(const std::filesystem::path& InputPath, uint64_t RawSize, @@ -192,106 +468,60 @@ public: std::atomic<uint64_t>& BytesProcessed, std::atomic<bool>& AbortFlag) const override { - ZEN_TRACE_CPU("ChunkingControllerWithFixedChunking::ProcessFile"); - if (RawSize < m_Settings.ChunkFileSizeLimit) - { - return false; - } + ZEN_TRACE_CPU("StandardChunkingController::ProcessFile"); - const uint32_t ExtensionHash = HashStringAsLowerDjb2(InputPath.extension().string()); - const bool ExcludeFromChunking = m_ExcludeExtensionHashes.contains(ExtensionHash); + const uint32_t ExtensionHash = HashStringAsLowerDjb2(InputPath.extension().string()); - if (ExcludeFromChunking) + if (m_SplitOnlyExtensionHashes.contains(ExtensionHash)) { - return false; + if (RawSize < m_Settings.SplitOnlyMinSize) + { + return false; + } + BasicFile Source(InputPath, BasicFile::Mode::kRead); + return FixedChunking(Source, RawSize, OutChunked, BytesProcessed, m_Settings.SplitOnlyChunkSize, AbortFlag); } - const bool FixedChunkingExtension = m_FixedChunkingExtensionHashes.contains(ExtensionHash); - - if (FixedChunkingExtension) + if (m_SplitAndCompressExtensionHashes.contains(ExtensionHash)) { - if (RawSize < m_Settings.MinSizeForFixedChunking) + if (RawSize < m_Settings.SplitAndCompressMinSize) { return false; } - ZEN_TRACE_CPU("FixedChunking"); - IoHashStream FullHasher; - BasicFile Source(InputPath, BasicFile::Mode::kRead); - uint64_t Offset = 0; - tsl::robin_map<IoHash, uint32_t, IoHash::Hasher> ChunkHashToChunkIndex; - const uint64_t ExpectedChunkCount = 1 + (RawSize / m_Settings.FixedChunkingChunkSize); - ChunkHashToChunkIndex.reserve(ExpectedChunkCount); - OutChunked.Info.ChunkHashes.reserve(ExpectedChunkCount); - OutChunked.Info.ChunkSequence.reserve(ExpectedChunkCount); - OutChunked.ChunkSources.reserve(ExpectedChunkCount); - - static const uint64_t BufferingSize = 256u * 1024u; - - IoHashStream ChunkHasher; - - while (Offset < RawSize) + BasicFile Source(InputPath, BasicFile::Mode::kRead); + return FixedChunking(Source, RawSize, OutChunked, BytesProcessed, m_Settings.SplitAndCompressChunkSize, AbortFlag); + } + + if (RawSize > sizeof(uint32_t) && !m_Settings.SplitAndCompressFileLeadingBytes.empty()) + { + BasicFile Source(InputPath, BasicFile::Mode::kRead); + uint32_t LeadingBytes = 0; + Source.Read(&LeadingBytes, 4, 0); + if (std::find(m_Settings.SplitAndCompressFileLeadingBytes.begin(), + m_Settings.SplitAndCompressFileLeadingBytes.end(), + LeadingBytes) != m_Settings.SplitAndCompressFileLeadingBytes.end()) { - if (AbortFlag) + if (RawSize < m_Settings.SplitAndCompressMinSize) { return false; } - - ChunkHasher.Reset(); - - uint64_t ChunkSize = std::min<uint64_t>(RawSize - Offset, m_Settings.FixedChunkingChunkSize); - if (ChunkSize >= (BufferingSize + BufferingSize / 2)) - { - ScanFile(Source.Handle(), - Offset, - ChunkSize, - BufferingSize, - [&FullHasher, &ChunkHasher, &BytesProcessed](const void* Data, size_t Size) { - FullHasher.Append(Data, Size); - ChunkHasher.Append(Data, Size); - BytesProcessed.fetch_add(Size); - }); - } - else - { - IoBuffer ChunkData = Source.ReadRange(Offset, ChunkSize); - FullHasher.Append(ChunkData); - ChunkHasher.Append(ChunkData); - BytesProcessed.fetch_add(ChunkSize); - } - - const IoHash ChunkHash = ChunkHasher.GetHash(); - if (auto It = ChunkHashToChunkIndex.find(ChunkHash); It != ChunkHashToChunkIndex.end()) - { - OutChunked.Info.ChunkSequence.push_back(It->second); - } else { - uint32_t ChunkIndex = gsl::narrow<uint32_t>(OutChunked.Info.ChunkHashes.size()); - OutChunked.Info.ChunkHashes.push_back(ChunkHash); - OutChunked.Info.ChunkSequence.push_back(ChunkIndex); - OutChunked.ChunkSources.push_back({.Offset = Offset, .Size = gsl::narrow<uint32_t>(ChunkSize)}); + return FixedChunking(Source, RawSize, OutChunked, BytesProcessed, m_Settings.SplitAndCompressChunkSize, AbortFlag); } - Offset += ChunkSize; } - OutChunked.Info.RawSize = RawSize; - OutChunked.Info.RawHash = FullHasher.GetHash(); - return true; + } + + if (RawSize < m_Settings.DynamicChunkingParams.MaxSize) + { + return false; } else { - BasicFile Buffer(InputPath, BasicFile::Mode::kRead); - if (m_Settings.ExcludeElfFiles && IsElfFile(Buffer)) - { - return false; - } - if (m_Settings.ExcludeMachOFiles && IsMachOFile(Buffer)) - { - return false; - } - - OutChunked = ChunkData(Buffer, 0, RawSize, m_Settings.ChunkingParams, &BytesProcessed, &AbortFlag); - return true; + BasicFile Source(InputPath, BasicFile::Mode::kRead); + OutChunked = ChunkData(Source, 0, RawSize, m_Settings.DynamicChunkingParams, &BytesProcessed, &AbortFlag); } + return true; } virtual std::string_view GetName() const override { return Name; } @@ -299,89 +529,71 @@ public: virtual CbObject GetParameters() const override { CbObjectWriter Writer; - Writer.BeginArray("FixedChunkingExtensions"); - { - for (const std::string& Extension : m_Settings.FixedChunkingExtensions) - { - Writer.AddString(Extension); - } - } - Writer.EndArray(); // ChunkExcludeExtensions - - Writer.BeginArray("ChunkExcludeExtensions"sv); - { - for (const std::string& Extension : m_Settings.ExcludeExtensions) - { - Writer.AddString(Extension); - } - } - Writer.EndArray(); // ChunkExcludeExtensions - Writer.AddBool("ExcludeElfFiles"sv, m_Settings.ExcludeElfFiles); - Writer.AddBool("ExcludeMachOFiles"sv, m_Settings.ExcludeMachOFiles); + compactbinary_helpers::WriteArray(m_Settings.SplitOnlyExtensions, "SplitOnlyExtensions"sv, Writer); + Writer.AddInteger("SplitOnlyChunkSize"sv, m_Settings.SplitOnlyChunkSize); + Writer.AddInteger("SplitOnlyMinSize"sv, m_Settings.SplitOnlyMinSize); - Writer.AddInteger("ChunkFileSizeLimit"sv, m_Settings.ChunkFileSizeLimit); + compactbinary_helpers::WriteArray(m_Settings.SplitAndCompressExtensions, "SplitAndCompressExtensions"sv, Writer); + compactbinary_helpers::WriteArray(m_Settings.SplitAndCompressFileLeadingBytes, "SplitAndCompressFileLeadingBytes"sv, Writer); + Writer.AddInteger("SplitAndCompressChunkSize"sv, m_Settings.SplitAndCompressChunkSize); + Writer.AddInteger("SplitAndCompressMinSize"sv, m_Settings.SplitAndCompressMinSize); - WriteChunkParams(Writer, m_Settings.ChunkingParams); + WriteChunkParams(Writer, m_Settings.DynamicChunkingParams); - Writer.AddInteger("FixedChunkingChunkSize"sv, m_Settings.FixedChunkingChunkSize); - Writer.AddInteger("MinSizeForFixedChunking"sv, m_Settings.MinSizeForFixedChunking); return Writer.Save(); } - static constexpr std::string_view Name = "ChunkingControllerWithFixedChunking"sv; + static constexpr std::string_view Name = "StandardChunkingController"sv; private: - static ChunkingControllerWithFixedChunkingSettings ReadSettings(CbObjectView Parameters) + static StandardChunkingControllerSettings ReadSettings(CbObjectView Parameters) { - return ChunkingControllerWithFixedChunkingSettings{ - .FixedChunkingExtensions = ReadStringArray(Parameters["FixedChunkingExtensions"sv].AsArrayView()), - .ExcludeExtensions = ReadStringArray(Parameters["ChunkExcludeExtensions"sv].AsArrayView()), - .ExcludeElfFiles = Parameters["ExcludeElfFiles"sv].AsBool(DefaultChunkingExcludeElfFiles), - .ExcludeMachOFiles = Parameters["ExcludeMachOFiles"sv].AsBool(DefaultChunkingExcludeMachOFiles), - .ChunkFileSizeLimit = Parameters["ChunkFileSizeLimit"sv].AsUInt64(DefaultChunkingFileSizeLimit), - .ChunkingParams = ReadChunkParams(Parameters["ChunkingParams"sv].AsObjectView()), - .FixedChunkingChunkSize = Parameters["FixedChunkingChunkSize"sv].AsUInt64(DefaultFixedChunkingChunkSize), - .MinSizeForFixedChunking = Parameters["MinSizeForFixedChunking"sv].AsUInt64(DefaultFixedChunkingChunkSize)}; + return StandardChunkingControllerSettings{ + .SplitOnlyExtensions = compactbinary_helpers::ReadArray<std::string>("SplitOnlyExtensions"sv, Parameters), + .SplitOnlyChunkSize = Parameters["SplitOnlyChunkSize"sv].AsUInt64(DefaultSplitOnlyChunkSize), + .SplitOnlyMinSize = Parameters["SplitOnlyMinSize"sv].AsUInt64(DefaultSplitOnlyMinSize), + + .SplitAndCompressExtensions = compactbinary_helpers::ReadArray<std::string>("SplitAndCompressExtensions"sv, Parameters), + .SplitAndCompressFileLeadingBytes = + compactbinary_helpers::ReadArray<uint32_t>("SplitAndCompressFileLeadingBytes"sv, Parameters), + .SplitAndCompressChunkSize = Parameters["SplitAndCompressChunkSize"sv].AsUInt64(DefaultSplitAndCompressChunkSize), + .SplitAndCompressMinSize = Parameters["SplitAndCompressMinSize"sv].AsUInt64(DefaultSplitAndCompressMinSize), + + .DynamicChunkingParams = ReadChunkParams(Parameters["ChunkingParams"sv].AsObjectView())}; } - const ChunkingControllerWithFixedChunkingSettings m_Settings; - tsl::robin_set<uint32_t> m_FixedChunkingExtensionHashes; - tsl::robin_set<uint32_t> m_ExcludeExtensionHashes; + const StandardChunkingControllerSettings m_Settings; + tsl::robin_set<uint32_t> m_SplitOnlyExtensionHashes; + tsl::robin_set<uint32_t> m_SplitAndCompressExtensionHashes; }; std::unique_ptr<ChunkingController> -CreateBasicChunkingController(const BasicChunkingControllerSettings& Settings) -{ - return std::make_unique<BasicChunkingController>(Settings); -} -std::unique_ptr<ChunkingController> -CreateBasicChunkingController(CbObjectView Parameters) +CreateStandardChunkingController(const StandardChunkingControllerSettings& Setting) { - return std::make_unique<BasicChunkingController>(Parameters); + return std::make_unique<StandardChunkingController>(Setting); } std::unique_ptr<ChunkingController> -CreateChunkingControllerWithFixedChunking(const ChunkingControllerWithFixedChunkingSettings& Setting) -{ - return std::make_unique<ChunkingControllerWithFixedChunking>(Setting); -} -std::unique_ptr<ChunkingController> -CreateChunkingControllerWithFixedChunking(CbObjectView Parameters) +CreateStandardChunkingController(CbObjectView Parameters) { - return std::make_unique<ChunkingControllerWithFixedChunking>(Parameters); + return std::make_unique<StandardChunkingController>(Parameters); } std::unique_ptr<ChunkingController> CreateChunkingController(std::string_view Name, CbObjectView Parameters) { - if (Name == BasicChunkingController::Name) + if (Name == legacy::BasicChunkingController::Name) + { + return legacy::CreateBasicChunkingController(Parameters); + } + else if (Name == legacy::ChunkingControllerWithFixedChunking::Name) { - return CreateBasicChunkingController(Parameters); + return legacy::CreateChunkingControllerWithFixedChunking(Parameters); } - else if (Name == ChunkingControllerWithFixedChunking::Name) + else if (Name == StandardChunkingController::Name) { - return CreateChunkingControllerWithFixedChunking(Parameters); + return CreateStandardChunkingController(Parameters); } return {}; } diff --git a/src/zenremotestore/include/zenremotestore/builds/buildstorageoperations.h b/src/zenremotestore/include/zenremotestore/builds/buildstorageoperations.h index f200a342c..7e3903892 100644 --- a/src/zenremotestore/include/zenremotestore/builds/buildstorageoperations.h +++ b/src/zenremotestore/include/zenremotestore/builds/buildstorageoperations.h @@ -565,6 +565,8 @@ public: std::vector<std::string> ExcludeFolders; std::vector<std::string> ExcludeExtensions; std::string ZenExcludeManifestName = ".zen_exclude_manifest.txt"; + + std::vector<std::string> NonCompressableExtensions; }; BuildsOperationUploadFolder(BuildOpLogOutput& LogOutput, StorageInstance& Storage, @@ -693,6 +695,8 @@ private: const bool m_CreateBuild; // ?? Member? const CbObject m_MetaData; // ?? Member const Options m_Options; + + tsl::robin_set<uint32_t> m_NonCompressableExtensionHashes; }; struct ValidateStatistics diff --git a/src/zenremotestore/include/zenremotestore/chunking/chunkedcontent.h b/src/zenremotestore/include/zenremotestore/chunking/chunkedcontent.h index 2a56d14d3..a4f22f4df 100644 --- a/src/zenremotestore/include/zenremotestore/chunking/chunkedcontent.h +++ b/src/zenremotestore/include/zenremotestore/chunking/chunkedcontent.h @@ -184,107 +184,4 @@ GetFirstPathIndexForRawHash(const ChunkedContentLookup& Lookup, const IoHash& Ra return GetFirstPathIndexForSeqeuenceIndex(Lookup, SequenceIndex); } -namespace compactbinary_helpers { - template<typename Type> - void WriteArray(std::span<const Type> Values, std::string_view ArrayName, CbWriter& Output) - { - Output.BeginArray(ArrayName); - for (const Type Value : Values) - { - Output << Value; - } - Output.EndArray(); - } - - template<typename Type> - void WriteArray(const std::vector<Type>& Values, std::string_view ArrayName, CbWriter& Output) - { - WriteArray(std::span<const Type>(Values), ArrayName, Output); - } - - template<> - inline void WriteArray(std::span<const std::filesystem::path> Values, std::string_view ArrayName, CbWriter& Output) - { - Output.BeginArray(ArrayName); - for (const std::filesystem::path& Path : Values) - { - Output.AddString((const char*)Path.generic_u8string().c_str()); - } - Output.EndArray(); - } - - template<> - inline void WriteArray(const std::vector<std::filesystem::path>& Values, std::string_view ArrayName, CbWriter& Output) - { - WriteArray(std::span<const std::filesystem::path>(Values), ArrayName, Output); - } - - inline void WriteBinaryAttachmentArray(std::span<const IoHash> Values, std::string_view ArrayName, CbWriter& Output) - { - Output.BeginArray(ArrayName); - for (const IoHash& Hash : Values) - { - Output.AddBinaryAttachment(Hash); - } - Output.EndArray(); - } - - inline void WriteBinaryAttachmentArray(const std::vector<IoHash>& Values, std::string_view ArrayName, CbWriter& Output) - { - WriteArray(std::span<const IoHash>(Values), ArrayName, Output); - } - - inline void ReadArray(std::string_view ArrayName, CbObjectView Input, std::vector<uint32_t>& Result) - { - CbArrayView Array = Input[ArrayName].AsArrayView(); - Result.reserve(Array.Num()); - for (CbFieldView ItemView : Array) - { - Result.push_back(ItemView.AsUInt32()); - } - } - - inline void ReadArray(std::string_view ArrayName, CbObjectView Input, std::vector<uint64_t>& Result) - { - CbArrayView Array = Input[ArrayName].AsArrayView(); - Result.reserve(Array.Num()); - for (CbFieldView ItemView : Array) - { - Result.push_back(ItemView.AsUInt64()); - } - } - - inline void ReadArray(std::string_view ArrayName, CbObjectView Input, std::vector<std::filesystem::path>& Result) - { - CbArrayView Array = Input[ArrayName].AsArrayView(); - Result.reserve(Array.Num()); - for (CbFieldView ItemView : Array) - { - std::u8string_view U8Path = ItemView.AsU8String(); - Result.push_back(std::filesystem::path(U8Path)); - } - } - - inline void ReadArray(std::string_view ArrayName, CbObjectView Input, std::vector<IoHash>& Result) - { - CbArrayView Array = Input[ArrayName].AsArrayView(); - Result.reserve(Array.Num()); - for (CbFieldView ItemView : Array) - { - Result.push_back(ItemView.AsHash()); - } - } - - inline void ReadBinaryAttachmentArray(std::string_view ArrayName, CbObjectView Input, std::vector<IoHash>& Result) - { - CbArrayView Array = Input[ArrayName].AsArrayView(); - Result.reserve(Array.Num()); - for (CbFieldView ItemView : Array) - { - Result.push_back(ItemView.AsBinaryAttachment()); - } - } - -} // namespace compactbinary_helpers - } // namespace zen diff --git a/src/zenremotestore/include/zenremotestore/chunking/chunkingcontroller.h b/src/zenremotestore/include/zenremotestore/chunking/chunkingcontroller.h index b49d76a09..809e7c475 100644 --- a/src/zenremotestore/include/zenremotestore/chunking/chunkingcontroller.h +++ b/src/zenremotestore/include/zenremotestore/chunking/chunkingcontroller.h @@ -11,23 +11,6 @@ namespace zen { -const std::vector<std::string> DefaultChunkingExcludeExtensions = { - ".exe", ".dll", ".pdb", ".self", ".mp4", ".zip", ".7z", ".bzip", ".rar", ".gzip", ".sym", ".psym", ".txt", ".ini", ".json", - ".verse", ".versemodule", ".jpg", ".c", ".h", ".cpp", ".cxx", ".c++", ".cc", ".hpp", ".hxx", ".h++", ".py", ".ogg", ".plist"}; - -const std::vector<std::string> DefaultFixedChunkingExtensions = {".apk", ".nsp", ".xvc", ".pkg", ".dmg", ".ipa"}; -const bool DefaultChunkingExcludeElfFiles = true; -const bool DefaultChunkingExcludeMachOFiles = true; - -const ChunkedParams DefaultChunkedParams = {.MinSize = ((8u * 1u) * 1024u) - 128u, - .MaxSize = 128u * 1024u, - .AvgSize = ((8u * 4u) * 1024u) + 128u}; - -const size_t DefaultChunkingFileSizeLimit = DefaultChunkedParams.MaxSize; - -const uint64_t DefaultFixedChunkingChunkSize = 32u * 1024u * 1024u; -const uint64_t DefaultMinSizeForFixedChunking = DefaultFixedChunkingChunkSize * 8u; - struct ChunkedInfoWithSource; class ChunkingController @@ -45,33 +28,46 @@ public: virtual CbObject GetParameters() const = 0; }; -struct BasicChunkingControllerSettings -{ - std::vector<std::string> ExcludeExtensions = DefaultChunkingExcludeExtensions; - bool ExcludeElfFiles = DefaultChunkingExcludeElfFiles; - bool ExcludeMachOFiles = DefaultChunkingExcludeMachOFiles; - uint64_t ChunkFileSizeLimit = DefaultChunkingFileSizeLimit; - ChunkedParams ChunkingParams = DefaultChunkedParams; -}; +//////////// Standard chunking controller -std::unique_ptr<ChunkingController> CreateBasicChunkingController(const BasicChunkingControllerSettings& Settings); -std::unique_ptr<ChunkingController> CreateBasicChunkingController(CbObjectView Parameters); +const std::vector<std::string> DefaultSplitOnlyExtensions( + {".mp4", ".zip", ".7z", ".bzip", ".rar", ".gzip", ".apk", ".nsp", ".xvc", ".pkg", ".dmg", ".ipa", ".jpg", ".ogg", ".msixvc"}); +const uint64_t DefaultSplitOnlyChunkSize = 32u * 1024u * 1024u; +const uint64_t DefaultSplitOnlyMinSize = DefaultSplitOnlyChunkSize * 4u; -struct ChunkingControllerWithFixedChunkingSettings +const std::vector<std::string> DefaultSplitAndCompressExtensions({".exe", ".dll", ".pdb", ".self", ".sym", ".psym", + ".dSYM", ".txt", ".ini", ".json", ".verse", ".versemodule", + ".c", ".h", ".cpp", ".cxx", ".c++", ".cc", + ".hpp", ".hxx", ".h++", ".py", ".plist"}); + +const std::vector<uint32_t> DefaultSplitAndCompressFileLeadingBytes({ + 0x464c457f, // Elf + 0xfeedface, // MachO Big Endian + 0xcefaedfe, // MachO Little Endian +}); + +const uint64_t DefaultSplitAndCompressChunkSize = 64u * 1024u * 1024u; +const uint64_t DefaultSplitAndCompressMinSize = DefaultSplitAndCompressChunkSize * 4u; + +const ChunkedParams DefaultDynamicChunkingParams = {.MinSize = ((8u * 1u) * 1024u) - 128u, + .MaxSize = 128u * 1024u, + .AvgSize = ((8u * 4u) * 1024u) + 128u}; + +struct StandardChunkingControllerSettings { - std::vector<std::string> FixedChunkingExtensions = DefaultFixedChunkingExtensions; - std::vector<std::string> ExcludeExtensions = DefaultChunkingExcludeExtensions; - bool ExcludeElfFiles = DefaultChunkingExcludeElfFiles; - bool ExcludeMachOFiles = DefaultChunkingExcludeMachOFiles; - uint64_t ChunkFileSizeLimit = DefaultChunkingFileSizeLimit; - ChunkedParams ChunkingParams = DefaultChunkedParams; - uint64_t FixedChunkingChunkSize = DefaultFixedChunkingChunkSize; - uint64_t MinSizeForFixedChunking = DefaultMinSizeForFixedChunking; -}; + std::vector<std::string> SplitOnlyExtensions = DefaultSplitOnlyExtensions; + uint64_t SplitOnlyChunkSize = DefaultSplitOnlyChunkSize; + uint64_t SplitOnlyMinSize = DefaultSplitOnlyMinSize; + + std::vector<std::string> SplitAndCompressExtensions = DefaultSplitAndCompressExtensions; + std::vector<uint32_t> SplitAndCompressFileLeadingBytes = DefaultSplitAndCompressFileLeadingBytes; + uint64_t SplitAndCompressChunkSize = DefaultSplitAndCompressChunkSize; + uint64_t SplitAndCompressMinSize = DefaultSplitAndCompressMinSize; -std::unique_ptr<ChunkingController> CreateChunkingControllerWithFixedChunking(const ChunkingControllerWithFixedChunkingSettings& Setting); -std::unique_ptr<ChunkingController> CreateChunkingControllerWithFixedChunking(CbObjectView Parameters); + ChunkedParams DynamicChunkingParams = DefaultDynamicChunkingParams; +}; +std::unique_ptr<ChunkingController> CreateStandardChunkingController(const StandardChunkingControllerSettings& Setting); std::unique_ptr<ChunkingController> CreateChunkingController(std::string_view Name, CbObjectView Parameters); } // namespace zen |