From 8f192ab154ff9de41d4c063138478400fe2aef24 Mon Sep 17 00:00:00 2001 From: Dan Engelbrecht Date: Fri, 28 Mar 2025 14:12:42 +0100 Subject: temp path options and reduced scanning of target folder (#328) - Feature: zen: `--zen-folder-path` added to `builds` command, `list`, `upload`, `download`, `fetch-blob`, `validate-part` to control where `.zen` folder is placed and named - Improvement: Only check known files from remote state when downloading to a target folder with no local state file - Improvement: Don't move existing local to cache and back if they are untouched --- src/zenutil/chunkedcontent.cpp | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'src/zenutil/chunkedcontent.cpp') diff --git a/src/zenutil/chunkedcontent.cpp b/src/zenutil/chunkedcontent.cpp index bb1ee5183..1e8447a57 100644 --- a/src/zenutil/chunkedcontent.cpp +++ b/src/zenutil/chunkedcontent.cpp @@ -304,14 +304,22 @@ FolderContent GetUpdatedContent(const FolderContent& Old, const FolderContent& New, std::vector& OutDeletedPathIndexes) { ZEN_TRACE_CPU("FolderContent::GetUpdatedContent"); - FolderContent Result = {.Platform = Old.Platform}; + + const uint32_t NewPathCount = gsl::narrow(New.Paths.size()); + + FolderContent Result = {.Platform = Old.Platform}; + Result.Paths.reserve(NewPathCount); + Result.RawSizes.reserve(NewPathCount); + Result.Attributes.reserve(NewPathCount); + Result.ModificationTicks.reserve(NewPathCount); + tsl::robin_map NewPathToIndex; - const uint32_t NewPathCount = gsl::narrow(New.Paths.size()); NewPathToIndex.reserve(NewPathCount); for (uint32_t NewPathIndex = 0; NewPathIndex < NewPathCount; NewPathIndex++) { NewPathToIndex.insert({New.Paths[NewPathIndex].generic_string(), NewPathIndex}); } + uint32_t OldPathCount = gsl::narrow(Old.Paths.size()); for (uint32_t OldPathIndex = 0; OldPathIndex < OldPathCount; OldPathIndex++) { @@ -667,6 +675,12 @@ DeletePathsFromChunkedContent(const ChunkedFolderContent& BaseContent, std::span const ChunkedContentLookup BaseLookup = BuildChunkedContentLookup(BaseContent); tsl::robin_map ChunkHashToChunkIndex; + const size_t ExpectedCount = BaseContent.Paths.size() - DeletedPaths.size(); + Result.Paths.reserve(ExpectedCount); + Result.RawSizes.reserve(ExpectedCount); + Result.Attributes.reserve(ExpectedCount); + Result.RawHashes.reserve(ExpectedCount); + tsl::robin_map RawHashToSequenceRawHashIndex; for (uint32_t PathIndex = 0; PathIndex < BaseContent.Paths.size(); PathIndex++) { -- cgit v1.2.3 From bbb7e0c77b9ed114baf428256f347631b3e1092c Mon Sep 17 00:00:00 2001 From: Dan Engelbrecht Date: Tue, 1 Apr 2025 14:26:53 +0200 Subject: verify that we can read input files that are only hashed (#333) * output build and part details by default * output executable and version at start of builds command * verify that we can read files we do not chunk --- src/zenutil/chunkedcontent.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'src/zenutil/chunkedcontent.cpp') diff --git a/src/zenutil/chunkedcontent.cpp b/src/zenutil/chunkedcontent.cpp index 1e8447a57..32ae2d94a 100644 --- a/src/zenutil/chunkedcontent.cpp +++ b/src/zenutil/chunkedcontent.cpp @@ -140,8 +140,12 @@ namespace { { ZEN_TRACE_CPU("HashOnly"); - IoBuffer Buffer = IoBufferBuilder::MakeFromFile((FolderPath / Path).make_preferred()); - const IoHash Hash = IoHash::HashBuffer(Buffer, &Stats.BytesHashed); + IoBuffer Buffer = IoBufferBuilder::MakeFromFile((FolderPath / Path).make_preferred()); + if (Buffer.GetSize() != RawSize) + { + throw std::runtime_error(fmt::format("Failed opening file '{}' for hashing", FolderPath / Path)); + } + const IoHash Hash = IoHash::HashBuffer(Buffer, &Stats.BytesHashed); Lock.WithExclusiveLock([&]() { if (!RawHashToSequenceRawHashIndex.contains(Hash)) -- cgit v1.2.3 From 1ca32ca4718dad5bf1e2f381fe93b47d8159807b Mon Sep 17 00:00:00 2001 From: Dan Engelbrecht Date: Tue, 8 Apr 2025 18:57:25 +0200 Subject: scavenge builds (#352) - Improvement: `zen builds` now scavenges previous download locations for data to reduce download size, enabled by default, disable with `--enable-scavenge=false` - Bugfix: Failing to rename a file during download sometimes reported an error when it succeeded when retrying --- src/zenutil/chunkedcontent.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'src/zenutil/chunkedcontent.cpp') diff --git a/src/zenutil/chunkedcontent.cpp b/src/zenutil/chunkedcontent.cpp index 32ae2d94a..17b348f8d 100644 --- a/src/zenutil/chunkedcontent.cpp +++ b/src/zenutil/chunkedcontent.cpp @@ -305,7 +305,7 @@ FolderContent::UpdateState(const FolderContent& Rhs, std::vector& OutP } FolderContent -GetUpdatedContent(const FolderContent& Old, const FolderContent& New, std::vector& OutDeletedPathIndexes) +GetUpdatedContent(const FolderContent& Old, const FolderContent& New, std::vector& OutDeletedPaths) { ZEN_TRACE_CPU("FolderContent::GetUpdatedContent"); @@ -342,7 +342,7 @@ GetUpdatedContent(const FolderContent& Old, const FolderContent& New, std::vecto } else { - OutDeletedPathIndexes.push_back(Old.Paths[OldPathIndex]); + OutDeletedPaths.push_back(Old.Paths[OldPathIndex]); } } return Result; -- cgit v1.2.3 From 4e2efa1051e3eb86ab48d92b3f6ad5896cda5d81 Mon Sep 17 00:00:00 2001 From: Dan Engelbrecht Date: Fri, 16 May 2025 19:51:36 +0200 Subject: parallel work handle dispatch exception (#400) - Bugfix: Wait for async threads if dispatching of work using ParallellWork throws exception --- src/zenutil/chunkedcontent.cpp | 52 ++++++++++++++++++++---------------------- 1 file changed, 25 insertions(+), 27 deletions(-) (limited to 'src/zenutil/chunkedcontent.cpp') diff --git a/src/zenutil/chunkedcontent.cpp b/src/zenutil/chunkedcontent.cpp index 17b348f8d..ae129324e 100644 --- a/src/zenutil/chunkedcontent.cpp +++ b/src/zenutil/chunkedcontent.cpp @@ -11,7 +11,7 @@ #include #include -#include +#include #include ZEN_THIRD_PARTY_INCLUDES_START @@ -378,7 +378,7 @@ GetFolderContent(GetFolderContentStatistics& Stats, std::function&& AcceptDirectory, std::function&& AcceptFile, WorkerThreadPool& WorkerPool, - int32_t UpdateInteralMS, + int32_t UpdateIntervalMS, std::function&& UpdateCallback, std::atomic& AbortFlag) { @@ -467,7 +467,7 @@ GetFolderContent(GetFolderContentStatistics& Stats, WorkerPool, PendingWork); PendingWork.CountDown(); - while (!PendingWork.Wait(UpdateInteralMS)) + while (!PendingWork.Wait(UpdateIntervalMS)) { UpdateCallback(AbortFlag.load(), PendingWork.Remaining()); } @@ -731,7 +731,7 @@ ChunkFolderContent(ChunkingStatistics& Stats, const std::filesystem::path& RootPath, const FolderContent& Content, const ChunkingController& InChunkingController, - int32_t UpdateInteralMS, + int32_t UpdateIntervalMS, std::function&& UpdateCallback, std::atomic& AbortFlag) { @@ -772,7 +772,7 @@ ChunkFolderContent(ChunkingStatistics& Stats, RwLock Lock; - ParallellWork Work(AbortFlag); + ParallelWork Work(AbortFlag); for (uint32_t PathIndex : Order) { @@ -780,28 +780,26 @@ ChunkFolderContent(ChunkingStatistics& Stats, { break; } - Work.ScheduleWork( - WorkerPool, // GetSyncWorkerPool() - [&, PathIndex](std::atomic& AbortFlag) { - if (!AbortFlag) - { - IoHash RawHash = HashOneFile(Stats, - InChunkingController, - Result, - ChunkHashToChunkIndex, - RawHashToSequenceRawHashIndex, - Lock, - RootPath, - PathIndex, - AbortFlag); - Lock.WithExclusiveLock([&]() { Result.RawHashes[PathIndex] = RawHash; }); - Stats.FilesProcessed++; - } - }, - Work.DefaultErrorFunction()); - } - - Work.Wait(UpdateInteralMS, [&](bool IsAborted, std::ptrdiff_t PendingWork) { + Work.ScheduleWork(WorkerPool, // GetSyncWorkerPool() + [&, PathIndex](std::atomic& AbortFlag) { + if (!AbortFlag) + { + IoHash RawHash = HashOneFile(Stats, + InChunkingController, + Result, + ChunkHashToChunkIndex, + RawHashToSequenceRawHashIndex, + Lock, + RootPath, + PathIndex, + AbortFlag); + Lock.WithExclusiveLock([&]() { Result.RawHashes[PathIndex] = RawHash; }); + Stats.FilesProcessed++; + } + }); + } + + Work.Wait(UpdateIntervalMS, [&](bool IsAborted, std::ptrdiff_t PendingWork) { ZEN_UNUSED(IsAborted); ZEN_UNUSED(PendingWork); UpdateCallback(Work.IsAborted(), Work.PendingWork().Remaining()); -- cgit v1.2.3 From a0b10b046095d57ffbdb46c83084601a832f4562 Mon Sep 17 00:00:00 2001 From: Dan Engelbrecht Date: Tue, 3 Jun 2025 16:21:01 +0200 Subject: fixed size chunking for encrypted files (#410) - Improvement: Use fixed size block chunking for know encrypted/compressed file types - Improvement: Skip trying to compress chunks that are sourced from files that are known to be encrypted/compressed - Improvement: Add global open file cache for written files increasing throughput during download by reducing overhead of open/close of file by 80% --- src/zenutil/chunkedcontent.cpp | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'src/zenutil/chunkedcontent.cpp') diff --git a/src/zenutil/chunkedcontent.cpp b/src/zenutil/chunkedcontent.cpp index ae129324e..4bec4901a 100644 --- a/src/zenutil/chunkedcontent.cpp +++ b/src/zenutil/chunkedcontent.cpp @@ -891,8 +891,12 @@ BuildChunkedContentLookup(const ChunkedFolderContent& Content) } Result.SequenceIndexFirstPathIndex.resize(Content.ChunkedContent.SequenceRawHashes.size(), (uint32_t)-1); + Result.PathExtensionHash.resize(Content.Paths.size()); for (uint32_t PathIndex = 0; PathIndex < Content.Paths.size(); PathIndex++) { + std::string LowercaseExtension = Content.Paths[PathIndex].extension().string(); + std::transform(LowercaseExtension.begin(), LowercaseExtension.end(), LowercaseExtension.begin(), ::tolower); + Result.PathExtensionHash[PathIndex] = HashStringDjb2(LowercaseExtension); if (Content.RawSizes[PathIndex] > 0) { const IoHash& RawHash = Content.RawHashes[PathIndex]; -- cgit v1.2.3 From 937510356143f83ecd15d0a9f58b611c7418ed61 Mon Sep 17 00:00:00 2001 From: Dan Engelbrecht Date: Wed, 4 Jun 2025 08:59:44 +0200 Subject: faster scavenge (#417) - Improvement: Multithreaded scavenge pass for zen builds download - Improvement: Optimized check for modified files when verifying state of scavenged paths --- src/zenutil/chunkedcontent.cpp | 91 +++++++++++++++++++++++++++++------------- 1 file changed, 63 insertions(+), 28 deletions(-) (limited to 'src/zenutil/chunkedcontent.cpp') diff --git a/src/zenutil/chunkedcontent.cpp b/src/zenutil/chunkedcontent.cpp index 4bec4901a..c7532e098 100644 --- a/src/zenutil/chunkedcontent.cpp +++ b/src/zenutil/chunkedcontent.cpp @@ -662,7 +662,9 @@ MergeChunkedFolderContents(const ChunkedFolderContent& Base, std::span DeletedPaths) +DeletePathsFromChunkedContent(const ChunkedFolderContent& BaseContent, + const ChunkedContentLookup& BaseContentLookup, + std::span DeletedPaths) { ZEN_TRACE_CPU("DeletePathsFromChunkedContent"); @@ -676,14 +678,18 @@ DeletePathsFromChunkedContent(const ChunkedFolderContent& BaseContent, std::span { DeletedPathSet.insert(PathCompareString(DeletedPath)); } - const ChunkedContentLookup BaseLookup = BuildChunkedContentLookup(BaseContent); - tsl::robin_map ChunkHashToChunkIndex; - const size_t ExpectedCount = BaseContent.Paths.size() - DeletedPaths.size(); - Result.Paths.reserve(ExpectedCount); - Result.RawSizes.reserve(ExpectedCount); - Result.Attributes.reserve(ExpectedCount); - Result.RawHashes.reserve(ExpectedCount); + const size_t BaseChunkCount = BaseContent.ChunkedContent.ChunkHashes.size(); + std::vector NewChunkIndexes(BaseChunkCount, (uint32_t)-1); + + const size_t ExpectedPathCount = BaseContent.Paths.size() - DeletedPaths.size(); + Result.Paths.reserve(ExpectedPathCount); + Result.RawSizes.reserve(ExpectedPathCount); + Result.Attributes.reserve(ExpectedPathCount); + Result.RawHashes.reserve(ExpectedPathCount); + + Result.ChunkedContent.ChunkHashes.reserve(BaseChunkCount); + Result.ChunkedContent.ChunkRawSizes.reserve(BaseChunkCount); tsl::robin_map RawHashToSequenceRawHashIndex; for (uint32_t PathIndex = 0; PathIndex < BaseContent.Paths.size(); PathIndex++) @@ -703,20 +709,33 @@ DeletePathsFromChunkedContent(const ChunkedFolderContent& BaseContent, std::span { RawHashToSequenceRawHashIndex.insert( {RawHash, gsl::narrow(Result.ChunkedContent.SequenceRawHashes.size())}); - const uint32_t SequenceRawHashIndex = BaseLookup.RawHashToSequenceIndex.at(RawHash); - const uint32_t OrderIndexOffset = BaseLookup.SequenceIndexChunkOrderOffset[SequenceRawHashIndex]; - const uint32_t ChunkCount = BaseContent.ChunkedContent.ChunkCounts[SequenceRawHashIndex]; - ChunkingStatistics Stats; + const uint32_t SequenceRawHashIndex = BaseContentLookup.RawHashToSequenceIndex.at(RawHash); + const uint32_t OrderIndexOffset = BaseContentLookup.SequenceIndexChunkOrderOffset[SequenceRawHashIndex]; + const uint32_t ChunkCount = BaseContent.ChunkedContent.ChunkCounts[SequenceRawHashIndex]; + std::span OriginalChunkOrder = std::span(BaseContent.ChunkedContent.ChunkOrders).subspan(OrderIndexOffset, ChunkCount); - AddChunkSequence(Stats, - Result.ChunkedContent, - ChunkHashToChunkIndex, - RawHash, - OriginalChunkOrder, - BaseContent.ChunkedContent.ChunkHashes, - BaseContent.ChunkedContent.ChunkRawSizes); - Stats.UniqueSequencesFound++; + + Result.ChunkedContent.ChunkCounts.push_back(gsl::narrow(OriginalChunkOrder.size())); + + for (uint32_t OldChunkIndex : OriginalChunkOrder) + { + if (uint32_t FoundChunkIndex = NewChunkIndexes[OldChunkIndex]; FoundChunkIndex != (uint32_t)-1) + { + Result.ChunkedContent.ChunkOrders.push_back(FoundChunkIndex); + } + else + { + const uint32_t NewChunkIndex = gsl::narrow(Result.ChunkedContent.ChunkHashes.size()); + NewChunkIndexes[OldChunkIndex] = NewChunkIndex; + const IoHash& ChunkHash = BaseContent.ChunkedContent.ChunkHashes[OldChunkIndex]; + const uint64_t OldChunkSize = BaseContent.ChunkedContent.ChunkRawSizes[OldChunkIndex]; + Result.ChunkedContent.ChunkHashes.push_back(ChunkHash); + Result.ChunkedContent.ChunkRawSizes.push_back(OldChunkSize); + Result.ChunkedContent.ChunkOrders.push_back(NewChunkIndex); + } + } + Result.ChunkedContent.SequenceRawHashes.push_back(RawHash); } } } @@ -725,6 +744,19 @@ DeletePathsFromChunkedContent(const ChunkedFolderContent& BaseContent, std::span return Result; } +ChunkedFolderContent +DeletePathsFromChunkedContent(const ChunkedFolderContent& BaseContent, std::span DeletedPaths) +{ + ZEN_TRACE_CPU("DeletePathsFromChunkedContent"); + ZEN_ASSERT(DeletedPaths.size() <= BaseContent.Paths.size()); + if (DeletedPaths.size() == BaseContent.Paths.size()) + { + return {}; + } + const ChunkedContentLookup BaseLookup = BuildChunkedContentLookup(BaseContent); + return DeletePathsFromChunkedContent(BaseContent, BaseLookup, DeletedPaths); +} + ChunkedFolderContent ChunkFolderContent(ChunkingStatistics& Stats, WorkerThreadPool& WorkerPool, @@ -815,8 +847,9 @@ BuildChunkedContentLookup(const ChunkedFolderContent& Content) struct ChunkLocationReference { - uint32_t ChunkIndex = (uint32_t)-1; - ChunkedContentLookup::ChunkSequenceLocation Location; + uint32_t ChunkIndex = (uint32_t)-1; + uint32_t SequenceIndex = (uint32_t)-1; + uint64_t Offset = (uint64_t)-1; }; ChunkedContentLookup Result; @@ -845,8 +878,7 @@ BuildChunkedContentLookup(const ChunkedFolderContent& Content) { uint32_t ChunkIndex = Content.ChunkedContent.ChunkOrders[OrderIndex]; - Locations.push_back( - ChunkLocationReference{ChunkIndex, ChunkedContentLookup::ChunkSequenceLocation{SequenceIndex, LocationOffset}}); + Locations.push_back(ChunkLocationReference{.ChunkIndex = ChunkIndex, .SequenceIndex = SequenceIndex, .Offset = LocationOffset}); LocationOffset += Content.ChunkedContent.ChunkRawSizes[ChunkIndex]; } @@ -861,15 +893,15 @@ BuildChunkedContentLookup(const ChunkedFolderContent& Content) { return false; } - if (Lhs.Location.SequenceIndex < Rhs.Location.SequenceIndex) + if (Lhs.SequenceIndex < Rhs.SequenceIndex) { return true; } - if (Lhs.Location.SequenceIndex > Rhs.Location.SequenceIndex) + if (Lhs.SequenceIndex > Rhs.SequenceIndex) { return false; } - return Lhs.Location.Offset < Rhs.Location.Offset; + return Lhs.Offset < Rhs.Offset; }); Result.ChunkSequenceLocations.reserve(Locations.size()); @@ -882,7 +914,10 @@ BuildChunkedContentLookup(const ChunkedFolderContent& Content) uint32_t Count = 0; while ((RangeOffset + Count < Locations.size()) && (Locations[RangeOffset + Count].ChunkIndex == ChunkIndex)) { - Result.ChunkSequenceLocations.push_back(Locations[RangeOffset + Count].Location); + const ChunkLocationReference& LocationReference = Locations[RangeOffset + Count]; + Result.ChunkSequenceLocations.push_back( + ChunkedContentLookup::ChunkSequenceLocation{.SequenceIndex = LocationReference.SequenceIndex, + .Offset = LocationReference.Offset}); Count++; } Result.ChunkSequenceLocationOffset.push_back(RangeOffset); -- cgit v1.2.3 From 40b9386054de3c23f77da74eefaa743240d164fd Mon Sep 17 00:00:00 2001 From: Dan Engelbrecht Date: Thu, 5 Jun 2025 14:40:02 +0200 Subject: pause, resume and abort running builds cmd (#421) - Feature: `zen builds pause`, `zen builds resume` and `zen builds abort` commands to control a running `zen builds` command - `--process-id` the process id to control, if omitted it tries to find a running process using the same executable as itself - Improvement: Process report now indicates if it is pausing or aborting --- src/zenutil/chunkedcontent.cpp | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'src/zenutil/chunkedcontent.cpp') diff --git a/src/zenutil/chunkedcontent.cpp b/src/zenutil/chunkedcontent.cpp index c7532e098..cd1bf7dd7 100644 --- a/src/zenutil/chunkedcontent.cpp +++ b/src/zenutil/chunkedcontent.cpp @@ -758,14 +758,15 @@ DeletePathsFromChunkedContent(const ChunkedFolderContent& BaseContent, std::span } ChunkedFolderContent -ChunkFolderContent(ChunkingStatistics& Stats, - WorkerThreadPool& WorkerPool, - const std::filesystem::path& RootPath, - const FolderContent& Content, - const ChunkingController& InChunkingController, - int32_t UpdateIntervalMS, - std::function&& UpdateCallback, - std::atomic& AbortFlag) +ChunkFolderContent(ChunkingStatistics& Stats, + WorkerThreadPool& WorkerPool, + const std::filesystem::path& RootPath, + const FolderContent& Content, + const ChunkingController& InChunkingController, + int32_t UpdateIntervalMS, + std::function&& UpdateCallback, + std::atomic& AbortFlag, + std::atomic& PauseFlag) { ZEN_TRACE_CPU("ChunkFolderContent"); @@ -804,7 +805,7 @@ ChunkFolderContent(ChunkingStatistics& Stats, RwLock Lock; - ParallelWork Work(AbortFlag); + ParallelWork Work(AbortFlag, PauseFlag); for (uint32_t PathIndex : Order) { @@ -831,10 +832,9 @@ ChunkFolderContent(ChunkingStatistics& Stats, }); } - Work.Wait(UpdateIntervalMS, [&](bool IsAborted, std::ptrdiff_t PendingWork) { - ZEN_UNUSED(IsAborted); + Work.Wait(UpdateIntervalMS, [&](bool IsAborted, bool IsPaused, std::ptrdiff_t PendingWork) { ZEN_UNUSED(PendingWork); - UpdateCallback(Work.IsAborted(), Work.PendingWork().Remaining()); + UpdateCallback(IsAborted, IsPaused, Work.PendingWork().Remaining()); }); } return Result; -- cgit v1.2.3