diff options
| author | Dan Engelbrecht <[email protected]> | 2023-09-22 08:22:06 -0400 |
|---|---|---|
| committer | GitHub <[email protected]> | 2023-09-22 14:22:06 +0200 |
| commit | c7d4dc6a4d13881028d566f5ce501335e47e48bf (patch) | |
| tree | 493110da583a8e5d97fe05e14f23469ee6244d2b /src/zen/cmds/dedup_cmd.cpp | |
| parent | add trace command to enable/disable tracing at runtime (#416) (diff) | |
| download | archived-zen-c7d4dc6a4d13881028d566f5ce501335e47e48bf.tar.xz archived-zen-c7d4dc6a4d13881028d566f5ce501335e47e48bf.zip | |
Collect all zen admin-related commands into admin.h/.cpp (#418)
* move commands in scrub.h/cpp to admin_cmd.h/cpp
* move job command into admin_cmd.h/.cpp
* admin -> admin_cmd
* bench -> bench_cmd
* cache -> cache_cmd
* copy -> copy_cmd
* dedup -> dedup_cmd
* hash -> hash_cmd
* print -> print_cmd
* projectstore -> projectstore_cmd
* rpcreplay -> rpcreplay_cmd
* serve -> serve_cmd
* status -> status_cmd
* top -> top_cmd
* trace -> trace_cmd
* up -> up_cmd
* version -> version_cmd
Diffstat (limited to 'src/zen/cmds/dedup_cmd.cpp')
| -rw-r--r-- | src/zen/cmds/dedup_cmd.cpp | 302 |
1 files changed, 302 insertions, 0 deletions
diff --git a/src/zen/cmds/dedup_cmd.cpp b/src/zen/cmds/dedup_cmd.cpp new file mode 100644 index 000000000..d496cf404 --- /dev/null +++ b/src/zen/cmds/dedup_cmd.cpp @@ -0,0 +1,302 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#include "dedup_cmd.h" + +#include <zencore/blake3.h> +#include <zencore/filesystem.h> +#include <zencore/iobuffer.h> +#include <zencore/logging.h> +#include <zencore/string.h> +#include <zencore/thread.h> +#include <zencore/timer.h> + +#if ZEN_PLATFORM_WINDOWS +# include <ppl.h> +#endif + +#include <list> + +namespace zen { + +//////////////////////////////////////////////////////////////////////////////// + +#if ZEN_PLATFORM_LINUX || ZEN_PLATFORM_MAC + +namespace Concurrency { + + template<typename T0, typename T1> + inline void parallel_invoke(T0 const& t0, T1 const& t1) + { + t0(); + t1(); + } + +} // namespace Concurrency + +#endif // ZEN_PLATFORM_LINUX/MAC + +//////////////////////////////////////////////////////////////////////////////// + +DedupCommand::DedupCommand() +{ + m_Options.add_options()("h,help", "Print help"); + m_Options.add_options()("size", "Configure size threshold for dedup", cxxopts::value(m_SizeThreshold)->default_value("131072")); + m_Options.add_option("", "s", "source", "Copy source", cxxopts::value(m_DedupSource), "<file/directory>"); + m_Options.add_option("", "t", "target", "Copy target", cxxopts::value(m_DedupTarget), "<file/directory>"); + m_Options.add_option("", "", "positional", "Positional arguments", cxxopts::value(m_Positional), ""); + m_Options.parse_positional({"source", "target", "positional"}); +} + +DedupCommand::~DedupCommand() = default; + +int +DedupCommand::Run(const ZenCliOptions& GlobalOptions, int argc, char** argv) +{ + ZEN_UNUSED(GlobalOptions); + + if (!ParseOptions(argc, argv)) + { + return 0; + } + + // Validate arguments + + const bool SourceGood = zen::SupportsBlockRefCounting(m_DedupSource); + const bool TargetGood = zen::SupportsBlockRefCounting(m_DedupTarget); + + if (!SourceGood) + { + ZEN_ERROR("Source directory '{}' does not support deduplication", m_DedupSource); + + return 0; + } + + if (!TargetGood) + { + ZEN_ERROR("Target directory '{}' does not support deduplication", m_DedupTarget); + + return 0; + } + + ZEN_CONSOLE("Performing dedup operation between {} and {}, size threshold {}", + m_DedupSource, + m_DedupTarget, + zen::NiceBytes(m_SizeThreshold)); + + using DirEntryList_t = std::list<std::filesystem::directory_entry>; + + zen::RwLock MapLock; + std::unordered_map<size_t, DirEntryList_t> FileSizeMap; + size_t CandidateCount = 0; + + auto AddToList = [&](const std::filesystem::directory_entry& Entry) { + if (Entry.is_regular_file()) + { + uintmax_t FileSize = Entry.file_size(); + if (FileSize > m_SizeThreshold) + { + zen::RwLock::ExclusiveLockScope _(MapLock); + FileSizeMap[FileSize].push_back(Entry); + ++CandidateCount; + } + } + }; + + std::filesystem::recursive_directory_iterator DirEnd; + + ZEN_CONSOLE("Gathering file info from source: '{}'", m_DedupSource); + ZEN_CONSOLE("Gathering file info from target: '{}'", m_DedupTarget); + + { + zen::Stopwatch Timer; + + Concurrency::parallel_invoke( + [&] { + for (std::filesystem::recursive_directory_iterator DirIt1(m_DedupSource); DirIt1 != DirEnd; ++DirIt1) + { + AddToList(*DirIt1); + } + }, + [&] { + for (std::filesystem::recursive_directory_iterator DirIt2(m_DedupTarget); DirIt2 != DirEnd; ++DirIt2) + { + AddToList(*DirIt2); + } + }); + + ZEN_CONSOLE("Gathered {} candidates across {} size buckets. Elapsed: {}", + CandidateCount, + FileSizeMap.size(), + zen::NiceTimeSpanMs(Timer.GetElapsedTimeMs())); + } + + ZEN_CONSOLE("Sorting buckets by size"); + + zen::Stopwatch Timer; + + uint64_t DupeBytes = 0; + + struct SizeList + { + size_t Size; + DirEntryList_t* DirEntries; + }; + + std::vector<SizeList> SizeLists{FileSizeMap.size()}; + + { + int i = 0; + + for (auto& kv : FileSizeMap) + { + ZEN_ASSERT(kv.first >= m_SizeThreshold); + SizeLists[i].Size = kv.first; + SizeLists[i].DirEntries = &kv.second; + ++i; + } + } + + std::sort(begin(SizeLists), end(SizeLists), [](const SizeList& Lhs, const SizeList& Rhs) { return Lhs.Size > Rhs.Size; }); + + ZEN_CONSOLE("Bucket summary:"); + + std::vector<size_t> BucketId; + std::vector<size_t> BucketOffsets; + std::vector<size_t> BucketSizes; + std::vector<size_t> BucketFileCounts; + + size_t TotalFileSizes = 0; + size_t TotalFileCount = 0; + + { + size_t CurrentPow2 = 0; + size_t BucketSize = 0; + size_t BucketFileCount = 0; + bool FirstBucket = true; + + for (size_t i = 0; i < SizeLists.size(); ++i) + { + const size_t ThisSize = SizeLists[i].Size; + const size_t Pow2 = zen::NextPow2(ThisSize); + + if (CurrentPow2 != Pow2) + { + CurrentPow2 = Pow2; + + if (!FirstBucket) + { + BucketSizes.push_back(BucketSize); + BucketFileCounts.push_back(BucketFileCount); + } + + BucketId.push_back(Pow2); + BucketOffsets.push_back(i); + + FirstBucket = false; + BucketSize = 0; + BucketFileCount = 0; + } + + BucketSize += ThisSize; + TotalFileSizes += ThisSize; + BucketFileCount += SizeLists[i].DirEntries->size(); + TotalFileCount += SizeLists[i].DirEntries->size(); + } + + if (!FirstBucket) + { + BucketSizes.push_back(BucketSize); + BucketFileCounts.push_back(BucketFileCount); + } + + ZEN_ASSERT(BucketOffsets.size() == BucketSizes.size()); + ZEN_ASSERT(BucketOffsets.size() == BucketFileCounts.size()); + } + + for (size_t i = 0; i < BucketOffsets.size(); ++i) + { + ZEN_CONSOLE(" Bucket {} : {}, {} candidates", zen::NiceBytes(BucketId[i]), zen::NiceBytes(BucketSizes[i]), BucketFileCounts[i]); + } + + ZEN_CONSOLE("Total : {}, {} candidates", zen::NiceBytes(TotalFileSizes), TotalFileCount); + + std::string CurrentNice; + + for (SizeList& Size : SizeLists) + { + std::string CurNice{zen::NiceBytes(zen::NextPow2(Size.Size))}; + + if (CurNice != CurrentNice) + { + CurrentNice = CurNice; + ZEN_CONSOLE("Now scanning bucket: {}", CurrentNice); + } + + std::unordered_map<zen::BLAKE3, const std::filesystem::directory_entry*, zen::BLAKE3::Hasher> DedupMap; + + for (const auto& Entry : *Size.DirEntries) + { + zen::BLAKE3 Hash; + + if constexpr (true) + { + zen::BLAKE3Stream b3s; + + zen::ScanFile(Entry.path(), 64 * 1024, [&](const void* Data, size_t Size) { b3s.Append(Data, Size); }); + + Hash = b3s.GetHash(); + } + else + { + zen::FileContents Contents = zen::ReadFile(Entry.path()); + + zen::BLAKE3Stream b3s; + + for (zen::IoBuffer& Buffer : Contents.Data) + { + b3s.Append(Buffer.Data(), Buffer.Size()); + } + Hash = b3s.GetHash(); + } + + if (const std::filesystem::directory_entry* Dupe = DedupMap[Hash]) + { + std::string FileA = PathToUtf8(Dupe->path()); + std::string FileB = PathToUtf8(Entry.path()); + + size_t MinLen = std::min(FileA.size(), FileB.size()); + auto Its = std::mismatch(FileB.rbegin(), FileB.rbegin() + MinLen, FileA.rbegin()); + + if (Its.first != FileB.rbegin()) + { + if (Its.first[-1] == '\\' || Its.first[-1] == '/') + --Its.first; + + FileB = std::string(FileB.begin(), Its.first.base()) + "..."; + } + + ZEN_INFO("{} {} <-> {}", zen::NiceBytes(Entry.file_size()).c_str(), FileA.c_str(), FileB.c_str()); + + zen::CopyFileOptions Options; + Options.EnableClone = true; + Options.MustClone = true; + + zen::CopyFile(Dupe->path(), Entry.path(), Options); + + DupeBytes += Entry.file_size(); + } + else + { + DedupMap[Hash] = &Entry; + } + } + + Size.DirEntries->clear(); + } + + ZEN_CONSOLE("Elapsed: {} Deduped: {}", zen::NiceTimeSpanMs(Timer.GetElapsedTimeMs()), zen::NiceBytes(DupeBytes)); + + return 0; +} + +} // namespace zen |