aboutsummaryrefslogtreecommitdiff
path: root/src/zen/cmds/dedup_cmd.cpp
diff options
context:
space:
mode:
authorDan Engelbrecht <[email protected]>2023-09-22 08:22:06 -0400
committerGitHub <[email protected]>2023-09-22 14:22:06 +0200
commitc7d4dc6a4d13881028d566f5ce501335e47e48bf (patch)
tree493110da583a8e5d97fe05e14f23469ee6244d2b /src/zen/cmds/dedup_cmd.cpp
parentadd trace command to enable/disable tracing at runtime (#416) (diff)
downloadarchived-zen-c7d4dc6a4d13881028d566f5ce501335e47e48bf.tar.xz
archived-zen-c7d4dc6a4d13881028d566f5ce501335e47e48bf.zip
Collect all zen admin-related commands into admin.h/.cpp (#418)
* move commands in scrub.h/cpp to admin_cmd.h/cpp * move job command into admin_cmd.h/.cpp * admin -> admin_cmd * bench -> bench_cmd * cache -> cache_cmd * copy -> copy_cmd * dedup -> dedup_cmd * hash -> hash_cmd * print -> print_cmd * projectstore -> projectstore_cmd * rpcreplay -> rpcreplay_cmd * serve -> serve_cmd * status -> status_cmd * top -> top_cmd * trace -> trace_cmd * up -> up_cmd * version -> version_cmd
Diffstat (limited to 'src/zen/cmds/dedup_cmd.cpp')
-rw-r--r--src/zen/cmds/dedup_cmd.cpp302
1 files changed, 302 insertions, 0 deletions
diff --git a/src/zen/cmds/dedup_cmd.cpp b/src/zen/cmds/dedup_cmd.cpp
new file mode 100644
index 000000000..d496cf404
--- /dev/null
+++ b/src/zen/cmds/dedup_cmd.cpp
@@ -0,0 +1,302 @@
+// Copyright Epic Games, Inc. All Rights Reserved.
+
+#include "dedup_cmd.h"
+
+#include <zencore/blake3.h>
+#include <zencore/filesystem.h>
+#include <zencore/iobuffer.h>
+#include <zencore/logging.h>
+#include <zencore/string.h>
+#include <zencore/thread.h>
+#include <zencore/timer.h>
+
+#if ZEN_PLATFORM_WINDOWS
+# include <ppl.h>
+#endif
+
+#include <list>
+
+namespace zen {
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if ZEN_PLATFORM_LINUX || ZEN_PLATFORM_MAC
+
+namespace Concurrency {
+
+ template<typename T0, typename T1>
+ inline void parallel_invoke(T0 const& t0, T1 const& t1)
+ {
+ t0();
+ t1();
+ }
+
+} // namespace Concurrency
+
+#endif // ZEN_PLATFORM_LINUX/MAC
+
+////////////////////////////////////////////////////////////////////////////////
+
+DedupCommand::DedupCommand()
+{
+ m_Options.add_options()("h,help", "Print help");
+ m_Options.add_options()("size", "Configure size threshold for dedup", cxxopts::value(m_SizeThreshold)->default_value("131072"));
+ m_Options.add_option("", "s", "source", "Copy source", cxxopts::value(m_DedupSource), "<file/directory>");
+ m_Options.add_option("", "t", "target", "Copy target", cxxopts::value(m_DedupTarget), "<file/directory>");
+ m_Options.add_option("", "", "positional", "Positional arguments", cxxopts::value(m_Positional), "");
+ m_Options.parse_positional({"source", "target", "positional"});
+}
+
+DedupCommand::~DedupCommand() = default;
+
+int
+DedupCommand::Run(const ZenCliOptions& GlobalOptions, int argc, char** argv)
+{
+ ZEN_UNUSED(GlobalOptions);
+
+ if (!ParseOptions(argc, argv))
+ {
+ return 0;
+ }
+
+ // Validate arguments
+
+ const bool SourceGood = zen::SupportsBlockRefCounting(m_DedupSource);
+ const bool TargetGood = zen::SupportsBlockRefCounting(m_DedupTarget);
+
+ if (!SourceGood)
+ {
+ ZEN_ERROR("Source directory '{}' does not support deduplication", m_DedupSource);
+
+ return 0;
+ }
+
+ if (!TargetGood)
+ {
+ ZEN_ERROR("Target directory '{}' does not support deduplication", m_DedupTarget);
+
+ return 0;
+ }
+
+ ZEN_CONSOLE("Performing dedup operation between {} and {}, size threshold {}",
+ m_DedupSource,
+ m_DedupTarget,
+ zen::NiceBytes(m_SizeThreshold));
+
+ using DirEntryList_t = std::list<std::filesystem::directory_entry>;
+
+ zen::RwLock MapLock;
+ std::unordered_map<size_t, DirEntryList_t> FileSizeMap;
+ size_t CandidateCount = 0;
+
+ auto AddToList = [&](const std::filesystem::directory_entry& Entry) {
+ if (Entry.is_regular_file())
+ {
+ uintmax_t FileSize = Entry.file_size();
+ if (FileSize > m_SizeThreshold)
+ {
+ zen::RwLock::ExclusiveLockScope _(MapLock);
+ FileSizeMap[FileSize].push_back(Entry);
+ ++CandidateCount;
+ }
+ }
+ };
+
+ std::filesystem::recursive_directory_iterator DirEnd;
+
+ ZEN_CONSOLE("Gathering file info from source: '{}'", m_DedupSource);
+ ZEN_CONSOLE("Gathering file info from target: '{}'", m_DedupTarget);
+
+ {
+ zen::Stopwatch Timer;
+
+ Concurrency::parallel_invoke(
+ [&] {
+ for (std::filesystem::recursive_directory_iterator DirIt1(m_DedupSource); DirIt1 != DirEnd; ++DirIt1)
+ {
+ AddToList(*DirIt1);
+ }
+ },
+ [&] {
+ for (std::filesystem::recursive_directory_iterator DirIt2(m_DedupTarget); DirIt2 != DirEnd; ++DirIt2)
+ {
+ AddToList(*DirIt2);
+ }
+ });
+
+ ZEN_CONSOLE("Gathered {} candidates across {} size buckets. Elapsed: {}",
+ CandidateCount,
+ FileSizeMap.size(),
+ zen::NiceTimeSpanMs(Timer.GetElapsedTimeMs()));
+ }
+
+ ZEN_CONSOLE("Sorting buckets by size");
+
+ zen::Stopwatch Timer;
+
+ uint64_t DupeBytes = 0;
+
+ struct SizeList
+ {
+ size_t Size;
+ DirEntryList_t* DirEntries;
+ };
+
+ std::vector<SizeList> SizeLists{FileSizeMap.size()};
+
+ {
+ int i = 0;
+
+ for (auto& kv : FileSizeMap)
+ {
+ ZEN_ASSERT(kv.first >= m_SizeThreshold);
+ SizeLists[i].Size = kv.first;
+ SizeLists[i].DirEntries = &kv.second;
+ ++i;
+ }
+ }
+
+ std::sort(begin(SizeLists), end(SizeLists), [](const SizeList& Lhs, const SizeList& Rhs) { return Lhs.Size > Rhs.Size; });
+
+ ZEN_CONSOLE("Bucket summary:");
+
+ std::vector<size_t> BucketId;
+ std::vector<size_t> BucketOffsets;
+ std::vector<size_t> BucketSizes;
+ std::vector<size_t> BucketFileCounts;
+
+ size_t TotalFileSizes = 0;
+ size_t TotalFileCount = 0;
+
+ {
+ size_t CurrentPow2 = 0;
+ size_t BucketSize = 0;
+ size_t BucketFileCount = 0;
+ bool FirstBucket = true;
+
+ for (size_t i = 0; i < SizeLists.size(); ++i)
+ {
+ const size_t ThisSize = SizeLists[i].Size;
+ const size_t Pow2 = zen::NextPow2(ThisSize);
+
+ if (CurrentPow2 != Pow2)
+ {
+ CurrentPow2 = Pow2;
+
+ if (!FirstBucket)
+ {
+ BucketSizes.push_back(BucketSize);
+ BucketFileCounts.push_back(BucketFileCount);
+ }
+
+ BucketId.push_back(Pow2);
+ BucketOffsets.push_back(i);
+
+ FirstBucket = false;
+ BucketSize = 0;
+ BucketFileCount = 0;
+ }
+
+ BucketSize += ThisSize;
+ TotalFileSizes += ThisSize;
+ BucketFileCount += SizeLists[i].DirEntries->size();
+ TotalFileCount += SizeLists[i].DirEntries->size();
+ }
+
+ if (!FirstBucket)
+ {
+ BucketSizes.push_back(BucketSize);
+ BucketFileCounts.push_back(BucketFileCount);
+ }
+
+ ZEN_ASSERT(BucketOffsets.size() == BucketSizes.size());
+ ZEN_ASSERT(BucketOffsets.size() == BucketFileCounts.size());
+ }
+
+ for (size_t i = 0; i < BucketOffsets.size(); ++i)
+ {
+ ZEN_CONSOLE(" Bucket {} : {}, {} candidates", zen::NiceBytes(BucketId[i]), zen::NiceBytes(BucketSizes[i]), BucketFileCounts[i]);
+ }
+
+ ZEN_CONSOLE("Total : {}, {} candidates", zen::NiceBytes(TotalFileSizes), TotalFileCount);
+
+ std::string CurrentNice;
+
+ for (SizeList& Size : SizeLists)
+ {
+ std::string CurNice{zen::NiceBytes(zen::NextPow2(Size.Size))};
+
+ if (CurNice != CurrentNice)
+ {
+ CurrentNice = CurNice;
+ ZEN_CONSOLE("Now scanning bucket: {}", CurrentNice);
+ }
+
+ std::unordered_map<zen::BLAKE3, const std::filesystem::directory_entry*, zen::BLAKE3::Hasher> DedupMap;
+
+ for (const auto& Entry : *Size.DirEntries)
+ {
+ zen::BLAKE3 Hash;
+
+ if constexpr (true)
+ {
+ zen::BLAKE3Stream b3s;
+
+ zen::ScanFile(Entry.path(), 64 * 1024, [&](const void* Data, size_t Size) { b3s.Append(Data, Size); });
+
+ Hash = b3s.GetHash();
+ }
+ else
+ {
+ zen::FileContents Contents = zen::ReadFile(Entry.path());
+
+ zen::BLAKE3Stream b3s;
+
+ for (zen::IoBuffer& Buffer : Contents.Data)
+ {
+ b3s.Append(Buffer.Data(), Buffer.Size());
+ }
+ Hash = b3s.GetHash();
+ }
+
+ if (const std::filesystem::directory_entry* Dupe = DedupMap[Hash])
+ {
+ std::string FileA = PathToUtf8(Dupe->path());
+ std::string FileB = PathToUtf8(Entry.path());
+
+ size_t MinLen = std::min(FileA.size(), FileB.size());
+ auto Its = std::mismatch(FileB.rbegin(), FileB.rbegin() + MinLen, FileA.rbegin());
+
+ if (Its.first != FileB.rbegin())
+ {
+ if (Its.first[-1] == '\\' || Its.first[-1] == '/')
+ --Its.first;
+
+ FileB = std::string(FileB.begin(), Its.first.base()) + "...";
+ }
+
+ ZEN_INFO("{} {} <-> {}", zen::NiceBytes(Entry.file_size()).c_str(), FileA.c_str(), FileB.c_str());
+
+ zen::CopyFileOptions Options;
+ Options.EnableClone = true;
+ Options.MustClone = true;
+
+ zen::CopyFile(Dupe->path(), Entry.path(), Options);
+
+ DupeBytes += Entry.file_size();
+ }
+ else
+ {
+ DedupMap[Hash] = &Entry;
+ }
+ }
+
+ Size.DirEntries->clear();
+ }
+
+ ZEN_CONSOLE("Elapsed: {} Deduped: {}", zen::NiceTimeSpanMs(Timer.GetElapsedTimeMs()), zen::NiceBytes(DupeBytes));
+
+ return 0;
+}
+
+} // namespace zen