// Copyright Epic Games, Inc. All Rights Reserved. #include "dedup_cmd.h" #include #include #include #include #include #include #include #include #if ZEN_PLATFORM_WINDOWS # include #endif #include #include namespace zen { //////////////////////////////////////////////////////////////////////////////// #if ZEN_PLATFORM_LINUX || ZEN_PLATFORM_MAC namespace Concurrency { template inline void parallel_invoke(T0 const& t0, T1 const& t1) { t0(); t1(); } } // namespace Concurrency #endif // ZEN_PLATFORM_LINUX/MAC //////////////////////////////////////////////////////////////////////////////// DedupCommand::DedupCommand() { m_Options.add_options()("h,help", "Print help"); m_Options.add_options()("size", "Configure size threshold for dedup", cxxopts::value(m_SizeThreshold)->default_value("131072")); m_Options.add_option("", "s", "source", "Copy source", cxxopts::value(m_DedupSource), ""); m_Options.add_option("", "t", "target", "Copy target", cxxopts::value(m_DedupTarget), ""); m_Options.add_option("", "", "positional", "Positional arguments", cxxopts::value(m_Positional), ""); m_Options.parse_positional({"source", "target", "positional"}); } DedupCommand::~DedupCommand() = default; void DedupCommand::Run(const ZenCliOptions& GlobalOptions, int argc, char** argv) { ZEN_UNUSED(GlobalOptions); if (!ParseOptions(argc, argv)) { return; } // Validate arguments const bool SourceGood = zen::SupportsBlockRefCounting(m_DedupSource); const bool TargetGood = zen::SupportsBlockRefCounting(m_DedupTarget); if (!SourceGood) { throw std::runtime_error(fmt::format("Source directory '{}' does not support deduplication", m_DedupSource)); } if (!TargetGood) { throw std::runtime_error(fmt::format("Target directory '{}' does not support deduplication", m_DedupTarget)); } ZEN_CONSOLE("Performing dedup operation between {} and {}, size threshold {}", m_DedupSource, m_DedupTarget, zen::NiceBytes(m_SizeThreshold)); using DirEntryList_t = std::list; zen::RwLock MapLock; std::unordered_map FileSizeMap; size_t CandidateCount = 0; auto AddToList = [&](const std::filesystem::directory_entry& Entry) { if (Entry.is_regular_file()) { uintmax_t FileSize = Entry.file_size(); if (FileSize > m_SizeThreshold) { zen::RwLock::ExclusiveLockScope _(MapLock); FileSizeMap[FileSize].push_back(Entry); ++CandidateCount; } } }; std::filesystem::recursive_directory_iterator DirEnd; ZEN_CONSOLE("Gathering file info from source: '{}'", m_DedupSource); ZEN_CONSOLE("Gathering file info from target: '{}'", m_DedupTarget); { zen::Stopwatch Timer; Concurrency::parallel_invoke( [&] { for (std::filesystem::recursive_directory_iterator DirIt1(m_DedupSource); DirIt1 != DirEnd; ++DirIt1) { AddToList(*DirIt1); } }, [&] { for (std::filesystem::recursive_directory_iterator DirIt2(m_DedupTarget); DirIt2 != DirEnd; ++DirIt2) { AddToList(*DirIt2); } }); ZEN_CONSOLE("Gathered {} candidates across {} size buckets. Elapsed: {}", CandidateCount, FileSizeMap.size(), zen::NiceTimeSpanMs(Timer.GetElapsedTimeMs())); } ZEN_CONSOLE("Sorting buckets by size"); zen::Stopwatch Timer; uint64_t DupeBytes = 0; struct SizeList { size_t Size; DirEntryList_t* DirEntries; }; std::vector SizeLists{FileSizeMap.size()}; { int i = 0; for (auto& kv : FileSizeMap) { ZEN_ASSERT(kv.first >= m_SizeThreshold); SizeLists[i].Size = kv.first; SizeLists[i].DirEntries = &kv.second; ++i; } } std::sort(begin(SizeLists), end(SizeLists), [](const SizeList& Lhs, const SizeList& Rhs) { return Lhs.Size > Rhs.Size; }); ZEN_CONSOLE("Bucket summary:"); std::vector BucketId; std::vector BucketOffsets; std::vector BucketSizes; std::vector BucketFileCounts; size_t TotalFileSizes = 0; size_t TotalFileCount = 0; { size_t CurrentPow2 = 0; size_t BucketSize = 0; size_t BucketFileCount = 0; bool FirstBucket = true; for (size_t i = 0; i < SizeLists.size(); ++i) { const size_t ThisSize = SizeLists[i].Size; const size_t Pow2 = zen::NextPow2(ThisSize); if (CurrentPow2 != Pow2) { CurrentPow2 = Pow2; if (!FirstBucket) { BucketSizes.push_back(BucketSize); BucketFileCounts.push_back(BucketFileCount); } BucketId.push_back(Pow2); BucketOffsets.push_back(i); FirstBucket = false; BucketSize = 0; BucketFileCount = 0; } BucketSize += ThisSize; TotalFileSizes += ThisSize; BucketFileCount += SizeLists[i].DirEntries->size(); TotalFileCount += SizeLists[i].DirEntries->size(); } if (!FirstBucket) { BucketSizes.push_back(BucketSize); BucketFileCounts.push_back(BucketFileCount); } ZEN_ASSERT(BucketOffsets.size() == BucketSizes.size()); ZEN_ASSERT(BucketOffsets.size() == BucketFileCounts.size()); } for (size_t i = 0; i < BucketOffsets.size(); ++i) { ZEN_CONSOLE(" Bucket {} : {}, {} candidates", zen::NiceBytes(BucketId[i]), zen::NiceBytes(BucketSizes[i]), BucketFileCounts[i]); } ZEN_CONSOLE("Total : {}, {} candidates", zen::NiceBytes(TotalFileSizes), TotalFileCount); std::string CurrentNice; for (SizeList& Size : SizeLists) { std::string CurNice{zen::NiceBytes(zen::NextPow2(Size.Size))}; if (CurNice != CurrentNice) { CurrentNice = CurNice; ZEN_CONSOLE("Now scanning bucket: {}", CurrentNice); } std::unordered_map DedupMap; for (const auto& Entry : *Size.DirEntries) { zen::BLAKE3 Hash; if constexpr (true) { zen::BLAKE3Stream b3s; if (std::error_code ScanEc = zen::ScanFile(Entry.path(), 64 * 1024, [&](const void* Data, size_t Size) { b3s.Append(Data, Size); }); ScanEc) { throw std::system_error(ScanEc, fmt::format("Failed to scan file '{}'", Entry.path())); } Hash = b3s.GetHash(); } else { zen::FileContents Contents = zen::ReadFile(Entry.path()); zen::BLAKE3Stream b3s; for (zen::IoBuffer& Buffer : Contents.Data) { b3s.Append(Buffer.Data(), Buffer.Size()); } Hash = b3s.GetHash(); } if (const std::filesystem::directory_entry* Dupe = DedupMap[Hash]) { std::string FileA = PathToUtf8(Dupe->path()); std::string FileB = PathToUtf8(Entry.path()); size_t MinLen = std::min(FileA.size(), FileB.size()); auto Its = std::mismatch(FileB.rbegin(), FileB.rbegin() + MinLen, FileA.rbegin()); if (Its.first != FileB.rbegin()) { if (Its.first[-1] == '\\' || Its.first[-1] == '/') --Its.first; FileB = std::string(FileB.begin(), Its.first.base()) + "..."; } ZEN_INFO("{} {} <-> {}", zen::NiceBytes(Entry.file_size()).c_str(), FileA.c_str(), FileB.c_str()); zen::CopyFileOptions Options; Options.EnableClone = true; Options.MustClone = true; if (std::error_code Ec = zen::CopyFile(Dupe->path(), Entry.path(), Options); Ec) { ZEN_ERROR("Failed to clone '{}' to '{}': {}", Dupe->path(), Entry.path(), Ec.message()); continue; } DupeBytes += Entry.file_size(); } else { DedupMap[Hash] = &Entry; } } Size.DirEntries->clear(); } ZEN_CONSOLE("Elapsed: {} Deduped: {}", zen::NiceTimeSpanMs(Timer.GetElapsedTimeMs()), zen::NiceBytes(DupeBytes)); } } // namespace zen