diff options
| author | Dan Engelbrecht <[email protected]> | 2022-06-17 07:06:21 -0700 |
|---|---|---|
| committer | GitHub <[email protected]> | 2022-06-17 07:06:21 -0700 |
| commit | c7e22a4ef1cce7103b9afbeec487461cb32f8dbe (patch) | |
| tree | 8b99d51bf496c96f82161c18fbdcfd5c6f8f31fd /zenstore/include | |
| parent | fixed merge mistake which caused a build error (diff) | |
| download | zen-c7e22a4ef1cce7103b9afbeec487461cb32f8dbe.tar.xz zen-c7e22a4ef1cce7103b9afbeec487461cb32f8dbe.zip | |
Make cas storage an hidden implementation detail of CidStore (#130)v0.1.4-pre6v0.1.4-pre5
- Bumped ZEN_SCHEMA_VERSION
- CasStore no longer a public API, it is hidden behind CidStore
- Moved cas.h from public header folder
- CidStore no longer maps from Cid -> Cas, we store entries in Cas under RawHash
- CasStore now decompresses data to validate content (matching against RawHash)
- CasChunkSet renames to HashKeySet and put in separate header/cpp file
- Disabled "Chunk" command for now as it relied on CAS being exposed as a service
- Changed CAS http service to Cid http server
- Moved "Run" command completely inside ZEN_WITH_EXEC_SERVICES define
- Removed "cas.basic" test
- Uncommented ".exec.basic" test and added return-skip at start of test
- Moved ScrubContext to separate header file
- Renamed CasGC to GcManager
- Cleaned up configuration passing in cas store classes
- Removed CAS stuff from GcContext and clarified naming in class
- Remove migration code
Diffstat (limited to 'zenstore/include')
| -rw-r--r-- | zenstore/include/zenstore/cas.h | 144 | ||||
| -rw-r--r-- | zenstore/include/zenstore/caslog.h | 2 | ||||
| -rw-r--r-- | zenstore/include/zenstore/cidstore.h | 57 | ||||
| -rw-r--r-- | zenstore/include/zenstore/gc.h | 44 | ||||
| -rw-r--r-- | zenstore/include/zenstore/hashkeyset.h | 54 | ||||
| -rw-r--r-- | zenstore/include/zenstore/scrubcontext.h | 40 |
6 files changed, 152 insertions, 189 deletions
diff --git a/zenstore/include/zenstore/cas.h b/zenstore/include/zenstore/cas.h deleted file mode 100644 index 5592fbd0a..000000000 --- a/zenstore/include/zenstore/cas.h +++ /dev/null @@ -1,144 +0,0 @@ -// Copyright Epic Games, Inc. All Rights Reserved. - -#pragma once - -#include "zenstore.h" - -#include <zencore/blake3.h> -#include <zencore/iobuffer.h> -#include <zencore/iohash.h> -#include <zencore/refcount.h> -#include <zencore/timer.h> - -#include <atomic> -#include <filesystem> -#include <functional> -#include <memory> -#include <string> -#include <unordered_set> - -namespace zen { - -class GcContext; -class CasGc; - -struct CasStoreConfiguration -{ - // Root directory for CAS store - std::filesystem::path RootDirectory; - - // Threshold below which values are considered 'tiny' and managed using the 'tiny values' strategy - uint64_t TinyValueThreshold = 1024; - - // Threshold above which values are considered 'huge' and managed using the 'huge values' strategy - uint64_t HugeValueThreshold = 1024 * 1024; -}; - -/** Manage a set of IoHash values - */ - -class CasChunkSet -{ -public: - void AddChunkToSet(const IoHash& HashToAdd); - void AddChunksToSet(std::span<const IoHash> HashesToAdd); - void RemoveChunksIf(std::function<bool(const IoHash& CandidateHash)>&& Predicate); - void IterateChunks(std::function<void(const IoHash& ChunkHash)>&& Callback); - [[nodiscard]] inline bool ContainsChunk(const IoHash& Hash) const { return m_ChunkSet.find(Hash) != m_ChunkSet.end(); } - [[nodiscard]] inline bool IsEmpty() const { return m_ChunkSet.empty(); } - [[nodiscard]] inline size_t GetSize() const { return m_ChunkSet.size(); } - - inline void FilterChunks(std::span<const IoHash> Candidates, Invocable<const IoHash&> auto MatchFunc) - { - for (const IoHash& Candidate : Candidates) - { - if (ContainsChunk(Candidate)) - { - MatchFunc(Candidate); - } - } - } - - inline void FilterChunks(std::span<const IoHash> Candidates, Invocable<const IoHash&, bool> auto MatchFunc) - { - for (const IoHash& Candidate : Candidates) - { - MatchFunc(Candidate, ContainsChunk(Candidate)); - } - } - -private: - // Q: should we protect this with a lock, or is that a higher level concern? - std::unordered_set<IoHash, IoHash::Hasher> m_ChunkSet; -}; - -/** Context object for data scrubbing - * - * Data scrubbing is when we traverse stored data to validate it and - * optionally correct/recover - */ - -class ScrubContext -{ -public: - virtual void ReportBadCasChunks(std::span<IoHash> BadCasChunks); - inline uint64_t ScrubTimestamp() const { return m_ScrubTime; } - inline bool RunRecovery() const { return m_Recover; } - void ReportScrubbed(uint64_t ChunkCount, uint64_t ChunkBytes); - - inline uint64_t ScrubbedChunks() const { return m_ChunkCount; } - inline uint64_t ScrubbedBytes() const { return m_ByteCount; } - -private: - uint64_t m_ScrubTime = GetHifreqTimerValue(); - bool m_Recover = true; - std::atomic<uint64_t> m_ChunkCount{0}; - std::atomic<uint64_t> m_ByteCount{0}; - CasChunkSet m_BadCas; - CasChunkSet m_BadCid; -}; - -struct CasStoreSize -{ - uint64_t TinySize{}; - uint64_t SmallSize{}; - uint64_t LargeSize{}; - uint64_t TotalSize{}; -}; - -/** Content Addressable Storage interface - - */ - -class CasStore -{ -public: - virtual ~CasStore() = default; - - const CasStoreConfiguration& Config() { return m_Config; } - - struct InsertResult - { - bool New = false; - }; - - virtual void Initialize(const CasStoreConfiguration& Config) = 0; - virtual InsertResult InsertChunk(IoBuffer Data, const IoHash& ChunkHash) = 0; - virtual IoBuffer FindChunk(const IoHash& ChunkHash) = 0; - virtual bool ContainsChunk(const IoHash& ChunkHash) = 0; - virtual void FilterChunks(CasChunkSet& InOutChunks) = 0; - virtual void Flush() = 0; - virtual void Scrub(ScrubContext& Ctx) = 0; - virtual void GarbageCollect(GcContext& GcCtx) = 0; - virtual CasStoreSize TotalSize() const = 0; - -protected: - CasStoreConfiguration m_Config; - uint64_t m_LastScrubTime = 0; -}; - -ZENCORE_API std::unique_ptr<CasStore> CreateCasStore(CasGc& Gc); - -void CAS_forcelink(); - -} // namespace zen diff --git a/zenstore/include/zenstore/caslog.h b/zenstore/include/zenstore/caslog.h index 4b93a708f..c56b653fc 100644 --- a/zenstore/include/zenstore/caslog.h +++ b/zenstore/include/zenstore/caslog.h @@ -2,8 +2,6 @@ #pragma once -#include "zenstore.h" - #include <zencore/uid.h> #include <zenstore/basicfile.h> diff --git a/zenstore/include/zenstore/cidstore.h b/zenstore/include/zenstore/cidstore.h index b0252a2a6..21e3c3160 100644 --- a/zenstore/include/zenstore/cidstore.h +++ b/zenstore/include/zenstore/cidstore.h @@ -5,7 +5,7 @@ #include "zenstore.h" #include <zencore/iohash.h> -#include <zenstore/cas.h> +#include <zenstore/hashkeyset.h> ZEN_THIRD_PARTY_INCLUDES_START #include <tsl/robin_map.h> @@ -15,53 +15,68 @@ ZEN_THIRD_PARTY_INCLUDES_END namespace zen { +class GcManager; class CasStore; class CompressedBuffer; class IoBuffer; +class ScrubContext; /** Content Store * - * Data in the content store is referenced by content identifiers (CIDs), rather than their - * literal hash. This class maps uncompressed hashes to compressed hashes and may + * Data in the content store is referenced by content identifiers (CIDs), it works + * with compressed buffers so the CID is expected to be the RAW hash. It stores the + * chunk directly under the RAW hash. + * This class maps uncompressed hashes (CIDs) to compressed hashes and may * be used to deal with other kinds of indirections in the future. For example, if we want * to support chunking then a CID may represent a list of chunks which could be concatenated * to form the referenced chunk. * - * It would likely be possible to implement this mapping in a more efficient way if we - * integrate it into the CAS store itself, so we can avoid maintaining copies of large - * hashes in multiple locations. This would also allow us to consolidate commit logs etc - * which would be more resilient than the current split log scheme - * */ + +struct CidStoreSize +{ + uint64_t TinySize = 0; + uint64_t SmallSize = 0; + uint64_t LargeSize = 0; + uint64_t TotalSize = 0; +}; + +struct CidStoreConfiguration +{ + // Root directory for CAS store + std::filesystem::path RootDirectory; + + // Threshold below which values are considered 'tiny' and managed using the 'tiny values' strategy + uint64_t TinyValueThreshold = 1024; + + // Threshold above which values are considered 'huge' and managed using the 'huge values' strategy + uint64_t HugeValueThreshold = 1024 * 1024; +}; + class CidStore { public: - CidStore(CasStore& InCasStore, const std::filesystem::path& RootDir); + CidStore(GcManager& Gc); ~CidStore(); struct InsertResult { - IoHash DecompressedId; - IoHash CompressedHash; - bool New = false; + bool New = false; }; - InsertResult AddChunk(CompressedBuffer& ChunkData); - void AddCompressedCid(const IoHash& DecompressedId, const IoHash& Compressed); + void Initialize(const CidStoreConfiguration& Config); + InsertResult AddChunk(const CompressedBuffer& ChunkData); IoBuffer FindChunkByCid(const IoHash& DecompressedId); bool ContainsChunk(const IoHash& DecompressedId); + void FilterChunks(HashKeySet& InOutChunks); void Flush(); void Scrub(ScrubContext& Ctx); - void RemoveCids(CasChunkSet& CasChunks); - CasStoreSize CasSize() const; - - // TODO: add batch filter support - - IoHash RemapCid(const IoHash& DecompressedId); + CidStoreSize TotalSize() const; private: struct Impl; - std::unique_ptr<Impl> m_Impl; + std::unique_ptr<CasStore> m_CasStore; + std::unique_ptr<Impl> m_Impl; }; } // namespace zen diff --git a/zenstore/include/zenstore/gc.h b/zenstore/include/zenstore/gc.h index 398025181..656e594af 100644 --- a/zenstore/include/zenstore/gc.h +++ b/zenstore/include/zenstore/gc.h @@ -22,8 +22,8 @@ class logger; namespace zen { -class CasChunkSet; -class CasGc; +class HashKeySet; +class GcManager; class CidStore; struct IoHash; @@ -50,18 +50,16 @@ public: GcContext(GcClock::TimePoint Time = GcClock::Now()); ~GcContext(); - void ContributeCids(std::span<const IoHash> Cid); - void ContributeCas(std::span<const IoHash> Hash); - void ContributeCacheKeys(const std::string& CacheKeyContext, std::vector<IoHash>&& ExpiredKeys); + void AddRetainedCids(std::span<const IoHash> Cid); + void SetExpiredCacheKeys(const std::string& CacheKeyContext, std::vector<IoHash>&& ExpiredKeys); void IterateCids(std::function<void(const IoHash&)> Callback); void FilterCids(std::span<const IoHash> Cid, std::function<void(const IoHash&)> KeepFunc); - void FilterCas(std::span<const IoHash> Cas, std::function<void(const IoHash&)> KeepFunc); - void FilterCas(std::span<const IoHash> Cas, std::function<void(const IoHash&, bool)>&& FilterFunc); + void FilterCids(std::span<const IoHash> Cid, std::function<void(const IoHash&, bool)>&& FilterFunc); - void DeletedCas(std::span<const IoHash> Cas); - CasChunkSet& DeletedCas(); + void AddDeletedCids(std::span<const IoHash> Cas); + const HashKeySet& DeletedCids(); std::span<const IoHash> ExpiredCacheKeys(const std::string& CacheKeyContext) const; @@ -97,13 +95,13 @@ private: class GcContributor { public: - GcContributor(CasGc& Gc); + GcContributor(GcManager& Gc); ~GcContributor(); virtual void GatherReferences(GcContext& GcCtx) = 0; protected: - CasGc& m_Gc; + GcManager& m_Gc; }; struct GcStorageSize @@ -117,23 +115,23 @@ struct GcStorageSize class GcStorage { public: - GcStorage(CasGc& Gc); + GcStorage(GcManager& Gc); ~GcStorage(); virtual void CollectGarbage(GcContext& GcCtx) = 0; virtual GcStorageSize StorageSize() const = 0; private: - CasGc& m_Gc; + GcManager& m_Gc; }; /** GC orchestrator */ -class CasGc +class GcManager { public: - CasGc(); - ~CasGc(); + GcManager(); + ~GcManager(); void AddGcContributor(GcContributor* Contributor); void RemoveGcContributor(GcContributor* Contributor); @@ -143,12 +141,14 @@ public: void CollectGarbage(GcContext& GcCtx); - void SetCidStore(CidStore* Cids); - void OnNewCidReferences(std::span<IoHash> Hashes); - void OnCommittedCidReferences(std::span<IoHash> Hashes); - void OnDroppedCidReferences(std::span<IoHash> Hashes); GcStorageSize TotalStorageSize() const; +#if ZEN_USE_REF_TRACKING + void OnNewCidReferences(std::span<IoHash> Hashes); + void OnCommittedCidReferences(std::span<IoHash> Hashes); + void OnDroppedCidReferences(std::span<IoHash> Hashes); +#endif + private: mutable RwLock m_Lock; std::vector<GcContributor*> m_GcContribs; @@ -180,7 +180,7 @@ struct GcSchedulerConfig class GcScheduler { public: - GcScheduler(CasGc& CasGc); + GcScheduler(GcManager& GcManager); ~GcScheduler(); void Initialize(const GcSchedulerConfig& Config); @@ -201,7 +201,7 @@ private: spdlog::logger& Log() { return m_Log; } spdlog::logger& m_Log; - CasGc& m_CasGc; + GcManager& m_GcManager; GcSchedulerConfig m_Config; GcClock::TimePoint m_LastGcTime{}; GcClock::TimePoint m_NextGcTime{}; diff --git a/zenstore/include/zenstore/hashkeyset.h b/zenstore/include/zenstore/hashkeyset.h new file mode 100644 index 000000000..411a6256e --- /dev/null +++ b/zenstore/include/zenstore/hashkeyset.h @@ -0,0 +1,54 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include "zenstore.h" + +#include <zencore/iohash.h> + +#include <functional> +#include <unordered_set> + +namespace zen { + +/** Manage a set of IoHash values + */ + +class HashKeySet +{ +public: + void AddHashToSet(const IoHash& HashToAdd); + void AddHashesToSet(std::span<const IoHash> HashesToAdd); + void RemoveHashesIf(std::function<bool(const IoHash& CandidateHash)>&& Predicate); + void IterateHashes(std::function<void(const IoHash& Hash)>&& Callback) const; + [[nodiscard]] inline bool ContainsHash(const IoHash& Hash) const { return m_HashSet.find(Hash) != m_HashSet.end(); } + [[nodiscard]] inline bool IsEmpty() const { return m_HashSet.empty(); } + [[nodiscard]] inline size_t GetSize() const { return m_HashSet.size(); } + + inline void FilterHashes(std::span<const IoHash> Candidates, Invocable<const IoHash&> auto MatchFunc) const + { + for (const IoHash& Candidate : Candidates) + { + if (ContainsHash(Candidate)) + { + MatchFunc(Candidate); + } + } + } + + inline void FilterHashes(std::span<const IoHash> Candidates, Invocable<const IoHash&, bool> auto MatchFunc) const + { + for (const IoHash& Candidate : Candidates) + { + MatchFunc(Candidate, ContainsHash(Candidate)); + } + } + +private: + // Q: should we protect this with a lock, or is that a higher level concern? + std::unordered_set<IoHash, IoHash::Hasher> m_HashSet; +}; + +void hashkeyset_forcelink(); + +} // namespace zen diff --git a/zenstore/include/zenstore/scrubcontext.h b/zenstore/include/zenstore/scrubcontext.h new file mode 100644 index 000000000..bf906492c --- /dev/null +++ b/zenstore/include/zenstore/scrubcontext.h @@ -0,0 +1,40 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include <zencore/timer.h> + +namespace zen { + +/** Context object for data scrubbing + * + * Data scrubbing is when we traverse stored data to validate it and + * optionally correct/recover + */ + +class ScrubContext +{ +public: + virtual void ReportBadCidChunks(std::span<IoHash> BadCasChunks) { m_BadCid.AddHashesToSet(BadCasChunks); } + inline uint64_t ScrubTimestamp() const { return m_ScrubTime; } + inline bool RunRecovery() const { return m_Recover; } + void ReportScrubbed(uint64_t ChunkCount, uint64_t ChunkBytes) + { + m_ChunkCount.fetch_add(ChunkCount); + m_ByteCount.fetch_add(ChunkBytes); + } + + inline uint64_t ScrubbedChunks() const { return m_ChunkCount; } + inline uint64_t ScrubbedBytes() const { return m_ByteCount; } + + const HashKeySet BadCids() const { return m_BadCid; } + +private: + uint64_t m_ScrubTime = GetHifreqTimerValue(); + bool m_Recover = true; + std::atomic<uint64_t> m_ChunkCount{0}; + std::atomic<uint64_t> m_ByteCount{0}; + HashKeySet m_BadCid; +}; + +} // namespace zen |