aboutsummaryrefslogtreecommitdiff
path: root/zenstore/include
diff options
context:
space:
mode:
authorDan Engelbrecht <[email protected]>2022-06-17 07:06:21 -0700
committerGitHub <[email protected]>2022-06-17 07:06:21 -0700
commitc7e22a4ef1cce7103b9afbeec487461cb32f8dbe (patch)
tree8b99d51bf496c96f82161c18fbdcfd5c6f8f31fd /zenstore/include
parentfixed merge mistake which caused a build error (diff)
downloadzen-c7e22a4ef1cce7103b9afbeec487461cb32f8dbe.tar.xz
zen-c7e22a4ef1cce7103b9afbeec487461cb32f8dbe.zip
Make cas storage an hidden implementation detail of CidStore (#130)v0.1.4-pre6v0.1.4-pre5
- Bumped ZEN_SCHEMA_VERSION - CasStore no longer a public API, it is hidden behind CidStore - Moved cas.h from public header folder - CidStore no longer maps from Cid -> Cas, we store entries in Cas under RawHash - CasStore now decompresses data to validate content (matching against RawHash) - CasChunkSet renames to HashKeySet and put in separate header/cpp file - Disabled "Chunk" command for now as it relied on CAS being exposed as a service - Changed CAS http service to Cid http server - Moved "Run" command completely inside ZEN_WITH_EXEC_SERVICES define - Removed "cas.basic" test - Uncommented ".exec.basic" test and added return-skip at start of test - Moved ScrubContext to separate header file - Renamed CasGC to GcManager - Cleaned up configuration passing in cas store classes - Removed CAS stuff from GcContext and clarified naming in class - Remove migration code
Diffstat (limited to 'zenstore/include')
-rw-r--r--zenstore/include/zenstore/cas.h144
-rw-r--r--zenstore/include/zenstore/caslog.h2
-rw-r--r--zenstore/include/zenstore/cidstore.h57
-rw-r--r--zenstore/include/zenstore/gc.h44
-rw-r--r--zenstore/include/zenstore/hashkeyset.h54
-rw-r--r--zenstore/include/zenstore/scrubcontext.h40
6 files changed, 152 insertions, 189 deletions
diff --git a/zenstore/include/zenstore/cas.h b/zenstore/include/zenstore/cas.h
deleted file mode 100644
index 5592fbd0a..000000000
--- a/zenstore/include/zenstore/cas.h
+++ /dev/null
@@ -1,144 +0,0 @@
-// Copyright Epic Games, Inc. All Rights Reserved.
-
-#pragma once
-
-#include "zenstore.h"
-
-#include <zencore/blake3.h>
-#include <zencore/iobuffer.h>
-#include <zencore/iohash.h>
-#include <zencore/refcount.h>
-#include <zencore/timer.h>
-
-#include <atomic>
-#include <filesystem>
-#include <functional>
-#include <memory>
-#include <string>
-#include <unordered_set>
-
-namespace zen {
-
-class GcContext;
-class CasGc;
-
-struct CasStoreConfiguration
-{
- // Root directory for CAS store
- std::filesystem::path RootDirectory;
-
- // Threshold below which values are considered 'tiny' and managed using the 'tiny values' strategy
- uint64_t TinyValueThreshold = 1024;
-
- // Threshold above which values are considered 'huge' and managed using the 'huge values' strategy
- uint64_t HugeValueThreshold = 1024 * 1024;
-};
-
-/** Manage a set of IoHash values
- */
-
-class CasChunkSet
-{
-public:
- void AddChunkToSet(const IoHash& HashToAdd);
- void AddChunksToSet(std::span<const IoHash> HashesToAdd);
- void RemoveChunksIf(std::function<bool(const IoHash& CandidateHash)>&& Predicate);
- void IterateChunks(std::function<void(const IoHash& ChunkHash)>&& Callback);
- [[nodiscard]] inline bool ContainsChunk(const IoHash& Hash) const { return m_ChunkSet.find(Hash) != m_ChunkSet.end(); }
- [[nodiscard]] inline bool IsEmpty() const { return m_ChunkSet.empty(); }
- [[nodiscard]] inline size_t GetSize() const { return m_ChunkSet.size(); }
-
- inline void FilterChunks(std::span<const IoHash> Candidates, Invocable<const IoHash&> auto MatchFunc)
- {
- for (const IoHash& Candidate : Candidates)
- {
- if (ContainsChunk(Candidate))
- {
- MatchFunc(Candidate);
- }
- }
- }
-
- inline void FilterChunks(std::span<const IoHash> Candidates, Invocable<const IoHash&, bool> auto MatchFunc)
- {
- for (const IoHash& Candidate : Candidates)
- {
- MatchFunc(Candidate, ContainsChunk(Candidate));
- }
- }
-
-private:
- // Q: should we protect this with a lock, or is that a higher level concern?
- std::unordered_set<IoHash, IoHash::Hasher> m_ChunkSet;
-};
-
-/** Context object for data scrubbing
- *
- * Data scrubbing is when we traverse stored data to validate it and
- * optionally correct/recover
- */
-
-class ScrubContext
-{
-public:
- virtual void ReportBadCasChunks(std::span<IoHash> BadCasChunks);
- inline uint64_t ScrubTimestamp() const { return m_ScrubTime; }
- inline bool RunRecovery() const { return m_Recover; }
- void ReportScrubbed(uint64_t ChunkCount, uint64_t ChunkBytes);
-
- inline uint64_t ScrubbedChunks() const { return m_ChunkCount; }
- inline uint64_t ScrubbedBytes() const { return m_ByteCount; }
-
-private:
- uint64_t m_ScrubTime = GetHifreqTimerValue();
- bool m_Recover = true;
- std::atomic<uint64_t> m_ChunkCount{0};
- std::atomic<uint64_t> m_ByteCount{0};
- CasChunkSet m_BadCas;
- CasChunkSet m_BadCid;
-};
-
-struct CasStoreSize
-{
- uint64_t TinySize{};
- uint64_t SmallSize{};
- uint64_t LargeSize{};
- uint64_t TotalSize{};
-};
-
-/** Content Addressable Storage interface
-
- */
-
-class CasStore
-{
-public:
- virtual ~CasStore() = default;
-
- const CasStoreConfiguration& Config() { return m_Config; }
-
- struct InsertResult
- {
- bool New = false;
- };
-
- virtual void Initialize(const CasStoreConfiguration& Config) = 0;
- virtual InsertResult InsertChunk(IoBuffer Data, const IoHash& ChunkHash) = 0;
- virtual IoBuffer FindChunk(const IoHash& ChunkHash) = 0;
- virtual bool ContainsChunk(const IoHash& ChunkHash) = 0;
- virtual void FilterChunks(CasChunkSet& InOutChunks) = 0;
- virtual void Flush() = 0;
- virtual void Scrub(ScrubContext& Ctx) = 0;
- virtual void GarbageCollect(GcContext& GcCtx) = 0;
- virtual CasStoreSize TotalSize() const = 0;
-
-protected:
- CasStoreConfiguration m_Config;
- uint64_t m_LastScrubTime = 0;
-};
-
-ZENCORE_API std::unique_ptr<CasStore> CreateCasStore(CasGc& Gc);
-
-void CAS_forcelink();
-
-} // namespace zen
diff --git a/zenstore/include/zenstore/caslog.h b/zenstore/include/zenstore/caslog.h
index 4b93a708f..c56b653fc 100644
--- a/zenstore/include/zenstore/caslog.h
+++ b/zenstore/include/zenstore/caslog.h
@@ -2,8 +2,6 @@
#pragma once
-#include "zenstore.h"
-
#include <zencore/uid.h>
#include <zenstore/basicfile.h>
diff --git a/zenstore/include/zenstore/cidstore.h b/zenstore/include/zenstore/cidstore.h
index b0252a2a6..21e3c3160 100644
--- a/zenstore/include/zenstore/cidstore.h
+++ b/zenstore/include/zenstore/cidstore.h
@@ -5,7 +5,7 @@
#include "zenstore.h"
#include <zencore/iohash.h>
-#include <zenstore/cas.h>
+#include <zenstore/hashkeyset.h>
ZEN_THIRD_PARTY_INCLUDES_START
#include <tsl/robin_map.h>
@@ -15,53 +15,68 @@ ZEN_THIRD_PARTY_INCLUDES_END
namespace zen {
+class GcManager;
class CasStore;
class CompressedBuffer;
class IoBuffer;
+class ScrubContext;
/** Content Store
*
- * Data in the content store is referenced by content identifiers (CIDs), rather than their
- * literal hash. This class maps uncompressed hashes to compressed hashes and may
+ * Data in the content store is referenced by content identifiers (CIDs), it works
+ * with compressed buffers so the CID is expected to be the RAW hash. It stores the
+ * chunk directly under the RAW hash.
+ * This class maps uncompressed hashes (CIDs) to compressed hashes and may
* be used to deal with other kinds of indirections in the future. For example, if we want
* to support chunking then a CID may represent a list of chunks which could be concatenated
* to form the referenced chunk.
*
- * It would likely be possible to implement this mapping in a more efficient way if we
- * integrate it into the CAS store itself, so we can avoid maintaining copies of large
- * hashes in multiple locations. This would also allow us to consolidate commit logs etc
- * which would be more resilient than the current split log scheme
- *
*/
+
+struct CidStoreSize
+{
+ uint64_t TinySize = 0;
+ uint64_t SmallSize = 0;
+ uint64_t LargeSize = 0;
+ uint64_t TotalSize = 0;
+};
+
+struct CidStoreConfiguration
+{
+ // Root directory for CAS store
+ std::filesystem::path RootDirectory;
+
+ // Threshold below which values are considered 'tiny' and managed using the 'tiny values' strategy
+ uint64_t TinyValueThreshold = 1024;
+
+ // Threshold above which values are considered 'huge' and managed using the 'huge values' strategy
+ uint64_t HugeValueThreshold = 1024 * 1024;
+};
+
class CidStore
{
public:
- CidStore(CasStore& InCasStore, const std::filesystem::path& RootDir);
+ CidStore(GcManager& Gc);
~CidStore();
struct InsertResult
{
- IoHash DecompressedId;
- IoHash CompressedHash;
- bool New = false;
+ bool New = false;
};
- InsertResult AddChunk(CompressedBuffer& ChunkData);
- void AddCompressedCid(const IoHash& DecompressedId, const IoHash& Compressed);
+ void Initialize(const CidStoreConfiguration& Config);
+ InsertResult AddChunk(const CompressedBuffer& ChunkData);
IoBuffer FindChunkByCid(const IoHash& DecompressedId);
bool ContainsChunk(const IoHash& DecompressedId);
+ void FilterChunks(HashKeySet& InOutChunks);
void Flush();
void Scrub(ScrubContext& Ctx);
- void RemoveCids(CasChunkSet& CasChunks);
- CasStoreSize CasSize() const;
-
- // TODO: add batch filter support
-
- IoHash RemapCid(const IoHash& DecompressedId);
+ CidStoreSize TotalSize() const;
private:
struct Impl;
- std::unique_ptr<Impl> m_Impl;
+ std::unique_ptr<CasStore> m_CasStore;
+ std::unique_ptr<Impl> m_Impl;
};
} // namespace zen
diff --git a/zenstore/include/zenstore/gc.h b/zenstore/include/zenstore/gc.h
index 398025181..656e594af 100644
--- a/zenstore/include/zenstore/gc.h
+++ b/zenstore/include/zenstore/gc.h
@@ -22,8 +22,8 @@ class logger;
namespace zen {
-class CasChunkSet;
-class CasGc;
+class HashKeySet;
+class GcManager;
class CidStore;
struct IoHash;
@@ -50,18 +50,16 @@ public:
GcContext(GcClock::TimePoint Time = GcClock::Now());
~GcContext();
- void ContributeCids(std::span<const IoHash> Cid);
- void ContributeCas(std::span<const IoHash> Hash);
- void ContributeCacheKeys(const std::string& CacheKeyContext, std::vector<IoHash>&& ExpiredKeys);
+ void AddRetainedCids(std::span<const IoHash> Cid);
+ void SetExpiredCacheKeys(const std::string& CacheKeyContext, std::vector<IoHash>&& ExpiredKeys);
void IterateCids(std::function<void(const IoHash&)> Callback);
void FilterCids(std::span<const IoHash> Cid, std::function<void(const IoHash&)> KeepFunc);
- void FilterCas(std::span<const IoHash> Cas, std::function<void(const IoHash&)> KeepFunc);
- void FilterCas(std::span<const IoHash> Cas, std::function<void(const IoHash&, bool)>&& FilterFunc);
+ void FilterCids(std::span<const IoHash> Cid, std::function<void(const IoHash&, bool)>&& FilterFunc);
- void DeletedCas(std::span<const IoHash> Cas);
- CasChunkSet& DeletedCas();
+ void AddDeletedCids(std::span<const IoHash> Cas);
+ const HashKeySet& DeletedCids();
std::span<const IoHash> ExpiredCacheKeys(const std::string& CacheKeyContext) const;
@@ -97,13 +95,13 @@ private:
class GcContributor
{
public:
- GcContributor(CasGc& Gc);
+ GcContributor(GcManager& Gc);
~GcContributor();
virtual void GatherReferences(GcContext& GcCtx) = 0;
protected:
- CasGc& m_Gc;
+ GcManager& m_Gc;
};
struct GcStorageSize
@@ -117,23 +115,23 @@ struct GcStorageSize
class GcStorage
{
public:
- GcStorage(CasGc& Gc);
+ GcStorage(GcManager& Gc);
~GcStorage();
virtual void CollectGarbage(GcContext& GcCtx) = 0;
virtual GcStorageSize StorageSize() const = 0;
private:
- CasGc& m_Gc;
+ GcManager& m_Gc;
};
/** GC orchestrator
*/
-class CasGc
+class GcManager
{
public:
- CasGc();
- ~CasGc();
+ GcManager();
+ ~GcManager();
void AddGcContributor(GcContributor* Contributor);
void RemoveGcContributor(GcContributor* Contributor);
@@ -143,12 +141,14 @@ public:
void CollectGarbage(GcContext& GcCtx);
- void SetCidStore(CidStore* Cids);
- void OnNewCidReferences(std::span<IoHash> Hashes);
- void OnCommittedCidReferences(std::span<IoHash> Hashes);
- void OnDroppedCidReferences(std::span<IoHash> Hashes);
GcStorageSize TotalStorageSize() const;
+#if ZEN_USE_REF_TRACKING
+ void OnNewCidReferences(std::span<IoHash> Hashes);
+ void OnCommittedCidReferences(std::span<IoHash> Hashes);
+ void OnDroppedCidReferences(std::span<IoHash> Hashes);
+#endif
+
private:
mutable RwLock m_Lock;
std::vector<GcContributor*> m_GcContribs;
@@ -180,7 +180,7 @@ struct GcSchedulerConfig
class GcScheduler
{
public:
- GcScheduler(CasGc& CasGc);
+ GcScheduler(GcManager& GcManager);
~GcScheduler();
void Initialize(const GcSchedulerConfig& Config);
@@ -201,7 +201,7 @@ private:
spdlog::logger& Log() { return m_Log; }
spdlog::logger& m_Log;
- CasGc& m_CasGc;
+ GcManager& m_GcManager;
GcSchedulerConfig m_Config;
GcClock::TimePoint m_LastGcTime{};
GcClock::TimePoint m_NextGcTime{};
diff --git a/zenstore/include/zenstore/hashkeyset.h b/zenstore/include/zenstore/hashkeyset.h
new file mode 100644
index 000000000..411a6256e
--- /dev/null
+++ b/zenstore/include/zenstore/hashkeyset.h
@@ -0,0 +1,54 @@
+// Copyright Epic Games, Inc. All Rights Reserved.
+
+#pragma once
+
+#include "zenstore.h"
+
+#include <zencore/iohash.h>
+
+#include <functional>
+#include <unordered_set>
+
+namespace zen {
+
+/** Manage a set of IoHash values
+ */
+
+class HashKeySet
+{
+public:
+ void AddHashToSet(const IoHash& HashToAdd);
+ void AddHashesToSet(std::span<const IoHash> HashesToAdd);
+ void RemoveHashesIf(std::function<bool(const IoHash& CandidateHash)>&& Predicate);
+ void IterateHashes(std::function<void(const IoHash& Hash)>&& Callback) const;
+ [[nodiscard]] inline bool ContainsHash(const IoHash& Hash) const { return m_HashSet.find(Hash) != m_HashSet.end(); }
+ [[nodiscard]] inline bool IsEmpty() const { return m_HashSet.empty(); }
+ [[nodiscard]] inline size_t GetSize() const { return m_HashSet.size(); }
+
+ inline void FilterHashes(std::span<const IoHash> Candidates, Invocable<const IoHash&> auto MatchFunc) const
+ {
+ for (const IoHash& Candidate : Candidates)
+ {
+ if (ContainsHash(Candidate))
+ {
+ MatchFunc(Candidate);
+ }
+ }
+ }
+
+ inline void FilterHashes(std::span<const IoHash> Candidates, Invocable<const IoHash&, bool> auto MatchFunc) const
+ {
+ for (const IoHash& Candidate : Candidates)
+ {
+ MatchFunc(Candidate, ContainsHash(Candidate));
+ }
+ }
+
+private:
+ // Q: should we protect this with a lock, or is that a higher level concern?
+ std::unordered_set<IoHash, IoHash::Hasher> m_HashSet;
+};
+
+void hashkeyset_forcelink();
+
+} // namespace zen
diff --git a/zenstore/include/zenstore/scrubcontext.h b/zenstore/include/zenstore/scrubcontext.h
new file mode 100644
index 000000000..bf906492c
--- /dev/null
+++ b/zenstore/include/zenstore/scrubcontext.h
@@ -0,0 +1,40 @@
+// Copyright Epic Games, Inc. All Rights Reserved.
+
+#pragma once
+
+#include <zencore/timer.h>
+
+namespace zen {
+
+/** Context object for data scrubbing
+ *
+ * Data scrubbing is when we traverse stored data to validate it and
+ * optionally correct/recover
+ */
+
+class ScrubContext
+{
+public:
+ virtual void ReportBadCidChunks(std::span<IoHash> BadCasChunks) { m_BadCid.AddHashesToSet(BadCasChunks); }
+ inline uint64_t ScrubTimestamp() const { return m_ScrubTime; }
+ inline bool RunRecovery() const { return m_Recover; }
+ void ReportScrubbed(uint64_t ChunkCount, uint64_t ChunkBytes)
+ {
+ m_ChunkCount.fetch_add(ChunkCount);
+ m_ByteCount.fetch_add(ChunkBytes);
+ }
+
+ inline uint64_t ScrubbedChunks() const { return m_ChunkCount; }
+ inline uint64_t ScrubbedBytes() const { return m_ByteCount; }
+
+ const HashKeySet BadCids() const { return m_BadCid; }
+
+private:
+ uint64_t m_ScrubTime = GetHifreqTimerValue();
+ bool m_Recover = true;
+ std::atomic<uint64_t> m_ChunkCount{0};
+ std::atomic<uint64_t> m_ByteCount{0};
+ HashKeySet m_BadCid;
+};
+
+} // namespace zen