aboutsummaryrefslogtreecommitdiff
path: root/src/zennomad/include
diff options
context:
space:
mode:
authorStefan Boberg <[email protected]>2026-03-04 14:13:46 +0100
committerGitHub Enterprise <[email protected]>2026-03-04 14:13:46 +0100
commit0763d09a81e5a1d3df11763a7ec75e7860c9510a (patch)
tree074575ba6ea259044a179eab0bb396d37268fb09 /src/zennomad/include
parentnative xmake toolchain definition for UE-clang (#805) (diff)
downloadzen-0763d09a81e5a1d3df11763a7ec75e7860c9510a.tar.xz
zen-0763d09a81e5a1d3df11763a7ec75e7860c9510a.zip
compute orchestration (#763)
- Added local process runners for Linux/Wine, Mac with some sandboxing support - Horde & Nomad provisioning for development and testing - Client session queues with lifecycle management (active/draining/cancelled), automatic retry with configurable limits, and manual reschedule API - Improved web UI for orchestrator, compute, and hub dashboards with WebSocket push updates - Some security hardening - Improved scalability and `zen exec` command Still experimental - compute support is disabled by default
Diffstat (limited to 'src/zennomad/include')
-rw-r--r--src/zennomad/include/zennomad/nomadclient.h77
-rw-r--r--src/zennomad/include/zennomad/nomadconfig.h65
-rw-r--r--src/zennomad/include/zennomad/nomadprocess.h78
-rw-r--r--src/zennomad/include/zennomad/nomadprovisioner.h107
-rw-r--r--src/zennomad/include/zennomad/zennomad.h9
5 files changed, 336 insertions, 0 deletions
diff --git a/src/zennomad/include/zennomad/nomadclient.h b/src/zennomad/include/zennomad/nomadclient.h
new file mode 100644
index 000000000..0a3411ace
--- /dev/null
+++ b/src/zennomad/include/zennomad/nomadclient.h
@@ -0,0 +1,77 @@
+// Copyright Epic Games, Inc. All Rights Reserved.
+
+#pragma once
+
+#include <zennomad/nomadconfig.h>
+
+#include <zencore/logbase.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace zen {
+class HttpClient;
+}
+
+namespace zen::nomad {
+
+/** Summary of a Nomad job returned by the API. */
+struct NomadJobInfo
+{
+ std::string Id;
+ std::string Status; ///< "pending", "running", "dead"
+ std::string StatusDescription;
+};
+
+/** Summary of a Nomad allocation returned by the API. */
+struct NomadAllocInfo
+{
+ std::string Id;
+ std::string ClientStatus; ///< "pending", "running", "complete", "failed"
+ std::string TaskState; ///< State of the task within the allocation
+};
+
+/** HTTP client for the Nomad REST API (v1).
+ *
+ * Handles job submission, status polling, and job termination.
+ * All calls are synchronous. Thread safety: individual methods are
+ * not thread-safe; callers must synchronize access.
+ */
+class NomadClient
+{
+public:
+ explicit NomadClient(const NomadConfig& Config);
+ ~NomadClient();
+
+ NomadClient(const NomadClient&) = delete;
+ NomadClient& operator=(const NomadClient&) = delete;
+
+ /** Initialize the underlying HTTP client. Must be called before other methods. */
+ bool Initialize();
+
+ /** Build the Nomad job registration JSON for the given job ID and orchestrator endpoint.
+ * The JSON structure varies based on the configured driver and distribution mode. */
+ std::string BuildJobJson(const std::string& JobId, const std::string& OrchestratorEndpoint) const;
+
+ /** Submit a job via PUT /v1/jobs. On success, populates OutJob with the job info. */
+ bool SubmitJob(const std::string& JobJson, NomadJobInfo& OutJob);
+
+ /** Get the status of a job via GET /v1/job/{jobId}. */
+ bool GetJobStatus(const std::string& JobId, NomadJobInfo& OutJob);
+
+ /** Get allocations for a job via GET /v1/job/{jobId}/allocations. */
+ bool GetAllocations(const std::string& JobId, std::vector<NomadAllocInfo>& OutAllocs);
+
+ /** Stop a job via DELETE /v1/job/{jobId}. */
+ bool StopJob(const std::string& JobId);
+
+ LoggerRef Log() { return m_Log; }
+
+private:
+ NomadConfig m_Config;
+ std::unique_ptr<zen::HttpClient> m_Http;
+ LoggerRef m_Log;
+};
+
+} // namespace zen::nomad
diff --git a/src/zennomad/include/zennomad/nomadconfig.h b/src/zennomad/include/zennomad/nomadconfig.h
new file mode 100644
index 000000000..92d2bbaca
--- /dev/null
+++ b/src/zennomad/include/zennomad/nomadconfig.h
@@ -0,0 +1,65 @@
+// Copyright Epic Games, Inc. All Rights Reserved.
+
+#pragma once
+
+#include <zennomad/zennomad.h>
+
+#include <string>
+
+namespace zen::nomad {
+
+/** Nomad task driver type. */
+enum class Driver
+{
+ RawExec, ///< Use Nomad raw_exec driver (direct process execution)
+ Docker, ///< Use Nomad Docker driver
+};
+
+/** How the zenserver binary is made available on Nomad clients. */
+enum class BinaryDistribution
+{
+ PreDeployed, ///< Binary is already present on Nomad client nodes
+ Artifact, ///< Download binary via Nomad artifact stanza
+};
+
+/** Configuration for Nomad worker provisioning.
+ *
+ * Specifies the Nomad server URL, authentication, resource limits, and
+ * job configuration. Used by NomadClient and NomadProvisioner.
+ */
+struct NomadConfig
+{
+ bool Enabled = false; ///< Whether Nomad provisioning is active
+ std::string ServerUrl; ///< Nomad HTTP API URL (e.g. "http://localhost:4646")
+ std::string AclToken; ///< Nomad ACL token (sent as X-Nomad-Token header)
+ std::string Datacenter = "dc1"; ///< Target datacenter
+ std::string Namespace = "default"; ///< Nomad namespace
+ std::string Region; ///< Nomad region (empty = server default)
+
+ Driver TaskDriver = Driver::RawExec; ///< Task driver for job execution
+ BinaryDistribution BinDistribution = BinaryDistribution::PreDeployed; ///< How to distribute the zenserver binary
+
+ std::string BinaryPath; ///< Path to zenserver on Nomad clients (PreDeployed mode)
+ std::string ArtifactSource; ///< URL to download zenserver binary (Artifact mode)
+ std::string DockerImage; ///< Docker image name (Docker driver mode)
+
+ int MaxJobs = 64; ///< Maximum concurrent Nomad jobs
+ int CpuMhz = 1000; ///< CPU MHz allocated per task
+ int MemoryMb = 2048; ///< Memory MB allocated per task
+ int CoresPerJob = 32; ///< Estimated cores per job (for scaling calculations)
+ int MaxCores = 2048; ///< Maximum total cores to provision
+
+ std::string JobPrefix = "zenserver-worker"; ///< Prefix for generated Nomad job IDs
+
+ /** Validate the configuration. Returns false if required fields are missing
+ * or incompatible options are set. */
+ bool Validate() const;
+};
+
+const char* ToString(Driver D);
+const char* ToString(BinaryDistribution Dist);
+
+bool FromString(Driver& OutDriver, std::string_view Str);
+bool FromString(BinaryDistribution& OutDist, std::string_view Str);
+
+} // namespace zen::nomad
diff --git a/src/zennomad/include/zennomad/nomadprocess.h b/src/zennomad/include/zennomad/nomadprocess.h
new file mode 100644
index 000000000..a66c2ce41
--- /dev/null
+++ b/src/zennomad/include/zennomad/nomadprocess.h
@@ -0,0 +1,78 @@
+// Copyright Epic Games, Inc. All Rights Reserved.
+
+#pragma once
+
+#include <zenhttp/httpclient.h>
+
+#include <memory>
+#include <string>
+#include <string_view>
+#include <vector>
+
+namespace zen::nomad {
+
+struct NomadJobInfo;
+struct NomadAllocInfo;
+
+/** Manages a Nomad agent process running in dev mode for testing.
+ *
+ * Spawns `nomad agent -dev` and polls the HTTP API until the agent
+ * is ready. On destruction or via StopNomadAgent(), the agent
+ * process is killed.
+ */
+class NomadProcess
+{
+public:
+ NomadProcess();
+ ~NomadProcess();
+
+ NomadProcess(const NomadProcess&) = delete;
+ NomadProcess& operator=(const NomadProcess&) = delete;
+
+ /** Spawn a Nomad dev agent and block until the leader endpoint responds (10 s timeout). */
+ void SpawnNomadAgent();
+
+ /** Kill the Nomad agent process. */
+ void StopNomadAgent();
+
+private:
+ struct Impl;
+ std::unique_ptr<Impl> m_Impl;
+};
+
+/** Lightweight HTTP wrapper around the Nomad v1 REST API for use in tests.
+ *
+ * Unlike the production NomadClient (which requires a NomadConfig and
+ * supports all driver/distribution modes), this client exposes a simpler
+ * interface geared towards test scenarios.
+ */
+class NomadTestClient
+{
+public:
+ explicit NomadTestClient(std::string_view BaseUri);
+ ~NomadTestClient();
+
+ NomadTestClient(const NomadTestClient&) = delete;
+ NomadTestClient& operator=(const NomadTestClient&) = delete;
+
+ /** Submit a raw_exec batch job.
+ * Returns the parsed job info on success; Id will be empty on failure. */
+ NomadJobInfo SubmitJob(std::string_view JobId, std::string_view Command, const std::vector<std::string>& Args);
+
+ /** Query the status of an existing job. */
+ NomadJobInfo GetJobStatus(std::string_view JobId);
+
+ /** Stop (deregister) a running job. */
+ void StopJob(std::string_view JobId);
+
+ /** Get allocations for a job. */
+ std::vector<NomadAllocInfo> GetAllocations(std::string_view JobId);
+
+ /** List all jobs, optionally filtered by prefix. */
+ std::vector<NomadJobInfo> ListJobs(std::string_view Prefix = "");
+
+private:
+ HttpClient m_HttpClient;
+};
+
+} // namespace zen::nomad
diff --git a/src/zennomad/include/zennomad/nomadprovisioner.h b/src/zennomad/include/zennomad/nomadprovisioner.h
new file mode 100644
index 000000000..750693b3f
--- /dev/null
+++ b/src/zennomad/include/zennomad/nomadprovisioner.h
@@ -0,0 +1,107 @@
+// Copyright Epic Games, Inc. All Rights Reserved.
+
+#pragma once
+
+#include <zennomad/nomadconfig.h>
+
+#include <zencore/logbase.h>
+
+#include <atomic>
+#include <condition_variable>
+#include <cstdint>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <thread>
+#include <vector>
+
+namespace zen::nomad {
+
+class NomadClient;
+
+/** Snapshot of the current Nomad provisioning state, returned by NomadProvisioner::GetStats(). */
+struct NomadProvisioningStats
+{
+ uint32_t TargetCoreCount = 0; ///< Requested number of cores (clamped to MaxCores)
+ uint32_t EstimatedCoreCount = 0; ///< Cores expected from submitted jobs
+ uint32_t ActiveJobCount = 0; ///< Number of currently tracked Nomad jobs
+ uint32_t RunningJobCount = 0; ///< Number of jobs in "running" status
+};
+
+/** Job lifecycle manager for Nomad worker provisioning.
+ *
+ * Provisions remote compute workers by submitting batch jobs to a Nomad
+ * cluster via the REST API. Each job runs zenserver in compute mode, which
+ * announces itself back to the orchestrator.
+ *
+ * Uses a single management thread that periodically:
+ * 1. Submits new jobs when estimated cores < target cores
+ * 2. Polls existing jobs for status changes
+ * 3. Cleans up dead/failed jobs and adjusts counters
+ *
+ * Thread safety: SetTargetCoreCount and GetStats may be called from any thread.
+ */
+class NomadProvisioner
+{
+public:
+ /** Construct a provisioner.
+ * @param Config Nomad connection and job configuration.
+ * @param OrchestratorEndpoint URL of the orchestrator that remote workers announce to. */
+ NomadProvisioner(const NomadConfig& Config, std::string_view OrchestratorEndpoint);
+
+ /** Signals the management thread to exit and stops all tracked jobs. */
+ ~NomadProvisioner();
+
+ NomadProvisioner(const NomadProvisioner&) = delete;
+ NomadProvisioner& operator=(const NomadProvisioner&) = delete;
+
+ /** Set the target number of cores to provision.
+ * Clamped to NomadConfig::MaxCores. The management thread will
+ * submit new jobs to approach this target. */
+ void SetTargetCoreCount(uint32_t Count);
+
+ /** Return a snapshot of the current provisioning counters. */
+ NomadProvisioningStats GetStats() const;
+
+private:
+ LoggerRef Log() { return m_Log; }
+
+ struct TrackedJob
+ {
+ std::string JobId;
+ std::string Status; ///< "pending", "running", "dead"
+ int Cores = 0;
+ };
+
+ void ManagementThread();
+ void SubmitNewJobs();
+ void PollExistingJobs();
+ void CleanupDeadJobs();
+ void StopAllJobs();
+
+ std::string GenerateJobId();
+
+ NomadConfig m_Config;
+ std::string m_OrchestratorEndpoint;
+
+ std::unique_ptr<NomadClient> m_Client;
+
+ mutable std::mutex m_JobsLock;
+ std::vector<TrackedJob> m_Jobs;
+ std::atomic<uint32_t> m_JobIndex{0};
+
+ std::atomic<uint32_t> m_TargetCoreCount{0};
+ std::atomic<uint32_t> m_EstimatedCoreCount{0};
+ std::atomic<uint32_t> m_RunningJobCount{0};
+
+ std::thread m_Thread;
+ std::mutex m_WakeMutex;
+ std::condition_variable m_WakeCV;
+ std::atomic<bool> m_ShouldExit{false};
+
+ uint32_t m_ProcessId = 0;
+
+ LoggerRef m_Log;
+};
+
+} // namespace zen::nomad
diff --git a/src/zennomad/include/zennomad/zennomad.h b/src/zennomad/include/zennomad/zennomad.h
new file mode 100644
index 000000000..09fb98dfe
--- /dev/null
+++ b/src/zennomad/include/zennomad/zennomad.h
@@ -0,0 +1,9 @@
+// Copyright Epic Games, Inc. All Rights Reserved.
+
+#pragma once
+
+#include <zencore/zencore.h>
+
+#if !defined(ZEN_WITH_NOMAD)
+# define ZEN_WITH_NOMAD 1
+#endif