diff options
| author | Stefan Boberg <[email protected]> | 2026-03-04 14:13:46 +0100 |
|---|---|---|
| committer | GitHub Enterprise <[email protected]> | 2026-03-04 14:13:46 +0100 |
| commit | 0763d09a81e5a1d3df11763a7ec75e7860c9510a (patch) | |
| tree | 074575ba6ea259044a179eab0bb396d37268fb09 /src/zennomad/include | |
| parent | native xmake toolchain definition for UE-clang (#805) (diff) | |
| download | zen-0763d09a81e5a1d3df11763a7ec75e7860c9510a.tar.xz zen-0763d09a81e5a1d3df11763a7ec75e7860c9510a.zip | |
compute orchestration (#763)
- Added local process runners for Linux/Wine, Mac with some sandboxing support
- Horde & Nomad provisioning for development and testing
- Client session queues with lifecycle management (active/draining/cancelled), automatic retry with configurable limits, and manual reschedule API
- Improved web UI for orchestrator, compute, and hub dashboards with WebSocket push updates
- Some security hardening
- Improved scalability and `zen exec` command
Still experimental - compute support is disabled by default
Diffstat (limited to 'src/zennomad/include')
| -rw-r--r-- | src/zennomad/include/zennomad/nomadclient.h | 77 | ||||
| -rw-r--r-- | src/zennomad/include/zennomad/nomadconfig.h | 65 | ||||
| -rw-r--r-- | src/zennomad/include/zennomad/nomadprocess.h | 78 | ||||
| -rw-r--r-- | src/zennomad/include/zennomad/nomadprovisioner.h | 107 | ||||
| -rw-r--r-- | src/zennomad/include/zennomad/zennomad.h | 9 |
5 files changed, 336 insertions, 0 deletions
diff --git a/src/zennomad/include/zennomad/nomadclient.h b/src/zennomad/include/zennomad/nomadclient.h new file mode 100644 index 000000000..0a3411ace --- /dev/null +++ b/src/zennomad/include/zennomad/nomadclient.h @@ -0,0 +1,77 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include <zennomad/nomadconfig.h> + +#include <zencore/logbase.h> + +#include <memory> +#include <string> +#include <vector> + +namespace zen { +class HttpClient; +} + +namespace zen::nomad { + +/** Summary of a Nomad job returned by the API. */ +struct NomadJobInfo +{ + std::string Id; + std::string Status; ///< "pending", "running", "dead" + std::string StatusDescription; +}; + +/** Summary of a Nomad allocation returned by the API. */ +struct NomadAllocInfo +{ + std::string Id; + std::string ClientStatus; ///< "pending", "running", "complete", "failed" + std::string TaskState; ///< State of the task within the allocation +}; + +/** HTTP client for the Nomad REST API (v1). + * + * Handles job submission, status polling, and job termination. + * All calls are synchronous. Thread safety: individual methods are + * not thread-safe; callers must synchronize access. + */ +class NomadClient +{ +public: + explicit NomadClient(const NomadConfig& Config); + ~NomadClient(); + + NomadClient(const NomadClient&) = delete; + NomadClient& operator=(const NomadClient&) = delete; + + /** Initialize the underlying HTTP client. Must be called before other methods. */ + bool Initialize(); + + /** Build the Nomad job registration JSON for the given job ID and orchestrator endpoint. + * The JSON structure varies based on the configured driver and distribution mode. */ + std::string BuildJobJson(const std::string& JobId, const std::string& OrchestratorEndpoint) const; + + /** Submit a job via PUT /v1/jobs. On success, populates OutJob with the job info. */ + bool SubmitJob(const std::string& JobJson, NomadJobInfo& OutJob); + + /** Get the status of a job via GET /v1/job/{jobId}. */ + bool GetJobStatus(const std::string& JobId, NomadJobInfo& OutJob); + + /** Get allocations for a job via GET /v1/job/{jobId}/allocations. */ + bool GetAllocations(const std::string& JobId, std::vector<NomadAllocInfo>& OutAllocs); + + /** Stop a job via DELETE /v1/job/{jobId}. */ + bool StopJob(const std::string& JobId); + + LoggerRef Log() { return m_Log; } + +private: + NomadConfig m_Config; + std::unique_ptr<zen::HttpClient> m_Http; + LoggerRef m_Log; +}; + +} // namespace zen::nomad diff --git a/src/zennomad/include/zennomad/nomadconfig.h b/src/zennomad/include/zennomad/nomadconfig.h new file mode 100644 index 000000000..92d2bbaca --- /dev/null +++ b/src/zennomad/include/zennomad/nomadconfig.h @@ -0,0 +1,65 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include <zennomad/zennomad.h> + +#include <string> + +namespace zen::nomad { + +/** Nomad task driver type. */ +enum class Driver +{ + RawExec, ///< Use Nomad raw_exec driver (direct process execution) + Docker, ///< Use Nomad Docker driver +}; + +/** How the zenserver binary is made available on Nomad clients. */ +enum class BinaryDistribution +{ + PreDeployed, ///< Binary is already present on Nomad client nodes + Artifact, ///< Download binary via Nomad artifact stanza +}; + +/** Configuration for Nomad worker provisioning. + * + * Specifies the Nomad server URL, authentication, resource limits, and + * job configuration. Used by NomadClient and NomadProvisioner. + */ +struct NomadConfig +{ + bool Enabled = false; ///< Whether Nomad provisioning is active + std::string ServerUrl; ///< Nomad HTTP API URL (e.g. "http://localhost:4646") + std::string AclToken; ///< Nomad ACL token (sent as X-Nomad-Token header) + std::string Datacenter = "dc1"; ///< Target datacenter + std::string Namespace = "default"; ///< Nomad namespace + std::string Region; ///< Nomad region (empty = server default) + + Driver TaskDriver = Driver::RawExec; ///< Task driver for job execution + BinaryDistribution BinDistribution = BinaryDistribution::PreDeployed; ///< How to distribute the zenserver binary + + std::string BinaryPath; ///< Path to zenserver on Nomad clients (PreDeployed mode) + std::string ArtifactSource; ///< URL to download zenserver binary (Artifact mode) + std::string DockerImage; ///< Docker image name (Docker driver mode) + + int MaxJobs = 64; ///< Maximum concurrent Nomad jobs + int CpuMhz = 1000; ///< CPU MHz allocated per task + int MemoryMb = 2048; ///< Memory MB allocated per task + int CoresPerJob = 32; ///< Estimated cores per job (for scaling calculations) + int MaxCores = 2048; ///< Maximum total cores to provision + + std::string JobPrefix = "zenserver-worker"; ///< Prefix for generated Nomad job IDs + + /** Validate the configuration. Returns false if required fields are missing + * or incompatible options are set. */ + bool Validate() const; +}; + +const char* ToString(Driver D); +const char* ToString(BinaryDistribution Dist); + +bool FromString(Driver& OutDriver, std::string_view Str); +bool FromString(BinaryDistribution& OutDist, std::string_view Str); + +} // namespace zen::nomad diff --git a/src/zennomad/include/zennomad/nomadprocess.h b/src/zennomad/include/zennomad/nomadprocess.h new file mode 100644 index 000000000..a66c2ce41 --- /dev/null +++ b/src/zennomad/include/zennomad/nomadprocess.h @@ -0,0 +1,78 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include <zenhttp/httpclient.h> + +#include <memory> +#include <string> +#include <string_view> +#include <vector> + +namespace zen::nomad { + +struct NomadJobInfo; +struct NomadAllocInfo; + +/** Manages a Nomad agent process running in dev mode for testing. + * + * Spawns `nomad agent -dev` and polls the HTTP API until the agent + * is ready. On destruction or via StopNomadAgent(), the agent + * process is killed. + */ +class NomadProcess +{ +public: + NomadProcess(); + ~NomadProcess(); + + NomadProcess(const NomadProcess&) = delete; + NomadProcess& operator=(const NomadProcess&) = delete; + + /** Spawn a Nomad dev agent and block until the leader endpoint responds (10 s timeout). */ + void SpawnNomadAgent(); + + /** Kill the Nomad agent process. */ + void StopNomadAgent(); + +private: + struct Impl; + std::unique_ptr<Impl> m_Impl; +}; + +/** Lightweight HTTP wrapper around the Nomad v1 REST API for use in tests. + * + * Unlike the production NomadClient (which requires a NomadConfig and + * supports all driver/distribution modes), this client exposes a simpler + * interface geared towards test scenarios. + */ +class NomadTestClient +{ +public: + explicit NomadTestClient(std::string_view BaseUri); + ~NomadTestClient(); + + NomadTestClient(const NomadTestClient&) = delete; + NomadTestClient& operator=(const NomadTestClient&) = delete; + + /** Submit a raw_exec batch job. + * Returns the parsed job info on success; Id will be empty on failure. */ + NomadJobInfo SubmitJob(std::string_view JobId, std::string_view Command, const std::vector<std::string>& Args); + + /** Query the status of an existing job. */ + NomadJobInfo GetJobStatus(std::string_view JobId); + + /** Stop (deregister) a running job. */ + void StopJob(std::string_view JobId); + + /** Get allocations for a job. */ + std::vector<NomadAllocInfo> GetAllocations(std::string_view JobId); + + /** List all jobs, optionally filtered by prefix. */ + std::vector<NomadJobInfo> ListJobs(std::string_view Prefix = ""); + +private: + HttpClient m_HttpClient; +}; + +} // namespace zen::nomad diff --git a/src/zennomad/include/zennomad/nomadprovisioner.h b/src/zennomad/include/zennomad/nomadprovisioner.h new file mode 100644 index 000000000..750693b3f --- /dev/null +++ b/src/zennomad/include/zennomad/nomadprovisioner.h @@ -0,0 +1,107 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include <zennomad/nomadconfig.h> + +#include <zencore/logbase.h> + +#include <atomic> +#include <condition_variable> +#include <cstdint> +#include <memory> +#include <mutex> +#include <string> +#include <thread> +#include <vector> + +namespace zen::nomad { + +class NomadClient; + +/** Snapshot of the current Nomad provisioning state, returned by NomadProvisioner::GetStats(). */ +struct NomadProvisioningStats +{ + uint32_t TargetCoreCount = 0; ///< Requested number of cores (clamped to MaxCores) + uint32_t EstimatedCoreCount = 0; ///< Cores expected from submitted jobs + uint32_t ActiveJobCount = 0; ///< Number of currently tracked Nomad jobs + uint32_t RunningJobCount = 0; ///< Number of jobs in "running" status +}; + +/** Job lifecycle manager for Nomad worker provisioning. + * + * Provisions remote compute workers by submitting batch jobs to a Nomad + * cluster via the REST API. Each job runs zenserver in compute mode, which + * announces itself back to the orchestrator. + * + * Uses a single management thread that periodically: + * 1. Submits new jobs when estimated cores < target cores + * 2. Polls existing jobs for status changes + * 3. Cleans up dead/failed jobs and adjusts counters + * + * Thread safety: SetTargetCoreCount and GetStats may be called from any thread. + */ +class NomadProvisioner +{ +public: + /** Construct a provisioner. + * @param Config Nomad connection and job configuration. + * @param OrchestratorEndpoint URL of the orchestrator that remote workers announce to. */ + NomadProvisioner(const NomadConfig& Config, std::string_view OrchestratorEndpoint); + + /** Signals the management thread to exit and stops all tracked jobs. */ + ~NomadProvisioner(); + + NomadProvisioner(const NomadProvisioner&) = delete; + NomadProvisioner& operator=(const NomadProvisioner&) = delete; + + /** Set the target number of cores to provision. + * Clamped to NomadConfig::MaxCores. The management thread will + * submit new jobs to approach this target. */ + void SetTargetCoreCount(uint32_t Count); + + /** Return a snapshot of the current provisioning counters. */ + NomadProvisioningStats GetStats() const; + +private: + LoggerRef Log() { return m_Log; } + + struct TrackedJob + { + std::string JobId; + std::string Status; ///< "pending", "running", "dead" + int Cores = 0; + }; + + void ManagementThread(); + void SubmitNewJobs(); + void PollExistingJobs(); + void CleanupDeadJobs(); + void StopAllJobs(); + + std::string GenerateJobId(); + + NomadConfig m_Config; + std::string m_OrchestratorEndpoint; + + std::unique_ptr<NomadClient> m_Client; + + mutable std::mutex m_JobsLock; + std::vector<TrackedJob> m_Jobs; + std::atomic<uint32_t> m_JobIndex{0}; + + std::atomic<uint32_t> m_TargetCoreCount{0}; + std::atomic<uint32_t> m_EstimatedCoreCount{0}; + std::atomic<uint32_t> m_RunningJobCount{0}; + + std::thread m_Thread; + std::mutex m_WakeMutex; + std::condition_variable m_WakeCV; + std::atomic<bool> m_ShouldExit{false}; + + uint32_t m_ProcessId = 0; + + LoggerRef m_Log; +}; + +} // namespace zen::nomad diff --git a/src/zennomad/include/zennomad/zennomad.h b/src/zennomad/include/zennomad/zennomad.h new file mode 100644 index 000000000..09fb98dfe --- /dev/null +++ b/src/zennomad/include/zennomad/zennomad.h @@ -0,0 +1,9 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include <zencore/zencore.h> + +#if !defined(ZEN_WITH_NOMAD) +# define ZEN_WITH_NOMAD 1 +#endif |