diff options
| author | Stefan Boberg <[email protected]> | 2026-04-14 16:18:23 +0200 |
|---|---|---|
| committer | Stefan Boberg <[email protected]> | 2026-04-14 16:18:23 +0200 |
| commit | 053b7373357d2555bac111b94c6909bc148f24ac (patch) | |
| tree | 456a8ce2a1b38ff6aef342324f7fa4c17fdadd30 /src/zenovermind/include | |
| parent | 5.8.4 (diff) | |
| download | zen-sb/compute-overmind.tar.xz zen-sb/compute-overmind.zip | |
Add Overmind provisioner alongside Horde and Nomadsb/compute-overmind
Introduces the zenovermind module with an HTTP client targeting the
Overmind REST gateway (/v1/jobs) and a management-thread provisioner
that schedules, polls, and cancels jobs following the same pattern as
the existing Nomad provisioner. Wired into the compute server with
full CLI options (--overmind-*), lifecycle management, and maintenance
tick support behind the ZEN_WITH_OVERMIND compile flag.
Diffstat (limited to 'src/zenovermind/include')
4 files changed, 233 insertions, 0 deletions
diff --git a/src/zenovermind/include/zenovermind/overmindclient.h b/src/zenovermind/include/zenovermind/overmindclient.h new file mode 100644 index 000000000..68348b4a6 --- /dev/null +++ b/src/zenovermind/include/zenovermind/overmindclient.h @@ -0,0 +1,69 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include <zenovermind/overmindconfig.h> + +#include <zencore/logbase.h> + +#include <memory> +#include <string> + +namespace zen { +class HttpClient; +} + +namespace zen::overmind { + +/** Summary of an Overmind job returned by the REST API. */ +struct OvermindJobInfo +{ + std::string Id; + std::string Status; ///< "STATUS_PENDING", "STATUS_RUNNING", "STATUS_COMPLETE", "STATUS_ERROR", ... +}; + +/** HTTP client for the Overmind REST gateway (v1). + * + * Handles job scheduling, status polling, and job cancellation via the + * grpc-gateway REST endpoints on port 2580. + * + * All calls are synchronous. Thread safety: individual methods are + * not thread-safe; callers must synchronize access. + */ +class OvermindClient +{ +public: + explicit OvermindClient(const OvermindConfig& Config); + ~OvermindClient(); + + OvermindClient(const OvermindClient&) = delete; + OvermindClient& operator=(const OvermindClient&) = delete; + + /** Initialize the underlying HTTP client. Must be called before other methods. */ + bool Initialize(); + + /** Build the JSON body for a ScheduleJob request. */ + std::string BuildJobJson(const std::string& JobName, + const std::string& OrchestratorEndpoint, + const std::string& CoordinatorSession = {}, + bool CleanStart = false, + const std::string& TraceHost = {}) const; + + /** Schedule a job via POST /v1/jobs. On success, populates OutJob. */ + bool ScheduleJob(const std::string& JobJson, OvermindJobInfo& OutJob); + + /** Get the status of a job via GET /v1/jobs/{jobId}. */ + bool GetJobStatus(const std::string& JobId, OvermindJobInfo& OutJob); + + /** Cancel a job via DELETE /v1/jobs/{jobId}. */ + bool CancelJob(const std::string& JobId); + + LoggerRef Log() { return m_Log; } + +private: + OvermindConfig m_Config; + std::unique_ptr<zen::HttpClient> m_Http; + LoggerRef m_Log; +}; + +} // namespace zen::overmind diff --git a/src/zenovermind/include/zenovermind/overmindconfig.h b/src/zenovermind/include/zenovermind/overmindconfig.h new file mode 100644 index 000000000..a463e31ea --- /dev/null +++ b/src/zenovermind/include/zenovermind/overmindconfig.h @@ -0,0 +1,45 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include <zenovermind/zenovermind.h> + +#include <string> + +namespace zen::overmind { + +/** Configuration for Overmind worker provisioning. + * + * Specifies the Overmind REST gateway URL, authentication, namespace, + * region, and resource limits. Used by OvermindClient and OvermindProvisioner. + */ +struct OvermindConfig +{ + bool Enabled = false; ///< Whether Overmind provisioning is active + std::string ServerUrl; ///< Overmind REST gateway URL (e.g. "http://localhost:2580") + std::string AuthToken; ///< JWT bearer token for authentication + + std::string Namespace; ///< Overmind namespace for job submission + std::string Region; ///< Target region (e.g. "REGION_US_EAST") + + /** Overmind command reference for the zenserver binary in + * "namespace:name:version" format (e.g. "infra:zenserver:v1.0.0"). */ + std::string CommandRef; + + std::string Os = "OPERATING_SYSTEM_LINUX"; ///< Target operating system + std::string Arch = "CPU_ARCHITECTURE_X86_64"; ///< Target CPU architecture + + std::string Memory = "4GiB"; ///< Memory per task + std::string Cpu = "2000m"; ///< CPU per task (millicores) + + int MaxJobs = 64; ///< Maximum concurrent Overmind jobs + int CoresPerJob = 32; ///< Estimated cores per job (for scaling calculations) + int MaxCores = 2048; ///< Maximum total cores to provision + + std::string JobName = "zenserver-worker"; ///< Name for generated Overmind jobs + + /** Validate the configuration. Returns false if required fields are missing. */ + bool Validate() const; +}; + +} // namespace zen::overmind diff --git a/src/zenovermind/include/zenovermind/overmindprovisioner.h b/src/zenovermind/include/zenovermind/overmindprovisioner.h new file mode 100644 index 000000000..cb0a84728 --- /dev/null +++ b/src/zenovermind/include/zenovermind/overmindprovisioner.h @@ -0,0 +1,110 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include <zenovermind/overmindconfig.h> + +#include <zencore/logbase.h> + +#include <atomic> +#include <condition_variable> +#include <cstdint> +#include <memory> +#include <mutex> +#include <string> +#include <thread> +#include <vector> + +namespace zen::overmind { + +class OvermindClient; + +/** Snapshot of the current Overmind provisioning state, returned by OvermindProvisioner::GetStats(). */ +struct OvermindProvisioningStats +{ + uint32_t TargetCoreCount = 0; ///< Requested number of cores (clamped to MaxCores) + uint32_t EstimatedCoreCount = 0; ///< Cores expected from submitted jobs + uint32_t ActiveJobCount = 0; ///< Number of currently tracked Overmind jobs + uint32_t RunningJobCount = 0; ///< Number of jobs in running status +}; + +/** Job lifecycle manager for Overmind worker provisioning. + * + * Provisions remote compute workers by scheduling jobs via the Overmind + * REST gateway. Each job runs zenserver in compute mode, which + * announces itself back to the orchestrator. + * + * Uses a single management thread that periodically: + * 1. Submits new jobs when estimated cores < target cores + * 2. Polls existing jobs for status changes + * 3. Cleans up completed/failed jobs and adjusts counters + * + * Thread safety: SetTargetCoreCount and GetStats may be called from any thread. + */ +class OvermindProvisioner +{ +public: + /** Construct a provisioner. + * @param Config Overmind connection and job configuration. + * @param OrchestratorEndpoint URL of the orchestrator that remote workers announce to. */ + OvermindProvisioner(const OvermindConfig& Config, + std::string_view OrchestratorEndpoint, + std::string_view CoordinatorSession = {}, + bool CleanStart = false, + std::string_view TraceHost = {}); + + /** Signals the management thread to exit and cancels all tracked jobs. */ + ~OvermindProvisioner(); + + OvermindProvisioner(const OvermindProvisioner&) = delete; + OvermindProvisioner& operator=(const OvermindProvisioner&) = delete; + + /** Set the target number of cores to provision. + * Clamped to OvermindConfig::MaxCores. The management thread will + * schedule new jobs to approach this target. */ + void SetTargetCoreCount(uint32_t Count); + + /** Return a snapshot of the current provisioning counters. */ + OvermindProvisioningStats GetStats() const; + +private: + LoggerRef Log() { return m_Log; } + + struct TrackedJob + { + std::string Id; + std::string Status; ///< Overmind status string + int Cores = 0; + }; + + void ManagementThread(); + void SubmitNewJobs(); + void PollExistingJobs(); + void CleanupFinishedJobs(); + void CancelAllJobs(); + + OvermindConfig m_Config; + std::string m_OrchestratorEndpoint; + std::string m_CoordinatorSession; + bool m_CleanStart = false; + std::string m_TraceHost; + + std::unique_ptr<OvermindClient> m_Client; + + mutable std::mutex m_JobsLock; + std::vector<TrackedJob> m_Jobs; + std::atomic<uint32_t> m_JobIndex{0}; + + std::atomic<uint32_t> m_TargetCoreCount{0}; + std::atomic<uint32_t> m_EstimatedCoreCount{0}; + std::atomic<uint32_t> m_RunningJobCount{0}; + + std::thread m_Thread; + std::mutex m_WakeMutex; + std::condition_variable m_WakeCV; + std::atomic<bool> m_ShouldExit{false}; + + LoggerRef m_Log; +}; + +} // namespace zen::overmind diff --git a/src/zenovermind/include/zenovermind/zenovermind.h b/src/zenovermind/include/zenovermind/zenovermind.h new file mode 100644 index 000000000..b7f451a16 --- /dev/null +++ b/src/zenovermind/include/zenovermind/zenovermind.h @@ -0,0 +1,9 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include <zencore/zencore.h> + +#if !defined(ZEN_WITH_OVERMIND) +# define ZEN_WITH_OVERMIND 1 +#endif |