aboutsummaryrefslogtreecommitdiff
path: root/src/zenserver/compute
diff options
context:
space:
mode:
Diffstat (limited to 'src/zenserver/compute')
-rw-r--r--src/zenserver/compute/computeserver.cpp728
-rw-r--r--src/zenserver/compute/computeserver.h110
-rw-r--r--src/zenserver/compute/computeservice.cpp100
-rw-r--r--src/zenserver/compute/computeservice.h36
4 files changed, 806 insertions, 168 deletions
diff --git a/src/zenserver/compute/computeserver.cpp b/src/zenserver/compute/computeserver.cpp
index 0f9ef0287..0d8550c5b 100644
--- a/src/zenserver/compute/computeserver.cpp
+++ b/src/zenserver/compute/computeserver.cpp
@@ -1,9 +1,9 @@
// Copyright Epic Games, Inc. All Rights Reserved.
#include "computeserver.h"
-#include <zencompute/httpfunctionservice.h>
-#include "computeservice.h"
-
+#include <zencompute/cloudmetadata.h>
+#include <zencompute/httpcomputeservice.h>
+#include <zencompute/httporchestrator.h>
#if ZEN_WITH_COMPUTE_SERVICES
# include <zencore/fmtutils.h>
@@ -13,10 +13,20 @@
# include <zencore/scopeguard.h>
# include <zencore/sentryintegration.h>
# include <zencore/system.h>
+# include <zencore/compactbinarybuilder.h>
# include <zencore/windows.h>
+# include <zenhttp/httpclient.h>
# include <zenhttp/httpapiservice.h>
# include <zenstore/cidstore.h>
# include <zenutil/service.h>
+# if ZEN_WITH_HORDE
+# include <zenhorde/hordeconfig.h>
+# include <zenhorde/hordeprovisioner.h>
+# endif
+# if ZEN_WITH_NOMAD
+# include <zennomad/nomadconfig.h>
+# include <zennomad/nomadprovisioner.h>
+# endif
ZEN_THIRD_PARTY_INCLUDES_START
# include <cxxopts.hpp>
@@ -29,6 +39,13 @@ ZenComputeServerConfigurator::AddCliOptions(cxxopts::Options& Options)
{
Options.add_option("compute",
"",
+ "max-actions",
+ "Maximum number of concurrent local actions (0 = auto)",
+ cxxopts::value<int32_t>(m_ServerOptions.MaxConcurrentActions)->default_value("0"),
+ "");
+
+ Options.add_option("compute",
+ "",
"upstream-notification-endpoint",
"Endpoint URL for upstream notifications",
cxxopts::value<std::string>(m_ServerOptions.UpstreamNotificationEndpoint)->default_value(""),
@@ -40,6 +57,236 @@ ZenComputeServerConfigurator::AddCliOptions(cxxopts::Options& Options)
"Instance ID for use in notifications",
cxxopts::value<std::string>(m_ServerOptions.InstanceId)->default_value(""),
"");
+
+ Options.add_option("compute",
+ "",
+ "coordinator-endpoint",
+ "Endpoint URL for coordinator service",
+ cxxopts::value<std::string>(m_ServerOptions.CoordinatorEndpoint)->default_value(""),
+ "");
+
+ Options.add_option("compute",
+ "",
+ "idms",
+ "Enable IDMS cloud detection; optionally specify a custom probe endpoint",
+ cxxopts::value<std::string>(m_ServerOptions.IdmsEndpoint)->default_value("")->implicit_value("auto"),
+ "");
+
+ Options.add_option("compute",
+ "",
+ "worker-websocket",
+ "Use WebSocket for worker-orchestrator link (instant reachability detection)",
+ cxxopts::value<bool>(m_ServerOptions.EnableWorkerWebSocket)->default_value("false"),
+ "");
+
+# if ZEN_WITH_HORDE
+ // Horde provisioning options
+ Options.add_option("horde",
+ "",
+ "horde-enabled",
+ "Enable Horde worker provisioning",
+ cxxopts::value<bool>(m_ServerOptions.HordeConfig.Enabled)->default_value("false"),
+ "");
+
+ Options.add_option("horde",
+ "",
+ "horde-server",
+ "Horde server URL",
+ cxxopts::value<std::string>(m_ServerOptions.HordeConfig.ServerUrl)->default_value(""),
+ "");
+
+ Options.add_option("horde",
+ "",
+ "horde-token",
+ "Horde authentication token",
+ cxxopts::value<std::string>(m_ServerOptions.HordeConfig.AuthToken)->default_value(""),
+ "");
+
+ Options.add_option("horde",
+ "",
+ "horde-pool",
+ "Horde pool name",
+ cxxopts::value<std::string>(m_ServerOptions.HordeConfig.Pool)->default_value(""),
+ "");
+
+ Options.add_option("horde",
+ "",
+ "horde-cluster",
+ "Horde cluster ID ('default' or '_auto' for auto-resolve)",
+ cxxopts::value<std::string>(m_ServerOptions.HordeConfig.Cluster)->default_value("default"),
+ "");
+
+ Options.add_option("horde",
+ "",
+ "horde-mode",
+ "Horde connection mode (direct, tunnel, relay)",
+ cxxopts::value<std::string>(m_HordeModeStr)->default_value("direct"),
+ "");
+
+ Options.add_option("horde",
+ "",
+ "horde-encryption",
+ "Horde transport encryption (none, aes)",
+ cxxopts::value<std::string>(m_HordeEncryptionStr)->default_value("none"),
+ "");
+
+ Options.add_option("horde",
+ "",
+ "horde-max-cores",
+ "Maximum number of Horde cores to provision",
+ cxxopts::value<int>(m_ServerOptions.HordeConfig.MaxCores)->default_value("2048"),
+ "");
+
+ Options.add_option("horde",
+ "",
+ "horde-host",
+ "Host address for Horde agents to connect back to",
+ cxxopts::value<std::string>(m_ServerOptions.HordeConfig.HostAddress)->default_value(""),
+ "");
+
+ Options.add_option("horde",
+ "",
+ "horde-condition",
+ "Additional Horde agent filter condition",
+ cxxopts::value<std::string>(m_ServerOptions.HordeConfig.Condition)->default_value(""),
+ "");
+
+ Options.add_option("horde",
+ "",
+ "horde-binaries",
+ "Path to directory containing zenserver binary for remote upload",
+ cxxopts::value<std::string>(m_ServerOptions.HordeConfig.BinariesPath)->default_value(""),
+ "");
+
+ Options.add_option("horde",
+ "",
+ "horde-zen-service-port",
+ "Port number for Zen service communication",
+ cxxopts::value<uint16_t>(m_ServerOptions.HordeConfig.ZenServicePort)->default_value("8558"),
+ "");
+# endif
+
+# if ZEN_WITH_NOMAD
+ // Nomad provisioning options
+ Options.add_option("nomad",
+ "",
+ "nomad-enabled",
+ "Enable Nomad worker provisioning",
+ cxxopts::value<bool>(m_ServerOptions.NomadConfig.Enabled)->default_value("false"),
+ "");
+
+ Options.add_option("nomad",
+ "",
+ "nomad-server",
+ "Nomad HTTP API URL",
+ cxxopts::value<std::string>(m_ServerOptions.NomadConfig.ServerUrl)->default_value(""),
+ "");
+
+ Options.add_option("nomad",
+ "",
+ "nomad-token",
+ "Nomad ACL token",
+ cxxopts::value<std::string>(m_ServerOptions.NomadConfig.AclToken)->default_value(""),
+ "");
+
+ Options.add_option("nomad",
+ "",
+ "nomad-datacenter",
+ "Nomad target datacenter",
+ cxxopts::value<std::string>(m_ServerOptions.NomadConfig.Datacenter)->default_value("dc1"),
+ "");
+
+ Options.add_option("nomad",
+ "",
+ "nomad-namespace",
+ "Nomad namespace",
+ cxxopts::value<std::string>(m_ServerOptions.NomadConfig.Namespace)->default_value("default"),
+ "");
+
+ Options.add_option("nomad",
+ "",
+ "nomad-region",
+ "Nomad region (empty for server default)",
+ cxxopts::value<std::string>(m_ServerOptions.NomadConfig.Region)->default_value(""),
+ "");
+
+ Options.add_option("nomad",
+ "",
+ "nomad-driver",
+ "Nomad task driver (raw_exec, docker)",
+ cxxopts::value<std::string>(m_NomadDriverStr)->default_value("raw_exec"),
+ "");
+
+ Options.add_option("nomad",
+ "",
+ "nomad-distribution",
+ "Binary distribution mode (predeployed, artifact)",
+ cxxopts::value<std::string>(m_NomadDistributionStr)->default_value("predeployed"),
+ "");
+
+ Options.add_option("nomad",
+ "",
+ "nomad-binary-path",
+ "Path to zenserver on Nomad clients (predeployed mode)",
+ cxxopts::value<std::string>(m_ServerOptions.NomadConfig.BinaryPath)->default_value(""),
+ "");
+
+ Options.add_option("nomad",
+ "",
+ "nomad-artifact-source",
+ "URL to download zenserver binary (artifact mode)",
+ cxxopts::value<std::string>(m_ServerOptions.NomadConfig.ArtifactSource)->default_value(""),
+ "");
+
+ Options.add_option("nomad",
+ "",
+ "nomad-docker-image",
+ "Docker image for zenserver (docker driver)",
+ cxxopts::value<std::string>(m_ServerOptions.NomadConfig.DockerImage)->default_value(""),
+ "");
+
+ Options.add_option("nomad",
+ "",
+ "nomad-max-jobs",
+ "Maximum concurrent Nomad jobs",
+ cxxopts::value<int>(m_ServerOptions.NomadConfig.MaxJobs)->default_value("64"),
+ "");
+
+ Options.add_option("nomad",
+ "",
+ "nomad-cpu-mhz",
+ "CPU MHz allocated per Nomad task",
+ cxxopts::value<int>(m_ServerOptions.NomadConfig.CpuMhz)->default_value("1000"),
+ "");
+
+ Options.add_option("nomad",
+ "",
+ "nomad-memory-mb",
+ "Memory MB allocated per Nomad task",
+ cxxopts::value<int>(m_ServerOptions.NomadConfig.MemoryMb)->default_value("2048"),
+ "");
+
+ Options.add_option("nomad",
+ "",
+ "nomad-cores-per-job",
+ "Estimated cores per Nomad job (for scaling)",
+ cxxopts::value<int>(m_ServerOptions.NomadConfig.CoresPerJob)->default_value("32"),
+ "");
+
+ Options.add_option("nomad",
+ "",
+ "nomad-max-cores",
+ "Maximum total cores to provision via Nomad",
+ cxxopts::value<int>(m_ServerOptions.NomadConfig.MaxCores)->default_value("2048"),
+ "");
+
+ Options.add_option("nomad",
+ "",
+ "nomad-job-prefix",
+ "Prefix for generated Nomad job IDs",
+ cxxopts::value<std::string>(m_ServerOptions.NomadConfig.JobPrefix)->default_value("zenserver-worker"),
+ "");
+# endif
}
void
@@ -63,6 +310,15 @@ ZenComputeServerConfigurator::OnConfigFileParsed(LuaConfig::Options& LuaOptions)
void
ZenComputeServerConfigurator::ValidateOptions()
{
+# if ZEN_WITH_HORDE
+ horde::FromString(m_ServerOptions.HordeConfig.Mode, m_HordeModeStr);
+ horde::FromString(m_ServerOptions.HordeConfig.EncryptionMode, m_HordeEncryptionStr);
+# endif
+
+# if ZEN_WITH_NOMAD
+ nomad::FromString(m_ServerOptions.NomadConfig.TaskDriver, m_NomadDriverStr);
+ nomad::FromString(m_ServerOptions.NomadConfig.BinDistribution, m_NomadDistributionStr);
+# endif
}
///////////////////////////////////////////////////////////////////////////
@@ -90,10 +346,14 @@ ZenComputeServer::Initialize(const ZenComputeServerConfig& ServerConfig, ZenServ
return EffectiveBasePort;
}
+ m_CoordinatorEndpoint = ServerConfig.CoordinatorEndpoint;
+ m_InstanceId = ServerConfig.InstanceId;
+ m_EnableWorkerWebSocket = ServerConfig.EnableWorkerWebSocket;
+
// This is a workaround to make sure we can have automated tests. Without
// this the ranges for different child zen compute processes could overlap with
// the main test range.
- ZenServerEnvironment::SetBaseChildId(1000);
+ ZenServerEnvironment::SetBaseChildId(2000);
m_DebugOptionForcedCrash = ServerConfig.ShouldCrash;
@@ -113,12 +373,54 @@ ZenComputeServer::Cleanup()
ZEN_INFO(ZEN_APP_NAME " cleaning up");
try
{
+ // Cancel the maintenance timer so it stops re-enqueuing before we
+ // tear down the provisioners it references.
+ m_ProvisionerMaintenanceTimer.cancel();
+ m_AnnounceTimer.cancel();
+
+# if ZEN_WITH_HORDE
+ // Shut down Horde provisioner first — this signals all agent threads
+ // to exit and joins them before we tear down HTTP services.
+ m_HordeProvisioner.reset();
+# endif
+
+# if ZEN_WITH_NOMAD
+ // Shut down Nomad provisioner — stops the management thread and
+ // sends stop requests for all tracked jobs.
+ m_NomadProvisioner.reset();
+# endif
+
+ // Close the orchestrator WebSocket client before stopping the io_context
+ m_WsReconnectTimer.cancel();
+ if (m_OrchestratorWsClient)
+ {
+ m_OrchestratorWsClient->Close();
+ m_OrchestratorWsClient.reset();
+ }
+ m_OrchestratorWsHandler.reset();
+
+ ResolveCloudMetadata();
+ m_CloudMetadata.reset();
+
+ // Shut down services that own threads or use the io_context before we
+ // stop the io_context and close the HTTP server.
+ if (m_OrchestratorService)
+ {
+ m_OrchestratorService->Shutdown();
+ }
+ if (m_ComputeService)
+ {
+ m_ComputeService->Shutdown();
+ }
+
m_IoContext.stop();
if (m_IoRunner.joinable())
{
m_IoRunner.join();
}
+ ShutdownServices();
+
if (m_Http)
{
m_Http->Close();
@@ -139,7 +441,8 @@ ZenComputeServer::InitializeState(const ZenComputeServerConfig& ServerConfig)
void
ZenComputeServer::InitializeServices(const ZenComputeServerConfig& ServerConfig)
{
- ZEN_INFO("initializing storage");
+ ZEN_TRACE_CPU("ZenComputeServer::InitializeServices");
+ ZEN_INFO("initializing compute services");
CidStoreConfiguration Config;
Config.RootDirectory = m_DataRoot / "cas";
@@ -147,46 +450,403 @@ ZenComputeServer::InitializeServices(const ZenComputeServerConfig& ServerConfig)
m_CidStore = std::make_unique<CidStore>(m_GcManager);
m_CidStore->Initialize(Config);
+ if (!ServerConfig.IdmsEndpoint.empty())
+ {
+ ZEN_INFO("detecting cloud environment (async)");
+ if (ServerConfig.IdmsEndpoint == "auto")
+ {
+ m_CloudMetadataFuture = std::async(std::launch::async, [DataDir = ServerConfig.DataDir] {
+ return std::make_unique<zen::compute::CloudMetadata>(DataDir / "cloud");
+ });
+ }
+ else
+ {
+ ZEN_INFO("using custom IDMS endpoint: {}", ServerConfig.IdmsEndpoint);
+ m_CloudMetadataFuture = std::async(std::launch::async, [DataDir = ServerConfig.DataDir, Endpoint = ServerConfig.IdmsEndpoint] {
+ return std::make_unique<zen::compute::CloudMetadata>(DataDir / "cloud", Endpoint);
+ });
+ }
+ }
+
ZEN_INFO("instantiating API service");
m_ApiService = std::make_unique<zen::HttpApiService>(*m_Http);
- ZEN_INFO("instantiating compute service");
- m_ComputeService = std::make_unique<HttpComputeService>(ServerConfig.DataDir / "compute");
+ ZEN_INFO("instantiating orchestrator service");
+ m_OrchestratorService =
+ std::make_unique<zen::compute::HttpOrchestratorService>(ServerConfig.DataDir / "orch", ServerConfig.EnableWorkerWebSocket);
- // Ref<zen::compute::FunctionRunner> Runner;
- // Runner = zen::compute::CreateLocalRunner(*m_CidStore, ServerConfig.DataDir / "runner");
+ ZEN_INFO("instantiating function service");
+ m_ComputeService = std::make_unique<zen::compute::HttpComputeService>(*m_CidStore,
+ m_StatsService,
+ ServerConfig.DataDir / "functions",
+ ServerConfig.MaxConcurrentActions);
- // TODO: (re)implement default configuration here
+ m_FrontendService = std::make_unique<HttpFrontendService>(m_ContentRoot, m_StatusService);
- ZEN_INFO("instantiating function service");
- m_FunctionService =
- std::make_unique<zen::compute::HttpFunctionService>(*m_CidStore, m_StatsService, ServerConfig.DataDir / "functions");
+# if ZEN_WITH_NOMAD
+ // Nomad provisioner
+ if (ServerConfig.NomadConfig.Enabled && !ServerConfig.NomadConfig.ServerUrl.empty())
+ {
+ ZEN_INFO("instantiating Nomad provisioner (server: {})", ServerConfig.NomadConfig.ServerUrl);
+
+ const auto& NomadCfg = ServerConfig.NomadConfig;
+
+ if (!NomadCfg.Validate())
+ {
+ ZEN_ERROR("invalid Nomad configuration");
+ }
+ else
+ {
+ ExtendableStringBuilder<256> OrchestratorEndpoint;
+ OrchestratorEndpoint << m_Http->GetServiceUri(m_OrchestratorService.get());
+ if (auto View = OrchestratorEndpoint.ToView(); !View.empty() && View.back() != '/')
+ {
+ OrchestratorEndpoint << '/';
+ }
+
+ m_NomadProvisioner = std::make_unique<nomad::NomadProvisioner>(NomadCfg, OrchestratorEndpoint);
+ }
+ }
+# endif
+
+# if ZEN_WITH_HORDE
+ // Horde provisioner
+ if (ServerConfig.HordeConfig.Enabled && !ServerConfig.HordeConfig.ServerUrl.empty())
+ {
+ ZEN_INFO("instantiating Horde provisioner (server: {})", ServerConfig.HordeConfig.ServerUrl);
+
+ const auto& HordeConfig = ServerConfig.HordeConfig;
+
+ if (!HordeConfig.Validate())
+ {
+ ZEN_ERROR("invalid Horde configuration");
+ }
+ else
+ {
+ ExtendableStringBuilder<256> OrchestratorEndpoint;
+ OrchestratorEndpoint << m_Http->GetServiceUri(m_OrchestratorService.get());
+ if (auto View = OrchestratorEndpoint.ToView(); !View.empty() && View.back() != '/')
+ {
+ OrchestratorEndpoint << '/';
+ }
+
+ // If no binaries path is specified, just use the running executable's directory
+ std::filesystem::path BinariesPath = HordeConfig.BinariesPath.empty() ? GetRunningExecutablePath().parent_path()
+ : std::filesystem::path(HordeConfig.BinariesPath);
+ std::filesystem::path WorkingDir = ServerConfig.DataDir / "horde";
+
+ m_HordeProvisioner = std::make_unique<horde::HordeProvisioner>(HordeConfig, BinariesPath, WorkingDir, OrchestratorEndpoint);
+ }
+ }
+# endif
+}
+
+void
+ZenComputeServer::ResolveCloudMetadata()
+{
+ if (m_CloudMetadataFuture.valid())
+ {
+ m_CloudMetadata = m_CloudMetadataFuture.get();
+ }
+}
+
+std::string
+ZenComputeServer::GetInstanceId() const
+{
+ if (!m_InstanceId.empty())
+ {
+ return m_InstanceId;
+ }
+ return fmt::format("{}-{}", GetMachineName(), GetCurrentProcessId());
+}
+
+std::string
+ZenComputeServer::GetAnnounceUrl() const
+{
+ return m_Http->GetServiceUri(nullptr);
}
void
ZenComputeServer::RegisterServices(const ZenComputeServerConfig& ServerConfig)
{
+ ZEN_TRACE_CPU("ZenComputeServer::RegisterServices");
ZEN_UNUSED(ServerConfig);
+ if (m_ApiService)
+ {
+ m_Http->RegisterService(*m_ApiService);
+ }
+
+ if (m_OrchestratorService)
+ {
+ m_Http->RegisterService(*m_OrchestratorService);
+ }
+
if (m_ComputeService)
{
m_Http->RegisterService(*m_ComputeService);
}
- if (m_ApiService)
+ if (m_FrontendService)
{
- m_Http->RegisterService(*m_ApiService);
+ m_Http->RegisterService(*m_FrontendService);
+ }
+}
+
+CbObject
+ZenComputeServer::BuildAnnounceBody()
+{
+ CbObjectWriter AnnounceBody;
+ AnnounceBody << "id" << GetInstanceId();
+ AnnounceBody << "uri" << GetAnnounceUrl();
+ AnnounceBody << "hostname" << GetMachineName();
+ AnnounceBody << "platform" << GetRuntimePlatformName();
+
+ ExtendedSystemMetrics Sm = ApplyReportingOverrides(m_MetricsTracker.Query());
+
+ AnnounceBody.BeginObject("metrics");
+ Describe(Sm, AnnounceBody);
+ AnnounceBody.EndObject();
+
+ AnnounceBody << "cpu_usage" << Sm.CpuUsagePercent;
+ AnnounceBody << "memory_total" << Sm.SystemMemoryMiB * 1024 * 1024;
+ AnnounceBody << "memory_used" << (Sm.SystemMemoryMiB - Sm.AvailSystemMemoryMiB) * 1024 * 1024;
+
+ AnnounceBody << "bytes_received" << m_Http->GetTotalBytesReceived();
+ AnnounceBody << "bytes_sent" << m_Http->GetTotalBytesSent();
+
+ auto Actions = m_ComputeService->GetActionCounts();
+ AnnounceBody << "actions_pending" << Actions.Pending;
+ AnnounceBody << "actions_running" << Actions.Running;
+ AnnounceBody << "actions_completed" << Actions.Completed;
+ AnnounceBody << "active_queues" << Actions.ActiveQueues;
+
+ // Derive provisioner from instance ID prefix (e.g. "horde-xxx" or "nomad-xxx")
+ if (m_InstanceId.starts_with("horde-"))
+ {
+ AnnounceBody << "provisioner"
+ << "horde";
+ }
+ else if (m_InstanceId.starts_with("nomad-"))
+ {
+ AnnounceBody << "provisioner"
+ << "nomad";
+ }
+
+ ResolveCloudMetadata();
+ if (m_CloudMetadata)
+ {
+ m_CloudMetadata->Describe(AnnounceBody);
+ }
+
+ return AnnounceBody.Save();
+}
+
+void
+ZenComputeServer::PostAnnounce()
+{
+ ZEN_TRACE_CPU("ZenComputeServer::PostAnnounce");
+
+ if (!m_ComputeService || m_CoordinatorEndpoint.empty())
+ {
+ return;
+ }
+
+ ZEN_INFO("notifying coordinator at '{}' of our availability at '{}'", m_CoordinatorEndpoint, GetAnnounceUrl());
+
+ try
+ {
+ CbObject Body = BuildAnnounceBody();
+
+ // If we have an active WebSocket connection, send via that instead of HTTP POST
+ if (m_OrchestratorWsClient && m_OrchestratorWsClient->IsOpen())
+ {
+ MemoryView View = Body.GetView();
+ m_OrchestratorWsClient->SendBinary(std::span<const uint8_t>(reinterpret_cast<const uint8_t*>(View.GetData()), View.GetSize()));
+ ZEN_INFO("announced to coordinator via WebSocket");
+ return;
+ }
+
+ HttpClient CoordinatorHttp(m_CoordinatorEndpoint);
+ HttpClient::Response Result = CoordinatorHttp.Post("announce", std::move(Body));
+
+ if (Result.Error)
+ {
+ ZEN_ERROR("failed to notify coordinator at '{}': HTTP error {} - {}",
+ m_CoordinatorEndpoint,
+ static_cast<int>(Result.Error->ErrorCode),
+ Result.Error->ErrorMessage);
+ }
+ else if (!IsHttpOk(Result.StatusCode))
+ {
+ ZEN_ERROR("failed to notify coordinator at '{}': unexpected HTTP status code {}",
+ m_CoordinatorEndpoint,
+ static_cast<int>(Result.StatusCode));
+ }
+ else
+ {
+ ZEN_INFO("successfully notified coordinator at '{}'", m_CoordinatorEndpoint);
+ }
+ }
+ catch (const std::exception& Ex)
+ {
+ ZEN_ERROR("failed to notify coordinator at '{}': {}", m_CoordinatorEndpoint, Ex.what());
+ }
+}
+
+void
+ZenComputeServer::EnqueueAnnounceTimer()
+{
+ if (!m_ComputeService || m_CoordinatorEndpoint.empty())
+ {
+ return;
+ }
+
+ m_AnnounceTimer.expires_after(std::chrono::seconds(15));
+ m_AnnounceTimer.async_wait([this](const asio::error_code& Ec) {
+ if (!Ec)
+ {
+ PostAnnounce();
+ EnqueueAnnounceTimer();
+ }
+ });
+ EnsureIoRunner();
+}
+
+void
+ZenComputeServer::InitializeOrchestratorWebSocket()
+{
+ if (!m_EnableWorkerWebSocket || m_CoordinatorEndpoint.empty())
+ {
+ return;
+ }
+
+ // Convert http://host:port → ws://host:port/orch/ws
+ std::string WsUrl = m_CoordinatorEndpoint;
+ if (WsUrl.starts_with("http://"))
+ {
+ WsUrl = "ws://" + WsUrl.substr(7);
+ }
+ else if (WsUrl.starts_with("https://"))
+ {
+ WsUrl = "wss://" + WsUrl.substr(8);
+ }
+ if (!WsUrl.empty() && WsUrl.back() != '/')
+ {
+ WsUrl += '/';
+ }
+ WsUrl += "orch/ws";
+
+ ZEN_INFO("establishing WebSocket link to orchestrator at {}", WsUrl);
+
+ m_OrchestratorWsHandler = std::make_unique<OrchestratorWsHandler>(*this);
+ m_OrchestratorWsClient =
+ std::make_unique<HttpWsClient>(WsUrl, *m_OrchestratorWsHandler, m_IoContext, HttpWsClientSettings{.LogCategory = "orch_ws"});
+
+ m_OrchestratorWsClient->Connect();
+ EnsureIoRunner();
+}
+
+void
+ZenComputeServer::EnqueueWsReconnect()
+{
+ m_WsReconnectTimer.expires_after(std::chrono::seconds(5));
+ m_WsReconnectTimer.async_wait([this](const asio::error_code& Ec) {
+ if (!Ec && m_OrchestratorWsClient)
+ {
+ ZEN_INFO("attempting WebSocket reconnect to orchestrator");
+ m_OrchestratorWsClient->Connect();
+ }
+ });
+ EnsureIoRunner();
+}
+
+void
+ZenComputeServer::OrchestratorWsHandler::OnWsOpen()
+{
+ ZEN_INFO("WebSocket link to orchestrator established");
+
+ // Send initial announce immediately over the WebSocket
+ Server.PostAnnounce();
+}
+
+void
+ZenComputeServer::OrchestratorWsHandler::OnWsMessage([[maybe_unused]] const WebSocketMessage& Msg)
+{
+ // Orchestrator does not push messages to workers; ignore
+}
+
+void
+ZenComputeServer::OrchestratorWsHandler::OnWsClose([[maybe_unused]] uint16_t Code, [[maybe_unused]] std::string_view Reason)
+{
+ ZEN_WARN("WebSocket link to orchestrator closed (code {}), falling back to HTTP announce", Code);
+
+ // Trigger an immediate HTTP announce so the orchestrator has fresh state,
+ // then schedule a reconnect attempt.
+ Server.PostAnnounce();
+ Server.EnqueueWsReconnect();
+}
+
+void
+ZenComputeServer::ProvisionerMaintenanceTick()
+{
+# if ZEN_WITH_HORDE
+ if (m_HordeProvisioner)
+ {
+ m_HordeProvisioner->SetTargetCoreCount(UINT32_MAX);
+ auto Stats = m_HordeProvisioner->GetStats();
+ ZEN_DEBUG("Horde maintenance: target={}, estimated={}, active={}",
+ Stats.TargetCoreCount,
+ Stats.EstimatedCoreCount,
+ Stats.ActiveCoreCount);
+ }
+# endif
+
+# if ZEN_WITH_NOMAD
+ if (m_NomadProvisioner)
+ {
+ m_NomadProvisioner->SetTargetCoreCount(UINT32_MAX);
+ auto Stats = m_NomadProvisioner->GetStats();
+ ZEN_DEBUG("Nomad maintenance: target={}, estimated={}, running jobs={}",
+ Stats.TargetCoreCount,
+ Stats.EstimatedCoreCount,
+ Stats.RunningJobCount);
}
+# endif
+}
+
+void
+ZenComputeServer::EnqueueProvisionerMaintenanceTimer()
+{
+ bool HasProvisioner = false;
+# if ZEN_WITH_HORDE
+ HasProvisioner = HasProvisioner || (m_HordeProvisioner != nullptr);
+# endif
+# if ZEN_WITH_NOMAD
+ HasProvisioner = HasProvisioner || (m_NomadProvisioner != nullptr);
+# endif
- if (m_FunctionService)
+ if (!HasProvisioner)
{
- m_Http->RegisterService(*m_FunctionService);
+ return;
}
+
+ m_ProvisionerMaintenanceTimer.expires_after(std::chrono::seconds(15));
+ m_ProvisionerMaintenanceTimer.async_wait([this](const asio::error_code& Ec) {
+ if (!Ec)
+ {
+ ProvisionerMaintenanceTick();
+ EnqueueProvisionerMaintenanceTimer();
+ }
+ });
+ EnsureIoRunner();
}
void
ZenComputeServer::Run()
{
+ ZEN_TRACE_CPU("ZenComputeServer::Run");
+
if (m_ProcessMonitor.IsActive())
{
CheckOwnerPid();
@@ -205,7 +865,7 @@ ZenComputeServer::Run()
ExtendableStringBuilder<256> BuildOptions;
GetBuildOptions(BuildOptions, '\n');
- ZEN_INFO("Build options ({}/{}):\n{}", GetOperatingSystemName(), GetCpuName(), BuildOptions);
+ ZEN_INFO("Build options ({}/{}, {}):\n{}", GetOperatingSystemName(), GetCpuName(), GetCompilerName(), BuildOptions);
}
ZEN_INFO(ZEN_APP_NAME " now running as COMPUTE (pid: {})", GetCurrentProcessId());
@@ -236,6 +896,35 @@ ZenComputeServer::Run()
OnReady();
+ PostAnnounce();
+ EnqueueAnnounceTimer();
+ InitializeOrchestratorWebSocket();
+
+# if ZEN_WITH_HORDE
+ // Start Horde provisioning if configured — request maximum allowed cores.
+ // SetTargetCoreCount clamps to HordeConfig::MaxCores internally.
+ if (m_HordeProvisioner)
+ {
+ ZEN_INFO("Horde provisioning starting");
+ m_HordeProvisioner->SetTargetCoreCount(UINT32_MAX);
+ auto Stats = m_HordeProvisioner->GetStats();
+ ZEN_INFO("Horde provisioning started (target cores: {})", Stats.TargetCoreCount);
+ }
+# endif
+
+# if ZEN_WITH_NOMAD
+ // Start Nomad provisioning if configured — request maximum allowed cores.
+ // SetTargetCoreCount clamps to NomadConfig::MaxCores internally.
+ if (m_NomadProvisioner)
+ {
+ m_NomadProvisioner->SetTargetCoreCount(UINT32_MAX);
+ auto Stats = m_NomadProvisioner->GetStats();
+ ZEN_INFO("Nomad provisioning started (target cores: {})", Stats.TargetCoreCount);
+ }
+# endif
+
+ EnqueueProvisionerMaintenanceTimer();
+
m_Http->Run(IsInteractiveMode);
SetNewState(kShuttingDown);
@@ -254,7 +943,10 @@ ZenComputeServerMain::ZenComputeServerMain(ZenComputeServerConfig& ServerOptions
void
ZenComputeServerMain::DoRun(ZenServerState::ZenServerEntry* Entry)
{
+ ZEN_TRACE_CPU("ZenComputeServerMain::DoRun");
+
ZenComputeServer Server;
+ Server.SetServerMode("Compute");
Server.SetDataRoot(m_ServerOptions.DataDir);
Server.SetContentRoot(m_ServerOptions.ContentDir);
Server.SetTestMode(m_ServerOptions.IsTest);
diff --git a/src/zenserver/compute/computeserver.h b/src/zenserver/compute/computeserver.h
index 625140b23..8f4edc0f0 100644
--- a/src/zenserver/compute/computeserver.h
+++ b/src/zenserver/compute/computeserver.h
@@ -6,7 +6,11 @@
#if ZEN_WITH_COMPUTE_SERVICES
+# include <future>
+# include <zencore/system.h>
+# include <zenhttp/httpwsclient.h>
# include <zenstore/gc.h>
+# include "frontend/frontend.h"
namespace cxxopts {
class Options;
@@ -16,19 +20,46 @@ struct Options;
}
namespace zen::compute {
-class HttpFunctionService;
-}
+class CloudMetadata;
+class HttpComputeService;
+class HttpOrchestratorService;
+} // namespace zen::compute
+
+# if ZEN_WITH_HORDE
+# include <zenhorde/hordeconfig.h>
+namespace zen::horde {
+class HordeProvisioner;
+} // namespace zen::horde
+# endif
+
+# if ZEN_WITH_NOMAD
+# include <zennomad/nomadconfig.h>
+namespace zen::nomad {
+class NomadProvisioner;
+} // namespace zen::nomad
+# endif
namespace zen {
class CidStore;
class HttpApiService;
-class HttpComputeService;
struct ZenComputeServerConfig : public ZenServerConfig
{
std::string UpstreamNotificationEndpoint;
std::string InstanceId; // For use in notifications
+ std::string CoordinatorEndpoint;
+ std::string IdmsEndpoint;
+ int32_t MaxConcurrentActions = 0; // 0 = auto (LogicalProcessorCount * 2)
+ bool EnableWorkerWebSocket = false; // Use WebSocket for worker↔orchestrator link
+
+# if ZEN_WITH_HORDE
+ horde::HordeConfig HordeConfig;
+# endif
+
+# if ZEN_WITH_NOMAD
+ nomad::NomadConfig NomadConfig;
+# endif
};
struct ZenComputeServerConfigurator : public ZenServerConfiguratorBase
@@ -49,6 +80,16 @@ private:
virtual void ValidateOptions() override;
ZenComputeServerConfig& m_ServerOptions;
+
+# if ZEN_WITH_HORDE
+ std::string m_HordeModeStr = "direct";
+ std::string m_HordeEncryptionStr = "none";
+# endif
+
+# if ZEN_WITH_NOMAD
+ std::string m_NomadDriverStr = "raw_exec";
+ std::string m_NomadDistributionStr = "predeployed";
+# endif
};
class ZenComputeServerMain : public ZenServerMain
@@ -88,17 +129,58 @@ public:
void Cleanup();
private:
- HttpStatsService m_StatsService;
- GcManager m_GcManager;
- GcScheduler m_GcScheduler{m_GcManager};
- std::unique_ptr<CidStore> m_CidStore;
- std::unique_ptr<HttpComputeService> m_ComputeService;
- std::unique_ptr<HttpApiService> m_ApiService;
- std::unique_ptr<zen::compute::HttpFunctionService> m_FunctionService;
-
- void InitializeState(const ZenComputeServerConfig& ServerConfig);
- void InitializeServices(const ZenComputeServerConfig& ServerConfig);
- void RegisterServices(const ZenComputeServerConfig& ServerConfig);
+ GcManager m_GcManager;
+ GcScheduler m_GcScheduler{m_GcManager};
+ std::unique_ptr<CidStore> m_CidStore;
+ std::unique_ptr<HttpApiService> m_ApiService;
+ std::unique_ptr<zen::compute::HttpComputeService> m_ComputeService;
+ std::unique_ptr<zen::compute::HttpOrchestratorService> m_OrchestratorService;
+ std::unique_ptr<zen::compute::CloudMetadata> m_CloudMetadata;
+ std::future<std::unique_ptr<zen::compute::CloudMetadata>> m_CloudMetadataFuture;
+ std::unique_ptr<HttpFrontendService> m_FrontendService;
+# if ZEN_WITH_HORDE
+ std::unique_ptr<zen::horde::HordeProvisioner> m_HordeProvisioner;
+# endif
+# if ZEN_WITH_NOMAD
+ std::unique_ptr<zen::nomad::NomadProvisioner> m_NomadProvisioner;
+# endif
+ SystemMetricsTracker m_MetricsTracker;
+ std::string m_CoordinatorEndpoint;
+ std::string m_InstanceId;
+
+ asio::steady_timer m_AnnounceTimer{m_IoContext};
+ asio::steady_timer m_ProvisionerMaintenanceTimer{m_IoContext};
+
+ void InitializeState(const ZenComputeServerConfig& ServerConfig);
+ void InitializeServices(const ZenComputeServerConfig& ServerConfig);
+ void RegisterServices(const ZenComputeServerConfig& ServerConfig);
+ void ResolveCloudMetadata();
+ void PostAnnounce();
+ void EnqueueAnnounceTimer();
+ void EnqueueProvisionerMaintenanceTimer();
+ void ProvisionerMaintenanceTick();
+ std::string GetAnnounceUrl() const;
+ std::string GetInstanceId() const;
+ CbObject BuildAnnounceBody();
+
+ // Worker→orchestrator WebSocket client
+ struct OrchestratorWsHandler : public IWsClientHandler
+ {
+ ZenComputeServer& Server;
+ explicit OrchestratorWsHandler(ZenComputeServer& S) : Server(S) {}
+
+ void OnWsOpen() override;
+ void OnWsMessage(const WebSocketMessage& Msg) override;
+ void OnWsClose(uint16_t Code, std::string_view Reason) override;
+ };
+
+ std::unique_ptr<OrchestratorWsHandler> m_OrchestratorWsHandler;
+ std::unique_ptr<HttpWsClient> m_OrchestratorWsClient;
+ asio::steady_timer m_WsReconnectTimer{m_IoContext};
+ bool m_EnableWorkerWebSocket = false;
+
+ void InitializeOrchestratorWebSocket();
+ void EnqueueWsReconnect();
};
} // namespace zen
diff --git a/src/zenserver/compute/computeservice.cpp b/src/zenserver/compute/computeservice.cpp
deleted file mode 100644
index 2c0bc0ae9..000000000
--- a/src/zenserver/compute/computeservice.cpp
+++ /dev/null
@@ -1,100 +0,0 @@
-// Copyright Epic Games, Inc. All Rights Reserved.
-
-#include "computeservice.h"
-
-#if ZEN_WITH_COMPUTE_SERVICES
-
-# include <zencore/compactbinarybuilder.h>
-# include <zencore/filesystem.h>
-# include <zencore/fmtutils.h>
-# include <zencore/logging.h>
-# include <zencore/system.h>
-# include <zenutil/zenserverprocess.h>
-
-ZEN_THIRD_PARTY_INCLUDES_START
-# include <EASTL/fixed_vector.h>
-# include <asio.hpp>
-ZEN_THIRD_PARTY_INCLUDES_END
-
-# include <unordered_map>
-
-namespace zen {
-
-//////////////////////////////////////////////////////////////////////////
-
-struct ResourceMetrics
-{
- uint64_t DiskUsageBytes = 0;
- uint64_t MemoryUsageBytes = 0;
-};
-
-//////////////////////////////////////////////////////////////////////////
-
-struct HttpComputeService::Impl
-{
- Impl(const Impl&) = delete;
- Impl& operator=(const Impl&) = delete;
-
- Impl();
- ~Impl();
-
- void Initialize(std::filesystem::path BaseDir) { ZEN_UNUSED(BaseDir); }
-
- void Cleanup() {}
-
-private:
-};
-
-HttpComputeService::Impl::Impl()
-{
-}
-
-HttpComputeService::Impl::~Impl()
-{
-}
-
-///////////////////////////////////////////////////////////////////////////
-
-HttpComputeService::HttpComputeService(std::filesystem::path BaseDir) : m_Impl(std::make_unique<Impl>())
-{
- using namespace std::literals;
-
- m_Impl->Initialize(BaseDir);
-
- m_Router.RegisterRoute(
- "status",
- [this](HttpRouterRequest& Req) {
- CbObjectWriter Obj;
- Obj.BeginArray("modules");
- Obj.EndArray();
- Req.ServerRequest().WriteResponse(HttpResponseCode::OK, Obj.Save());
- },
- HttpVerb::kGet);
-
- m_Router.RegisterRoute(
- "stats",
- [this](HttpRouterRequest& Req) {
- CbObjectWriter Obj;
- Req.ServerRequest().WriteResponse(HttpResponseCode::OK, Obj.Save());
- },
- HttpVerb::kGet);
-}
-
-HttpComputeService::~HttpComputeService()
-{
-}
-
-const char*
-HttpComputeService::BaseUri() const
-{
- return "/compute/";
-}
-
-void
-HttpComputeService::HandleRequest(zen::HttpServerRequest& Request)
-{
- m_Router.HandleRequest(Request);
-}
-
-} // namespace zen
-#endif // ZEN_WITH_COMPUTE_SERVICES
diff --git a/src/zenserver/compute/computeservice.h b/src/zenserver/compute/computeservice.h
deleted file mode 100644
index 339200dd8..000000000
--- a/src/zenserver/compute/computeservice.h
+++ /dev/null
@@ -1,36 +0,0 @@
-// Copyright Epic Games, Inc. All Rights Reserved.
-
-#pragma once
-
-#include <zenhttp/httpserver.h>
-
-#if ZEN_WITH_COMPUTE_SERVICES
-namespace zen {
-
-/** ZenServer Compute Service
- *
- * Manages a set of compute workers for use in UEFN content worker
- *
- */
-class HttpComputeService : public zen::HttpService
-{
-public:
- HttpComputeService(std::filesystem::path BaseDir);
- ~HttpComputeService();
-
- HttpComputeService(const HttpComputeService&) = delete;
- HttpComputeService& operator=(const HttpComputeService&) = delete;
-
- virtual const char* BaseUri() const override;
- virtual void HandleRequest(zen::HttpServerRequest& Request) override;
-
-private:
- HttpRequestRouter m_Router;
-
- struct Impl;
-
- std::unique_ptr<Impl> m_Impl;
-};
-
-} // namespace zen
-#endif // ZEN_WITH_COMPUTE_SERVICES