From 795345e5fd7974a1f5227d507a58bb3ed75eafd5 Mon Sep 17 00:00:00 2001 From: Stefan Boberg Date: Mon, 13 Apr 2026 16:38:16 +0200 Subject: Compute OIDC auth, async Horde agents, and orchestrator improvements (#913) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rework of the Horde agent subsystem from synchronous per-thread I/O to an async ASIO-driven architecture, plus provisioner scale-down with graceful draining, OIDC authentication, scheduler improvements, and dashboard UI for provisioner control. ### Async Horde Agent Rewrite - Replace synchronous `HordeAgent` (one thread per agent, blocking I/O) with `AsyncHordeAgent` — an ASIO state machine running on a shared `io_context` thread pool - Replace `TcpComputeTransport`/`AesComputeTransport` with `AsyncTcpComputeTransport`/`AsyncAesComputeTransport` - Replace `AgentMessageChannel` with `AsyncAgentMessageChannel` using frame queuing and ASIO timers - Delete `ComputeBuffer` and `ComputeChannel` ring-buffer classes (no longer needed) ### Provisioner Drain / Scale-Down - `HordeProvisioner` can now drain agents when target core count is lowered: queries each agent's `/compute/session/status` for workload, selects candidates by largest-fit/lowest-workload, and sends `/compute/session/drain` - Configurable `--horde-drain-grace-period` (default 300s) before force-kill - Implement `IProvisionerStateProvider` interface to expose provisioner state to the orchestrator HTTP layer - Forward `--coordinator-session`, `--provision-clean`, and `--provision-tracehost` through both Horde and Nomad provisioners to spawned workers ### OIDC Authentication - `HordeClient` accepts an `AccessTokenProvider` (refreshable token function) as alternative to static `--horde-token` - Wire up `OidcToken.exe` auto-discovery via `httpclientauth::CreateFromOidcTokenExecutable` with `--HordeUrl` mode - New `--horde-oidctoken-exe-path` CLI option for explicit path override ### Orchestrator & Scheduler - Orchestrator generates a session ID at startup; workers include `coordinator_session` in announcements so the orchestrator can reject stale-session workers - New `Rejected` action state — when a remote runner declines at capacity, the action is rescheduled without retry count increment - Reduce scheduler lock contention: snapshot pending actions under shared lock, sort/trim outside the lock - Parallelize remote action submission across runners via `WorkerThreadPool` with slow-submit warnings - New action field `FailureReason` populated by all runner types (exit codes, sandbox failures, exceptions) - New endpoints: `session/drain`, `session/status`, `session/sunset`, `provisioner/status`, `provisioner/target` ### Remote Execution - Eager-attach mode for `RemoteHttpRunner` — bundles all attachments upfront in a `CbPackage` for single-roundtrip submits - Track in-flight submissions to prevent over-queuing - Show remote runner hostname in `GetDisplayName()` - `--announce-url` to override the endpoint announced to the coordinator (e.g. relay-visible address) ### Frontend Dashboard - Delete standalone `compute.html` (925 lines) and `orchestrator.html` (669 lines), consolidated into JS page modules - Add provisioner panel to orchestrator dashboard: target/active/estimated core counts, draining agent count - Editable target-cores input with debounced POST to `/orch/provisioner/target` - Per-agent provisioning status badges (active / draining / deallocated) in the agents table - Active vs total CPU counts in agents summary row ### CLI - New `zen compute record-start` / `record-stop` subcommands - `zen exec` progress bar with submit and completion phases, atomic work counters, `--progress` mode (Pretty/Plain/Quiet) ### Other - `DataDir` supports environment variable expansion - Worker manifest validation checks for `worker.zcb` marker to detect incomplete cached directories - Linux/Mac runners `nice(5)` child processes to avoid starving the main server - `ComputeService::SetShutdownCallback` wired to `RequestExit` via `session/sunset` - Curl HTTP client logs effective URL on failure - `MachineInfo` carries `Pool` and `Mode` from Horde response - Horde bundle creation includes `.pdb` on Windows --- src/zen/cmds/compute_cmd.cpp | 96 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) create mode 100644 src/zen/cmds/compute_cmd.cpp (limited to 'src/zen/cmds/compute_cmd.cpp') diff --git a/src/zen/cmds/compute_cmd.cpp b/src/zen/cmds/compute_cmd.cpp new file mode 100644 index 000000000..01166cb0e --- /dev/null +++ b/src/zen/cmds/compute_cmd.cpp @@ -0,0 +1,96 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#include "compute_cmd.h" + +#if ZEN_WITH_COMPUTE_SERVICES + +# include +# include +# include + +using namespace std::literals; + +namespace zen { + +////////////////////////////////////////////////////////////////////////// +// ComputeRecordStartSubCmd + +ComputeRecordStartSubCmd::ComputeRecordStartSubCmd() : ZenSubCmdBase("record-start", "Start recording compute actions") +{ + SubOptions().add_option("", "u", "hosturl", ZenCmdBase::kHostUrlHelp, cxxopts::value(m_HostName)->default_value(""), ""); +} + +void +ComputeRecordStartSubCmd::Run(const ZenCliOptions& GlobalOptions) +{ + ZEN_UNUSED(GlobalOptions); + + m_HostName = ZenCmdBase::ResolveTargetHostSpec(m_HostName); + if (m_HostName.empty()) + { + throw OptionParseException("Unable to resolve server specification", SubOptions().help()); + } + + HttpClient Http = ZenCmdBase::CreateHttpClient(m_HostName); + if (HttpClient::Response Response = Http.Post("/compute/record/start"sv, HttpClient::KeyValueMap{}, HttpClient::KeyValueMap{})) + { + CbObject Obj = Response.AsObject(); + std::string_view Path = Obj["path"sv].AsString(); + ZEN_CONSOLE("recording started: " ZEN_BRIGHT_GREEN("{}"), Path); + } + else + { + Response.ThrowError("Failed to start recording"); + } +} + +////////////////////////////////////////////////////////////////////////// +// ComputeRecordStopSubCmd + +ComputeRecordStopSubCmd::ComputeRecordStopSubCmd() : ZenSubCmdBase("record-stop", "Stop recording compute actions") +{ + SubOptions().add_option("", "u", "hosturl", ZenCmdBase::kHostUrlHelp, cxxopts::value(m_HostName)->default_value(""), ""); +} + +void +ComputeRecordStopSubCmd::Run(const ZenCliOptions& GlobalOptions) +{ + ZEN_UNUSED(GlobalOptions); + + m_HostName = ZenCmdBase::ResolveTargetHostSpec(m_HostName); + if (m_HostName.empty()) + { + throw OptionParseException("Unable to resolve server specification", SubOptions().help()); + } + + HttpClient Http = ZenCmdBase::CreateHttpClient(m_HostName); + if (HttpClient::Response Response = Http.Post("/compute/record/stop"sv, HttpClient::KeyValueMap{}, HttpClient::KeyValueMap{})) + { + CbObject Obj = Response.AsObject(); + std::string_view Path = Obj["path"sv].AsString(); + ZEN_CONSOLE("recording stopped: " ZEN_BRIGHT_GREEN("{}"), Path); + } + else + { + Response.ThrowError("Failed to stop recording"); + } +} + +////////////////////////////////////////////////////////////////////////// +// ComputeCommand + +ComputeCommand::ComputeCommand() +{ + m_Options.add_options()("h,help", "Print help"); + m_Options.add_option("__hidden__", "", "subcommand", "", cxxopts::value(m_SubCommand)->default_value(""), ""); + m_Options.parse_positional({"subcommand"}); + + AddSubCommand(m_RecordStartSubCmd); + AddSubCommand(m_RecordStopSubCmd); +} + +ComputeCommand::~ComputeCommand() = default; + +} // namespace zen + +#endif // ZEN_WITH_COMPUTE_SERVICES -- cgit v1.2.3 From 27d72af24a8de9a81500e68a0874f1430297b3bc Mon Sep 17 00:00:00 2001 From: Stefan Boberg Date: Mon, 20 Apr 2026 23:52:38 +0200 Subject: Zen CLI common server interface (#920) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduces a common `ZenServiceClient` RAII wrapper for zen CLI commands that interact with a zenserver instance. CLI operations (admin, builds, cache, exec, hub, info, projectstore, trace, ui, version, vfs, workspaces) automatically register sessions so they become visible in the server's session list, and forward log output to the server's session log endpoint. All session HTTP I/O (announce, remove, log batches) runs on a single background worker thread, so CLI startup and shutdown never block on server availability. ### Key changes - **`ZenServiceClient`** — new RAII class that wraps host resolution, HTTP client creation, and session lifecycle (register on connect, remove on exit). Replaces ad-hoc boilerplate across all command files that talk to a server, including the new `trace` subcommands (`start`, `stop`, `status`). - **Async session I/O** — `SessionsServiceClient` now owns a single worker thread and command queue. `Announce()`, `Remove()`, and `UpdateMetadata()` enqueue commands and return immediately. The worker creates one `HttpClient` with a 5-second total timeout, bounding any individual request. Eliminates main-thread stalls when the server is unreachable. - **Session log forwarding** — `SessionLogSink` is a thin enqueuer that posts log messages to the same worker queue (no separate thread or HTTP client). Log levels are serialized as integers; the server-side ingest handles both string and integer formats for backwards compatibility, with bounds checking on integer values. - **Build & projectstore session registration** — Long-running `builds` and projectstore cache (oplog-download) connections register sessions too, making them visible alongside regular CLI command sessions. ### Cleanup - Extract `SetupCacheSession` helper on `StorageInstance` to reduce duplication. - Remove unused `HttpClient` reference in ui command. --- src/zen/cmds/compute_cmd.cpp | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) (limited to 'src/zen/cmds/compute_cmd.cpp') diff --git a/src/zen/cmds/compute_cmd.cpp b/src/zen/cmds/compute_cmd.cpp index 01166cb0e..9a350c69c 100644 --- a/src/zen/cmds/compute_cmd.cpp +++ b/src/zen/cmds/compute_cmd.cpp @@ -4,6 +4,8 @@ #if ZEN_WITH_COMPUTE_SERVICES +# include "zenserviceclient.h" + # include # include # include @@ -25,13 +27,8 @@ ComputeRecordStartSubCmd::Run(const ZenCliOptions& GlobalOptions) { ZEN_UNUSED(GlobalOptions); - m_HostName = ZenCmdBase::ResolveTargetHostSpec(m_HostName); - if (m_HostName.empty()) - { - throw OptionParseException("Unable to resolve server specification", SubOptions().help()); - } - - HttpClient Http = ZenCmdBase::CreateHttpClient(m_HostName); + ZenServiceClient Service({.HostSpec = m_HostName, .CommandName = "record-start"}); + HttpClient& Http = Service.Http(); if (HttpClient::Response Response = Http.Post("/compute/record/start"sv, HttpClient::KeyValueMap{}, HttpClient::KeyValueMap{})) { CbObject Obj = Response.AsObject(); @@ -57,13 +54,8 @@ ComputeRecordStopSubCmd::Run(const ZenCliOptions& GlobalOptions) { ZEN_UNUSED(GlobalOptions); - m_HostName = ZenCmdBase::ResolveTargetHostSpec(m_HostName); - if (m_HostName.empty()) - { - throw OptionParseException("Unable to resolve server specification", SubOptions().help()); - } - - HttpClient Http = ZenCmdBase::CreateHttpClient(m_HostName); + ZenServiceClient Service({.HostSpec = m_HostName, .CommandName = "record-stop"}); + HttpClient& Http = Service.Http(); if (HttpClient::Response Response = Http.Post("/compute/record/stop"sv, HttpClient::KeyValueMap{}, HttpClient::KeyValueMap{})) { CbObject Obj = Response.AsObject(); -- cgit v1.2.3