From 795345e5fd7974a1f5227d507a58bb3ed75eafd5 Mon Sep 17 00:00:00 2001 From: Stefan Boberg Date: Mon, 13 Apr 2026 16:38:16 +0200 Subject: Compute OIDC auth, async Horde agents, and orchestrator improvements (#913) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rework of the Horde agent subsystem from synchronous per-thread I/O to an async ASIO-driven architecture, plus provisioner scale-down with graceful draining, OIDC authentication, scheduler improvements, and dashboard UI for provisioner control. ### Async Horde Agent Rewrite - Replace synchronous `HordeAgent` (one thread per agent, blocking I/O) with `AsyncHordeAgent` — an ASIO state machine running on a shared `io_context` thread pool - Replace `TcpComputeTransport`/`AesComputeTransport` with `AsyncTcpComputeTransport`/`AsyncAesComputeTransport` - Replace `AgentMessageChannel` with `AsyncAgentMessageChannel` using frame queuing and ASIO timers - Delete `ComputeBuffer` and `ComputeChannel` ring-buffer classes (no longer needed) ### Provisioner Drain / Scale-Down - `HordeProvisioner` can now drain agents when target core count is lowered: queries each agent's `/compute/session/status` for workload, selects candidates by largest-fit/lowest-workload, and sends `/compute/session/drain` - Configurable `--horde-drain-grace-period` (default 300s) before force-kill - Implement `IProvisionerStateProvider` interface to expose provisioner state to the orchestrator HTTP layer - Forward `--coordinator-session`, `--provision-clean`, and `--provision-tracehost` through both Horde and Nomad provisioners to spawned workers ### OIDC Authentication - `HordeClient` accepts an `AccessTokenProvider` (refreshable token function) as alternative to static `--horde-token` - Wire up `OidcToken.exe` auto-discovery via `httpclientauth::CreateFromOidcTokenExecutable` with `--HordeUrl` mode - New `--horde-oidctoken-exe-path` CLI option for explicit path override ### Orchestrator & Scheduler - Orchestrator generates a session ID at startup; workers include `coordinator_session` in announcements so the orchestrator can reject stale-session workers - New `Rejected` action state — when a remote runner declines at capacity, the action is rescheduled without retry count increment - Reduce scheduler lock contention: snapshot pending actions under shared lock, sort/trim outside the lock - Parallelize remote action submission across runners via `WorkerThreadPool` with slow-submit warnings - New action field `FailureReason` populated by all runner types (exit codes, sandbox failures, exceptions) - New endpoints: `session/drain`, `session/status`, `session/sunset`, `provisioner/status`, `provisioner/target` ### Remote Execution - Eager-attach mode for `RemoteHttpRunner` — bundles all attachments upfront in a `CbPackage` for single-roundtrip submits - Track in-flight submissions to prevent over-queuing - Show remote runner hostname in `GetDisplayName()` - `--announce-url` to override the endpoint announced to the coordinator (e.g. relay-visible address) ### Frontend Dashboard - Delete standalone `compute.html` (925 lines) and `orchestrator.html` (669 lines), consolidated into JS page modules - Add provisioner panel to orchestrator dashboard: target/active/estimated core counts, draining agent count - Editable target-cores input with debounced POST to `/orch/provisioner/target` - Per-agent provisioning status badges (active / draining / deallocated) in the agents table - Active vs total CPU counts in agents summary row ### CLI - New `zen compute record-start` / `record-stop` subcommands - `zen exec` progress bar with submit and completion phases, atomic work counters, `--progress` mode (Pretty/Plain/Quiet) ### Other - `DataDir` supports environment variable expansion - Worker manifest validation checks for `worker.zcb` marker to detect incomplete cached directories - Linux/Mac runners `nice(5)` child processes to avoid starving the main server - `ComputeService::SetShutdownCallback` wired to `RequestExit` via `session/sunset` - Curl HTTP client logs effective URL on failure - `MachineInfo` carries `Pool` and `Mode` from Horde response - Horde bundle creation includes `.pdb` on Windows --- src/zencompute/runners/linuxrunner.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'src/zencompute/runners/linuxrunner.cpp') diff --git a/src/zencompute/runners/linuxrunner.cpp b/src/zencompute/runners/linuxrunner.cpp index 9055005d9..ce5bbdcc8 100644 --- a/src/zencompute/runners/linuxrunner.cpp +++ b/src/zencompute/runners/linuxrunner.cpp @@ -430,7 +430,8 @@ LinuxProcessRunner::SubmitAction(Ref Action) if (ChildPid == 0) { - // Child process + // Child process — lower priority so workers don't starve the main server + nice(5); if (m_Sandboxed) { @@ -481,7 +482,8 @@ LinuxProcessRunner::SubmitAction(Ref Action) // Clean up the sandbox in the background m_DeferredDeleter.Enqueue(Action->ActionLsn, std::move(Prepared->SandboxPath)); - ZEN_ERROR("Sandbox setup failed for action {}: {}", Action->ActionLsn, ErrBuf); + Action->FailureReason = fmt::format("sandbox setup failed: {}", ErrBuf); + ZEN_ERROR("action {} ({}): {}", Action->ActionId, Action->ActionLsn, Action->FailureReason); Action->SetActionState(RunnerAction::State::Failed); return SubmitResult{.IsAccepted = false}; -- cgit v1.2.3 From 3d59b5d7036c35fe484d052ff32dbdc9d0a75cf7 Mon Sep 17 00:00:00 2001 From: Dan Engelbrecht Date: Mon, 13 Apr 2026 19:17:09 +0200 Subject: fix utf characters in source code (#953) --- src/zencompute/runners/linuxrunner.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'src/zencompute/runners/linuxrunner.cpp') diff --git a/src/zencompute/runners/linuxrunner.cpp b/src/zencompute/runners/linuxrunner.cpp index ce5bbdcc8..be4274823 100644 --- a/src/zencompute/runners/linuxrunner.cpp +++ b/src/zencompute/runners/linuxrunner.cpp @@ -195,7 +195,7 @@ namespace { WriteErrorAndExit(ErrorPipeFd, "bind mount /lib failed", errno); } - // /lib64 (optional — not all distros have it) + // /lib64 (optional - not all distros have it) { struct stat St; if (stat("/lib64", &St) == 0 && S_ISDIR(St.st_mode)) @@ -208,7 +208,7 @@ namespace { } } - // /etc (required — for resolv.conf, ld.so.cache, etc.) + // /etc (required - for resolv.conf, ld.so.cache, etc.) if (MkdirIfNeeded(BuildPath("etc"), 0755) != 0) { WriteErrorAndExit(ErrorPipeFd, "mkdir sandbox/etc failed", errno); @@ -218,7 +218,7 @@ namespace { WriteErrorAndExit(ErrorPipeFd, "bind mount /etc failed", errno); } - // /worker — bind-mount worker directory (contains the executable) + // /worker - bind-mount worker directory (contains the executable) if (MkdirIfNeeded(BuildPath("worker"), 0755) != 0) { WriteErrorAndExit(ErrorPipeFd, "mkdir sandbox/worker failed", errno); @@ -430,12 +430,12 @@ LinuxProcessRunner::SubmitAction(Ref Action) if (ChildPid == 0) { - // Child process — lower priority so workers don't starve the main server + // Child process - lower priority so workers don't starve the main server nice(5); if (m_Sandboxed) { - // Close read end of error pipe — child only writes + // Close read end of error pipe - child only writes close(ErrorPipe[0]); SetupNamespaceSandbox(SandboxPathStr.c_str(), CurrentUid, CurrentGid, WorkerPathStr.c_str(), ErrorPipe[1]); @@ -462,7 +462,7 @@ LinuxProcessRunner::SubmitAction(Ref Action) if (m_Sandboxed) { - // Close write end of error pipe — parent only reads + // Close write end of error pipe - parent only reads close(ErrorPipe[1]); // Read from error pipe. If execve succeeded, pipe was closed by O_CLOEXEC @@ -679,7 +679,7 @@ ReadProcStatCpuTicks(pid_t Pid) Buf[Len] = '\0'; - // Skip past "pid (name) " — find last ')' to handle names containing spaces or parens + // Skip past "pid (name) " - find last ')' to handle names containing spaces or parens const char* P = strrchr(Buf, ')'); if (!P) { @@ -709,7 +709,7 @@ LinuxProcessRunner::SampleProcessCpu(RunningAction& Running) if (CurrentOsTicks == 0) { - // Process gone or /proc entry unreadable — record timestamp without updating usage + // Process gone or /proc entry unreadable - record timestamp without updating usage Running.LastCpuSampleTicks = NowTicks; Running.LastCpuOsTicks = 0; return; -- cgit v1.2.3