aboutsummaryrefslogtreecommitdiff
path: root/src/zenserver/frontend/html/pages/orchestrator.js
diff options
context:
space:
mode:
authorStefan Boberg <[email protected]>2026-04-13 16:38:16 +0200
committerGitHub Enterprise <[email protected]>2026-04-13 16:38:16 +0200
commit795345e5fd7974a1f5227d507a58bb3ed75eafd5 (patch)
tree7a0f142bf562c3590400586c82b0e7a1b5ad6493 /src/zenserver/frontend/html/pages/orchestrator.js
parent5.8.4-pre2 (diff)
downloadzen-795345e5fd7974a1f5227d507a58bb3ed75eafd5.tar.xz
zen-795345e5fd7974a1f5227d507a58bb3ed75eafd5.zip
Compute OIDC auth, async Horde agents, and orchestrator improvements (#913)
Rework of the Horde agent subsystem from synchronous per-thread I/O to an async ASIO-driven architecture, plus provisioner scale-down with graceful draining, OIDC authentication, scheduler improvements, and dashboard UI for provisioner control. ### Async Horde Agent Rewrite - Replace synchronous `HordeAgent` (one thread per agent, blocking I/O) with `AsyncHordeAgent` — an ASIO state machine running on a shared `io_context` thread pool - Replace `TcpComputeTransport`/`AesComputeTransport` with `AsyncTcpComputeTransport`/`AsyncAesComputeTransport` - Replace `AgentMessageChannel` with `AsyncAgentMessageChannel` using frame queuing and ASIO timers - Delete `ComputeBuffer` and `ComputeChannel` ring-buffer classes (no longer needed) ### Provisioner Drain / Scale-Down - `HordeProvisioner` can now drain agents when target core count is lowered: queries each agent's `/compute/session/status` for workload, selects candidates by largest-fit/lowest-workload, and sends `/compute/session/drain` - Configurable `--horde-drain-grace-period` (default 300s) before force-kill - Implement `IProvisionerStateProvider` interface to expose provisioner state to the orchestrator HTTP layer - Forward `--coordinator-session`, `--provision-clean`, and `--provision-tracehost` through both Horde and Nomad provisioners to spawned workers ### OIDC Authentication - `HordeClient` accepts an `AccessTokenProvider` (refreshable token function) as alternative to static `--horde-token` - Wire up `OidcToken.exe` auto-discovery via `httpclientauth::CreateFromOidcTokenExecutable` with `--HordeUrl` mode - New `--horde-oidctoken-exe-path` CLI option for explicit path override ### Orchestrator & Scheduler - Orchestrator generates a session ID at startup; workers include `coordinator_session` in announcements so the orchestrator can reject stale-session workers - New `Rejected` action state — when a remote runner declines at capacity, the action is rescheduled without retry count increment - Reduce scheduler lock contention: snapshot pending actions under shared lock, sort/trim outside the lock - Parallelize remote action submission across runners via `WorkerThreadPool` with slow-submit warnings - New action field `FailureReason` populated by all runner types (exit codes, sandbox failures, exceptions) - New endpoints: `session/drain`, `session/status`, `session/sunset`, `provisioner/status`, `provisioner/target` ### Remote Execution - Eager-attach mode for `RemoteHttpRunner` — bundles all attachments upfront in a `CbPackage` for single-roundtrip submits - Track in-flight submissions to prevent over-queuing - Show remote runner hostname in `GetDisplayName()` - `--announce-url` to override the endpoint announced to the coordinator (e.g. relay-visible address) ### Frontend Dashboard - Delete standalone `compute.html` (925 lines) and `orchestrator.html` (669 lines), consolidated into JS page modules - Add provisioner panel to orchestrator dashboard: target/active/estimated core counts, draining agent count - Editable target-cores input with debounced POST to `/orch/provisioner/target` - Per-agent provisioning status badges (active / draining / deallocated) in the agents table - Active vs total CPU counts in agents summary row ### CLI - New `zen compute record-start` / `record-stop` subcommands - `zen exec` progress bar with submit and completion phases, atomic work counters, `--progress` mode (Pretty/Plain/Quiet) ### Other - `DataDir` supports environment variable expansion - Worker manifest validation checks for `worker.zcb` marker to detect incomplete cached directories - Linux/Mac runners `nice(5)` child processes to avoid starving the main server - `ComputeService::SetShutdownCallback` wired to `RequestExit` via `session/sunset` - Curl HTTP client logs effective URL on failure - `MachineInfo` carries `Pool` and `Mode` from Horde response - Horde bundle creation includes `.pdb` on Windows
Diffstat (limited to 'src/zenserver/frontend/html/pages/orchestrator.js')
-rw-r--r--src/zenserver/frontend/html/pages/orchestrator.js210
1 files changed, 206 insertions, 4 deletions
diff --git a/src/zenserver/frontend/html/pages/orchestrator.js b/src/zenserver/frontend/html/pages/orchestrator.js
index a280fabdb..30f6a8122 100644
--- a/src/zenserver/frontend/html/pages/orchestrator.js
+++ b/src/zenserver/frontend/html/pages/orchestrator.js
@@ -14,6 +14,14 @@ export class Page extends ZenPage
{
this.set_title("orchestrator");
+ // Provisioner section (hidden until data arrives)
+ this._prov_section = this._collapsible_section("Provisioner");
+ this._prov_section._parent.inner().style.display = "none";
+ this._prov_grid = null;
+ this._prov_target_dirty = false;
+ this._prov_commit_timer = null;
+ this._prov_last_target = null;
+
// Agents section
const agents_section = this._collapsible_section("Compute Agents");
this._agents_host = agents_section;
@@ -50,11 +58,12 @@ export class Page extends ZenPage
{
try
{
- const [agents, history, clients, client_history] = await Promise.all([
+ const [agents, history, clients, client_history, prov] = await Promise.all([
new Fetcher().resource("/orch/agents").json(),
new Fetcher().resource("/orch/history").param("limit", "50").json().catch(() => null),
new Fetcher().resource("/orch/clients").json().catch(() => null),
new Fetcher().resource("/orch/clients/history").param("limit", "50").json().catch(() => null),
+ new Fetcher().resource("/orch/provisioner/status").json().catch(() => null),
]);
this._render_agents(agents);
@@ -70,6 +79,7 @@ export class Page extends ZenPage
{
this._render_client_history(client_history.client_events || []);
}
+ this._render_provisioner(prov);
}
catch (e) { /* service unavailable */ }
}
@@ -109,6 +119,7 @@ export class Page extends ZenPage
{
this._render_client_history(data.client_events);
}
+ this._render_provisioner(data.provisioner);
}
catch (e) { /* ignore parse errors */ }
};
@@ -156,7 +167,7 @@ export class Page extends ZenPage
return;
}
- let totalCpus = 0, totalWeightedCpu = 0;
+ let totalCpus = 0, activeCpus = 0, totalWeightedCpu = 0;
let totalMemUsed = 0, totalMemTotal = 0;
let totalQueues = 0, totalPending = 0, totalRunning = 0, totalCompleted = 0;
let totalRecv = 0, totalSent = 0;
@@ -173,8 +184,14 @@ export class Page extends ZenPage
const completed = w.actions_completed || 0;
const recv = w.bytes_received || 0;
const sent = w.bytes_sent || 0;
+ const provisioner = w.provisioner || "";
+ const isProvisioned = provisioner !== "";
totalCpus += cpus;
+ if (w.provisioner_status === "active")
+ {
+ activeCpus += cpus;
+ }
if (cpus > 0 && typeof cpuUsage === "number")
{
totalWeightedCpu += cpuUsage * cpus;
@@ -209,12 +226,49 @@ export class Page extends ZenPage
cell.inner().textContent = "";
cell.tag("a").text(hostname).attr("href", w.uri + "/dashboard/compute/").attr("target", "_blank");
}
+
+ // Visual treatment based on provisioner status
+ const provStatus = w.provisioner_status || "";
+ if (!isProvisioned)
+ {
+ row.inner().style.opacity = "0.45";
+ }
+ else
+ {
+ const hostCell = row.get_cell(0);
+ const el = hostCell.inner();
+ const badge = document.createElement("span");
+ const badgeBase = "display:inline-block;margin-left:6px;padding:1px 5px;border-radius:8px;" +
+ "font-size:9px;font-weight:600;color:#fff;vertical-align:middle;";
+
+ if (provStatus === "draining")
+ {
+ badge.textContent = "draining";
+ badge.style.cssText = badgeBase + "background:var(--theme_warn);";
+ row.inner().style.opacity = "0.6";
+ }
+ else if (provStatus === "active")
+ {
+ badge.textContent = provisioner;
+ badge.style.cssText = badgeBase + "background:#8957e5;";
+ }
+ else
+ {
+ badge.textContent = "deallocated";
+ badge.style.cssText = badgeBase + "background:var(--theme_fail);";
+ row.inner().style.opacity = "0.45";
+ }
+ el.appendChild(badge);
+ }
}
- // Total row
+ // Total row — show active / total in CPUs column
+ const cpuLabel = activeCpus < totalCpus
+ ? Friendly.sep(activeCpus) + " / " + Friendly.sep(totalCpus)
+ : Friendly.sep(totalCpus);
const total = this._agents_table.add_row(
"TOTAL",
- Friendly.sep(totalCpus),
+ cpuLabel,
"",
totalMemTotal > 0 ? Friendly.bytes(totalMemUsed) + " / " + Friendly.bytes(totalMemTotal) : "-",
Friendly.sep(totalQueues),
@@ -305,6 +359,154 @@ export class Page extends ZenPage
}
}
+ _render_provisioner(prov)
+ {
+ const container = this._prov_section._parent.inner();
+
+ if (!prov || !prov.name)
+ {
+ container.style.display = "none";
+ return;
+ }
+ container.style.display = "";
+
+ if (!this._prov_grid)
+ {
+ this._prov_grid = this._prov_section.tag().classify("grid").classify("stats-tiles");
+ this._prov_tiles = {};
+
+ // Target cores tile with editable input
+ const target_tile = this._prov_grid.tag().classify("card").classify("stats-tile");
+ target_tile.tag().classify("card-title").text("Target Cores");
+ const target_body = target_tile.tag().classify("tile-metrics");
+ const target_m = target_body.tag().classify("tile-metric").classify("tile-metric-hero");
+ const input = document.createElement("input");
+ input.type = "number";
+ input.min = "0";
+ input.style.cssText = "width:100px;padding:4px 8px;border:1px solid var(--theme_g2);border-radius:4px;" +
+ "background:var(--theme_g4);color:var(--theme_bright);font-size:20px;font-weight:600;text-align:right;";
+ target_m.inner().appendChild(input);
+ target_m.tag().classify("metric-label").text("target");
+ this._prov_tiles.target_input = input;
+
+ input.addEventListener("focus", () => { this._prov_target_dirty = true; });
+ input.addEventListener("input", () => {
+ this._prov_target_dirty = true;
+ if (this._prov_commit_timer)
+ {
+ clearTimeout(this._prov_commit_timer);
+ }
+ this._prov_commit_timer = setTimeout(() => this._commit_provisioner_target(), 800);
+ });
+ input.addEventListener("keydown", (e) => {
+ if (e.key === "Enter")
+ {
+ if (this._prov_commit_timer)
+ {
+ clearTimeout(this._prov_commit_timer);
+ }
+ this._commit_provisioner_target();
+ input.blur();
+ }
+ });
+ input.addEventListener("blur", () => {
+ if (this._prov_commit_timer)
+ {
+ clearTimeout(this._prov_commit_timer);
+ }
+ this._commit_provisioner_target();
+ });
+
+ // Active cores
+ const active_tile = this._prov_grid.tag().classify("card").classify("stats-tile");
+ active_tile.tag().classify("card-title").text("Active Cores");
+ const active_body = active_tile.tag().classify("tile-metrics");
+ this._prov_tiles.active = active_body;
+
+ // Estimated cores
+ const est_tile = this._prov_grid.tag().classify("card").classify("stats-tile");
+ est_tile.tag().classify("card-title").text("Estimated Cores");
+ const est_body = est_tile.tag().classify("tile-metrics");
+ this._prov_tiles.estimated = est_body;
+
+ // Agents
+ const agents_tile = this._prov_grid.tag().classify("card").classify("stats-tile");
+ agents_tile.tag().classify("card-title").text("Agents");
+ const agents_body = agents_tile.tag().classify("tile-metrics");
+ this._prov_tiles.agents = agents_body;
+
+ // Draining
+ const drain_tile = this._prov_grid.tag().classify("card").classify("stats-tile");
+ drain_tile.tag().classify("card-title").text("Draining");
+ const drain_body = drain_tile.tag().classify("tile-metrics");
+ this._prov_tiles.draining = drain_body;
+ }
+
+ // Update values
+ const input = this._prov_tiles.target_input;
+ if (!this._prov_target_dirty && document.activeElement !== input)
+ {
+ input.value = prov.target_cores;
+ }
+ this._prov_last_target = prov.target_cores;
+
+ // Re-render metric tiles (clear and recreate content)
+ for (const key of ["active", "estimated", "agents", "draining"])
+ {
+ this._prov_tiles[key].inner().innerHTML = "";
+ }
+ this._metric(this._prov_tiles.active, Friendly.sep(prov.active_cores), "cores", true);
+ this._metric(this._prov_tiles.estimated, Friendly.sep(prov.estimated_cores), "cores", true);
+ this._metric(this._prov_tiles.agents, Friendly.sep(prov.agents), "active", true);
+ this._metric(this._prov_tiles.draining, Friendly.sep(prov.agents_draining || 0), "agents", true);
+ }
+
+ async _commit_provisioner_target()
+ {
+ const input = this._prov_tiles?.target_input;
+ if (!input || this._prov_committing)
+ {
+ return;
+ }
+ const value = parseInt(input.value, 10);
+ if (isNaN(value) || value < 0)
+ {
+ return;
+ }
+ if (value === this._prov_last_target)
+ {
+ this._prov_target_dirty = false;
+ return;
+ }
+ this._prov_committing = true;
+ try
+ {
+ const resp = await fetch("/orch/provisioner/target", {
+ method: "POST",
+ headers: { "Content-Type": "application/json" },
+ body: JSON.stringify({ target_cores: value }),
+ });
+ if (resp.ok)
+ {
+ this._prov_target_dirty = false;
+ console.log("Target cores set to", value);
+ }
+ else
+ {
+ const text = await resp.text();
+ console.error("Failed to set target cores: HTTP", resp.status, text);
+ }
+ }
+ catch (e)
+ {
+ console.error("Failed to set target cores:", e);
+ }
+ finally
+ {
+ this._prov_committing = false;
+ }
+ }
+
_metric(parent, value, label, hero = false)
{
const m = parent.tag().classify("tile-metric");