diff options
| author | Stefan Boberg <[email protected]> | 2026-04-13 16:38:16 +0200 |
|---|---|---|
| committer | GitHub Enterprise <[email protected]> | 2026-04-13 16:38:16 +0200 |
| commit | 795345e5fd7974a1f5227d507a58bb3ed75eafd5 (patch) | |
| tree | 7a0f142bf562c3590400586c82b0e7a1b5ad6493 /src/zenhorde/hordeclient.cpp | |
| parent | 5.8.4-pre2 (diff) | |
| download | zen-795345e5fd7974a1f5227d507a58bb3ed75eafd5.tar.xz zen-795345e5fd7974a1f5227d507a58bb3ed75eafd5.zip | |
Compute OIDC auth, async Horde agents, and orchestrator improvements (#913)
Rework of the Horde agent subsystem from synchronous per-thread I/O to an async ASIO-driven architecture, plus provisioner scale-down with graceful draining, OIDC authentication, scheduler improvements, and dashboard UI for provisioner control.
### Async Horde Agent Rewrite
- Replace synchronous `HordeAgent` (one thread per agent, blocking I/O) with `AsyncHordeAgent` — an ASIO state machine running on a shared `io_context` thread pool
- Replace `TcpComputeTransport`/`AesComputeTransport` with `AsyncTcpComputeTransport`/`AsyncAesComputeTransport`
- Replace `AgentMessageChannel` with `AsyncAgentMessageChannel` using frame queuing and ASIO timers
- Delete `ComputeBuffer` and `ComputeChannel` ring-buffer classes (no longer needed)
### Provisioner Drain / Scale-Down
- `HordeProvisioner` can now drain agents when target core count is lowered: queries each agent's `/compute/session/status` for workload, selects candidates by largest-fit/lowest-workload, and sends `/compute/session/drain`
- Configurable `--horde-drain-grace-period` (default 300s) before force-kill
- Implement `IProvisionerStateProvider` interface to expose provisioner state to the orchestrator HTTP layer
- Forward `--coordinator-session`, `--provision-clean`, and `--provision-tracehost` through both Horde and Nomad provisioners to spawned workers
### OIDC Authentication
- `HordeClient` accepts an `AccessTokenProvider` (refreshable token function) as alternative to static `--horde-token`
- Wire up `OidcToken.exe` auto-discovery via `httpclientauth::CreateFromOidcTokenExecutable` with `--HordeUrl` mode
- New `--horde-oidctoken-exe-path` CLI option for explicit path override
### Orchestrator & Scheduler
- Orchestrator generates a session ID at startup; workers include `coordinator_session` in announcements so the orchestrator can reject stale-session workers
- New `Rejected` action state — when a remote runner declines at capacity, the action is rescheduled without retry count increment
- Reduce scheduler lock contention: snapshot pending actions under shared lock, sort/trim outside the lock
- Parallelize remote action submission across runners via `WorkerThreadPool` with slow-submit warnings
- New action field `FailureReason` populated by all runner types (exit codes, sandbox failures, exceptions)
- New endpoints: `session/drain`, `session/status`, `session/sunset`, `provisioner/status`, `provisioner/target`
### Remote Execution
- Eager-attach mode for `RemoteHttpRunner` — bundles all attachments upfront in a `CbPackage` for single-roundtrip submits
- Track in-flight submissions to prevent over-queuing
- Show remote runner hostname in `GetDisplayName()`
- `--announce-url` to override the endpoint announced to the coordinator (e.g. relay-visible address)
### Frontend Dashboard
- Delete standalone `compute.html` (925 lines) and `orchestrator.html` (669 lines), consolidated into JS page modules
- Add provisioner panel to orchestrator dashboard: target/active/estimated core counts, draining agent count
- Editable target-cores input with debounced POST to `/orch/provisioner/target`
- Per-agent provisioning status badges (active / draining / deallocated) in the agents table
- Active vs total CPU counts in agents summary row
### CLI
- New `zen compute record-start` / `record-stop` subcommands
- `zen exec` progress bar with submit and completion phases, atomic work counters, `--progress` mode (Pretty/Plain/Quiet)
### Other
- `DataDir` supports environment variable expansion
- Worker manifest validation checks for `worker.zcb` marker to detect incomplete cached directories
- Linux/Mac runners `nice(5)` child processes to avoid starving the main server
- `ComputeService::SetShutdownCallback` wired to `RequestExit` via `session/sunset`
- Curl HTTP client logs effective URL on failure
- `MachineInfo` carries `Pool` and `Mode` from Horde response
- Horde bundle creation includes `.pdb` on Windows
Diffstat (limited to 'src/zenhorde/hordeclient.cpp')
| -rw-r--r-- | src/zenhorde/hordeclient.cpp | 65 |
1 files changed, 21 insertions, 44 deletions
diff --git a/src/zenhorde/hordeclient.cpp b/src/zenhorde/hordeclient.cpp index 0eefc57c6..618a85e0e 100644 --- a/src/zenhorde/hordeclient.cpp +++ b/src/zenhorde/hordeclient.cpp @@ -4,6 +4,7 @@ #include <zencore/iobuffer.h> #include <zencore/logging.h> #include <zencore/memoryview.h> +#include <zencore/string.h> #include <zencore/trace.h> #include <zenhorde/hordeclient.h> #include <zenhttp/httpclient.h> @@ -14,7 +15,7 @@ ZEN_THIRD_PARTY_INCLUDES_END namespace zen::horde { -HordeClient::HordeClient(const HordeConfig& Config) : m_Config(Config), m_Log(zen::logging::Get("horde.client")) +HordeClient::HordeClient(HordeConfig Config) : m_Config(std::move(Config)), m_Log("horde.client") { } @@ -32,7 +33,11 @@ HordeClient::Initialize() Settings.RetryCount = 1; Settings.ExpectedErrorCodes = {HttpResponseCode::ServiceUnavailable, HttpResponseCode::TooManyRequests}; - if (!m_Config.AuthToken.empty()) + if (m_Config.AccessTokenProvider) + { + Settings.AccessTokenProvider = m_Config.AccessTokenProvider; + } + else if (!m_Config.AuthToken.empty()) { Settings.AccessTokenProvider = [token = m_Config.AuthToken]() -> HttpClientAccessToken { return HttpClientAccessToken(token, HttpClientAccessToken::Clock::now() + std::chrono::hours{24}); @@ -41,7 +46,7 @@ HordeClient::Initialize() m_Http = std::make_unique<zen::HttpClient>(m_Config.ServerUrl, Settings); - if (!m_Config.AuthToken.empty()) + if (Settings.AccessTokenProvider) { if (!m_Http->Authenticate()) { @@ -63,24 +68,21 @@ HordeClient::BuildRequestBody() const Requirements["pool"] = m_Config.Pool; } - std::string Condition; -#if ZEN_PLATFORM_WINDOWS ExtendableStringBuilder<256> CondBuf; +#if ZEN_PLATFORM_WINDOWS CondBuf << "(OSFamily == 'Windows' || WineEnabled == '" << (m_Config.AllowWine ? "true" : "false") << "')"; - Condition = std::string(CondBuf); #elif ZEN_PLATFORM_MAC - Condition = "OSFamily == 'MacOS'"; + CondBuf << "OSFamily == 'MacOS'"; #else - Condition = "OSFamily == 'Linux'"; + CondBuf << "OSFamily == 'Linux'"; #endif if (!m_Config.Condition.empty()) { - Condition += " "; - Condition += m_Config.Condition; + CondBuf << " " << m_Config.Condition; } - Requirements["condition"] = Condition; + Requirements["condition"] = std::string(CondBuf); Requirements["exclusive"] = true; json11::Json::object Connection; @@ -157,37 +159,8 @@ HordeClient::ResolveCluster(const std::string& RequestBody, ClusterInfo& OutClus } OutCluster.ClusterId = ClusterIdVal.string_value(); - return true; -} - -bool -HordeClient::ParseHexBytes(std::string_view Hex, uint8_t* Out, size_t OutSize) -{ - if (Hex.size() != OutSize * 2) - { - return false; - } - for (size_t i = 0; i < OutSize; ++i) - { - auto HexToByte = [](char c) -> int { - if (c >= '0' && c <= '9') - return c - '0'; - if (c >= 'a' && c <= 'f') - return c - 'a' + 10; - if (c >= 'A' && c <= 'F') - return c - 'A' + 10; - return -1; - }; - - const int Hi = HexToByte(Hex[i * 2]); - const int Lo = HexToByte(Hex[i * 2 + 1]); - if (Hi < 0 || Lo < 0) - { - return false; - } - Out[i] = static_cast<uint8_t>((Hi << 4) | Lo); - } + ZEN_DEBUG("cluster resolution succeeded: clusterId='{}'", OutCluster.ClusterId); return true; } @@ -197,8 +170,6 @@ HordeClient::RequestMachine(const std::string& RequestBody, const std::string& C { ZEN_TRACE_CPU("HordeClient::RequestMachine"); - ZEN_INFO("requesting machine from Horde with cluster '{}'", ClusterId.empty() ? "default" : ClusterId.c_str()); - ExtendableStringBuilder<128> ResourcePath; ResourcePath << "api/v2/compute/" << (ClusterId.empty() ? "default" : ClusterId.c_str()); @@ -324,6 +295,10 @@ HordeClient::RequestMachine(const std::string& RequestBody, const std::string& C { PhysicalCores = static_cast<uint16_t>(std::atoi(Prop.c_str() + 14)); } + else if (Prop.starts_with("Pool=")) + { + OutMachine.Pool = Prop.substr(5); + } } } @@ -367,10 +342,12 @@ HordeClient::RequestMachine(const std::string& RequestBody, const std::string& C OutMachine.LeaseId = LeaseIdVal.string_value(); } - ZEN_INFO("Horde machine assigned [{}:{}] cores={} lease={}", + ZEN_INFO("Horde machine assigned [{}:{}] mode={} cores={} pool={} lease={}", OutMachine.GetConnectionAddress(), OutMachine.GetConnectionPort(), + ToString(OutMachine.Mode), OutMachine.LogicalCores, + OutMachine.Pool, OutMachine.LeaseId); return true; |