aboutsummaryrefslogtreecommitdiff
path: root/src/zenhorde/hordeclient.cpp
diff options
context:
space:
mode:
authorStefan Boberg <[email protected]>2026-04-13 16:38:16 +0200
committerGitHub Enterprise <[email protected]>2026-04-13 16:38:16 +0200
commit795345e5fd7974a1f5227d507a58bb3ed75eafd5 (patch)
tree7a0f142bf562c3590400586c82b0e7a1b5ad6493 /src/zenhorde/hordeclient.cpp
parent5.8.4-pre2 (diff)
downloadzen-795345e5fd7974a1f5227d507a58bb3ed75eafd5.tar.xz
zen-795345e5fd7974a1f5227d507a58bb3ed75eafd5.zip
Compute OIDC auth, async Horde agents, and orchestrator improvements (#913)
Rework of the Horde agent subsystem from synchronous per-thread I/O to an async ASIO-driven architecture, plus provisioner scale-down with graceful draining, OIDC authentication, scheduler improvements, and dashboard UI for provisioner control. ### Async Horde Agent Rewrite - Replace synchronous `HordeAgent` (one thread per agent, blocking I/O) with `AsyncHordeAgent` — an ASIO state machine running on a shared `io_context` thread pool - Replace `TcpComputeTransport`/`AesComputeTransport` with `AsyncTcpComputeTransport`/`AsyncAesComputeTransport` - Replace `AgentMessageChannel` with `AsyncAgentMessageChannel` using frame queuing and ASIO timers - Delete `ComputeBuffer` and `ComputeChannel` ring-buffer classes (no longer needed) ### Provisioner Drain / Scale-Down - `HordeProvisioner` can now drain agents when target core count is lowered: queries each agent's `/compute/session/status` for workload, selects candidates by largest-fit/lowest-workload, and sends `/compute/session/drain` - Configurable `--horde-drain-grace-period` (default 300s) before force-kill - Implement `IProvisionerStateProvider` interface to expose provisioner state to the orchestrator HTTP layer - Forward `--coordinator-session`, `--provision-clean`, and `--provision-tracehost` through both Horde and Nomad provisioners to spawned workers ### OIDC Authentication - `HordeClient` accepts an `AccessTokenProvider` (refreshable token function) as alternative to static `--horde-token` - Wire up `OidcToken.exe` auto-discovery via `httpclientauth::CreateFromOidcTokenExecutable` with `--HordeUrl` mode - New `--horde-oidctoken-exe-path` CLI option for explicit path override ### Orchestrator & Scheduler - Orchestrator generates a session ID at startup; workers include `coordinator_session` in announcements so the orchestrator can reject stale-session workers - New `Rejected` action state — when a remote runner declines at capacity, the action is rescheduled without retry count increment - Reduce scheduler lock contention: snapshot pending actions under shared lock, sort/trim outside the lock - Parallelize remote action submission across runners via `WorkerThreadPool` with slow-submit warnings - New action field `FailureReason` populated by all runner types (exit codes, sandbox failures, exceptions) - New endpoints: `session/drain`, `session/status`, `session/sunset`, `provisioner/status`, `provisioner/target` ### Remote Execution - Eager-attach mode for `RemoteHttpRunner` — bundles all attachments upfront in a `CbPackage` for single-roundtrip submits - Track in-flight submissions to prevent over-queuing - Show remote runner hostname in `GetDisplayName()` - `--announce-url` to override the endpoint announced to the coordinator (e.g. relay-visible address) ### Frontend Dashboard - Delete standalone `compute.html` (925 lines) and `orchestrator.html` (669 lines), consolidated into JS page modules - Add provisioner panel to orchestrator dashboard: target/active/estimated core counts, draining agent count - Editable target-cores input with debounced POST to `/orch/provisioner/target` - Per-agent provisioning status badges (active / draining / deallocated) in the agents table - Active vs total CPU counts in agents summary row ### CLI - New `zen compute record-start` / `record-stop` subcommands - `zen exec` progress bar with submit and completion phases, atomic work counters, `--progress` mode (Pretty/Plain/Quiet) ### Other - `DataDir` supports environment variable expansion - Worker manifest validation checks for `worker.zcb` marker to detect incomplete cached directories - Linux/Mac runners `nice(5)` child processes to avoid starving the main server - `ComputeService::SetShutdownCallback` wired to `RequestExit` via `session/sunset` - Curl HTTP client logs effective URL on failure - `MachineInfo` carries `Pool` and `Mode` from Horde response - Horde bundle creation includes `.pdb` on Windows
Diffstat (limited to 'src/zenhorde/hordeclient.cpp')
-rw-r--r--src/zenhorde/hordeclient.cpp65
1 files changed, 21 insertions, 44 deletions
diff --git a/src/zenhorde/hordeclient.cpp b/src/zenhorde/hordeclient.cpp
index 0eefc57c6..618a85e0e 100644
--- a/src/zenhorde/hordeclient.cpp
+++ b/src/zenhorde/hordeclient.cpp
@@ -4,6 +4,7 @@
#include <zencore/iobuffer.h>
#include <zencore/logging.h>
#include <zencore/memoryview.h>
+#include <zencore/string.h>
#include <zencore/trace.h>
#include <zenhorde/hordeclient.h>
#include <zenhttp/httpclient.h>
@@ -14,7 +15,7 @@ ZEN_THIRD_PARTY_INCLUDES_END
namespace zen::horde {
-HordeClient::HordeClient(const HordeConfig& Config) : m_Config(Config), m_Log(zen::logging::Get("horde.client"))
+HordeClient::HordeClient(HordeConfig Config) : m_Config(std::move(Config)), m_Log("horde.client")
{
}
@@ -32,7 +33,11 @@ HordeClient::Initialize()
Settings.RetryCount = 1;
Settings.ExpectedErrorCodes = {HttpResponseCode::ServiceUnavailable, HttpResponseCode::TooManyRequests};
- if (!m_Config.AuthToken.empty())
+ if (m_Config.AccessTokenProvider)
+ {
+ Settings.AccessTokenProvider = m_Config.AccessTokenProvider;
+ }
+ else if (!m_Config.AuthToken.empty())
{
Settings.AccessTokenProvider = [token = m_Config.AuthToken]() -> HttpClientAccessToken {
return HttpClientAccessToken(token, HttpClientAccessToken::Clock::now() + std::chrono::hours{24});
@@ -41,7 +46,7 @@ HordeClient::Initialize()
m_Http = std::make_unique<zen::HttpClient>(m_Config.ServerUrl, Settings);
- if (!m_Config.AuthToken.empty())
+ if (Settings.AccessTokenProvider)
{
if (!m_Http->Authenticate())
{
@@ -63,24 +68,21 @@ HordeClient::BuildRequestBody() const
Requirements["pool"] = m_Config.Pool;
}
- std::string Condition;
-#if ZEN_PLATFORM_WINDOWS
ExtendableStringBuilder<256> CondBuf;
+#if ZEN_PLATFORM_WINDOWS
CondBuf << "(OSFamily == 'Windows' || WineEnabled == '" << (m_Config.AllowWine ? "true" : "false") << "')";
- Condition = std::string(CondBuf);
#elif ZEN_PLATFORM_MAC
- Condition = "OSFamily == 'MacOS'";
+ CondBuf << "OSFamily == 'MacOS'";
#else
- Condition = "OSFamily == 'Linux'";
+ CondBuf << "OSFamily == 'Linux'";
#endif
if (!m_Config.Condition.empty())
{
- Condition += " ";
- Condition += m_Config.Condition;
+ CondBuf << " " << m_Config.Condition;
}
- Requirements["condition"] = Condition;
+ Requirements["condition"] = std::string(CondBuf);
Requirements["exclusive"] = true;
json11::Json::object Connection;
@@ -157,37 +159,8 @@ HordeClient::ResolveCluster(const std::string& RequestBody, ClusterInfo& OutClus
}
OutCluster.ClusterId = ClusterIdVal.string_value();
- return true;
-}
-
-bool
-HordeClient::ParseHexBytes(std::string_view Hex, uint8_t* Out, size_t OutSize)
-{
- if (Hex.size() != OutSize * 2)
- {
- return false;
- }
- for (size_t i = 0; i < OutSize; ++i)
- {
- auto HexToByte = [](char c) -> int {
- if (c >= '0' && c <= '9')
- return c - '0';
- if (c >= 'a' && c <= 'f')
- return c - 'a' + 10;
- if (c >= 'A' && c <= 'F')
- return c - 'A' + 10;
- return -1;
- };
-
- const int Hi = HexToByte(Hex[i * 2]);
- const int Lo = HexToByte(Hex[i * 2 + 1]);
- if (Hi < 0 || Lo < 0)
- {
- return false;
- }
- Out[i] = static_cast<uint8_t>((Hi << 4) | Lo);
- }
+ ZEN_DEBUG("cluster resolution succeeded: clusterId='{}'", OutCluster.ClusterId);
return true;
}
@@ -197,8 +170,6 @@ HordeClient::RequestMachine(const std::string& RequestBody, const std::string& C
{
ZEN_TRACE_CPU("HordeClient::RequestMachine");
- ZEN_INFO("requesting machine from Horde with cluster '{}'", ClusterId.empty() ? "default" : ClusterId.c_str());
-
ExtendableStringBuilder<128> ResourcePath;
ResourcePath << "api/v2/compute/" << (ClusterId.empty() ? "default" : ClusterId.c_str());
@@ -324,6 +295,10 @@ HordeClient::RequestMachine(const std::string& RequestBody, const std::string& C
{
PhysicalCores = static_cast<uint16_t>(std::atoi(Prop.c_str() + 14));
}
+ else if (Prop.starts_with("Pool="))
+ {
+ OutMachine.Pool = Prop.substr(5);
+ }
}
}
@@ -367,10 +342,12 @@ HordeClient::RequestMachine(const std::string& RequestBody, const std::string& C
OutMachine.LeaseId = LeaseIdVal.string_value();
}
- ZEN_INFO("Horde machine assigned [{}:{}] cores={} lease={}",
+ ZEN_INFO("Horde machine assigned [{}:{}] mode={} cores={} pool={} lease={}",
OutMachine.GetConnectionAddress(),
OutMachine.GetConnectionPort(),
+ ToString(OutMachine.Mode),
OutMachine.LogicalCores,
+ OutMachine.Pool,
OutMachine.LeaseId);
return true;