diff options
| author | Dan Engelbrecht <[email protected]> | 2026-03-30 11:21:50 +0200 |
|---|---|---|
| committer | GitHub Enterprise <[email protected]> | 2026-03-30 11:21:50 +0200 |
| commit | 10613ce78f4a01658ebacdd0ff8ce464b3d13c3f (patch) | |
| tree | fb45b1ff37e8c5582c82c56bfd2540511ce94abe /src | |
| parent | reuse single MinIO instance across s3client integration test (#901) (diff) | |
| download | zen-10613ce78f4a01658ebacdd0ff8ce464b3d13c3f.tar.xz zen-10613ce78f4a01658ebacdd0ff8ce464b3d13c3f.zip | |
hub resource limits (#900)
- Feature: Hub dashboard now shows a Resources tile with disk and memory usage against configured limits
- Feature: Hub module listing now shows state-change timestamps and duration for each instance
- Improvement: Hub provisioning rejects new instances when disk or memory usage exceeds configurable thresholds; limits are disabled by default (0 = no limit)
- `--hub-provision-disk-limit-bytes` - Reject provisioning when used disk exceeds this many bytes
- `--hub-provision-disk-limit-percent` - Reject provisioning when used disk exceeds this percentage of total disk
- `--hub-provision-memory-limit-bytes` - Reject provisioning when used memory exceeds this many bytes
- `--hub-provision-memory-limit-percent` - Reject provisioning when used memory exceeds this percentage of total RAM
- Improvement: Hub process metrics are now tracked atomically per active instance slot, eliminating per-query process handle lookups
- Improvement: Hub, Build Store, and Workspaces service stats sections in the dashboard are now collapsible
- Bugfix: Hub watchdog loop did not check `m_ShutdownFlag`, causing it to spin indefinitely on shutdown
Diffstat (limited to 'src')
| -rw-r--r-- | src/zenserver/frontend/html/pages/builds.js | 2 | ||||
| -rw-r--r-- | src/zenserver/frontend/html/pages/cache.js | 33 | ||||
| -rw-r--r-- | src/zenserver/frontend/html/pages/compute.js | 33 | ||||
| -rw-r--r-- | src/zenserver/frontend/html/pages/hub.js | 81 | ||||
| -rw-r--r-- | src/zenserver/frontend/html/pages/orchestrator.js | 33 | ||||
| -rw-r--r-- | src/zenserver/frontend/html/pages/page.js | 33 | ||||
| -rw-r--r-- | src/zenserver/frontend/html/pages/projects.js | 33 | ||||
| -rw-r--r-- | src/zenserver/frontend/html/pages/workspaces.js | 2 | ||||
| -rw-r--r-- | src/zenserver/frontend/html/zen.css | 4 | ||||
| -rw-r--r-- | src/zenserver/hub/httphubservice.cpp | 47 | ||||
| -rw-r--r-- | src/zenserver/hub/hub.cpp | 264 | ||||
| -rw-r--r-- | src/zenserver/hub/hub.h | 43 | ||||
| -rw-r--r-- | src/zenserver/hub/storageserverinstance.cpp | 36 | ||||
| -rw-r--r-- | src/zenserver/hub/storageserverinstance.h | 32 | ||||
| -rw-r--r-- | src/zenserver/hub/zenhubserver.cpp | 122 | ||||
| -rw-r--r-- | src/zenserver/hub/zenhubserver.h | 7 | ||||
| -rw-r--r-- | src/zenutil/include/zenutil/zenserverprocess.h | 1 |
17 files changed, 500 insertions, 306 deletions
diff --git a/src/zenserver/frontend/html/pages/builds.js b/src/zenserver/frontend/html/pages/builds.js index 095f0bf29..6b3426378 100644 --- a/src/zenserver/frontend/html/pages/builds.js +++ b/src/zenserver/frontend/html/pages/builds.js @@ -16,7 +16,7 @@ export class Page extends ZenPage this.set_title("build store"); // Build Store Stats - const stats_section = this.add_section("Build Store Stats"); + const stats_section = this._collapsible_section("Build Store Service Stats"); stats_section.tag().classify("dropall").text("raw yaml \u2192").on_click(() => { window.open("/stats/builds.yaml", "_blank"); }); diff --git a/src/zenserver/frontend/html/pages/cache.js b/src/zenserver/frontend/html/pages/cache.js index 1fc8227c8..e0f6f73b6 100644 --- a/src/zenserver/frontend/html/pages/cache.js +++ b/src/zenserver/frontend/html/pages/cache.js @@ -95,39 +95,6 @@ export class Page extends ZenPage } } - _collapsible_section(name) - { - const section = this.add_section(name); - const container = section._parent.inner(); - const heading = container.firstElementChild; - - heading.style.cursor = "pointer"; - heading.style.userSelect = "none"; - - const indicator = document.createElement("span"); - indicator.textContent = " \u25BC"; - indicator.style.fontSize = "0.7em"; - heading.appendChild(indicator); - - let collapsed = false; - heading.addEventListener("click", (e) => { - if (e.target !== heading && e.target !== indicator) - { - return; - } - collapsed = !collapsed; - indicator.textContent = collapsed ? " \u25B6" : " \u25BC"; - let sibling = heading.nextElementSibling; - while (sibling) - { - sibling.style.display = collapsed ? "none" : ""; - sibling = sibling.nextElementSibling; - } - }); - - return section; - } - _render_stats(stats) { const safe = (obj, path) => path.split(".").reduce((a, b) => a && a[b], obj); diff --git a/src/zenserver/frontend/html/pages/compute.js b/src/zenserver/frontend/html/pages/compute.js index ab3d49c27..d1a880954 100644 --- a/src/zenserver/frontend/html/pages/compute.js +++ b/src/zenserver/frontend/html/pages/compute.js @@ -100,39 +100,6 @@ export class Page extends ZenPage }, 2000); } - _collapsible_section(name) - { - const section = this.add_section(name); - const container = section._parent.inner(); - const heading = container.firstElementChild; - - heading.style.cursor = "pointer"; - heading.style.userSelect = "none"; - - const indicator = document.createElement("span"); - indicator.textContent = " \u25BC"; - indicator.style.fontSize = "0.7em"; - heading.appendChild(indicator); - - let collapsed = false; - heading.addEventListener("click", (e) => { - if (e.target !== heading && e.target !== indicator) - { - return; - } - collapsed = !collapsed; - indicator.textContent = collapsed ? " \u25B6" : " \u25BC"; - let sibling = heading.nextElementSibling; - while (sibling) - { - sibling.style.display = collapsed ? "none" : ""; - sibling = sibling.nextElementSibling; - } - }); - - return section; - } - async _load_chartjs() { if (window.Chart) diff --git a/src/zenserver/frontend/html/pages/hub.js b/src/zenserver/frontend/html/pages/hub.js index 78e3a090c..c6f96d496 100644 --- a/src/zenserver/frontend/html/pages/hub.js +++ b/src/zenserver/frontend/html/pages/hub.js @@ -82,7 +82,7 @@ export class Page extends ZenPage this.set_title("hub"); // Capacity - const stats_section = this.add_section("Capacity"); + const stats_section = this._collapsible_section("Hub Service Stats"); this._stats_grid = stats_section.tag().classify("grid").classify("stats-tiles"); // Modules @@ -203,27 +203,46 @@ export class Page extends ZenPage { const tile = grid.tag().classify("card").classify("stats-tile"); - tile.tag().classify("card-title").text("Active Modules"); + tile.tag().classify("card-title").text("Instances"); const body = tile.tag().classify("tile-metrics"); this._metric(body, Friendly.sep(current), "currently provisioned", true); + this._metric(body, Friendly.sep(max), "high watermark"); + this._metric(body, Friendly.sep(limit), "maximum allowed"); + if (limit > 0) + { + const pct = ((current / limit) * 100).toFixed(0) + "%"; + this._metric(body, pct, "utilization"); + } } + const machine = data.machine || {}; + const limits = data.resource_limits || {}; + if (machine.disk_total_bytes > 0 || machine.memory_total_mib > 0) { - const tile = grid.tag().classify("card").classify("stats-tile"); - tile.tag().classify("card-title").text("Peak Modules"); - const body = tile.tag().classify("tile-metrics"); - this._metric(body, Friendly.sep(max), "high watermark", true); - } + const disk_used = Math.max(0, (machine.disk_total_bytes || 0) - (machine.disk_free_bytes || 0)); + const mem_used = Math.max(0, (machine.memory_total_mib || 0) - (machine.memory_avail_mib || 0)) * 1024 * 1024; + const vmem_used = Math.max(0, (machine.virtual_memory_total_mib || 0) - (machine.virtual_memory_avail_mib || 0)) * 1024 * 1024; + const disk_limit = limits.disk_bytes || 0; + const mem_limit = limits.memory_bytes || 0; + const disk_over = disk_limit > 0 && disk_used > disk_limit; + const mem_over = mem_limit > 0 && mem_used > mem_limit; - { const tile = grid.tag().classify("card").classify("stats-tile"); - tile.tag().classify("card-title").text("Instance Limit"); - const body = tile.tag().classify("tile-metrics"); - this._metric(body, Friendly.sep(limit), "maximum allowed", true); - if (limit > 0) + if (disk_over || mem_over) { tile.inner().setAttribute("data-over", "true"); } + tile.tag().classify("card-title").text("Resources"); + const columns = tile.tag().classify("tile-columns"); + + const left = columns.tag().classify("tile-metrics"); + this._metric(left, Friendly.bytes(disk_used), "disk used", true); + if (disk_limit > 0) { this._metric(left, Friendly.bytes(disk_limit), "disk limit"); } + + const right = columns.tag().classify("tile-metrics"); + this._metric(right, Friendly.bytes(mem_used), "memory used", true); + if (mem_limit > 0) { this._metric(right, Friendly.bytes(mem_limit), "memory limit"); } + if (machine.virtual_memory_total_mib > 0) { - const pct = ((current / limit) * 100).toFixed(0) + "%"; - this._metric(body, pct, "utilization"); + this._metric(right, Friendly.bytes(vmem_used), "vmem used", true); + this._metric(right, Friendly.bytes(machine.virtual_memory_total_mib * 1024 * 1024), "vmem total"); } } } @@ -284,6 +303,14 @@ export class Page extends ZenPage } row.state_text.nodeValue = state; row.port_text.nodeValue = m.port ? String(m.port) : ""; + if (m.state_change_time) + { + const state_label = state.charAt(0).toUpperCase() + state.slice(1); + row.state_since_label.textContent = state_label + " since"; + row.state_age_label.textContent = state_label + " for"; + row.state_since_node.nodeValue = m.state_change_time; + row.state_age_node.nodeValue = Friendly.timespan(Date.now() - new Date(m.state_change_time).getTime()); + } row.btn_open.disabled = state !== "provisioned"; row.btn_hibernate.disabled = !_btn_enabled(state, "hibernate"); row.btn_wake.disabled = !_btn_enabled(state, "wake"); @@ -388,7 +415,7 @@ export class Page extends ZenPage td_action.appendChild(wrap_o); tr.appendChild(td_action); - // Build metrics grid from process_metrics keys. + // Build metrics grid: fixed state-time rows followed by process_metrics keys. // Keys are split into two halves and interleaved so the grid fills // top-to-bottom in the left column before continuing in the right column. const metric_nodes = new Map(); @@ -396,6 +423,28 @@ export class Page extends ZenPage metrics_td.colSpan = 6; const metrics_grid = document.createElement("div"); metrics_grid.className = "module-metrics-grid"; + + const _add_fixed_pair = (label, value_str) => { + const label_el = document.createElement("span"); + label_el.className = "module-metrics-label"; + label_el.textContent = label; + const value_node = document.createTextNode(value_str); + const value_el = document.createElement("span"); + value_el.className = "module-metrics-value"; + value_el.appendChild(value_node); + metrics_grid.appendChild(label_el); + metrics_grid.appendChild(value_el); + return { label_el, value_node }; + }; + + const state_label = m.state ? m.state.charAt(0).toUpperCase() + m.state.slice(1) : "State"; + const state_since_str = m.state_change_time || ""; + const state_age_str = m.state_change_time + ? Friendly.timespan(Date.now() - new Date(m.state_change_time).getTime()) + : ""; + const { label_el: state_since_label, value_node: state_since_node } = _add_fixed_pair(state_label + " since", state_since_str); + const { label_el: state_age_label, value_node: state_age_node } = _add_fixed_pair(state_label + " for", state_age_str); + const keys = Object.keys(m.process_metrics || {}); const half = Math.ceil(keys.length / 2); const add_metric_pair = (key) => { @@ -423,7 +472,7 @@ export class Page extends ZenPage metrics_td.appendChild(metrics_grid); metrics_tr.appendChild(metrics_td); - row = { tr, metrics_tr, idx: td_idx, cb, dot, state_text: state_node, port_text: port_node, btn_expand, btn_open: btn_o, btn_hibernate: btn_h, btn_wake: btn_w, btn_deprov: btn_d, metric_nodes }; + row = { tr, metrics_tr, idx: td_idx, cb, dot, state_text: state_node, port_text: port_node, btn_expand, btn_open: btn_o, btn_hibernate: btn_h, btn_wake: btn_w, btn_deprov: btn_d, metric_nodes, state_since_node, state_age_node, state_since_label, state_age_label }; this._row_cache.set(id, row); } diff --git a/src/zenserver/frontend/html/pages/orchestrator.js b/src/zenserver/frontend/html/pages/orchestrator.js index 4a9290a3c..a280fabdb 100644 --- a/src/zenserver/frontend/html/pages/orchestrator.js +++ b/src/zenserver/frontend/html/pages/orchestrator.js @@ -46,39 +46,6 @@ export class Page extends ZenPage this._connect_ws(); } - _collapsible_section(name) - { - const section = this.add_section(name); - const container = section._parent.inner(); - const heading = container.firstElementChild; - - heading.style.cursor = "pointer"; - heading.style.userSelect = "none"; - - const indicator = document.createElement("span"); - indicator.textContent = " \u25BC"; - indicator.style.fontSize = "0.7em"; - heading.appendChild(indicator); - - let collapsed = false; - heading.addEventListener("click", (e) => { - if (e.target !== heading && e.target !== indicator) - { - return; - } - collapsed = !collapsed; - indicator.textContent = collapsed ? " \u25B6" : " \u25BC"; - let sibling = heading.nextElementSibling; - while (sibling) - { - sibling.style.display = collapsed ? "none" : ""; - sibling = sibling.nextElementSibling; - } - }); - - return section; - } - async _fetch_all() { try diff --git a/src/zenserver/frontend/html/pages/page.js b/src/zenserver/frontend/html/pages/page.js index cf8d3e3dd..ff530ff8e 100644 --- a/src/zenserver/frontend/html/pages/page.js +++ b/src/zenserver/frontend/html/pages/page.js @@ -337,4 +337,37 @@ export class ZenPage extends PageBase this._metric(right, Friendly.duration(reqData.t_max), "max"); } } + + _collapsible_section(name) + { + const section = this.add_section(name); + const container = section._parent.inner(); + const heading = container.firstElementChild; + + heading.style.cursor = "pointer"; + heading.style.userSelect = "none"; + + const indicator = document.createElement("span"); + indicator.textContent = " \u25BC"; + indicator.style.fontSize = "0.7em"; + heading.appendChild(indicator); + + let collapsed = false; + heading.addEventListener("click", (e) => { + if (e.target !== heading && e.target !== indicator) + { + return; + } + collapsed = !collapsed; + indicator.textContent = collapsed ? " \u25B6" : " \u25BC"; + let sibling = heading.nextElementSibling; + while (sibling) + { + sibling.style.display = collapsed ? "none" : ""; + sibling = sibling.nextElementSibling; + } + }); + + return section; + } } diff --git a/src/zenserver/frontend/html/pages/projects.js b/src/zenserver/frontend/html/pages/projects.js index 2469bf70b..dfe4faeb8 100644 --- a/src/zenserver/frontend/html/pages/projects.js +++ b/src/zenserver/frontend/html/pages/projects.js @@ -110,39 +110,6 @@ export class Page extends ZenPage } } - _collapsible_section(name) - { - const section = this.add_section(name); - const container = section._parent.inner(); - const heading = container.firstElementChild; - - heading.style.cursor = "pointer"; - heading.style.userSelect = "none"; - - const indicator = document.createElement("span"); - indicator.textContent = " \u25BC"; - indicator.style.fontSize = "0.7em"; - heading.appendChild(indicator); - - let collapsed = false; - heading.addEventListener("click", (e) => { - if (e.target !== heading && e.target !== indicator) - { - return; - } - collapsed = !collapsed; - indicator.textContent = collapsed ? " \u25B6" : " \u25BC"; - let sibling = heading.nextElementSibling; - while (sibling) - { - sibling.style.display = collapsed ? "none" : ""; - sibling = sibling.nextElementSibling; - } - }); - - return section; - } - _clear_param(name) { this._params.delete(name); diff --git a/src/zenserver/frontend/html/pages/workspaces.js b/src/zenserver/frontend/html/pages/workspaces.js index d31fd7373..2442fb35b 100644 --- a/src/zenserver/frontend/html/pages/workspaces.js +++ b/src/zenserver/frontend/html/pages/workspaces.js @@ -13,7 +13,7 @@ export class Page extends ZenPage this.set_title("workspaces"); // Workspace Service Stats - const stats_section = this.add_section("Workspace Service Stats"); + const stats_section = this._collapsible_section("Workspace Service Stats"); this._stats_grid = stats_section.tag().classify("grid").classify("stats-tiles"); const stats = await new Fetcher().resource("stats", "ws").json().catch(() => null); diff --git a/src/zenserver/frontend/html/zen.css b/src/zenserver/frontend/html/zen.css index d9f7491ea..cb3d78cf2 100644 --- a/src/zenserver/frontend/html/zen.css +++ b/src/zenserver/frontend/html/zen.css @@ -816,6 +816,10 @@ zen-banner + zen-nav::part(nav-bar) { border-color: var(--theme_p0); } +.stats-tile[data-over="true"] { + border-color: var(--theme_fail); +} + .stats-tile-detailed { position: relative; } diff --git a/src/zenserver/hub/httphubservice.cpp b/src/zenserver/hub/httphubservice.cpp index ebefcf2e3..d52da5ae7 100644 --- a/src/zenserver/hub/httphubservice.cpp +++ b/src/zenserver/hub/httphubservice.cpp @@ -78,6 +78,10 @@ HttpHubService::HttpHubService(Hub& Hub, HttpStatsService& StatsService, HttpSta Obj << "moduleId" << ModuleId; Obj << "state" << ToString(Info.State); Obj << "port" << Info.Port; + if (Info.StateChangeTime != std::chrono::system_clock::time_point::min()) + { + Obj << "state_change_time" << ToDateTime(Info.StateChangeTime); + } Obj.BeginObject("process_metrics"); { Obj << "MemoryBytes" << Info.Metrics.MemoryBytes; @@ -228,17 +232,6 @@ HttpHubService::HttpHubService(Hub& Hub, HttpStatsService& StatsService, HttpSta }, HttpVerb::kPost); - m_Router.RegisterRoute( - "stats", - [this](HttpRouterRequest& Req) { - CbObjectWriter Obj; - Obj << "currentInstanceCount" << m_Hub.GetInstanceCount(); - Obj << "maxInstanceCount" << m_Hub.GetMaxInstanceCount(); - Obj << "instanceLimit" << m_Hub.GetConfig().InstanceLimit; - Req.ServerRequest().WriteResponse(HttpResponseCode::OK, Obj.Save()); - }, - HttpVerb::kGet); - m_StatsService.RegisterHandler("hub", *this); m_StatusService.RegisterHandler("hub", *this); } @@ -286,7 +279,37 @@ HttpHubService::HandleStatusRequest(HttpServerRequest& Request) void HttpHubService::HandleStatsRequest(HttpServerRequest& Request) { - Request.WriteResponse(HttpResponseCode::OK, CollectStats()); + CbObjectWriter Cbo; + + EmitSnapshot("requests", m_HttpRequests, Cbo); + + Cbo << "currentInstanceCount" << m_Hub.GetInstanceCount(); + Cbo << "maxInstanceCount" << m_Hub.GetMaxInstanceCount(); + Cbo << "instanceLimit" << m_Hub.GetConfig().InstanceLimit; + + SystemMetrics SysMetrics; + DiskSpace Disk; + m_Hub.GetMachineMetrics(SysMetrics, Disk); + Cbo.BeginObject("machine"); + { + Cbo << "disk_free_bytes" << Disk.Free; + Cbo << "disk_total_bytes" << Disk.Total; + Cbo << "memory_avail_mib" << SysMetrics.AvailSystemMemoryMiB; + Cbo << "memory_total_mib" << SysMetrics.SystemMemoryMiB; + Cbo << "virtual_memory_avail_mib" << SysMetrics.AvailVirtualMemoryMiB; + Cbo << "virtual_memory_total_mib" << SysMetrics.VirtualMemoryMiB; + } + Cbo.EndObject(); + + const ResourceMetrics& Limits = m_Hub.GetConfig().ResourceLimits; + Cbo.BeginObject("resource_limits"); + { + Cbo << "disk_bytes" << Limits.DiskUsageBytes; + Cbo << "memory_bytes" << Limits.MemoryUsageBytes; + } + Cbo.EndObject(); + + Request.WriteResponse(HttpResponseCode::OK, Cbo.Save()); } CbObject diff --git a/src/zenserver/hub/hub.cpp b/src/zenserver/hub/hub.cpp index 6c44e2333..bf846d68e 100644 --- a/src/zenserver/hub/hub.cpp +++ b/src/zenserver/hub/hub.cpp @@ -19,7 +19,6 @@ ZEN_THIRD_PARTY_INCLUDES_START ZEN_THIRD_PARTY_INCLUDES_END #if ZEN_WITH_TESTS -# include <zencore/filesystem.h> # include <zencore/testing.h> # include <zencore/testutils.h> #endif @@ -122,6 +121,55 @@ private: ////////////////////////////////////////////////////////////////////////// +ProcessMetrics +Hub::AtomicProcessMetrics::Load() const +{ + return { + .MemoryBytes = MemoryBytes.load(), + .KernelTimeMs = KernelTimeMs.load(), + .UserTimeMs = UserTimeMs.load(), + .WorkingSetSize = WorkingSetSize.load(), + .PeakWorkingSetSize = PeakWorkingSetSize.load(), + .PagefileUsage = PagefileUsage.load(), + .PeakPagefileUsage = PeakPagefileUsage.load(), + }; +} + +void +Hub::AtomicProcessMetrics::Store(const ProcessMetrics& Metrics) +{ + MemoryBytes.store(Metrics.MemoryBytes); + KernelTimeMs.store(Metrics.KernelTimeMs); + UserTimeMs.store(Metrics.UserTimeMs); + WorkingSetSize.store(Metrics.WorkingSetSize); + PeakWorkingSetSize.store(Metrics.PeakWorkingSetSize); + PagefileUsage.store(Metrics.PagefileUsage); + PeakPagefileUsage.store(Metrics.PeakPagefileUsage); +} + +void +Hub::AtomicProcessMetrics::Reset() +{ + MemoryBytes.store(0); + KernelTimeMs.store(0); + UserTimeMs.store(0); + WorkingSetSize.store(0); + PeakWorkingSetSize.store(0); + PagefileUsage.store(0); + PeakPagefileUsage.store(0); +} + +void +Hub::GetMachineMetrics(SystemMetrics& OutSystemMetrict, DiskSpace& OutDiskSpace) const +{ + m_Lock.WithSharedLock([&]() { + OutSystemMetrict = m_SystemMetrics; + OutDiskSpace = m_DiskSpace; + }); +} + +////////////////////////////////////////////////////////////////////////// + Hub::Hub(const Configuration& Config, ZenServerEnvironment&& RunEnvironment, WorkerThreadPool* OptionalWorkerPool, @@ -134,10 +182,6 @@ Hub::Hub(const Configuration& Config, , m_ActiveInstances(Config.InstanceLimit) , m_FreeActiveInstanceIndexes(Config.InstanceLimit) { - m_HostMetrics = GetSystemMetrics(); - m_ResourceLimits.DiskUsageBytes = 1000ull * 1024 * 1024 * 1024; - m_ResourceLimits.MemoryUsageBytes = 16ull * 1024 * 1024 * 1024; - if (m_Config.HydrationTargetSpecification.empty()) { std::filesystem::path FileHydrationPath = m_RunEnvironment.CreateChildDir("hydration_storage"); @@ -171,6 +215,9 @@ Hub::Hub(const Configuration& Config, } } #endif + + UpdateMachineMetrics(); + m_WatchDog = std::thread([this]() { WatchDog(); }); } @@ -195,6 +242,9 @@ Hub::Shutdown() { ZEN_INFO("Hub service shutting down, deprovisioning any current instances"); + bool Expected = false; + bool WaitForBackgroundWork = m_ShutdownFlag.compare_exchange_strong(Expected, true); + m_WatchDogEvent.Set(); if (m_WatchDog.joinable()) { @@ -203,8 +253,6 @@ Hub::Shutdown() m_WatchDog = {}; - bool Expected = false; - bool WaitForBackgroundWork = m_ShutdownFlag.compare_exchange_strong(Expected, true); if (WaitForBackgroundWork && m_WorkerPool) { m_BackgroundWorkLatch.CountDown(); @@ -254,7 +302,7 @@ Hub::Provision(std::string_view ModuleId, HubProvisionedInstanceInfo& OutInfo) if (auto It = m_InstanceLookup.find(std::string(ModuleId)); It == m_InstanceLookup.end()) { std::string Reason; - if (!CanProvisionInstance(ModuleId, /* out */ Reason)) + if (!CanProvisionInstanceLocked(ModuleId, /* out */ Reason)) { ZEN_WARN("Cannot provision new storage server instance for module '{}': {}", ModuleId, Reason); @@ -289,6 +337,7 @@ Hub::Provision(std::string_view ModuleId, HubProvisionedInstanceInfo& OutInfo) Instance = NewInstance->LockExclusive(/*Wait*/ true); m_ActiveInstances[ActiveInstanceIndex].Instance = std::move(NewInstance); + m_ActiveInstances[ActiveInstanceIndex].ProcessMetrics.Reset(); m_InstanceLookup.insert_or_assign(std::string(ModuleId), ActiveInstanceIndex); // Set Provisioning while both hub lock and instance lock are held so that any // concurrent Deprovision sees the in-flight state, not Unprovisioned. @@ -947,12 +996,10 @@ Hub::Find(std::string_view ModuleId, InstanceInfo* OutInstanceInfo) ZEN_ASSERT(ActiveInstanceIndex < m_ActiveInstances.size()); const std::unique_ptr<StorageServerInstance>& Instance = m_ActiveInstances[ActiveInstanceIndex].Instance; ZEN_ASSERT(Instance); - InstanceInfo Info{ - m_ActiveInstances[ActiveInstanceIndex].State.load(), - std::chrono::system_clock::now() // TODO - }; - Instance->GetProcessMetrics(Info.Metrics); - Info.Port = Instance->GetBasePort(); + InstanceInfo Info{m_ActiveInstances[ActiveInstanceIndex].State.load(), + m_ActiveInstances[ActiveInstanceIndex].StateChangeTime.load()}; + Info.Metrics = m_ActiveInstances[ActiveInstanceIndex].ProcessMetrics.Load(); + Info.Port = Instance->GetBasePort(); *OutInstanceInfo = Info; } @@ -971,12 +1018,10 @@ Hub::EnumerateModules(std::function<void(std::string_view ModuleId, const Instan { const std::unique_ptr<StorageServerInstance>& Instance = m_ActiveInstances[ActiveInstanceIndex].Instance; ZEN_ASSERT(Instance); - InstanceInfo Info{ - m_ActiveInstances[ActiveInstanceIndex].State.load(), - std::chrono::system_clock::now() // TODO - }; - Instance->GetProcessMetrics(Info.Metrics); - Info.Port = Instance->GetBasePort(); + InstanceInfo Info{m_ActiveInstances[ActiveInstanceIndex].State.load(), + m_ActiveInstances[ActiveInstanceIndex].StateChangeTime.load()}; + Info.Metrics = m_ActiveInstances[ActiveInstanceIndex].ProcessMetrics.Load(); + Info.Port = Instance->GetBasePort(); Infos.push_back(std::make_pair(std::string(Instance->GetModuleId()), Info)); } @@ -994,28 +1039,8 @@ Hub::GetInstanceCount() return m_Lock.WithSharedLock([this]() { return gsl::narrow_cast<int>(m_InstanceLookup.size()); }); } -void -Hub::UpdateCapacityMetrics() -{ - m_HostMetrics = GetSystemMetrics(); - - // TODO: Should probably go into WatchDog and use atomic for update so it can be read without locks... - // Per-instance stats are already refreshed by WatchDog and are readable via the Find and EnumerateModules -} - -void -Hub::UpdateStats() -{ - int CurrentInstanceCount = m_Lock.WithSharedLock([this] { return gsl::narrow_cast<int>(m_InstanceLookup.size()); }); - int CurrentMaxCount = m_MaxInstanceCount.load(); - - int NewMax = Max(CurrentMaxCount, CurrentInstanceCount); - - m_MaxInstanceCount.compare_exchange_weak(CurrentMaxCount, NewMax); -} - bool -Hub::CanProvisionInstance(std::string_view ModuleId, std::string& OutReason) +Hub::CanProvisionInstanceLocked(std::string_view ModuleId, std::string& OutReason) { ZEN_UNUSED(ModuleId); if (m_FreeActiveInstanceIndexes.empty()) @@ -1025,7 +1050,24 @@ Hub::CanProvisionInstance(std::string_view ModuleId, std::string& OutReason) return false; } - // TODO: handle additional resource metrics + const uint64_t DiskUsedBytes = m_DiskSpace.Free <= m_DiskSpace.Total ? m_DiskSpace.Total - m_DiskSpace.Free : 0; + if (m_Config.ResourceLimits.DiskUsageBytes > 0 && DiskUsedBytes > m_Config.ResourceLimits.DiskUsageBytes) + { + OutReason = + fmt::format("disk usage ({}) exceeds ({})", NiceBytes(DiskUsedBytes), NiceBytes(m_Config.ResourceLimits.DiskUsageBytes)); + return false; + } + + const uint64_t RamUsedMiB = m_SystemMetrics.AvailSystemMemoryMiB <= m_SystemMetrics.SystemMemoryMiB + ? m_SystemMetrics.SystemMemoryMiB - m_SystemMetrics.AvailSystemMemoryMiB + : 0; + const uint64_t RamUsedBytes = RamUsedMiB * 1024 * 1024; + if (m_Config.ResourceLimits.MemoryUsageBytes > 0 && RamUsedBytes > m_Config.ResourceLimits.MemoryUsageBytes) + { + OutReason = + fmt::format("ram usage ({}) exceeds ({})", NiceBytes(RamUsedBytes), NiceBytes(m_Config.ResourceLimits.MemoryUsageBytes)); + return false; + } return true; } @@ -1065,8 +1107,10 @@ Hub::UpdateInstanceStateLocked(size_t ActiveInstanceIndex, HubInstanceState NewS } return false; }(m_ActiveInstances[ActiveInstanceIndex].State.load(), NewState)); + const std::chrono::system_clock::time_point Now = std::chrono::system_clock::now(); m_ActiveInstances[ActiveInstanceIndex].LastKnownActivitySum.store(0); - m_ActiveInstances[ActiveInstanceIndex].LastActivityTime.store(std::chrono::system_clock::now()); + m_ActiveInstances[ActiveInstanceIndex].LastActivityTime.store(Now); + m_ActiveInstances[ActiveInstanceIndex].StateChangeTime.store(Now); return m_ActiveInstances[ActiveInstanceIndex].State.exchange(NewState); } @@ -1173,14 +1217,14 @@ Hub::CheckInstanceStatus(HttpClient& ActivityCheckClient, StorageServerInstance::SharedLockedPtr&& LockedInstance, size_t ActiveInstanceIndex) { + const std::string ModuleId(LockedInstance.GetModuleId()); + HubInstanceState InstanceState = m_ActiveInstances[ActiveInstanceIndex].State.load(); if (LockedInstance.IsRunning()) { - LockedInstance.UpdateMetrics(); + m_ActiveInstances[ActiveInstanceIndex].ProcessMetrics.Store(LockedInstance.GetProcessMetrics()); if (InstanceState == HubInstanceState::Provisioned) { - const std::string ModuleId(LockedInstance.GetModuleId()); - const uint16_t Port = LockedInstance.GetBasePort(); const uint64_t PreviousActivitySum = m_ActiveInstances[ActiveInstanceIndex].LastKnownActivitySum.load(); const std::chrono::system_clock::time_point LastActivityTime = m_ActiveInstances[ActiveInstanceIndex].LastActivityTime.load(); @@ -1260,8 +1304,7 @@ Hub::CheckInstanceStatus(HttpClient& ActivityCheckClient, else if (InstanceState == HubInstanceState::Provisioned) { // Process is not running but state says it should be - instance died unexpectedly. - const std::string ModuleId(LockedInstance.GetModuleId()); - const uint16_t Port = LockedInstance.GetBasePort(); + const uint16_t Port = LockedInstance.GetBasePort(); UpdateInstanceState(LockedInstance, ActiveInstanceIndex, HubInstanceState::Crashed); NotifyStateUpdate(ModuleId, HubInstanceState::Provisioned, HubInstanceState::Crashed, Port, {}); LockedInstance = {}; @@ -1272,7 +1315,6 @@ Hub::CheckInstanceStatus(HttpClient& ActivityCheckClient, { // Process is not running - no HTTP activity check is possible. // Use a pure time-based check; the margin window does not apply here. - const std::string ModuleId = std::string(LockedInstance.GetModuleId()); const std::chrono::system_clock::time_point LastActivityTime = m_ActiveInstances[ActiveInstanceIndex].LastActivityTime.load(); const uint64_t PreviousActivitySum = m_ActiveInstances[ActiveInstanceIndex].LastKnownActivitySum.load(); const std::chrono::system_clock::time_point Now = std::chrono::system_clock::now(); @@ -1312,6 +1354,43 @@ Hub::CheckInstanceStatus(HttpClient& ActivityCheckClient, } void +Hub::UpdateMachineMetrics() +{ + try + { + bool DiskSpaceOk = false; + DiskSpace Disk; + + std::filesystem::path ChildDir = m_RunEnvironment.GetChildBaseDir(); + if (!ChildDir.empty()) + { + if (DiskSpaceInfo(ChildDir, Disk)) + { + DiskSpaceOk = true; + } + else + { + ZEN_WARN("Failed to query disk space for '{}'; disk-based provisioning limits will not be enforced", ChildDir); + } + } + + SystemMetrics Metrics = GetSystemMetrics(); + + m_Lock.WithExclusiveLock([&]() { + if (DiskSpaceOk) + { + m_DiskSpace = Disk; + } + m_SystemMetrics = Metrics; + }); + } + catch (const std::exception& Ex) + { + ZEN_WARN("Failed to update machine metrics. Reason: {}", Ex.what()); + } +} + +void Hub::WatchDog() { const uint64_t CycleIntervalMs = std::chrono::duration_cast<std::chrono::milliseconds>(m_Config.WatchDog.CycleInterval).count(); @@ -1326,16 +1405,18 @@ Hub::WatchDog() [&]() -> bool { return m_WatchDogEvent.Wait(0); }); size_t CheckInstanceIndex = SIZE_MAX; // first increment wraps to 0 - while (!m_WatchDogEvent.Wait(gsl::narrow<int>(CycleIntervalMs))) + while (!m_ShutdownFlag.load() && !m_WatchDogEvent.Wait(gsl::narrow<int>(CycleIntervalMs))) { try { + UpdateMachineMetrics(); + // Snapshot slot count. We iterate all slots (including freed nulls) so // round-robin coverage is not skewed by deprovisioned entries. size_t SlotsRemaining = m_Lock.WithSharedLock([this]() { return m_ActiveInstances.size(); }); Stopwatch Timer; - bool ShuttingDown = false; + bool ShuttingDown = m_ShutdownFlag.load(); while (SlotsRemaining > 0 && Timer.GetElapsedTimeMs() < CycleProcessingBudgetMs && !ShuttingDown) { StorageServerInstance::SharedLockedPtr LockedInstance; @@ -1366,16 +1447,24 @@ Hub::WatchDog() std::string ModuleId(LockedInstance.GetModuleId()); - bool InstanceIsOk = CheckInstanceStatus(ActivityCheckClient, std::move(LockedInstance), CheckInstanceIndex); - if (InstanceIsOk) + try { - ShuttingDown = m_WatchDogEvent.Wait(gsl::narrow<int>(InstanceCheckThrottleMs)); + bool InstanceIsOk = CheckInstanceStatus(ActivityCheckClient, std::move(LockedInstance), CheckInstanceIndex); + if (InstanceIsOk) + { + ShuttingDown = m_WatchDogEvent.Wait(gsl::narrow<int>(InstanceCheckThrottleMs)); + } + else + { + ZEN_WARN("Instance for module '{}' is not running, attempting recovery", ModuleId); + AttemptRecoverInstance(ModuleId); + } } - else + catch (const std::exception& Ex) { - ZEN_WARN("Instance for module '{}' is not running, attempting recovery", ModuleId); - AttemptRecoverInstance(ModuleId); + ZEN_WARN("Failed to check status of module {}. Reason: {}", ModuleId, Ex.what()); } + ShuttingDown |= m_ShutdownFlag.load(); } } catch (const std::exception& Ex) @@ -1515,6 +1604,8 @@ TEST_CASE("hub.provision_basic") Hub::InstanceInfo InstanceInfo; REQUIRE(HubInstance->Find("module_a", &InstanceInfo)); CHECK_EQ(InstanceInfo.State, HubInstanceState::Provisioned); + CHECK_NE(InstanceInfo.StateChangeTime, std::chrono::system_clock::time_point::min()); + CHECK_LE(InstanceInfo.StateChangeTime, std::chrono::system_clock::now()); { HttpClient ModClient(fmt::format("http://localhost:{}", Info.Port), kFastTimeout); @@ -1934,6 +2025,9 @@ TEST_CASE("hub.hibernate_wake") } REQUIRE(HubInstance->Find("hib_a", &Info)); CHECK_EQ(Info.State, HubInstanceState::Provisioned); + const std::chrono::system_clock::time_point ProvisionedTime = Info.StateChangeTime; + CHECK_NE(ProvisionedTime, std::chrono::system_clock::time_point::min()); + CHECK_LE(ProvisionedTime, std::chrono::system_clock::now()); { HttpClient ModClient(fmt::format("http://localhost:{}", ProvInfo.Port), kFastTimeout); CHECK(ModClient.Get("/health/")); @@ -1944,6 +2038,8 @@ TEST_CASE("hub.hibernate_wake") REQUIRE_MESSAGE(HibernateResult.ResponseCode == Hub::EResponseCode::Completed, HibernateResult.Message); REQUIRE(HubInstance->Find("hib_a", &Info)); CHECK_EQ(Info.State, HubInstanceState::Hibernated); + const std::chrono::system_clock::time_point HibernatedTime = Info.StateChangeTime; + CHECK_GE(HibernatedTime, ProvisionedTime); { HttpClient ModClient(fmt::format("http://localhost:{}", ProvInfo.Port), kFastTimeout); CHECK(!ModClient.Get("/health/")); @@ -1954,6 +2050,7 @@ TEST_CASE("hub.hibernate_wake") REQUIRE_MESSAGE(WakeResult.ResponseCode == Hub::EResponseCode::Completed, WakeResult.Message); REQUIRE(HubInstance->Find("hib_a", &Info)); CHECK_EQ(Info.State, HubInstanceState::Provisioned); + CHECK_GE(Info.StateChangeTime, HibernatedTime); { HttpClient ModClient(fmt::format("http://localhost:{}", ProvInfo.Port), kFastTimeout); CHECK(ModClient.Get("/health/")); @@ -2352,7 +2449,7 @@ TEST_CASE("hub.async_provision_shutdown_waits") TEST_CASE("hub.async_provision_rejected") { - // Rejection from CanProvisionInstance fires synchronously even when a WorkerPool is present. + // Rejection from CanProvisionInstanceLocked fires synchronously even when a WorkerPool is present. ScopedTemporaryDirectory TempDir; Hub::Configuration Config; @@ -2369,7 +2466,7 @@ TEST_CASE("hub.async_provision_rejected") REQUIRE_MESSAGE(FirstResult.ResponseCode == Hub::EResponseCode::Accepted, FirstResult.Message); REQUIRE_NE(Info.Port, 0); - // Second provision: CanProvisionInstance rejects synchronously (limit reached), returns Rejected + // Second provision: CanProvisionInstanceLocked rejects synchronously (limit reached), returns Rejected HubProvisionedInstanceInfo Info2; const Hub::Response SecondResult = HubInstance->Provision("async_r2", Info2); CHECK(SecondResult.ResponseCode == Hub::EResponseCode::Rejected); @@ -2485,6 +2582,55 @@ TEST_CASE("hub.instance.inactivity.deprovision") HubInstance->Shutdown(); } +TEST_CASE("hub.machine_metrics") +{ + ScopedTemporaryDirectory TempDir; + + std::unique_ptr<Hub> HubInstance = hub_testutils::MakeHub(TempDir.Path(), {}); + + // UpdateMachineMetrics() is called synchronously in the Hub constructor, so metrics + // are available immediately without waiting for a watchdog cycle. + SystemMetrics SysMetrics; + DiskSpace Disk; + HubInstance->GetMachineMetrics(SysMetrics, Disk); + + CHECK_GT(Disk.Total, 0u); + CHECK_LE(Disk.Free, Disk.Total); + + CHECK_GT(SysMetrics.SystemMemoryMiB, 0u); + CHECK_LE(SysMetrics.AvailSystemMemoryMiB, SysMetrics.SystemMemoryMiB); + + CHECK_GT(SysMetrics.VirtualMemoryMiB, 0u); + CHECK_LE(SysMetrics.AvailVirtualMemoryMiB, SysMetrics.VirtualMemoryMiB); +} + +TEST_CASE("hub.provision_rejected_resource_limits") +{ + // The Hub constructor calls UpdateMachineMetrics() synchronously, so CanProvisionInstanceLocked + // can enforce limits immediately without waiting for a watchdog cycle. + ScopedTemporaryDirectory TempDir; + + { + Hub::Configuration Config; + Config.ResourceLimits.DiskUsageBytes = 1; + std::unique_ptr<Hub> HubInstance = hub_testutils::MakeHub(TempDir.Path(), Config); + HubProvisionedInstanceInfo Info; + const Hub::Response Result = HubInstance->Provision("disk_limit", Info); + CHECK(Result.ResponseCode == Hub::EResponseCode::Rejected); + CHECK_NE(Result.Message.find("disk usage"), std::string::npos); + } + + { + Hub::Configuration Config; + Config.ResourceLimits.MemoryUsageBytes = 1; + std::unique_ptr<Hub> HubInstance = hub_testutils::MakeHub(TempDir.Path(), Config); + HubProvisionedInstanceInfo Info; + const Hub::Response Result = HubInstance->Provision("mem_limit", Info); + CHECK(Result.ResponseCode == Hub::EResponseCode::Rejected); + CHECK_NE(Result.Message.find("ram usage"), std::string::npos); + } +} + TEST_SUITE_END(); void diff --git a/src/zenserver/hub/hub.h b/src/zenserver/hub/hub.h index c343b19e2..9895f7068 100644 --- a/src/zenserver/hub/hub.h +++ b/src/zenserver/hub/hub.h @@ -6,6 +6,7 @@ #include "resourcemetrics.h" #include "storageserverinstance.h" +#include <zencore/filesystem.h> #include <zencore/system.h> #include <zenutil/zenserverprocess.h> @@ -68,6 +69,8 @@ public: std::string HydrationTargetSpecification; WatchDogConfiguration WatchDog; + + ResourceMetrics ResourceLimits; }; typedef std::function< @@ -86,7 +89,7 @@ public: struct InstanceInfo { HubInstanceState State = HubInstanceState::Unprovisioned; - std::chrono::system_clock::time_point ProvisionTime; + std::chrono::system_clock::time_point StateChangeTime; ProcessMetrics Metrics; uint16_t Port = 0; }; @@ -160,6 +163,8 @@ public: int GetMaxInstanceCount() const { return m_MaxInstanceCount.load(); } + void GetMachineMetrics(SystemMetrics& OutSystemMetrict, DiskSpace& OutDiskSpace) const; + const Configuration& GetConfig() const { return m_Config; } #if ZEN_WITH_TESTS @@ -181,9 +186,25 @@ private: #if ZEN_PLATFORM_WINDOWS JobObject m_JobObject; #endif - RwLock m_Lock; + mutable RwLock m_Lock; std::unordered_map<std::string, size_t> m_InstanceLookup; + // Mirrors ProcessMetrics with atomic fields, enabling lock-free reads alongside watchdog writes. + struct AtomicProcessMetrics + { + std::atomic<uint64_t> MemoryBytes = 0; + std::atomic<uint64_t> KernelTimeMs = 0; + std::atomic<uint64_t> UserTimeMs = 0; + std::atomic<uint64_t> WorkingSetSize = 0; + std::atomic<uint64_t> PeakWorkingSetSize = 0; + std::atomic<uint64_t> PagefileUsage = 0; + std::atomic<uint64_t> PeakPagefileUsage = 0; + + ProcessMetrics Load() const; + void Store(const ProcessMetrics& Metrics); + void Reset(); + }; + struct ActiveInstance { // Invariant: Instance == nullptr if and only if State == Unprovisioned. @@ -192,11 +213,16 @@ private: // without holding the hub lock. std::unique_ptr<StorageServerInstance> Instance; std::atomic<HubInstanceState> State = HubInstanceState::Unprovisioned; - // TODO: We should move current metrics here (from StorageServerInstance) - // Read and updated by WatchDog, updates to State triggers a reset of both + // Process metrics - written by WatchDog (inside instance shared lock), read lock-free. + AtomicProcessMetrics ProcessMetrics; + + // Activity tracking - written by WatchDog, reset on every state transition. std::atomic<uint64_t> LastKnownActivitySum = 0; std::atomic<std::chrono::system_clock::time_point> LastActivityTime = std::chrono::system_clock::time_point::min(); + + // Set in UpdateInstanceStateLocked on every state transition; read lock-free by Find/EnumerateModules. + std::atomic<std::chrono::system_clock::time_point> StateChangeTime = std::chrono::system_clock::time_point::min(); }; // UpdateInstanceState is overloaded to accept a locked instance pointer (exclusive or shared) or the hub exclusive @@ -226,21 +252,20 @@ private: std::vector<ActiveInstance> m_ActiveInstances; std::deque<size_t> m_FreeActiveInstanceIndexes; - ResourceMetrics m_ResourceLimits; - SystemMetrics m_HostMetrics; + SystemMetrics m_SystemMetrics; + DiskSpace m_DiskSpace; std::atomic<int> m_MaxInstanceCount = 0; std::thread m_WatchDog; Event m_WatchDogEvent; void WatchDog(); + void UpdateMachineMetrics(); bool CheckInstanceStatus(HttpClient& ActivityHttpClient, StorageServerInstance::SharedLockedPtr&& LockedInstance, size_t ActiveInstanceIndex); void AttemptRecoverInstance(std::string_view ModuleId); - void UpdateStats(); - void UpdateCapacityMetrics(); - bool CanProvisionInstance(std::string_view ModuleId, std::string& OutReason); + bool CanProvisionInstanceLocked(std::string_view ModuleId, std::string& OutReason); uint16_t GetInstanceIndexAssignedPort(size_t ActiveInstanceIndex) const; Response InternalDeprovision(const std::string& ModuleId, std::function<bool(ActiveInstance& Instance)>&& DeprovisionGate); diff --git a/src/zenserver/hub/storageserverinstance.cpp b/src/zenserver/hub/storageserverinstance.cpp index 6b139dbf1..802606f6a 100644 --- a/src/zenserver/hub/storageserverinstance.cpp +++ b/src/zenserver/hub/storageserverinstance.cpp @@ -57,16 +57,15 @@ StorageServerInstance::SpawnServerProcess() m_ServerInstance.EnableShutdownOnDestroy(); } -void -StorageServerInstance::GetProcessMetrics(ProcessMetrics& OutMetrics) const +ProcessMetrics +StorageServerInstance::GetProcessMetrics() const { - OutMetrics.MemoryBytes = m_MemoryBytes.load(); - OutMetrics.KernelTimeMs = m_KernelTimeMs.load(); - OutMetrics.UserTimeMs = m_UserTimeMs.load(); - OutMetrics.WorkingSetSize = m_WorkingSetSize.load(); - OutMetrics.PeakWorkingSetSize = m_PeakWorkingSetSize.load(); - OutMetrics.PagefileUsage = m_PagefileUsage.load(); - OutMetrics.PeakPagefileUsage = m_PeakPagefileUsage.load(); + ProcessMetrics Metrics; + if (m_ServerInstance.IsRunning()) + { + zen::GetProcessMetrics(m_ServerInstance.GetProcessHandle(), Metrics); + } + return Metrics; } void @@ -249,25 +248,6 @@ StorageServerInstance::SharedLockedPtr::IsRunning() const return m_Instance->m_ServerInstance.IsRunning(); } -void -StorageServerInstance::UpdateMetricsLocked() -{ - if (m_ServerInstance.IsRunning()) - { - ProcessMetrics Metrics; - zen::GetProcessMetrics(m_ServerInstance.GetProcessHandle(), Metrics); - - m_MemoryBytes.store(Metrics.MemoryBytes); - m_KernelTimeMs.store(Metrics.KernelTimeMs); - m_UserTimeMs.store(Metrics.UserTimeMs); - m_WorkingSetSize.store(Metrics.WorkingSetSize); - m_PeakWorkingSetSize.store(Metrics.PeakWorkingSetSize); - m_PagefileUsage.store(Metrics.PagefileUsage); - m_PeakPagefileUsage.store(Metrics.PeakPagefileUsage); - } - // TODO: Resource metrics... -} - #if ZEN_WITH_TESTS void StorageServerInstance::SharedLockedPtr::TerminateForTesting() const diff --git a/src/zenserver/hub/storageserverinstance.h b/src/zenserver/hub/storageserverinstance.h index 94c47630c..33646c375 100644 --- a/src/zenserver/hub/storageserverinstance.h +++ b/src/zenserver/hub/storageserverinstance.h @@ -2,8 +2,6 @@ #pragma once -#include "resourcemetrics.h" - #include <zenutil/zenserverprocess.h> #include <atomic> @@ -34,11 +32,9 @@ public: StorageServerInstance(ZenServerEnvironment& RunEnvironment, const Configuration& Config, std::string_view ModuleId); ~StorageServerInstance(); - const ResourceMetrics& GetResourceMetrics() const { return m_ResourceMetrics; } - inline std::string_view GetModuleId() const { return m_ModuleId; } inline uint16_t GetBasePort() const { return m_Config.BasePort; } - void GetProcessMetrics(ProcessMetrics& OutMetrics) const; + ProcessMetrics GetProcessMetrics() const; #if ZEN_PLATFORM_WINDOWS void SetJobObject(JobObject* InJobObject) { m_JobObject = InJobObject; } @@ -68,15 +64,10 @@ public: } bool IsRunning() const; - const ResourceMetrics& GetResourceMetrics() const - { - ZEN_ASSERT(m_Instance); - return m_Instance->m_ResourceMetrics; - } - void UpdateMetrics() + ProcessMetrics GetProcessMetrics() const { ZEN_ASSERT(m_Instance); - return m_Instance->UpdateMetricsLocked(); + return m_Instance->GetProcessMetrics(); } #if ZEN_WITH_TESTS @@ -114,12 +105,6 @@ public: } bool IsRunning() const; - const ResourceMetrics& GetResourceMetrics() const - { - ZEN_ASSERT(m_Instance); - return m_Instance->m_ResourceMetrics; - } - void Provision(); void Deprovision(); void Hibernate(); @@ -139,8 +124,6 @@ private: void HibernateLocked(); void WakeLocked(); - void UpdateMetricsLocked(); - mutable RwLock m_Lock; const Configuration m_Config; std::string m_ModuleId; @@ -149,15 +132,6 @@ private: std::filesystem::path m_BaseDir; std::filesystem::path m_TempDir; - ResourceMetrics m_ResourceMetrics; - - std::atomic<uint64_t> m_MemoryBytes = 0; - std::atomic<uint64_t> m_KernelTimeMs = 0; - std::atomic<uint64_t> m_UserTimeMs = 0; - std::atomic<uint64_t> m_WorkingSetSize = 0; - std::atomic<uint64_t> m_PeakWorkingSetSize = 0; - std::atomic<uint64_t> m_PagefileUsage = 0; - std::atomic<uint64_t> m_PeakPagefileUsage = 0; #if ZEN_PLATFORM_WINDOWS JobObject* m_JobObject = nullptr; diff --git a/src/zenserver/hub/zenhubserver.cpp b/src/zenserver/hub/zenhubserver.cpp index 314031246..2d0d5398b 100644 --- a/src/zenserver/hub/zenhubserver.cpp +++ b/src/zenserver/hub/zenhubserver.cpp @@ -7,12 +7,15 @@ #include "hub.h" #include <zencore/config.h> +#include <zencore/except.h> +#include <zencore/filesystem.h> #include <zencore/fmtutils.h> #include <zencore/memory/llm.h> #include <zencore/memory/memorytrace.h> #include <zencore/memory/tagtrace.h> #include <zencore/scopeguard.h> #include <zencore/sentryintegration.h> +#include <zencore/system.h> #include <zencore/windows.h> #include <zenhttp/httpapiservice.h> #include <zenutil/service.h> @@ -203,6 +206,34 @@ ZenHubServerConfigurator::AddCliOptions(cxxopts::Options& Options) "Request timeout in milliseconds for instance activity check requests", cxxopts::value<uint32_t>(m_ServerOptions.WatchdogConfig.ActivityCheckRequestTimeoutMs)->default_value("200"), "<ms>"); + + Options.add_option("hub", + "", + "hub-provision-disk-limit-bytes", + "Reject provisioning when used disk bytes exceed this value (0 = no limit).", + cxxopts::value<uint64_t>(m_ServerOptions.HubProvisionDiskLimitBytes), + "<bytes>"); + + Options.add_option("hub", + "", + "hub-provision-disk-limit-percent", + "Reject provisioning when used disk exceeds this percentage of total disk (0 = no limit).", + cxxopts::value<uint32_t>(m_ServerOptions.HubProvisionDiskLimitPercent), + "<percent>"); + + Options.add_option("hub", + "", + "hub-provision-memory-limit-bytes", + "Reject provisioning when used memory bytes exceed this value (0 = no limit).", + cxxopts::value<uint64_t>(m_ServerOptions.HubProvisionMemoryLimitBytes), + "<bytes>"); + + Options.add_option("hub", + "", + "hub-provision-memory-limit-percent", + "Reject provisioning when used memory exceeds this percentage of total RAM (0 = no limit).", + cxxopts::value<uint32_t>(m_ServerOptions.HubProvisionMemoryLimitPercent), + "<percent>"); } void @@ -226,6 +257,18 @@ ZenHubServerConfigurator::OnConfigFileParsed(LuaConfig::Options& LuaOptions) void ZenHubServerConfigurator::ValidateOptions() { + if (m_ServerOptions.HubProvisionDiskLimitPercent > 100) + { + throw OptionParseException( + fmt::format("'--hub-provision-disk-limit-percent' ({}) must be in range 0..100", m_ServerOptions.HubProvisionDiskLimitPercent), + {}); + } + if (m_ServerOptions.HubProvisionMemoryLimitPercent > 100) + { + throw OptionParseException(fmt::format("'--hub-provision-memory-limit-percent' ({}) must be in range 0..100", + m_ServerOptions.HubProvisionMemoryLimitPercent), + {}); + } } /////////////////////////////////////////////////////////////////////////// @@ -373,30 +416,71 @@ ZenHubServer::InitializeState(const ZenHubServerConfig& ServerConfig) ZEN_UNUSED(ServerConfig); } +ResourceMetrics +ZenHubServer::ResolveLimits(const ZenHubServerConfig& ServerConfig) +{ + uint64_t DiskTotal = 0; + uint64_t MemoryTotal = 0; + + if (ServerConfig.HubProvisionDiskLimitPercent > 0) + { + DiskSpace Disk; + if (DiskSpaceInfo(ServerConfig.DataDir, Disk)) + { + DiskTotal = Disk.Total; + } + else + { + ZEN_WARN("Failed to query disk space for '{}'; disk percent limit will not be applied", ServerConfig.DataDir); + } + } + if (ServerConfig.HubProvisionMemoryLimitPercent > 0) + { + MemoryTotal = GetSystemMetrics().SystemMemoryMiB * 1024 * 1024; + } + + auto Resolve = [](uint64_t Bytes, uint32_t Pct, uint64_t Total) -> uint64_t { + const uint64_t PctBytes = Pct > 0 ? (Total * Pct) / 100 : 0; + if (Bytes > 0 && PctBytes > 0) + { + return Min(Bytes, PctBytes); + } + return Bytes > 0 ? Bytes : PctBytes; + }; + + return { + .DiskUsageBytes = Resolve(ServerConfig.HubProvisionDiskLimitBytes, ServerConfig.HubProvisionDiskLimitPercent, DiskTotal), + .MemoryUsageBytes = Resolve(ServerConfig.HubProvisionMemoryLimitBytes, ServerConfig.HubProvisionMemoryLimitPercent, MemoryTotal), + }; +} + void ZenHubServer::InitializeServices(const ZenHubServerConfig& ServerConfig) { ZEN_INFO("instantiating Hub"); + Hub::Configuration HubConfig{ + .UseJobObject = ServerConfig.HubUseJobObject, + .BasePortNumber = ServerConfig.HubBasePortNumber, + .InstanceLimit = ServerConfig.HubInstanceLimit, + .InstanceHttpThreadCount = ServerConfig.HubInstanceHttpThreadCount, + .InstanceCoreLimit = ServerConfig.HubInstanceCoreLimit, + .InstanceConfigPath = ServerConfig.HubInstanceConfigPath, + .HydrationTargetSpecification = ServerConfig.HydrationTargetSpecification, + .WatchDog = + { + .CycleInterval = std::chrono::milliseconds(ServerConfig.WatchdogConfig.CycleIntervalMs), + .CycleProcessingBudget = std::chrono::milliseconds(ServerConfig.WatchdogConfig.CycleProcessingBudgetMs), + .InstanceCheckThrottle = std::chrono::milliseconds(ServerConfig.WatchdogConfig.InstanceCheckThrottleMs), + .ProvisionedInactivityTimeout = std::chrono::seconds(ServerConfig.WatchdogConfig.ProvisionedInactivityTimeoutSeconds), + .HibernatedInactivityTimeout = std::chrono::seconds(ServerConfig.WatchdogConfig.HibernatedInactivityTimeoutSeconds), + .InactivityCheckMargin = std::chrono::seconds(ServerConfig.WatchdogConfig.InactivityCheckMarginSeconds), + .ActivityCheckConnectTimeout = std::chrono::milliseconds(ServerConfig.WatchdogConfig.ActivityCheckConnectTimeoutMs), + .ActivityCheckRequestTimeout = std::chrono::milliseconds(ServerConfig.WatchdogConfig.ActivityCheckRequestTimeoutMs), + }, + .ResourceLimits = ResolveLimits(ServerConfig)}; + m_Hub = std::make_unique<Hub>( - Hub::Configuration{ - .UseJobObject = ServerConfig.HubUseJobObject, - .BasePortNumber = ServerConfig.HubBasePortNumber, - .InstanceLimit = ServerConfig.HubInstanceLimit, - .InstanceHttpThreadCount = ServerConfig.HubInstanceHttpThreadCount, - .InstanceCoreLimit = ServerConfig.HubInstanceCoreLimit, - .InstanceConfigPath = ServerConfig.HubInstanceConfigPath, - .HydrationTargetSpecification = ServerConfig.HydrationTargetSpecification, - .WatchDog = - { - .CycleInterval = std::chrono::milliseconds(ServerConfig.WatchdogConfig.CycleIntervalMs), - .CycleProcessingBudget = std::chrono::milliseconds(ServerConfig.WatchdogConfig.CycleProcessingBudgetMs), - .InstanceCheckThrottle = std::chrono::milliseconds(ServerConfig.WatchdogConfig.InstanceCheckThrottleMs), - .ProvisionedInactivityTimeout = std::chrono::seconds(ServerConfig.WatchdogConfig.ProvisionedInactivityTimeoutSeconds), - .HibernatedInactivityTimeout = std::chrono::seconds(ServerConfig.WatchdogConfig.HibernatedInactivityTimeoutSeconds), - .InactivityCheckMargin = std::chrono::seconds(ServerConfig.WatchdogConfig.InactivityCheckMarginSeconds), - .ActivityCheckConnectTimeout = std::chrono::milliseconds(ServerConfig.WatchdogConfig.ActivityCheckConnectTimeoutMs), - .ActivityCheckRequestTimeout = std::chrono::milliseconds(ServerConfig.WatchdogConfig.ActivityCheckRequestTimeoutMs), - }}, + std::move(HubConfig), ZenServerEnvironment(ZenServerEnvironment::Hub, ServerConfig.DataDir / "hub", ServerConfig.DataDir / "servers", diff --git a/src/zenserver/hub/zenhubserver.h b/src/zenserver/hub/zenhubserver.h index 77df3eaa3..9660e9a49 100644 --- a/src/zenserver/hub/zenhubserver.h +++ b/src/zenserver/hub/zenhubserver.h @@ -3,6 +3,7 @@ #pragma once #include "hubinstancestate.h" +#include "resourcemetrics.h" #include "zenserver.h" #include <zenutil/consul.h> @@ -49,6 +50,10 @@ struct ZenHubServerConfig : public ZenServerConfig std::filesystem::path HubInstanceConfigPath; // Path to Lua config file std::string HydrationTargetSpecification; // hydration/dehydration target specification ZenHubWatchdogConfig WatchdogConfig; + uint64_t HubProvisionDiskLimitBytes = 0; + uint32_t HubProvisionDiskLimitPercent = 0; + uint64_t HubProvisionMemoryLimitBytes = 0; + uint32_t HubProvisionMemoryLimitPercent = 0; }; class Hub; @@ -126,6 +131,8 @@ private: uint32_t m_ConsulHealthIntervalSeconds = 10; uint32_t m_ConsulDeregisterAfterSeconds = 30; + static ResourceMetrics ResolveLimits(const ZenHubServerConfig& ServerConfig); + void InitializeState(const ZenHubServerConfig& ServerConfig); void InitializeServices(const ZenHubServerConfig& ServerConfig); void RegisterServices(const ZenHubServerConfig& ServerConfig); diff --git a/src/zenutil/include/zenutil/zenserverprocess.h b/src/zenutil/include/zenutil/zenserverprocess.h index 03d507400..d6f66fbea 100644 --- a/src/zenutil/include/zenutil/zenserverprocess.h +++ b/src/zenutil/include/zenutil/zenserverprocess.h @@ -66,6 +66,7 @@ public: std::filesystem::path CreateNewTestDir(); std::filesystem::path CreateChildDir(std::string_view ChildName); std::filesystem::path ProgramBaseDir() const { return m_ProgramBaseDir; } + std::filesystem::path GetChildBaseDir() const { return m_ChildProcessBaseDir; } std::filesystem::path GetTestRootDir(std::string_view Path); inline bool IsInitialized() const { return m_IsInitialized; } inline bool IsTestEnvironment() const { return m_IsTestInstance; } |