aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorDan Engelbrecht <[email protected]>2026-03-30 11:21:50 +0200
committerGitHub Enterprise <[email protected]>2026-03-30 11:21:50 +0200
commit10613ce78f4a01658ebacdd0ff8ce464b3d13c3f (patch)
treefb45b1ff37e8c5582c82c56bfd2540511ce94abe /src
parentreuse single MinIO instance across s3client integration test (#901) (diff)
downloadzen-10613ce78f4a01658ebacdd0ff8ce464b3d13c3f.tar.xz
zen-10613ce78f4a01658ebacdd0ff8ce464b3d13c3f.zip
hub resource limits (#900)
- Feature: Hub dashboard now shows a Resources tile with disk and memory usage against configured limits - Feature: Hub module listing now shows state-change timestamps and duration for each instance - Improvement: Hub provisioning rejects new instances when disk or memory usage exceeds configurable thresholds; limits are disabled by default (0 = no limit) - `--hub-provision-disk-limit-bytes` - Reject provisioning when used disk exceeds this many bytes - `--hub-provision-disk-limit-percent` - Reject provisioning when used disk exceeds this percentage of total disk - `--hub-provision-memory-limit-bytes` - Reject provisioning when used memory exceeds this many bytes - `--hub-provision-memory-limit-percent` - Reject provisioning when used memory exceeds this percentage of total RAM - Improvement: Hub process metrics are now tracked atomically per active instance slot, eliminating per-query process handle lookups - Improvement: Hub, Build Store, and Workspaces service stats sections in the dashboard are now collapsible - Bugfix: Hub watchdog loop did not check `m_ShutdownFlag`, causing it to spin indefinitely on shutdown
Diffstat (limited to 'src')
-rw-r--r--src/zenserver/frontend/html/pages/builds.js2
-rw-r--r--src/zenserver/frontend/html/pages/cache.js33
-rw-r--r--src/zenserver/frontend/html/pages/compute.js33
-rw-r--r--src/zenserver/frontend/html/pages/hub.js81
-rw-r--r--src/zenserver/frontend/html/pages/orchestrator.js33
-rw-r--r--src/zenserver/frontend/html/pages/page.js33
-rw-r--r--src/zenserver/frontend/html/pages/projects.js33
-rw-r--r--src/zenserver/frontend/html/pages/workspaces.js2
-rw-r--r--src/zenserver/frontend/html/zen.css4
-rw-r--r--src/zenserver/hub/httphubservice.cpp47
-rw-r--r--src/zenserver/hub/hub.cpp264
-rw-r--r--src/zenserver/hub/hub.h43
-rw-r--r--src/zenserver/hub/storageserverinstance.cpp36
-rw-r--r--src/zenserver/hub/storageserverinstance.h32
-rw-r--r--src/zenserver/hub/zenhubserver.cpp122
-rw-r--r--src/zenserver/hub/zenhubserver.h7
-rw-r--r--src/zenutil/include/zenutil/zenserverprocess.h1
17 files changed, 500 insertions, 306 deletions
diff --git a/src/zenserver/frontend/html/pages/builds.js b/src/zenserver/frontend/html/pages/builds.js
index 095f0bf29..6b3426378 100644
--- a/src/zenserver/frontend/html/pages/builds.js
+++ b/src/zenserver/frontend/html/pages/builds.js
@@ -16,7 +16,7 @@ export class Page extends ZenPage
this.set_title("build store");
// Build Store Stats
- const stats_section = this.add_section("Build Store Stats");
+ const stats_section = this._collapsible_section("Build Store Service Stats");
stats_section.tag().classify("dropall").text("raw yaml \u2192").on_click(() => {
window.open("/stats/builds.yaml", "_blank");
});
diff --git a/src/zenserver/frontend/html/pages/cache.js b/src/zenserver/frontend/html/pages/cache.js
index 1fc8227c8..e0f6f73b6 100644
--- a/src/zenserver/frontend/html/pages/cache.js
+++ b/src/zenserver/frontend/html/pages/cache.js
@@ -95,39 +95,6 @@ export class Page extends ZenPage
}
}
- _collapsible_section(name)
- {
- const section = this.add_section(name);
- const container = section._parent.inner();
- const heading = container.firstElementChild;
-
- heading.style.cursor = "pointer";
- heading.style.userSelect = "none";
-
- const indicator = document.createElement("span");
- indicator.textContent = " \u25BC";
- indicator.style.fontSize = "0.7em";
- heading.appendChild(indicator);
-
- let collapsed = false;
- heading.addEventListener("click", (e) => {
- if (e.target !== heading && e.target !== indicator)
- {
- return;
- }
- collapsed = !collapsed;
- indicator.textContent = collapsed ? " \u25B6" : " \u25BC";
- let sibling = heading.nextElementSibling;
- while (sibling)
- {
- sibling.style.display = collapsed ? "none" : "";
- sibling = sibling.nextElementSibling;
- }
- });
-
- return section;
- }
-
_render_stats(stats)
{
const safe = (obj, path) => path.split(".").reduce((a, b) => a && a[b], obj);
diff --git a/src/zenserver/frontend/html/pages/compute.js b/src/zenserver/frontend/html/pages/compute.js
index ab3d49c27..d1a880954 100644
--- a/src/zenserver/frontend/html/pages/compute.js
+++ b/src/zenserver/frontend/html/pages/compute.js
@@ -100,39 +100,6 @@ export class Page extends ZenPage
}, 2000);
}
- _collapsible_section(name)
- {
- const section = this.add_section(name);
- const container = section._parent.inner();
- const heading = container.firstElementChild;
-
- heading.style.cursor = "pointer";
- heading.style.userSelect = "none";
-
- const indicator = document.createElement("span");
- indicator.textContent = " \u25BC";
- indicator.style.fontSize = "0.7em";
- heading.appendChild(indicator);
-
- let collapsed = false;
- heading.addEventListener("click", (e) => {
- if (e.target !== heading && e.target !== indicator)
- {
- return;
- }
- collapsed = !collapsed;
- indicator.textContent = collapsed ? " \u25B6" : " \u25BC";
- let sibling = heading.nextElementSibling;
- while (sibling)
- {
- sibling.style.display = collapsed ? "none" : "";
- sibling = sibling.nextElementSibling;
- }
- });
-
- return section;
- }
-
async _load_chartjs()
{
if (window.Chart)
diff --git a/src/zenserver/frontend/html/pages/hub.js b/src/zenserver/frontend/html/pages/hub.js
index 78e3a090c..c6f96d496 100644
--- a/src/zenserver/frontend/html/pages/hub.js
+++ b/src/zenserver/frontend/html/pages/hub.js
@@ -82,7 +82,7 @@ export class Page extends ZenPage
this.set_title("hub");
// Capacity
- const stats_section = this.add_section("Capacity");
+ const stats_section = this._collapsible_section("Hub Service Stats");
this._stats_grid = stats_section.tag().classify("grid").classify("stats-tiles");
// Modules
@@ -203,27 +203,46 @@ export class Page extends ZenPage
{
const tile = grid.tag().classify("card").classify("stats-tile");
- tile.tag().classify("card-title").text("Active Modules");
+ tile.tag().classify("card-title").text("Instances");
const body = tile.tag().classify("tile-metrics");
this._metric(body, Friendly.sep(current), "currently provisioned", true);
+ this._metric(body, Friendly.sep(max), "high watermark");
+ this._metric(body, Friendly.sep(limit), "maximum allowed");
+ if (limit > 0)
+ {
+ const pct = ((current / limit) * 100).toFixed(0) + "%";
+ this._metric(body, pct, "utilization");
+ }
}
+ const machine = data.machine || {};
+ const limits = data.resource_limits || {};
+ if (machine.disk_total_bytes > 0 || machine.memory_total_mib > 0)
{
- const tile = grid.tag().classify("card").classify("stats-tile");
- tile.tag().classify("card-title").text("Peak Modules");
- const body = tile.tag().classify("tile-metrics");
- this._metric(body, Friendly.sep(max), "high watermark", true);
- }
+ const disk_used = Math.max(0, (machine.disk_total_bytes || 0) - (machine.disk_free_bytes || 0));
+ const mem_used = Math.max(0, (machine.memory_total_mib || 0) - (machine.memory_avail_mib || 0)) * 1024 * 1024;
+ const vmem_used = Math.max(0, (machine.virtual_memory_total_mib || 0) - (machine.virtual_memory_avail_mib || 0)) * 1024 * 1024;
+ const disk_limit = limits.disk_bytes || 0;
+ const mem_limit = limits.memory_bytes || 0;
+ const disk_over = disk_limit > 0 && disk_used > disk_limit;
+ const mem_over = mem_limit > 0 && mem_used > mem_limit;
- {
const tile = grid.tag().classify("card").classify("stats-tile");
- tile.tag().classify("card-title").text("Instance Limit");
- const body = tile.tag().classify("tile-metrics");
- this._metric(body, Friendly.sep(limit), "maximum allowed", true);
- if (limit > 0)
+ if (disk_over || mem_over) { tile.inner().setAttribute("data-over", "true"); }
+ tile.tag().classify("card-title").text("Resources");
+ const columns = tile.tag().classify("tile-columns");
+
+ const left = columns.tag().classify("tile-metrics");
+ this._metric(left, Friendly.bytes(disk_used), "disk used", true);
+ if (disk_limit > 0) { this._metric(left, Friendly.bytes(disk_limit), "disk limit"); }
+
+ const right = columns.tag().classify("tile-metrics");
+ this._metric(right, Friendly.bytes(mem_used), "memory used", true);
+ if (mem_limit > 0) { this._metric(right, Friendly.bytes(mem_limit), "memory limit"); }
+ if (machine.virtual_memory_total_mib > 0)
{
- const pct = ((current / limit) * 100).toFixed(0) + "%";
- this._metric(body, pct, "utilization");
+ this._metric(right, Friendly.bytes(vmem_used), "vmem used", true);
+ this._metric(right, Friendly.bytes(machine.virtual_memory_total_mib * 1024 * 1024), "vmem total");
}
}
}
@@ -284,6 +303,14 @@ export class Page extends ZenPage
}
row.state_text.nodeValue = state;
row.port_text.nodeValue = m.port ? String(m.port) : "";
+ if (m.state_change_time)
+ {
+ const state_label = state.charAt(0).toUpperCase() + state.slice(1);
+ row.state_since_label.textContent = state_label + " since";
+ row.state_age_label.textContent = state_label + " for";
+ row.state_since_node.nodeValue = m.state_change_time;
+ row.state_age_node.nodeValue = Friendly.timespan(Date.now() - new Date(m.state_change_time).getTime());
+ }
row.btn_open.disabled = state !== "provisioned";
row.btn_hibernate.disabled = !_btn_enabled(state, "hibernate");
row.btn_wake.disabled = !_btn_enabled(state, "wake");
@@ -388,7 +415,7 @@ export class Page extends ZenPage
td_action.appendChild(wrap_o);
tr.appendChild(td_action);
- // Build metrics grid from process_metrics keys.
+ // Build metrics grid: fixed state-time rows followed by process_metrics keys.
// Keys are split into two halves and interleaved so the grid fills
// top-to-bottom in the left column before continuing in the right column.
const metric_nodes = new Map();
@@ -396,6 +423,28 @@ export class Page extends ZenPage
metrics_td.colSpan = 6;
const metrics_grid = document.createElement("div");
metrics_grid.className = "module-metrics-grid";
+
+ const _add_fixed_pair = (label, value_str) => {
+ const label_el = document.createElement("span");
+ label_el.className = "module-metrics-label";
+ label_el.textContent = label;
+ const value_node = document.createTextNode(value_str);
+ const value_el = document.createElement("span");
+ value_el.className = "module-metrics-value";
+ value_el.appendChild(value_node);
+ metrics_grid.appendChild(label_el);
+ metrics_grid.appendChild(value_el);
+ return { label_el, value_node };
+ };
+
+ const state_label = m.state ? m.state.charAt(0).toUpperCase() + m.state.slice(1) : "State";
+ const state_since_str = m.state_change_time || "";
+ const state_age_str = m.state_change_time
+ ? Friendly.timespan(Date.now() - new Date(m.state_change_time).getTime())
+ : "";
+ const { label_el: state_since_label, value_node: state_since_node } = _add_fixed_pair(state_label + " since", state_since_str);
+ const { label_el: state_age_label, value_node: state_age_node } = _add_fixed_pair(state_label + " for", state_age_str);
+
const keys = Object.keys(m.process_metrics || {});
const half = Math.ceil(keys.length / 2);
const add_metric_pair = (key) => {
@@ -423,7 +472,7 @@ export class Page extends ZenPage
metrics_td.appendChild(metrics_grid);
metrics_tr.appendChild(metrics_td);
- row = { tr, metrics_tr, idx: td_idx, cb, dot, state_text: state_node, port_text: port_node, btn_expand, btn_open: btn_o, btn_hibernate: btn_h, btn_wake: btn_w, btn_deprov: btn_d, metric_nodes };
+ row = { tr, metrics_tr, idx: td_idx, cb, dot, state_text: state_node, port_text: port_node, btn_expand, btn_open: btn_o, btn_hibernate: btn_h, btn_wake: btn_w, btn_deprov: btn_d, metric_nodes, state_since_node, state_age_node, state_since_label, state_age_label };
this._row_cache.set(id, row);
}
diff --git a/src/zenserver/frontend/html/pages/orchestrator.js b/src/zenserver/frontend/html/pages/orchestrator.js
index 4a9290a3c..a280fabdb 100644
--- a/src/zenserver/frontend/html/pages/orchestrator.js
+++ b/src/zenserver/frontend/html/pages/orchestrator.js
@@ -46,39 +46,6 @@ export class Page extends ZenPage
this._connect_ws();
}
- _collapsible_section(name)
- {
- const section = this.add_section(name);
- const container = section._parent.inner();
- const heading = container.firstElementChild;
-
- heading.style.cursor = "pointer";
- heading.style.userSelect = "none";
-
- const indicator = document.createElement("span");
- indicator.textContent = " \u25BC";
- indicator.style.fontSize = "0.7em";
- heading.appendChild(indicator);
-
- let collapsed = false;
- heading.addEventListener("click", (e) => {
- if (e.target !== heading && e.target !== indicator)
- {
- return;
- }
- collapsed = !collapsed;
- indicator.textContent = collapsed ? " \u25B6" : " \u25BC";
- let sibling = heading.nextElementSibling;
- while (sibling)
- {
- sibling.style.display = collapsed ? "none" : "";
- sibling = sibling.nextElementSibling;
- }
- });
-
- return section;
- }
-
async _fetch_all()
{
try
diff --git a/src/zenserver/frontend/html/pages/page.js b/src/zenserver/frontend/html/pages/page.js
index cf8d3e3dd..ff530ff8e 100644
--- a/src/zenserver/frontend/html/pages/page.js
+++ b/src/zenserver/frontend/html/pages/page.js
@@ -337,4 +337,37 @@ export class ZenPage extends PageBase
this._metric(right, Friendly.duration(reqData.t_max), "max");
}
}
+
+ _collapsible_section(name)
+ {
+ const section = this.add_section(name);
+ const container = section._parent.inner();
+ const heading = container.firstElementChild;
+
+ heading.style.cursor = "pointer";
+ heading.style.userSelect = "none";
+
+ const indicator = document.createElement("span");
+ indicator.textContent = " \u25BC";
+ indicator.style.fontSize = "0.7em";
+ heading.appendChild(indicator);
+
+ let collapsed = false;
+ heading.addEventListener("click", (e) => {
+ if (e.target !== heading && e.target !== indicator)
+ {
+ return;
+ }
+ collapsed = !collapsed;
+ indicator.textContent = collapsed ? " \u25B6" : " \u25BC";
+ let sibling = heading.nextElementSibling;
+ while (sibling)
+ {
+ sibling.style.display = collapsed ? "none" : "";
+ sibling = sibling.nextElementSibling;
+ }
+ });
+
+ return section;
+ }
}
diff --git a/src/zenserver/frontend/html/pages/projects.js b/src/zenserver/frontend/html/pages/projects.js
index 2469bf70b..dfe4faeb8 100644
--- a/src/zenserver/frontend/html/pages/projects.js
+++ b/src/zenserver/frontend/html/pages/projects.js
@@ -110,39 +110,6 @@ export class Page extends ZenPage
}
}
- _collapsible_section(name)
- {
- const section = this.add_section(name);
- const container = section._parent.inner();
- const heading = container.firstElementChild;
-
- heading.style.cursor = "pointer";
- heading.style.userSelect = "none";
-
- const indicator = document.createElement("span");
- indicator.textContent = " \u25BC";
- indicator.style.fontSize = "0.7em";
- heading.appendChild(indicator);
-
- let collapsed = false;
- heading.addEventListener("click", (e) => {
- if (e.target !== heading && e.target !== indicator)
- {
- return;
- }
- collapsed = !collapsed;
- indicator.textContent = collapsed ? " \u25B6" : " \u25BC";
- let sibling = heading.nextElementSibling;
- while (sibling)
- {
- sibling.style.display = collapsed ? "none" : "";
- sibling = sibling.nextElementSibling;
- }
- });
-
- return section;
- }
-
_clear_param(name)
{
this._params.delete(name);
diff --git a/src/zenserver/frontend/html/pages/workspaces.js b/src/zenserver/frontend/html/pages/workspaces.js
index d31fd7373..2442fb35b 100644
--- a/src/zenserver/frontend/html/pages/workspaces.js
+++ b/src/zenserver/frontend/html/pages/workspaces.js
@@ -13,7 +13,7 @@ export class Page extends ZenPage
this.set_title("workspaces");
// Workspace Service Stats
- const stats_section = this.add_section("Workspace Service Stats");
+ const stats_section = this._collapsible_section("Workspace Service Stats");
this._stats_grid = stats_section.tag().classify("grid").classify("stats-tiles");
const stats = await new Fetcher().resource("stats", "ws").json().catch(() => null);
diff --git a/src/zenserver/frontend/html/zen.css b/src/zenserver/frontend/html/zen.css
index d9f7491ea..cb3d78cf2 100644
--- a/src/zenserver/frontend/html/zen.css
+++ b/src/zenserver/frontend/html/zen.css
@@ -816,6 +816,10 @@ zen-banner + zen-nav::part(nav-bar) {
border-color: var(--theme_p0);
}
+.stats-tile[data-over="true"] {
+ border-color: var(--theme_fail);
+}
+
.stats-tile-detailed {
position: relative;
}
diff --git a/src/zenserver/hub/httphubservice.cpp b/src/zenserver/hub/httphubservice.cpp
index ebefcf2e3..d52da5ae7 100644
--- a/src/zenserver/hub/httphubservice.cpp
+++ b/src/zenserver/hub/httphubservice.cpp
@@ -78,6 +78,10 @@ HttpHubService::HttpHubService(Hub& Hub, HttpStatsService& StatsService, HttpSta
Obj << "moduleId" << ModuleId;
Obj << "state" << ToString(Info.State);
Obj << "port" << Info.Port;
+ if (Info.StateChangeTime != std::chrono::system_clock::time_point::min())
+ {
+ Obj << "state_change_time" << ToDateTime(Info.StateChangeTime);
+ }
Obj.BeginObject("process_metrics");
{
Obj << "MemoryBytes" << Info.Metrics.MemoryBytes;
@@ -228,17 +232,6 @@ HttpHubService::HttpHubService(Hub& Hub, HttpStatsService& StatsService, HttpSta
},
HttpVerb::kPost);
- m_Router.RegisterRoute(
- "stats",
- [this](HttpRouterRequest& Req) {
- CbObjectWriter Obj;
- Obj << "currentInstanceCount" << m_Hub.GetInstanceCount();
- Obj << "maxInstanceCount" << m_Hub.GetMaxInstanceCount();
- Obj << "instanceLimit" << m_Hub.GetConfig().InstanceLimit;
- Req.ServerRequest().WriteResponse(HttpResponseCode::OK, Obj.Save());
- },
- HttpVerb::kGet);
-
m_StatsService.RegisterHandler("hub", *this);
m_StatusService.RegisterHandler("hub", *this);
}
@@ -286,7 +279,37 @@ HttpHubService::HandleStatusRequest(HttpServerRequest& Request)
void
HttpHubService::HandleStatsRequest(HttpServerRequest& Request)
{
- Request.WriteResponse(HttpResponseCode::OK, CollectStats());
+ CbObjectWriter Cbo;
+
+ EmitSnapshot("requests", m_HttpRequests, Cbo);
+
+ Cbo << "currentInstanceCount" << m_Hub.GetInstanceCount();
+ Cbo << "maxInstanceCount" << m_Hub.GetMaxInstanceCount();
+ Cbo << "instanceLimit" << m_Hub.GetConfig().InstanceLimit;
+
+ SystemMetrics SysMetrics;
+ DiskSpace Disk;
+ m_Hub.GetMachineMetrics(SysMetrics, Disk);
+ Cbo.BeginObject("machine");
+ {
+ Cbo << "disk_free_bytes" << Disk.Free;
+ Cbo << "disk_total_bytes" << Disk.Total;
+ Cbo << "memory_avail_mib" << SysMetrics.AvailSystemMemoryMiB;
+ Cbo << "memory_total_mib" << SysMetrics.SystemMemoryMiB;
+ Cbo << "virtual_memory_avail_mib" << SysMetrics.AvailVirtualMemoryMiB;
+ Cbo << "virtual_memory_total_mib" << SysMetrics.VirtualMemoryMiB;
+ }
+ Cbo.EndObject();
+
+ const ResourceMetrics& Limits = m_Hub.GetConfig().ResourceLimits;
+ Cbo.BeginObject("resource_limits");
+ {
+ Cbo << "disk_bytes" << Limits.DiskUsageBytes;
+ Cbo << "memory_bytes" << Limits.MemoryUsageBytes;
+ }
+ Cbo.EndObject();
+
+ Request.WriteResponse(HttpResponseCode::OK, Cbo.Save());
}
CbObject
diff --git a/src/zenserver/hub/hub.cpp b/src/zenserver/hub/hub.cpp
index 6c44e2333..bf846d68e 100644
--- a/src/zenserver/hub/hub.cpp
+++ b/src/zenserver/hub/hub.cpp
@@ -19,7 +19,6 @@ ZEN_THIRD_PARTY_INCLUDES_START
ZEN_THIRD_PARTY_INCLUDES_END
#if ZEN_WITH_TESTS
-# include <zencore/filesystem.h>
# include <zencore/testing.h>
# include <zencore/testutils.h>
#endif
@@ -122,6 +121,55 @@ private:
//////////////////////////////////////////////////////////////////////////
+ProcessMetrics
+Hub::AtomicProcessMetrics::Load() const
+{
+ return {
+ .MemoryBytes = MemoryBytes.load(),
+ .KernelTimeMs = KernelTimeMs.load(),
+ .UserTimeMs = UserTimeMs.load(),
+ .WorkingSetSize = WorkingSetSize.load(),
+ .PeakWorkingSetSize = PeakWorkingSetSize.load(),
+ .PagefileUsage = PagefileUsage.load(),
+ .PeakPagefileUsage = PeakPagefileUsage.load(),
+ };
+}
+
+void
+Hub::AtomicProcessMetrics::Store(const ProcessMetrics& Metrics)
+{
+ MemoryBytes.store(Metrics.MemoryBytes);
+ KernelTimeMs.store(Metrics.KernelTimeMs);
+ UserTimeMs.store(Metrics.UserTimeMs);
+ WorkingSetSize.store(Metrics.WorkingSetSize);
+ PeakWorkingSetSize.store(Metrics.PeakWorkingSetSize);
+ PagefileUsage.store(Metrics.PagefileUsage);
+ PeakPagefileUsage.store(Metrics.PeakPagefileUsage);
+}
+
+void
+Hub::AtomicProcessMetrics::Reset()
+{
+ MemoryBytes.store(0);
+ KernelTimeMs.store(0);
+ UserTimeMs.store(0);
+ WorkingSetSize.store(0);
+ PeakWorkingSetSize.store(0);
+ PagefileUsage.store(0);
+ PeakPagefileUsage.store(0);
+}
+
+void
+Hub::GetMachineMetrics(SystemMetrics& OutSystemMetrict, DiskSpace& OutDiskSpace) const
+{
+ m_Lock.WithSharedLock([&]() {
+ OutSystemMetrict = m_SystemMetrics;
+ OutDiskSpace = m_DiskSpace;
+ });
+}
+
+//////////////////////////////////////////////////////////////////////////
+
Hub::Hub(const Configuration& Config,
ZenServerEnvironment&& RunEnvironment,
WorkerThreadPool* OptionalWorkerPool,
@@ -134,10 +182,6 @@ Hub::Hub(const Configuration& Config,
, m_ActiveInstances(Config.InstanceLimit)
, m_FreeActiveInstanceIndexes(Config.InstanceLimit)
{
- m_HostMetrics = GetSystemMetrics();
- m_ResourceLimits.DiskUsageBytes = 1000ull * 1024 * 1024 * 1024;
- m_ResourceLimits.MemoryUsageBytes = 16ull * 1024 * 1024 * 1024;
-
if (m_Config.HydrationTargetSpecification.empty())
{
std::filesystem::path FileHydrationPath = m_RunEnvironment.CreateChildDir("hydration_storage");
@@ -171,6 +215,9 @@ Hub::Hub(const Configuration& Config,
}
}
#endif
+
+ UpdateMachineMetrics();
+
m_WatchDog = std::thread([this]() { WatchDog(); });
}
@@ -195,6 +242,9 @@ Hub::Shutdown()
{
ZEN_INFO("Hub service shutting down, deprovisioning any current instances");
+ bool Expected = false;
+ bool WaitForBackgroundWork = m_ShutdownFlag.compare_exchange_strong(Expected, true);
+
m_WatchDogEvent.Set();
if (m_WatchDog.joinable())
{
@@ -203,8 +253,6 @@ Hub::Shutdown()
m_WatchDog = {};
- bool Expected = false;
- bool WaitForBackgroundWork = m_ShutdownFlag.compare_exchange_strong(Expected, true);
if (WaitForBackgroundWork && m_WorkerPool)
{
m_BackgroundWorkLatch.CountDown();
@@ -254,7 +302,7 @@ Hub::Provision(std::string_view ModuleId, HubProvisionedInstanceInfo& OutInfo)
if (auto It = m_InstanceLookup.find(std::string(ModuleId)); It == m_InstanceLookup.end())
{
std::string Reason;
- if (!CanProvisionInstance(ModuleId, /* out */ Reason))
+ if (!CanProvisionInstanceLocked(ModuleId, /* out */ Reason))
{
ZEN_WARN("Cannot provision new storage server instance for module '{}': {}", ModuleId, Reason);
@@ -289,6 +337,7 @@ Hub::Provision(std::string_view ModuleId, HubProvisionedInstanceInfo& OutInfo)
Instance = NewInstance->LockExclusive(/*Wait*/ true);
m_ActiveInstances[ActiveInstanceIndex].Instance = std::move(NewInstance);
+ m_ActiveInstances[ActiveInstanceIndex].ProcessMetrics.Reset();
m_InstanceLookup.insert_or_assign(std::string(ModuleId), ActiveInstanceIndex);
// Set Provisioning while both hub lock and instance lock are held so that any
// concurrent Deprovision sees the in-flight state, not Unprovisioned.
@@ -947,12 +996,10 @@ Hub::Find(std::string_view ModuleId, InstanceInfo* OutInstanceInfo)
ZEN_ASSERT(ActiveInstanceIndex < m_ActiveInstances.size());
const std::unique_ptr<StorageServerInstance>& Instance = m_ActiveInstances[ActiveInstanceIndex].Instance;
ZEN_ASSERT(Instance);
- InstanceInfo Info{
- m_ActiveInstances[ActiveInstanceIndex].State.load(),
- std::chrono::system_clock::now() // TODO
- };
- Instance->GetProcessMetrics(Info.Metrics);
- Info.Port = Instance->GetBasePort();
+ InstanceInfo Info{m_ActiveInstances[ActiveInstanceIndex].State.load(),
+ m_ActiveInstances[ActiveInstanceIndex].StateChangeTime.load()};
+ Info.Metrics = m_ActiveInstances[ActiveInstanceIndex].ProcessMetrics.Load();
+ Info.Port = Instance->GetBasePort();
*OutInstanceInfo = Info;
}
@@ -971,12 +1018,10 @@ Hub::EnumerateModules(std::function<void(std::string_view ModuleId, const Instan
{
const std::unique_ptr<StorageServerInstance>& Instance = m_ActiveInstances[ActiveInstanceIndex].Instance;
ZEN_ASSERT(Instance);
- InstanceInfo Info{
- m_ActiveInstances[ActiveInstanceIndex].State.load(),
- std::chrono::system_clock::now() // TODO
- };
- Instance->GetProcessMetrics(Info.Metrics);
- Info.Port = Instance->GetBasePort();
+ InstanceInfo Info{m_ActiveInstances[ActiveInstanceIndex].State.load(),
+ m_ActiveInstances[ActiveInstanceIndex].StateChangeTime.load()};
+ Info.Metrics = m_ActiveInstances[ActiveInstanceIndex].ProcessMetrics.Load();
+ Info.Port = Instance->GetBasePort();
Infos.push_back(std::make_pair(std::string(Instance->GetModuleId()), Info));
}
@@ -994,28 +1039,8 @@ Hub::GetInstanceCount()
return m_Lock.WithSharedLock([this]() { return gsl::narrow_cast<int>(m_InstanceLookup.size()); });
}
-void
-Hub::UpdateCapacityMetrics()
-{
- m_HostMetrics = GetSystemMetrics();
-
- // TODO: Should probably go into WatchDog and use atomic for update so it can be read without locks...
- // Per-instance stats are already refreshed by WatchDog and are readable via the Find and EnumerateModules
-}
-
-void
-Hub::UpdateStats()
-{
- int CurrentInstanceCount = m_Lock.WithSharedLock([this] { return gsl::narrow_cast<int>(m_InstanceLookup.size()); });
- int CurrentMaxCount = m_MaxInstanceCount.load();
-
- int NewMax = Max(CurrentMaxCount, CurrentInstanceCount);
-
- m_MaxInstanceCount.compare_exchange_weak(CurrentMaxCount, NewMax);
-}
-
bool
-Hub::CanProvisionInstance(std::string_view ModuleId, std::string& OutReason)
+Hub::CanProvisionInstanceLocked(std::string_view ModuleId, std::string& OutReason)
{
ZEN_UNUSED(ModuleId);
if (m_FreeActiveInstanceIndexes.empty())
@@ -1025,7 +1050,24 @@ Hub::CanProvisionInstance(std::string_view ModuleId, std::string& OutReason)
return false;
}
- // TODO: handle additional resource metrics
+ const uint64_t DiskUsedBytes = m_DiskSpace.Free <= m_DiskSpace.Total ? m_DiskSpace.Total - m_DiskSpace.Free : 0;
+ if (m_Config.ResourceLimits.DiskUsageBytes > 0 && DiskUsedBytes > m_Config.ResourceLimits.DiskUsageBytes)
+ {
+ OutReason =
+ fmt::format("disk usage ({}) exceeds ({})", NiceBytes(DiskUsedBytes), NiceBytes(m_Config.ResourceLimits.DiskUsageBytes));
+ return false;
+ }
+
+ const uint64_t RamUsedMiB = m_SystemMetrics.AvailSystemMemoryMiB <= m_SystemMetrics.SystemMemoryMiB
+ ? m_SystemMetrics.SystemMemoryMiB - m_SystemMetrics.AvailSystemMemoryMiB
+ : 0;
+ const uint64_t RamUsedBytes = RamUsedMiB * 1024 * 1024;
+ if (m_Config.ResourceLimits.MemoryUsageBytes > 0 && RamUsedBytes > m_Config.ResourceLimits.MemoryUsageBytes)
+ {
+ OutReason =
+ fmt::format("ram usage ({}) exceeds ({})", NiceBytes(RamUsedBytes), NiceBytes(m_Config.ResourceLimits.MemoryUsageBytes));
+ return false;
+ }
return true;
}
@@ -1065,8 +1107,10 @@ Hub::UpdateInstanceStateLocked(size_t ActiveInstanceIndex, HubInstanceState NewS
}
return false;
}(m_ActiveInstances[ActiveInstanceIndex].State.load(), NewState));
+ const std::chrono::system_clock::time_point Now = std::chrono::system_clock::now();
m_ActiveInstances[ActiveInstanceIndex].LastKnownActivitySum.store(0);
- m_ActiveInstances[ActiveInstanceIndex].LastActivityTime.store(std::chrono::system_clock::now());
+ m_ActiveInstances[ActiveInstanceIndex].LastActivityTime.store(Now);
+ m_ActiveInstances[ActiveInstanceIndex].StateChangeTime.store(Now);
return m_ActiveInstances[ActiveInstanceIndex].State.exchange(NewState);
}
@@ -1173,14 +1217,14 @@ Hub::CheckInstanceStatus(HttpClient& ActivityCheckClient,
StorageServerInstance::SharedLockedPtr&& LockedInstance,
size_t ActiveInstanceIndex)
{
+ const std::string ModuleId(LockedInstance.GetModuleId());
+
HubInstanceState InstanceState = m_ActiveInstances[ActiveInstanceIndex].State.load();
if (LockedInstance.IsRunning())
{
- LockedInstance.UpdateMetrics();
+ m_ActiveInstances[ActiveInstanceIndex].ProcessMetrics.Store(LockedInstance.GetProcessMetrics());
if (InstanceState == HubInstanceState::Provisioned)
{
- const std::string ModuleId(LockedInstance.GetModuleId());
-
const uint16_t Port = LockedInstance.GetBasePort();
const uint64_t PreviousActivitySum = m_ActiveInstances[ActiveInstanceIndex].LastKnownActivitySum.load();
const std::chrono::system_clock::time_point LastActivityTime = m_ActiveInstances[ActiveInstanceIndex].LastActivityTime.load();
@@ -1260,8 +1304,7 @@ Hub::CheckInstanceStatus(HttpClient& ActivityCheckClient,
else if (InstanceState == HubInstanceState::Provisioned)
{
// Process is not running but state says it should be - instance died unexpectedly.
- const std::string ModuleId(LockedInstance.GetModuleId());
- const uint16_t Port = LockedInstance.GetBasePort();
+ const uint16_t Port = LockedInstance.GetBasePort();
UpdateInstanceState(LockedInstance, ActiveInstanceIndex, HubInstanceState::Crashed);
NotifyStateUpdate(ModuleId, HubInstanceState::Provisioned, HubInstanceState::Crashed, Port, {});
LockedInstance = {};
@@ -1272,7 +1315,6 @@ Hub::CheckInstanceStatus(HttpClient& ActivityCheckClient,
{
// Process is not running - no HTTP activity check is possible.
// Use a pure time-based check; the margin window does not apply here.
- const std::string ModuleId = std::string(LockedInstance.GetModuleId());
const std::chrono::system_clock::time_point LastActivityTime = m_ActiveInstances[ActiveInstanceIndex].LastActivityTime.load();
const uint64_t PreviousActivitySum = m_ActiveInstances[ActiveInstanceIndex].LastKnownActivitySum.load();
const std::chrono::system_clock::time_point Now = std::chrono::system_clock::now();
@@ -1312,6 +1354,43 @@ Hub::CheckInstanceStatus(HttpClient& ActivityCheckClient,
}
void
+Hub::UpdateMachineMetrics()
+{
+ try
+ {
+ bool DiskSpaceOk = false;
+ DiskSpace Disk;
+
+ std::filesystem::path ChildDir = m_RunEnvironment.GetChildBaseDir();
+ if (!ChildDir.empty())
+ {
+ if (DiskSpaceInfo(ChildDir, Disk))
+ {
+ DiskSpaceOk = true;
+ }
+ else
+ {
+ ZEN_WARN("Failed to query disk space for '{}'; disk-based provisioning limits will not be enforced", ChildDir);
+ }
+ }
+
+ SystemMetrics Metrics = GetSystemMetrics();
+
+ m_Lock.WithExclusiveLock([&]() {
+ if (DiskSpaceOk)
+ {
+ m_DiskSpace = Disk;
+ }
+ m_SystemMetrics = Metrics;
+ });
+ }
+ catch (const std::exception& Ex)
+ {
+ ZEN_WARN("Failed to update machine metrics. Reason: {}", Ex.what());
+ }
+}
+
+void
Hub::WatchDog()
{
const uint64_t CycleIntervalMs = std::chrono::duration_cast<std::chrono::milliseconds>(m_Config.WatchDog.CycleInterval).count();
@@ -1326,16 +1405,18 @@ Hub::WatchDog()
[&]() -> bool { return m_WatchDogEvent.Wait(0); });
size_t CheckInstanceIndex = SIZE_MAX; // first increment wraps to 0
- while (!m_WatchDogEvent.Wait(gsl::narrow<int>(CycleIntervalMs)))
+ while (!m_ShutdownFlag.load() && !m_WatchDogEvent.Wait(gsl::narrow<int>(CycleIntervalMs)))
{
try
{
+ UpdateMachineMetrics();
+
// Snapshot slot count. We iterate all slots (including freed nulls) so
// round-robin coverage is not skewed by deprovisioned entries.
size_t SlotsRemaining = m_Lock.WithSharedLock([this]() { return m_ActiveInstances.size(); });
Stopwatch Timer;
- bool ShuttingDown = false;
+ bool ShuttingDown = m_ShutdownFlag.load();
while (SlotsRemaining > 0 && Timer.GetElapsedTimeMs() < CycleProcessingBudgetMs && !ShuttingDown)
{
StorageServerInstance::SharedLockedPtr LockedInstance;
@@ -1366,16 +1447,24 @@ Hub::WatchDog()
std::string ModuleId(LockedInstance.GetModuleId());
- bool InstanceIsOk = CheckInstanceStatus(ActivityCheckClient, std::move(LockedInstance), CheckInstanceIndex);
- if (InstanceIsOk)
+ try
{
- ShuttingDown = m_WatchDogEvent.Wait(gsl::narrow<int>(InstanceCheckThrottleMs));
+ bool InstanceIsOk = CheckInstanceStatus(ActivityCheckClient, std::move(LockedInstance), CheckInstanceIndex);
+ if (InstanceIsOk)
+ {
+ ShuttingDown = m_WatchDogEvent.Wait(gsl::narrow<int>(InstanceCheckThrottleMs));
+ }
+ else
+ {
+ ZEN_WARN("Instance for module '{}' is not running, attempting recovery", ModuleId);
+ AttemptRecoverInstance(ModuleId);
+ }
}
- else
+ catch (const std::exception& Ex)
{
- ZEN_WARN("Instance for module '{}' is not running, attempting recovery", ModuleId);
- AttemptRecoverInstance(ModuleId);
+ ZEN_WARN("Failed to check status of module {}. Reason: {}", ModuleId, Ex.what());
}
+ ShuttingDown |= m_ShutdownFlag.load();
}
}
catch (const std::exception& Ex)
@@ -1515,6 +1604,8 @@ TEST_CASE("hub.provision_basic")
Hub::InstanceInfo InstanceInfo;
REQUIRE(HubInstance->Find("module_a", &InstanceInfo));
CHECK_EQ(InstanceInfo.State, HubInstanceState::Provisioned);
+ CHECK_NE(InstanceInfo.StateChangeTime, std::chrono::system_clock::time_point::min());
+ CHECK_LE(InstanceInfo.StateChangeTime, std::chrono::system_clock::now());
{
HttpClient ModClient(fmt::format("http://localhost:{}", Info.Port), kFastTimeout);
@@ -1934,6 +2025,9 @@ TEST_CASE("hub.hibernate_wake")
}
REQUIRE(HubInstance->Find("hib_a", &Info));
CHECK_EQ(Info.State, HubInstanceState::Provisioned);
+ const std::chrono::system_clock::time_point ProvisionedTime = Info.StateChangeTime;
+ CHECK_NE(ProvisionedTime, std::chrono::system_clock::time_point::min());
+ CHECK_LE(ProvisionedTime, std::chrono::system_clock::now());
{
HttpClient ModClient(fmt::format("http://localhost:{}", ProvInfo.Port), kFastTimeout);
CHECK(ModClient.Get("/health/"));
@@ -1944,6 +2038,8 @@ TEST_CASE("hub.hibernate_wake")
REQUIRE_MESSAGE(HibernateResult.ResponseCode == Hub::EResponseCode::Completed, HibernateResult.Message);
REQUIRE(HubInstance->Find("hib_a", &Info));
CHECK_EQ(Info.State, HubInstanceState::Hibernated);
+ const std::chrono::system_clock::time_point HibernatedTime = Info.StateChangeTime;
+ CHECK_GE(HibernatedTime, ProvisionedTime);
{
HttpClient ModClient(fmt::format("http://localhost:{}", ProvInfo.Port), kFastTimeout);
CHECK(!ModClient.Get("/health/"));
@@ -1954,6 +2050,7 @@ TEST_CASE("hub.hibernate_wake")
REQUIRE_MESSAGE(WakeResult.ResponseCode == Hub::EResponseCode::Completed, WakeResult.Message);
REQUIRE(HubInstance->Find("hib_a", &Info));
CHECK_EQ(Info.State, HubInstanceState::Provisioned);
+ CHECK_GE(Info.StateChangeTime, HibernatedTime);
{
HttpClient ModClient(fmt::format("http://localhost:{}", ProvInfo.Port), kFastTimeout);
CHECK(ModClient.Get("/health/"));
@@ -2352,7 +2449,7 @@ TEST_CASE("hub.async_provision_shutdown_waits")
TEST_CASE("hub.async_provision_rejected")
{
- // Rejection from CanProvisionInstance fires synchronously even when a WorkerPool is present.
+ // Rejection from CanProvisionInstanceLocked fires synchronously even when a WorkerPool is present.
ScopedTemporaryDirectory TempDir;
Hub::Configuration Config;
@@ -2369,7 +2466,7 @@ TEST_CASE("hub.async_provision_rejected")
REQUIRE_MESSAGE(FirstResult.ResponseCode == Hub::EResponseCode::Accepted, FirstResult.Message);
REQUIRE_NE(Info.Port, 0);
- // Second provision: CanProvisionInstance rejects synchronously (limit reached), returns Rejected
+ // Second provision: CanProvisionInstanceLocked rejects synchronously (limit reached), returns Rejected
HubProvisionedInstanceInfo Info2;
const Hub::Response SecondResult = HubInstance->Provision("async_r2", Info2);
CHECK(SecondResult.ResponseCode == Hub::EResponseCode::Rejected);
@@ -2485,6 +2582,55 @@ TEST_CASE("hub.instance.inactivity.deprovision")
HubInstance->Shutdown();
}
+TEST_CASE("hub.machine_metrics")
+{
+ ScopedTemporaryDirectory TempDir;
+
+ std::unique_ptr<Hub> HubInstance = hub_testutils::MakeHub(TempDir.Path(), {});
+
+ // UpdateMachineMetrics() is called synchronously in the Hub constructor, so metrics
+ // are available immediately without waiting for a watchdog cycle.
+ SystemMetrics SysMetrics;
+ DiskSpace Disk;
+ HubInstance->GetMachineMetrics(SysMetrics, Disk);
+
+ CHECK_GT(Disk.Total, 0u);
+ CHECK_LE(Disk.Free, Disk.Total);
+
+ CHECK_GT(SysMetrics.SystemMemoryMiB, 0u);
+ CHECK_LE(SysMetrics.AvailSystemMemoryMiB, SysMetrics.SystemMemoryMiB);
+
+ CHECK_GT(SysMetrics.VirtualMemoryMiB, 0u);
+ CHECK_LE(SysMetrics.AvailVirtualMemoryMiB, SysMetrics.VirtualMemoryMiB);
+}
+
+TEST_CASE("hub.provision_rejected_resource_limits")
+{
+ // The Hub constructor calls UpdateMachineMetrics() synchronously, so CanProvisionInstanceLocked
+ // can enforce limits immediately without waiting for a watchdog cycle.
+ ScopedTemporaryDirectory TempDir;
+
+ {
+ Hub::Configuration Config;
+ Config.ResourceLimits.DiskUsageBytes = 1;
+ std::unique_ptr<Hub> HubInstance = hub_testutils::MakeHub(TempDir.Path(), Config);
+ HubProvisionedInstanceInfo Info;
+ const Hub::Response Result = HubInstance->Provision("disk_limit", Info);
+ CHECK(Result.ResponseCode == Hub::EResponseCode::Rejected);
+ CHECK_NE(Result.Message.find("disk usage"), std::string::npos);
+ }
+
+ {
+ Hub::Configuration Config;
+ Config.ResourceLimits.MemoryUsageBytes = 1;
+ std::unique_ptr<Hub> HubInstance = hub_testutils::MakeHub(TempDir.Path(), Config);
+ HubProvisionedInstanceInfo Info;
+ const Hub::Response Result = HubInstance->Provision("mem_limit", Info);
+ CHECK(Result.ResponseCode == Hub::EResponseCode::Rejected);
+ CHECK_NE(Result.Message.find("ram usage"), std::string::npos);
+ }
+}
+
TEST_SUITE_END();
void
diff --git a/src/zenserver/hub/hub.h b/src/zenserver/hub/hub.h
index c343b19e2..9895f7068 100644
--- a/src/zenserver/hub/hub.h
+++ b/src/zenserver/hub/hub.h
@@ -6,6 +6,7 @@
#include "resourcemetrics.h"
#include "storageserverinstance.h"
+#include <zencore/filesystem.h>
#include <zencore/system.h>
#include <zenutil/zenserverprocess.h>
@@ -68,6 +69,8 @@ public:
std::string HydrationTargetSpecification;
WatchDogConfiguration WatchDog;
+
+ ResourceMetrics ResourceLimits;
};
typedef std::function<
@@ -86,7 +89,7 @@ public:
struct InstanceInfo
{
HubInstanceState State = HubInstanceState::Unprovisioned;
- std::chrono::system_clock::time_point ProvisionTime;
+ std::chrono::system_clock::time_point StateChangeTime;
ProcessMetrics Metrics;
uint16_t Port = 0;
};
@@ -160,6 +163,8 @@ public:
int GetMaxInstanceCount() const { return m_MaxInstanceCount.load(); }
+ void GetMachineMetrics(SystemMetrics& OutSystemMetrict, DiskSpace& OutDiskSpace) const;
+
const Configuration& GetConfig() const { return m_Config; }
#if ZEN_WITH_TESTS
@@ -181,9 +186,25 @@ private:
#if ZEN_PLATFORM_WINDOWS
JobObject m_JobObject;
#endif
- RwLock m_Lock;
+ mutable RwLock m_Lock;
std::unordered_map<std::string, size_t> m_InstanceLookup;
+ // Mirrors ProcessMetrics with atomic fields, enabling lock-free reads alongside watchdog writes.
+ struct AtomicProcessMetrics
+ {
+ std::atomic<uint64_t> MemoryBytes = 0;
+ std::atomic<uint64_t> KernelTimeMs = 0;
+ std::atomic<uint64_t> UserTimeMs = 0;
+ std::atomic<uint64_t> WorkingSetSize = 0;
+ std::atomic<uint64_t> PeakWorkingSetSize = 0;
+ std::atomic<uint64_t> PagefileUsage = 0;
+ std::atomic<uint64_t> PeakPagefileUsage = 0;
+
+ ProcessMetrics Load() const;
+ void Store(const ProcessMetrics& Metrics);
+ void Reset();
+ };
+
struct ActiveInstance
{
// Invariant: Instance == nullptr if and only if State == Unprovisioned.
@@ -192,11 +213,16 @@ private:
// without holding the hub lock.
std::unique_ptr<StorageServerInstance> Instance;
std::atomic<HubInstanceState> State = HubInstanceState::Unprovisioned;
- // TODO: We should move current metrics here (from StorageServerInstance)
- // Read and updated by WatchDog, updates to State triggers a reset of both
+ // Process metrics - written by WatchDog (inside instance shared lock), read lock-free.
+ AtomicProcessMetrics ProcessMetrics;
+
+ // Activity tracking - written by WatchDog, reset on every state transition.
std::atomic<uint64_t> LastKnownActivitySum = 0;
std::atomic<std::chrono::system_clock::time_point> LastActivityTime = std::chrono::system_clock::time_point::min();
+
+ // Set in UpdateInstanceStateLocked on every state transition; read lock-free by Find/EnumerateModules.
+ std::atomic<std::chrono::system_clock::time_point> StateChangeTime = std::chrono::system_clock::time_point::min();
};
// UpdateInstanceState is overloaded to accept a locked instance pointer (exclusive or shared) or the hub exclusive
@@ -226,21 +252,20 @@ private:
std::vector<ActiveInstance> m_ActiveInstances;
std::deque<size_t> m_FreeActiveInstanceIndexes;
- ResourceMetrics m_ResourceLimits;
- SystemMetrics m_HostMetrics;
+ SystemMetrics m_SystemMetrics;
+ DiskSpace m_DiskSpace;
std::atomic<int> m_MaxInstanceCount = 0;
std::thread m_WatchDog;
Event m_WatchDogEvent;
void WatchDog();
+ void UpdateMachineMetrics();
bool CheckInstanceStatus(HttpClient& ActivityHttpClient,
StorageServerInstance::SharedLockedPtr&& LockedInstance,
size_t ActiveInstanceIndex);
void AttemptRecoverInstance(std::string_view ModuleId);
- void UpdateStats();
- void UpdateCapacityMetrics();
- bool CanProvisionInstance(std::string_view ModuleId, std::string& OutReason);
+ bool CanProvisionInstanceLocked(std::string_view ModuleId, std::string& OutReason);
uint16_t GetInstanceIndexAssignedPort(size_t ActiveInstanceIndex) const;
Response InternalDeprovision(const std::string& ModuleId, std::function<bool(ActiveInstance& Instance)>&& DeprovisionGate);
diff --git a/src/zenserver/hub/storageserverinstance.cpp b/src/zenserver/hub/storageserverinstance.cpp
index 6b139dbf1..802606f6a 100644
--- a/src/zenserver/hub/storageserverinstance.cpp
+++ b/src/zenserver/hub/storageserverinstance.cpp
@@ -57,16 +57,15 @@ StorageServerInstance::SpawnServerProcess()
m_ServerInstance.EnableShutdownOnDestroy();
}
-void
-StorageServerInstance::GetProcessMetrics(ProcessMetrics& OutMetrics) const
+ProcessMetrics
+StorageServerInstance::GetProcessMetrics() const
{
- OutMetrics.MemoryBytes = m_MemoryBytes.load();
- OutMetrics.KernelTimeMs = m_KernelTimeMs.load();
- OutMetrics.UserTimeMs = m_UserTimeMs.load();
- OutMetrics.WorkingSetSize = m_WorkingSetSize.load();
- OutMetrics.PeakWorkingSetSize = m_PeakWorkingSetSize.load();
- OutMetrics.PagefileUsage = m_PagefileUsage.load();
- OutMetrics.PeakPagefileUsage = m_PeakPagefileUsage.load();
+ ProcessMetrics Metrics;
+ if (m_ServerInstance.IsRunning())
+ {
+ zen::GetProcessMetrics(m_ServerInstance.GetProcessHandle(), Metrics);
+ }
+ return Metrics;
}
void
@@ -249,25 +248,6 @@ StorageServerInstance::SharedLockedPtr::IsRunning() const
return m_Instance->m_ServerInstance.IsRunning();
}
-void
-StorageServerInstance::UpdateMetricsLocked()
-{
- if (m_ServerInstance.IsRunning())
- {
- ProcessMetrics Metrics;
- zen::GetProcessMetrics(m_ServerInstance.GetProcessHandle(), Metrics);
-
- m_MemoryBytes.store(Metrics.MemoryBytes);
- m_KernelTimeMs.store(Metrics.KernelTimeMs);
- m_UserTimeMs.store(Metrics.UserTimeMs);
- m_WorkingSetSize.store(Metrics.WorkingSetSize);
- m_PeakWorkingSetSize.store(Metrics.PeakWorkingSetSize);
- m_PagefileUsage.store(Metrics.PagefileUsage);
- m_PeakPagefileUsage.store(Metrics.PeakPagefileUsage);
- }
- // TODO: Resource metrics...
-}
-
#if ZEN_WITH_TESTS
void
StorageServerInstance::SharedLockedPtr::TerminateForTesting() const
diff --git a/src/zenserver/hub/storageserverinstance.h b/src/zenserver/hub/storageserverinstance.h
index 94c47630c..33646c375 100644
--- a/src/zenserver/hub/storageserverinstance.h
+++ b/src/zenserver/hub/storageserverinstance.h
@@ -2,8 +2,6 @@
#pragma once
-#include "resourcemetrics.h"
-
#include <zenutil/zenserverprocess.h>
#include <atomic>
@@ -34,11 +32,9 @@ public:
StorageServerInstance(ZenServerEnvironment& RunEnvironment, const Configuration& Config, std::string_view ModuleId);
~StorageServerInstance();
- const ResourceMetrics& GetResourceMetrics() const { return m_ResourceMetrics; }
-
inline std::string_view GetModuleId() const { return m_ModuleId; }
inline uint16_t GetBasePort() const { return m_Config.BasePort; }
- void GetProcessMetrics(ProcessMetrics& OutMetrics) const;
+ ProcessMetrics GetProcessMetrics() const;
#if ZEN_PLATFORM_WINDOWS
void SetJobObject(JobObject* InJobObject) { m_JobObject = InJobObject; }
@@ -68,15 +64,10 @@ public:
}
bool IsRunning() const;
- const ResourceMetrics& GetResourceMetrics() const
- {
- ZEN_ASSERT(m_Instance);
- return m_Instance->m_ResourceMetrics;
- }
- void UpdateMetrics()
+ ProcessMetrics GetProcessMetrics() const
{
ZEN_ASSERT(m_Instance);
- return m_Instance->UpdateMetricsLocked();
+ return m_Instance->GetProcessMetrics();
}
#if ZEN_WITH_TESTS
@@ -114,12 +105,6 @@ public:
}
bool IsRunning() const;
- const ResourceMetrics& GetResourceMetrics() const
- {
- ZEN_ASSERT(m_Instance);
- return m_Instance->m_ResourceMetrics;
- }
-
void Provision();
void Deprovision();
void Hibernate();
@@ -139,8 +124,6 @@ private:
void HibernateLocked();
void WakeLocked();
- void UpdateMetricsLocked();
-
mutable RwLock m_Lock;
const Configuration m_Config;
std::string m_ModuleId;
@@ -149,15 +132,6 @@ private:
std::filesystem::path m_BaseDir;
std::filesystem::path m_TempDir;
- ResourceMetrics m_ResourceMetrics;
-
- std::atomic<uint64_t> m_MemoryBytes = 0;
- std::atomic<uint64_t> m_KernelTimeMs = 0;
- std::atomic<uint64_t> m_UserTimeMs = 0;
- std::atomic<uint64_t> m_WorkingSetSize = 0;
- std::atomic<uint64_t> m_PeakWorkingSetSize = 0;
- std::atomic<uint64_t> m_PagefileUsage = 0;
- std::atomic<uint64_t> m_PeakPagefileUsage = 0;
#if ZEN_PLATFORM_WINDOWS
JobObject* m_JobObject = nullptr;
diff --git a/src/zenserver/hub/zenhubserver.cpp b/src/zenserver/hub/zenhubserver.cpp
index 314031246..2d0d5398b 100644
--- a/src/zenserver/hub/zenhubserver.cpp
+++ b/src/zenserver/hub/zenhubserver.cpp
@@ -7,12 +7,15 @@
#include "hub.h"
#include <zencore/config.h>
+#include <zencore/except.h>
+#include <zencore/filesystem.h>
#include <zencore/fmtutils.h>
#include <zencore/memory/llm.h>
#include <zencore/memory/memorytrace.h>
#include <zencore/memory/tagtrace.h>
#include <zencore/scopeguard.h>
#include <zencore/sentryintegration.h>
+#include <zencore/system.h>
#include <zencore/windows.h>
#include <zenhttp/httpapiservice.h>
#include <zenutil/service.h>
@@ -203,6 +206,34 @@ ZenHubServerConfigurator::AddCliOptions(cxxopts::Options& Options)
"Request timeout in milliseconds for instance activity check requests",
cxxopts::value<uint32_t>(m_ServerOptions.WatchdogConfig.ActivityCheckRequestTimeoutMs)->default_value("200"),
"<ms>");
+
+ Options.add_option("hub",
+ "",
+ "hub-provision-disk-limit-bytes",
+ "Reject provisioning when used disk bytes exceed this value (0 = no limit).",
+ cxxopts::value<uint64_t>(m_ServerOptions.HubProvisionDiskLimitBytes),
+ "<bytes>");
+
+ Options.add_option("hub",
+ "",
+ "hub-provision-disk-limit-percent",
+ "Reject provisioning when used disk exceeds this percentage of total disk (0 = no limit).",
+ cxxopts::value<uint32_t>(m_ServerOptions.HubProvisionDiskLimitPercent),
+ "<percent>");
+
+ Options.add_option("hub",
+ "",
+ "hub-provision-memory-limit-bytes",
+ "Reject provisioning when used memory bytes exceed this value (0 = no limit).",
+ cxxopts::value<uint64_t>(m_ServerOptions.HubProvisionMemoryLimitBytes),
+ "<bytes>");
+
+ Options.add_option("hub",
+ "",
+ "hub-provision-memory-limit-percent",
+ "Reject provisioning when used memory exceeds this percentage of total RAM (0 = no limit).",
+ cxxopts::value<uint32_t>(m_ServerOptions.HubProvisionMemoryLimitPercent),
+ "<percent>");
}
void
@@ -226,6 +257,18 @@ ZenHubServerConfigurator::OnConfigFileParsed(LuaConfig::Options& LuaOptions)
void
ZenHubServerConfigurator::ValidateOptions()
{
+ if (m_ServerOptions.HubProvisionDiskLimitPercent > 100)
+ {
+ throw OptionParseException(
+ fmt::format("'--hub-provision-disk-limit-percent' ({}) must be in range 0..100", m_ServerOptions.HubProvisionDiskLimitPercent),
+ {});
+ }
+ if (m_ServerOptions.HubProvisionMemoryLimitPercent > 100)
+ {
+ throw OptionParseException(fmt::format("'--hub-provision-memory-limit-percent' ({}) must be in range 0..100",
+ m_ServerOptions.HubProvisionMemoryLimitPercent),
+ {});
+ }
}
///////////////////////////////////////////////////////////////////////////
@@ -373,30 +416,71 @@ ZenHubServer::InitializeState(const ZenHubServerConfig& ServerConfig)
ZEN_UNUSED(ServerConfig);
}
+ResourceMetrics
+ZenHubServer::ResolveLimits(const ZenHubServerConfig& ServerConfig)
+{
+ uint64_t DiskTotal = 0;
+ uint64_t MemoryTotal = 0;
+
+ if (ServerConfig.HubProvisionDiskLimitPercent > 0)
+ {
+ DiskSpace Disk;
+ if (DiskSpaceInfo(ServerConfig.DataDir, Disk))
+ {
+ DiskTotal = Disk.Total;
+ }
+ else
+ {
+ ZEN_WARN("Failed to query disk space for '{}'; disk percent limit will not be applied", ServerConfig.DataDir);
+ }
+ }
+ if (ServerConfig.HubProvisionMemoryLimitPercent > 0)
+ {
+ MemoryTotal = GetSystemMetrics().SystemMemoryMiB * 1024 * 1024;
+ }
+
+ auto Resolve = [](uint64_t Bytes, uint32_t Pct, uint64_t Total) -> uint64_t {
+ const uint64_t PctBytes = Pct > 0 ? (Total * Pct) / 100 : 0;
+ if (Bytes > 0 && PctBytes > 0)
+ {
+ return Min(Bytes, PctBytes);
+ }
+ return Bytes > 0 ? Bytes : PctBytes;
+ };
+
+ return {
+ .DiskUsageBytes = Resolve(ServerConfig.HubProvisionDiskLimitBytes, ServerConfig.HubProvisionDiskLimitPercent, DiskTotal),
+ .MemoryUsageBytes = Resolve(ServerConfig.HubProvisionMemoryLimitBytes, ServerConfig.HubProvisionMemoryLimitPercent, MemoryTotal),
+ };
+}
+
void
ZenHubServer::InitializeServices(const ZenHubServerConfig& ServerConfig)
{
ZEN_INFO("instantiating Hub");
+ Hub::Configuration HubConfig{
+ .UseJobObject = ServerConfig.HubUseJobObject,
+ .BasePortNumber = ServerConfig.HubBasePortNumber,
+ .InstanceLimit = ServerConfig.HubInstanceLimit,
+ .InstanceHttpThreadCount = ServerConfig.HubInstanceHttpThreadCount,
+ .InstanceCoreLimit = ServerConfig.HubInstanceCoreLimit,
+ .InstanceConfigPath = ServerConfig.HubInstanceConfigPath,
+ .HydrationTargetSpecification = ServerConfig.HydrationTargetSpecification,
+ .WatchDog =
+ {
+ .CycleInterval = std::chrono::milliseconds(ServerConfig.WatchdogConfig.CycleIntervalMs),
+ .CycleProcessingBudget = std::chrono::milliseconds(ServerConfig.WatchdogConfig.CycleProcessingBudgetMs),
+ .InstanceCheckThrottle = std::chrono::milliseconds(ServerConfig.WatchdogConfig.InstanceCheckThrottleMs),
+ .ProvisionedInactivityTimeout = std::chrono::seconds(ServerConfig.WatchdogConfig.ProvisionedInactivityTimeoutSeconds),
+ .HibernatedInactivityTimeout = std::chrono::seconds(ServerConfig.WatchdogConfig.HibernatedInactivityTimeoutSeconds),
+ .InactivityCheckMargin = std::chrono::seconds(ServerConfig.WatchdogConfig.InactivityCheckMarginSeconds),
+ .ActivityCheckConnectTimeout = std::chrono::milliseconds(ServerConfig.WatchdogConfig.ActivityCheckConnectTimeoutMs),
+ .ActivityCheckRequestTimeout = std::chrono::milliseconds(ServerConfig.WatchdogConfig.ActivityCheckRequestTimeoutMs),
+ },
+ .ResourceLimits = ResolveLimits(ServerConfig)};
+
m_Hub = std::make_unique<Hub>(
- Hub::Configuration{
- .UseJobObject = ServerConfig.HubUseJobObject,
- .BasePortNumber = ServerConfig.HubBasePortNumber,
- .InstanceLimit = ServerConfig.HubInstanceLimit,
- .InstanceHttpThreadCount = ServerConfig.HubInstanceHttpThreadCount,
- .InstanceCoreLimit = ServerConfig.HubInstanceCoreLimit,
- .InstanceConfigPath = ServerConfig.HubInstanceConfigPath,
- .HydrationTargetSpecification = ServerConfig.HydrationTargetSpecification,
- .WatchDog =
- {
- .CycleInterval = std::chrono::milliseconds(ServerConfig.WatchdogConfig.CycleIntervalMs),
- .CycleProcessingBudget = std::chrono::milliseconds(ServerConfig.WatchdogConfig.CycleProcessingBudgetMs),
- .InstanceCheckThrottle = std::chrono::milliseconds(ServerConfig.WatchdogConfig.InstanceCheckThrottleMs),
- .ProvisionedInactivityTimeout = std::chrono::seconds(ServerConfig.WatchdogConfig.ProvisionedInactivityTimeoutSeconds),
- .HibernatedInactivityTimeout = std::chrono::seconds(ServerConfig.WatchdogConfig.HibernatedInactivityTimeoutSeconds),
- .InactivityCheckMargin = std::chrono::seconds(ServerConfig.WatchdogConfig.InactivityCheckMarginSeconds),
- .ActivityCheckConnectTimeout = std::chrono::milliseconds(ServerConfig.WatchdogConfig.ActivityCheckConnectTimeoutMs),
- .ActivityCheckRequestTimeout = std::chrono::milliseconds(ServerConfig.WatchdogConfig.ActivityCheckRequestTimeoutMs),
- }},
+ std::move(HubConfig),
ZenServerEnvironment(ZenServerEnvironment::Hub,
ServerConfig.DataDir / "hub",
ServerConfig.DataDir / "servers",
diff --git a/src/zenserver/hub/zenhubserver.h b/src/zenserver/hub/zenhubserver.h
index 77df3eaa3..9660e9a49 100644
--- a/src/zenserver/hub/zenhubserver.h
+++ b/src/zenserver/hub/zenhubserver.h
@@ -3,6 +3,7 @@
#pragma once
#include "hubinstancestate.h"
+#include "resourcemetrics.h"
#include "zenserver.h"
#include <zenutil/consul.h>
@@ -49,6 +50,10 @@ struct ZenHubServerConfig : public ZenServerConfig
std::filesystem::path HubInstanceConfigPath; // Path to Lua config file
std::string HydrationTargetSpecification; // hydration/dehydration target specification
ZenHubWatchdogConfig WatchdogConfig;
+ uint64_t HubProvisionDiskLimitBytes = 0;
+ uint32_t HubProvisionDiskLimitPercent = 0;
+ uint64_t HubProvisionMemoryLimitBytes = 0;
+ uint32_t HubProvisionMemoryLimitPercent = 0;
};
class Hub;
@@ -126,6 +131,8 @@ private:
uint32_t m_ConsulHealthIntervalSeconds = 10;
uint32_t m_ConsulDeregisterAfterSeconds = 30;
+ static ResourceMetrics ResolveLimits(const ZenHubServerConfig& ServerConfig);
+
void InitializeState(const ZenHubServerConfig& ServerConfig);
void InitializeServices(const ZenHubServerConfig& ServerConfig);
void RegisterServices(const ZenHubServerConfig& ServerConfig);
diff --git a/src/zenutil/include/zenutil/zenserverprocess.h b/src/zenutil/include/zenutil/zenserverprocess.h
index 03d507400..d6f66fbea 100644
--- a/src/zenutil/include/zenutil/zenserverprocess.h
+++ b/src/zenutil/include/zenutil/zenserverprocess.h
@@ -66,6 +66,7 @@ public:
std::filesystem::path CreateNewTestDir();
std::filesystem::path CreateChildDir(std::string_view ChildName);
std::filesystem::path ProgramBaseDir() const { return m_ProgramBaseDir; }
+ std::filesystem::path GetChildBaseDir() const { return m_ChildProcessBaseDir; }
std::filesystem::path GetTestRootDir(std::string_view Path);
inline bool IsInitialized() const { return m_IsInitialized; }
inline bool IsTestEnvironment() const { return m_IsTestInstance; }