aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorDan Engelbrecht <[email protected]>2026-03-23 14:09:46 +0100
committerGitHub Enterprise <[email protected]>2026-03-23 14:09:46 +0100
commitd336e5937019dbab1924419624faa6ffc776cd7f (patch)
treecee312dc86f8404bbd4f987cbcf2f4d96124fc19 /src
parentUnique session/client tracking using HyperLogLog (#884) (diff)
downloadzen-d336e5937019dbab1924419624faa6ffc776cd7f.tar.xz
zen-d336e5937019dbab1924419624faa6ffc776cd7f.zip
add hub instance crash recovery (#885)
* add hub instance crash recovery
Diffstat (limited to 'src')
-rw-r--r--src/zenserver/frontend/html/pages/hub.js2
-rw-r--r--src/zenserver/frontend/html/zen.css7
-rw-r--r--src/zenserver/hub/hub.cpp268
-rw-r--r--src/zenserver/hub/hub.h6
-rw-r--r--src/zenserver/hub/hubinstancestate.cpp4
-rw-r--r--src/zenserver/hub/hubinstancestate.h2
-rw-r--r--src/zenserver/hub/storageserverinstance.cpp57
-rw-r--r--src/zenserver/hub/storageserverinstance.h6
-rw-r--r--src/zenutil/include/zenutil/zenserverprocess.h1
-rw-r--r--src/zenutil/zenserverprocess.cpp10
10 files changed, 328 insertions, 35 deletions
diff --git a/src/zenserver/frontend/html/pages/hub.js b/src/zenserver/frontend/html/pages/hub.js
index 3a5b67483..149a5c79c 100644
--- a/src/zenserver/frontend/html/pages/hub.js
+++ b/src/zenserver/frontend/html/pages/hub.js
@@ -8,7 +8,7 @@ import { Friendly } from "../util/friendly.js"
import { Modal } from "../util/modal.js"
////////////////////////////////////////////////////////////////////////////////
-const STABLE_STATES = new Set(["provisioned", "hibernated"]);
+const STABLE_STATES = new Set(["provisioned", "hibernated", "crashed"]);
function _is_actionable(state)
{
diff --git a/src/zenserver/frontend/html/zen.css b/src/zenserver/frontend/html/zen.css
index 5ce60d2d2..49a1e6d21 100644
--- a/src/zenserver/frontend/html/zen.css
+++ b/src/zenserver/frontend/html/zen.css
@@ -1099,6 +1099,12 @@ tr:last-child td {
.module-state-dot[data-state="provisioned"] { background: var(--theme_ok); }
.module-state-dot[data-state="hibernated"] { background: var(--theme_warn); }
.module-state-dot[data-state="unprovisioned"] { background: var(--theme_g1); }
+.module-state-dot[data-state="crashed"] { background: var(--theme_fail); }
+
+@keyframes module-dot-recovering {
+ 0%, 59.9% { background: var(--theme_fail); }
+ 60%, 100% { background: var(--theme_ok); }
+}
@keyframes module-dot-hibernating {
0%, 59.9% { background: var(--theme_warn); }
@@ -1124,6 +1130,7 @@ tr:last-child td {
.module-state-dot[data-state="hibernating"] { animation: module-dot-hibernating 1s steps(1, end) infinite; }
.module-state-dot[data-state="waking"] { animation: module-dot-waking 1s steps(1, end) infinite; }
.module-state-dot[data-state="provisioning"] { animation: module-dot-provisioning 1s steps(1, end) infinite; }
+.module-state-dot[data-state="recovering"] { animation: module-dot-recovering 1s steps(1, end) infinite; }
.module-state-dot[data-state="deprovisioning"][data-prev-state="provisioned"] {
animation: module-dot-deprovisioning-from-provisioned 1s steps(1, end) infinite;
}
diff --git a/src/zenserver/hub/hub.cpp b/src/zenserver/hub/hub.cpp
index ebbb9432a..e8487d7d9 100644
--- a/src/zenserver/hub/hub.cpp
+++ b/src/zenserver/hub/hub.cpp
@@ -307,6 +307,13 @@ Hub::Provision(std::string_view ModuleId, HubProvisionedInstanceInfo& OutInfo, s
const size_t ActiveInstanceIndex = It->second;
ZEN_ASSERT(m_ActiveInstances.size() > ActiveInstanceIndex);
+ if (m_RecoveringModules.contains(std::string(ModuleId)))
+ {
+ OutReason = fmt::format("Module '{}' is currently recovering from a crash", ModuleId);
+ ZEN_WARN("Attempted to provision module '{}' which is currently recovering", ModuleId);
+ return false;
+ }
+
std::unique_ptr<StorageServerInstance>& InstanceRaw = m_ActiveInstances[ActiveInstanceIndex];
Instance = InstanceRaw->LockExclusive(/*Wait*/ true);
AllocatedPort = InstanceRaw->GetBasePort();
@@ -409,6 +416,13 @@ Hub::Deprovision(const std::string& ModuleId, std::string& OutReason)
return false;
}
+ if (m_RecoveringModules.contains(ModuleId))
+ {
+ OutReason = fmt::format("Module '{}' is currently recovering from a crash", ModuleId);
+ ZEN_WARN("Attempted to deprovision module '{}' which is currently recovering", ModuleId);
+ return false;
+ }
+
if (auto It = m_InstanceLookup.find(ModuleId); It == m_InstanceLookup.end())
{
ZEN_WARN("Attempted to deprovision non-existent module '{}'", ModuleId);
@@ -479,7 +493,7 @@ Hub::Hibernate(const std::string& ModuleId, std::string& OutReason)
RwLock::ExclusiveLockScope _(m_Lock);
if (m_ProvisioningModules.contains(ModuleId) || m_DeprovisioningModules.contains(ModuleId) ||
- m_HibernatingModules.contains(ModuleId) || m_WakingModules.contains(ModuleId))
+ m_HibernatingModules.contains(ModuleId) || m_WakingModules.contains(ModuleId) || m_RecoveringModules.contains(ModuleId))
{
OutReason = fmt::format("Module '{}' is currently changing state", ModuleId);
return false;
@@ -538,7 +552,7 @@ Hub::Wake(const std::string& ModuleId, std::string& OutReason)
RwLock::ExclusiveLockScope _(m_Lock);
if (m_ProvisioningModules.contains(ModuleId) || m_DeprovisioningModules.contains(ModuleId) ||
- m_HibernatingModules.contains(ModuleId) || m_WakingModules.contains(ModuleId))
+ m_HibernatingModules.contains(ModuleId) || m_WakingModules.contains(ModuleId) || m_RecoveringModules.contains(ModuleId))
{
OutReason = fmt::format("Module '{}' is currently changing state", ModuleId);
return false;
@@ -707,60 +721,159 @@ Hub::CanProvisionInstance(std::string_view ModuleId, std::string& OutReason)
}
void
+Hub::AttemptRecoverInstance(std::string_view ModuleId)
+{
+ StorageServerInstance::ExclusiveLockedPtr Instance;
+ StorageServerInstance* RawInstance = nullptr;
+
+ {
+ RwLock::ExclusiveLockScope _(m_Lock);
+
+ if (m_RecoveringModules.contains(std::string(ModuleId)) || m_ProvisioningModules.contains(std::string(ModuleId)) ||
+ m_DeprovisioningModules.contains(std::string(ModuleId)) || m_HibernatingModules.contains(std::string(ModuleId)) ||
+ m_WakingModules.contains(std::string(ModuleId)))
+ {
+ return;
+ }
+
+ auto It = m_InstanceLookup.find(std::string(ModuleId));
+ if (It == m_InstanceLookup.end())
+ {
+ return;
+ }
+
+ const size_t ActiveInstanceIndex = It->second;
+ ZEN_ASSERT(ActiveInstanceIndex < m_ActiveInstances.size());
+ RawInstance = m_ActiveInstances[ActiveInstanceIndex].get();
+ Instance = RawInstance->LockExclusive(/*Wait*/ true);
+ m_RecoveringModules.emplace(std::string(ModuleId));
+ }
+
+ ZEN_ASSERT(Instance);
+
+ auto RemoveRecoveringModule = MakeGuard([&] {
+ RwLock::ExclusiveLockScope _(m_Lock);
+ m_RecoveringModules.erase(std::string(ModuleId));
+ });
+
+ // Re-validate: state may have changed between releasing shared lock and acquiring exclusive lock
+ if (Instance.GetState() != HubInstanceState::Provisioned || Instance.IsRunning())
+ {
+ return;
+ }
+
+ const uint16_t Port = RawInstance->GetBasePort();
+ std::string BaseUri; // TODO?
+
+ if (Instance.RecoverFromCrash())
+ {
+ Instance = {};
+ return;
+ }
+
+ // Restart threw but data dir is intact - run Dehydrate via Deprovision before cleanup.
+ Instance.Deprovision();
+ Instance = {};
+
+ if (m_DeprovisionedModuleCallback)
+ {
+ try
+ {
+ m_DeprovisionedModuleCallback(ModuleId, HubProvisionedInstanceInfo{.BaseUri = BaseUri, .Port = Port});
+ }
+ catch (const std::exception& Ex)
+ {
+ ZEN_ERROR("Deprovision callback for recovered module {} failed. Reason: '{}'", ModuleId, Ex.what());
+ }
+ }
+
+ std::unique_ptr<StorageServerInstance> DestroyInstance;
+ {
+ RwLock::ExclusiveLockScope _(m_Lock);
+ if (auto It = m_InstanceLookup.find(std::string(ModuleId)); It != m_InstanceLookup.end())
+ {
+ const size_t ActiveInstanceIndex = It->second;
+ ZEN_ASSERT(ActiveInstanceIndex < m_ActiveInstances.size());
+ DestroyInstance = std::move(m_ActiveInstances[ActiveInstanceIndex]);
+ m_FreeActiveInstanceIndexes.push_back(ActiveInstanceIndex);
+ m_InstanceLookup.erase(It);
+ }
+ m_FreePorts.push_back(Port);
+ m_RecoveringModules.erase(std::string(ModuleId));
+ }
+ RemoveRecoveringModule.Dismiss();
+
+ try
+ {
+ DestroyInstance.reset();
+ }
+ catch (const std::exception& Ex)
+ {
+ ZEN_ERROR("Failed to destroy recovered instance for module '{}': {}", ModuleId, Ex.what());
+ }
+}
+
+void
Hub::WatchDog()
{
constexpr uint64_t WatchDogWakeupTimeMs = 5000;
constexpr uint64_t WatchDogProcessingTimeMs = 500;
- size_t CheckInstanceIndex = 0;
+ size_t CheckInstanceIndex = SIZE_MAX; // first increment wraps to 0
while (!m_WatchDogEvent.Wait(WatchDogWakeupTimeMs))
{
try
{
- size_t MaxCheckCount = m_Lock.WithSharedLock([this]() { return m_InstanceLookup.size(); });
+ // Snapshot slot count. We iterate all slots (including freed nulls) so
+ // round-robin coverage is not skewed by deprovisioned entries.
+ size_t SlotsRemaining = m_Lock.WithSharedLock([this]() { return m_ActiveInstances.size(); });
Stopwatch Timer;
- while (MaxCheckCount-- > 0 && Timer.GetElapsedTimeMs() < WatchDogProcessingTimeMs && !m_WatchDogEvent.Wait(5))
+ bool ShuttingDown = false;
+ while (SlotsRemaining > 0 && Timer.GetElapsedTimeMs() < WatchDogProcessingTimeMs && !ShuttingDown)
{
StorageServerInstance::SharedLockedPtr LockedInstance;
- m_Lock.WithSharedLock([this, &CheckInstanceIndex, &LockedInstance]() {
- if (m_InstanceLookup.empty())
- {
- return;
- }
-
- size_t MaxLoopCount = m_ActiveInstances.size();
- StorageServerInstance* Instance = nullptr;
- while (MaxLoopCount-- > 0 && !Instance)
+ m_Lock.WithSharedLock([this, &CheckInstanceIndex, &LockedInstance, &SlotsRemaining]() {
+ // Advance through null (freed) slots under a single lock acquisition.
+ while (SlotsRemaining > 0)
{
+ SlotsRemaining--;
CheckInstanceIndex++;
if (CheckInstanceIndex >= m_ActiveInstances.size())
{
CheckInstanceIndex = 0;
}
- Instance = (CheckInstanceIndex < m_ActiveInstances.size()) ? m_ActiveInstances[CheckInstanceIndex].get() : nullptr;
- }
-
- if (Instance)
- {
- LockedInstance = Instance->LockShared(/*Wait*/ false);
+ StorageServerInstance* Instance = m_ActiveInstances[CheckInstanceIndex].get();
+ if (Instance)
+ {
+ LockedInstance = Instance->LockShared(/*Wait*/ false);
+ break; // Found a live slot (locked or busy); stop scanning this batch.
+ }
}
});
- if (LockedInstance)
+ if (!LockedInstance)
{
- if (LockedInstance.IsRunning())
- {
- LockedInstance.UpdateMetrics();
- }
- else if (LockedInstance.GetState() == HubInstanceState::Provisioned)
- {
- // Process is not running but state says it should be - instance died unexpectedly.
- // TODO: Track and attempt recovery.
- }
- // else: transitional state (Provisioning, Deprovisioning, Hibernating, Waking) - expected, skip.
+ // Either all remaining slots were null, or the live slot's lock was busy -- move on.
+ continue;
+ }
+
+ if (LockedInstance.IsRunning())
+ {
+ LockedInstance.UpdateMetrics();
+ }
+ else if (LockedInstance.GetState() == HubInstanceState::Provisioned)
+ {
+ // Process is not running but state says it should be - instance died unexpectedly.
+ const std::string ModuleId(LockedInstance.GetModuleId());
LockedInstance = {};
+ AttemptRecoverInstance(ModuleId);
}
+ // else: transitional state (Provisioning, Deprovisioning, Hibernating, Waking, Recovering, Crashed) - expected, skip.
+ LockedInstance = {};
+
+ // Rate-limit: pause briefly between live-instance checks and respond to shutdown.
+ ShuttingDown = m_WatchDogEvent.Wait(5);
}
}
catch (const std::exception& Ex)
@@ -1293,9 +1406,102 @@ TEST_CASE("hub.hibernate_wake_errors")
CHECK(Reason.empty());
}
+TEST_CASE("hub.recover_process_crash")
+{
+ ScopedTemporaryDirectory TempDir;
+ std::unique_ptr<Hub> HubInstance = hub_testutils::MakeHub(TempDir.Path());
+
+ HubProvisionedInstanceInfo Info;
+ std::string Reason;
+ REQUIRE_MESSAGE(HubInstance->Provision("module_a", Info, Reason), Reason);
+
+ // Kill the child process to simulate a crash, then poll until the watchdog detects it,
+ // recovers the instance, and the new process is serving requests.
+ HubInstance->TerminateModuleForTesting("module_a");
+
+ constexpr auto kPollIntervalMs = std::chrono::milliseconds(200);
+ constexpr auto kTimeoutMs = std::chrono::seconds(20);
+ const auto Deadline = std::chrono::steady_clock::now() + kTimeoutMs;
+
+ // A successful HTTP health check on the same port confirms the new process is up.
+ HttpClient ModClient(fmt::format("http://localhost:{}", Info.Port), kFastTimeout);
+ bool Recovered = false;
+ while (std::chrono::steady_clock::now() < Deadline)
+ {
+ std::this_thread::sleep_for(kPollIntervalMs);
+ Hub::InstanceInfo InstanceInfo;
+ if (HubInstance->Find("module_a", &InstanceInfo) && InstanceInfo.State == HubInstanceState::Provisioned &&
+ ModClient.Get("/health/"))
+ {
+ // Recovery must reuse the same port - the instance was never removed from the hub's
+ // port table during recovery, so AttemptRecoverInstance reuses m_Config.BasePort.
+ CHECK_EQ(InstanceInfo.Port, Info.Port);
+ Recovered = true;
+ break;
+ }
+ }
+ CHECK_MESSAGE(Recovered, "Instance did not recover within timeout");
+}
+
+TEST_CASE("hub.recover_process_crash_then_deprovision")
+{
+ ScopedTemporaryDirectory TempDir;
+ std::unique_ptr<Hub> HubInstance = hub_testutils::MakeHub(TempDir.Path());
+
+ HubProvisionedInstanceInfo Info;
+ std::string Reason;
+ REQUIRE_MESSAGE(HubInstance->Provision("module_a", Info, Reason), Reason);
+
+ // Kill the child process, wait for the watchdog to detect and recover the instance.
+ HubInstance->TerminateModuleForTesting("module_a");
+
+ constexpr auto kPollIntervalMs = std::chrono::milliseconds(200);
+ constexpr auto kTimeoutMs = std::chrono::seconds(20);
+ const auto Deadline = std::chrono::steady_clock::now() + kTimeoutMs;
+
+ bool Recovered = false;
+ while (std::chrono::steady_clock::now() < Deadline)
+ {
+ std::this_thread::sleep_for(kPollIntervalMs);
+ Hub::InstanceInfo InstanceInfo;
+ if (HubInstance->Find("module_a", &InstanceInfo) && InstanceInfo.State == HubInstanceState::Provisioned)
+ {
+ Recovered = true;
+ break;
+ }
+ }
+ REQUIRE_MESSAGE(Recovered, "Instance did not recover within timeout");
+
+ // After recovery, deprovision should succeed and a re-provision should work.
+ CHECK_MESSAGE(HubInstance->Deprovision("module_a", Reason), Reason);
+ CHECK_EQ(HubInstance->GetInstanceCount(), 0);
+
+ HubProvisionedInstanceInfo NewInfo;
+ CHECK_MESSAGE(HubInstance->Provision("module_a", NewInfo, Reason), Reason);
+ CHECK_NE(NewInfo.Port, 0);
+ HttpClient NewClient(fmt::format("http://localhost:{}", NewInfo.Port), kFastTimeout);
+ CHECK_MESSAGE(NewClient.Get("/health/"), "Re-provisioned instance is not serving requests");
+}
+
TEST_SUITE_END();
void
+Hub::TerminateModuleForTesting(const std::string& ModuleId)
+{
+ RwLock::SharedLockScope _(m_Lock);
+ auto It = m_InstanceLookup.find(ModuleId);
+ if (It == m_InstanceLookup.end())
+ {
+ return;
+ }
+ StorageServerInstance::SharedLockedPtr Locked = m_ActiveInstances[It->second]->LockShared(/*Wait*/ true);
+ if (Locked)
+ {
+ Locked.TerminateForTesting();
+ }
+}
+
+void
hub_forcelink()
{
}
diff --git a/src/zenserver/hub/hub.h b/src/zenserver/hub/hub.h
index 8c4039c38..f880dab52 100644
--- a/src/zenserver/hub/hub.h
+++ b/src/zenserver/hub/hub.h
@@ -131,6 +131,10 @@ public:
const Configuration& GetConfig() const { return m_Config; }
+#if ZEN_WITH_TESTS
+ void TerminateModuleForTesting(const std::string& ModuleId);
+#endif
+
private:
const Configuration m_Config;
ZenServerEnvironment m_RunEnvironment;
@@ -150,6 +154,7 @@ private:
std::unordered_set<std::string> m_ProvisioningModules;
std::unordered_set<std::string> m_HibernatingModules;
std::unordered_set<std::string> m_WakingModules;
+ std::unordered_set<std::string> m_RecoveringModules;
std::vector<std::unique_ptr<StorageServerInstance>> m_ActiveInstances;
std::vector<size_t> m_FreeActiveInstanceIndexes;
ResourceMetrics m_ResourceLimits;
@@ -160,6 +165,7 @@ private:
Event m_WatchDogEvent;
void WatchDog();
+ void AttemptRecoverInstance(std::string_view ModuleId);
void UpdateStats();
void UpdateCapacityMetrics();
diff --git a/src/zenserver/hub/hubinstancestate.cpp b/src/zenserver/hub/hubinstancestate.cpp
index 8337f73a8..c47fdd294 100644
--- a/src/zenserver/hub/hubinstancestate.cpp
+++ b/src/zenserver/hub/hubinstancestate.cpp
@@ -25,6 +25,10 @@ ToString(HubInstanceState State)
return "waking";
case HubInstanceState::Deprovisioning:
return "deprovisioning";
+ case HubInstanceState::Crashed:
+ return "crashed";
+ case HubInstanceState::Recovering:
+ return "recovering";
}
ZEN_ASSERT(false);
return "unknown";
diff --git a/src/zenserver/hub/hubinstancestate.h b/src/zenserver/hub/hubinstancestate.h
index ce1f222bd..2dee89ff0 100644
--- a/src/zenserver/hub/hubinstancestate.h
+++ b/src/zenserver/hub/hubinstancestate.h
@@ -16,6 +16,8 @@ enum class HubInstanceState : uint32_t
Hibernated, // Process stopped, data preserved; can be woken
Waking, // Starting process from preserved data
Deprovisioning, // Shutting down process and cleaning up data
+ Crashed, // Process died unexpectedly while Provisioned; recovery pending
+ Recovering, // Attempting in-place restart after a crash
};
std::string_view ToString(HubInstanceState State);
diff --git a/src/zenserver/hub/storageserverinstance.cpp b/src/zenserver/hub/storageserverinstance.cpp
index 7933cfa70..0a9efcc44 100644
--- a/src/zenserver/hub/storageserverinstance.cpp
+++ b/src/zenserver/hub/storageserverinstance.cpp
@@ -28,6 +28,7 @@ void
StorageServerInstance::SpawnServerProcess()
{
ZEN_ASSERT_FORMAT(!m_ServerInstance.IsRunning(), "Storage server instance for module '{}' is already running", m_ModuleId);
+ m_ServerInstance.ResetDeadProcess();
m_ServerInstance.SetServerExecutablePath(GetRunningExecutablePath());
m_ServerInstance.SetDataDir(m_BaseDir);
@@ -77,6 +78,12 @@ StorageServerInstance::ProvisionLocked()
return;
}
+ if (m_State.load() == HubInstanceState::Crashed)
+ {
+ ZEN_WARN("Storage server instance for module '{}' is in crashed state; re-provisioning from scratch", m_ModuleId);
+ m_State = HubInstanceState::Unprovisioned;
+ }
+
if (m_State.load() == HubInstanceState::Hibernated)
{
if (WakeLocked())
@@ -105,18 +112,23 @@ StorageServerInstance::ProvisionLocked()
void
StorageServerInstance::DeprovisionLocked()
{
- if (m_State.load() != HubInstanceState::Provisioned)
+ const HubInstanceState CurrentState = m_State.load();
+ if (CurrentState != HubInstanceState::Provisioned && CurrentState != HubInstanceState::Crashed)
{
ZEN_WARN("Attempted to deprovision storage server instance for module '{}' which is not provisioned (state: '{}')",
m_ModuleId,
- ToString(m_State.load()));
+ ToString(CurrentState));
return;
}
ZEN_INFO("Deprovisioning storage server instance for module '{}'", m_ModuleId);
m_State = HubInstanceState::Deprovisioning;
- m_ServerInstance.Shutdown();
+ if (CurrentState == HubInstanceState::Provisioned)
+ {
+ m_ServerInstance.Shutdown(); // Graceful; process may still be running
+ }
+ // Crashed: process already dead; skip Shutdown
Dehydrate();
@@ -187,6 +199,29 @@ StorageServerInstance::WakeLocked()
}
}
+bool
+StorageServerInstance::RecoverCrashedLocked()
+{
+ ZEN_ASSERT(m_State.load() == HubInstanceState::Provisioned);
+ ZEN_ASSERT(!m_ServerInstance.IsRunning());
+
+ ZEN_WARN("Storage server instance for module '{}' has crashed; attempting in-place recovery", m_ModuleId);
+ m_State = HubInstanceState::Recovering;
+ try
+ {
+ SpawnServerProcess();
+ m_State = HubInstanceState::Provisioned;
+ ZEN_INFO("Storage server instance for module '{}' recovered successfully", m_ModuleId);
+ return true;
+ }
+ catch (const std::exception& Ex)
+ {
+ ZEN_ERROR("Failed to restart module '{}': {}", m_ModuleId, Ex.what());
+ m_State = HubInstanceState::Crashed;
+ return false;
+ }
+}
+
void
StorageServerInstance::Hydrate()
{
@@ -303,6 +338,15 @@ StorageServerInstance::UpdateMetricsLocked()
// TODO: Resource metrics...
}
+#if ZEN_WITH_TESTS
+void
+StorageServerInstance::SharedLockedPtr::TerminateForTesting() const
+{
+ ZEN_ASSERT(m_Instance != nullptr);
+ m_Instance->m_ServerInstance.Terminate();
+}
+#endif
+
StorageServerInstance::ExclusiveLockedPtr::ExclusiveLockedPtr() : m_Lock(nullptr), m_Instance(nullptr)
{
}
@@ -402,4 +446,11 @@ StorageServerInstance::ExclusiveLockedPtr::Wake()
return m_Instance->WakeLocked();
}
+bool
+StorageServerInstance::ExclusiveLockedPtr::RecoverFromCrash()
+{
+ ZEN_ASSERT(m_Instance != nullptr);
+ return m_Instance->RecoverCrashedLocked();
+}
+
} // namespace zen
diff --git a/src/zenserver/hub/storageserverinstance.h b/src/zenserver/hub/storageserverinstance.h
index ba15133bf..9963f640e 100644
--- a/src/zenserver/hub/storageserverinstance.h
+++ b/src/zenserver/hub/storageserverinstance.h
@@ -81,6 +81,10 @@ public:
return m_Instance->UpdateMetricsLocked();
}
+#if ZEN_WITH_TESTS
+ void TerminateForTesting() const; // kills the child process to simulate a crash
+#endif
+
private:
RwLock* m_Lock = nullptr;
StorageServerInstance* m_Instance = nullptr;
@@ -122,6 +126,7 @@ public:
void Deprovision();
[[nodiscard]] bool Hibernate();
[[nodiscard]] bool Wake();
+ [[nodiscard]] bool RecoverFromCrash(); // true = recovered; false = spawn failed (Crashed), caller must Deprovision() + cleanup
private:
RwLock* m_Lock = nullptr;
@@ -136,6 +141,7 @@ private:
[[nodiscard]] bool HibernateLocked();
[[nodiscard]] bool WakeLocked();
+ [[nodiscard]] bool RecoverCrashedLocked(); // true = recovered (Provisioned); false = spawn failed (Crashed)
void UpdateMetricsLocked();
diff --git a/src/zenutil/include/zenutil/zenserverprocess.h b/src/zenutil/include/zenutil/zenserverprocess.h
index daa07a1e1..03d507400 100644
--- a/src/zenutil/include/zenutil/zenserverprocess.h
+++ b/src/zenutil/include/zenutil/zenserverprocess.h
@@ -135,6 +135,7 @@ struct ZenServerInstance
#endif
bool IsRunning() const;
bool Terminate();
+ void ResetDeadProcess();
std::string GetLogOutput() const;
inline ServerMode GetServerMode() const { return m_ServerMode; }
diff --git a/src/zenutil/zenserverprocess.cpp b/src/zenutil/zenserverprocess.cpp
index 3993d6a32..2b27b2d8b 100644
--- a/src/zenutil/zenserverprocess.cpp
+++ b/src/zenutil/zenserverprocess.cpp
@@ -1445,6 +1445,16 @@ ZenServerInstance::IsRunning() const
return m_Process.IsRunning();
}
+void
+ZenServerInstance::ResetDeadProcess()
+{
+ if (m_Process.IsValid() && !m_Process.IsRunning())
+ {
+ m_Process.Reset();
+ m_ShutdownEvent.reset();
+ }
+}
+
std::string
ZenServerInstance::GetLogOutput() const
{