diff options
| author | Dan Engelbrecht <[email protected]> | 2026-03-23 14:09:46 +0100 |
|---|---|---|
| committer | GitHub Enterprise <[email protected]> | 2026-03-23 14:09:46 +0100 |
| commit | d336e5937019dbab1924419624faa6ffc776cd7f (patch) | |
| tree | cee312dc86f8404bbd4f987cbcf2f4d96124fc19 /src | |
| parent | Unique session/client tracking using HyperLogLog (#884) (diff) | |
| download | zen-d336e5937019dbab1924419624faa6ffc776cd7f.tar.xz zen-d336e5937019dbab1924419624faa6ffc776cd7f.zip | |
add hub instance crash recovery (#885)
* add hub instance crash recovery
Diffstat (limited to 'src')
| -rw-r--r-- | src/zenserver/frontend/html/pages/hub.js | 2 | ||||
| -rw-r--r-- | src/zenserver/frontend/html/zen.css | 7 | ||||
| -rw-r--r-- | src/zenserver/hub/hub.cpp | 268 | ||||
| -rw-r--r-- | src/zenserver/hub/hub.h | 6 | ||||
| -rw-r--r-- | src/zenserver/hub/hubinstancestate.cpp | 4 | ||||
| -rw-r--r-- | src/zenserver/hub/hubinstancestate.h | 2 | ||||
| -rw-r--r-- | src/zenserver/hub/storageserverinstance.cpp | 57 | ||||
| -rw-r--r-- | src/zenserver/hub/storageserverinstance.h | 6 | ||||
| -rw-r--r-- | src/zenutil/include/zenutil/zenserverprocess.h | 1 | ||||
| -rw-r--r-- | src/zenutil/zenserverprocess.cpp | 10 |
10 files changed, 328 insertions, 35 deletions
diff --git a/src/zenserver/frontend/html/pages/hub.js b/src/zenserver/frontend/html/pages/hub.js index 3a5b67483..149a5c79c 100644 --- a/src/zenserver/frontend/html/pages/hub.js +++ b/src/zenserver/frontend/html/pages/hub.js @@ -8,7 +8,7 @@ import { Friendly } from "../util/friendly.js" import { Modal } from "../util/modal.js" //////////////////////////////////////////////////////////////////////////////// -const STABLE_STATES = new Set(["provisioned", "hibernated"]); +const STABLE_STATES = new Set(["provisioned", "hibernated", "crashed"]); function _is_actionable(state) { diff --git a/src/zenserver/frontend/html/zen.css b/src/zenserver/frontend/html/zen.css index 5ce60d2d2..49a1e6d21 100644 --- a/src/zenserver/frontend/html/zen.css +++ b/src/zenserver/frontend/html/zen.css @@ -1099,6 +1099,12 @@ tr:last-child td { .module-state-dot[data-state="provisioned"] { background: var(--theme_ok); } .module-state-dot[data-state="hibernated"] { background: var(--theme_warn); } .module-state-dot[data-state="unprovisioned"] { background: var(--theme_g1); } +.module-state-dot[data-state="crashed"] { background: var(--theme_fail); } + +@keyframes module-dot-recovering { + 0%, 59.9% { background: var(--theme_fail); } + 60%, 100% { background: var(--theme_ok); } +} @keyframes module-dot-hibernating { 0%, 59.9% { background: var(--theme_warn); } @@ -1124,6 +1130,7 @@ tr:last-child td { .module-state-dot[data-state="hibernating"] { animation: module-dot-hibernating 1s steps(1, end) infinite; } .module-state-dot[data-state="waking"] { animation: module-dot-waking 1s steps(1, end) infinite; } .module-state-dot[data-state="provisioning"] { animation: module-dot-provisioning 1s steps(1, end) infinite; } +.module-state-dot[data-state="recovering"] { animation: module-dot-recovering 1s steps(1, end) infinite; } .module-state-dot[data-state="deprovisioning"][data-prev-state="provisioned"] { animation: module-dot-deprovisioning-from-provisioned 1s steps(1, end) infinite; } diff --git a/src/zenserver/hub/hub.cpp b/src/zenserver/hub/hub.cpp index ebbb9432a..e8487d7d9 100644 --- a/src/zenserver/hub/hub.cpp +++ b/src/zenserver/hub/hub.cpp @@ -307,6 +307,13 @@ Hub::Provision(std::string_view ModuleId, HubProvisionedInstanceInfo& OutInfo, s const size_t ActiveInstanceIndex = It->second; ZEN_ASSERT(m_ActiveInstances.size() > ActiveInstanceIndex); + if (m_RecoveringModules.contains(std::string(ModuleId))) + { + OutReason = fmt::format("Module '{}' is currently recovering from a crash", ModuleId); + ZEN_WARN("Attempted to provision module '{}' which is currently recovering", ModuleId); + return false; + } + std::unique_ptr<StorageServerInstance>& InstanceRaw = m_ActiveInstances[ActiveInstanceIndex]; Instance = InstanceRaw->LockExclusive(/*Wait*/ true); AllocatedPort = InstanceRaw->GetBasePort(); @@ -409,6 +416,13 @@ Hub::Deprovision(const std::string& ModuleId, std::string& OutReason) return false; } + if (m_RecoveringModules.contains(ModuleId)) + { + OutReason = fmt::format("Module '{}' is currently recovering from a crash", ModuleId); + ZEN_WARN("Attempted to deprovision module '{}' which is currently recovering", ModuleId); + return false; + } + if (auto It = m_InstanceLookup.find(ModuleId); It == m_InstanceLookup.end()) { ZEN_WARN("Attempted to deprovision non-existent module '{}'", ModuleId); @@ -479,7 +493,7 @@ Hub::Hibernate(const std::string& ModuleId, std::string& OutReason) RwLock::ExclusiveLockScope _(m_Lock); if (m_ProvisioningModules.contains(ModuleId) || m_DeprovisioningModules.contains(ModuleId) || - m_HibernatingModules.contains(ModuleId) || m_WakingModules.contains(ModuleId)) + m_HibernatingModules.contains(ModuleId) || m_WakingModules.contains(ModuleId) || m_RecoveringModules.contains(ModuleId)) { OutReason = fmt::format("Module '{}' is currently changing state", ModuleId); return false; @@ -538,7 +552,7 @@ Hub::Wake(const std::string& ModuleId, std::string& OutReason) RwLock::ExclusiveLockScope _(m_Lock); if (m_ProvisioningModules.contains(ModuleId) || m_DeprovisioningModules.contains(ModuleId) || - m_HibernatingModules.contains(ModuleId) || m_WakingModules.contains(ModuleId)) + m_HibernatingModules.contains(ModuleId) || m_WakingModules.contains(ModuleId) || m_RecoveringModules.contains(ModuleId)) { OutReason = fmt::format("Module '{}' is currently changing state", ModuleId); return false; @@ -707,60 +721,159 @@ Hub::CanProvisionInstance(std::string_view ModuleId, std::string& OutReason) } void +Hub::AttemptRecoverInstance(std::string_view ModuleId) +{ + StorageServerInstance::ExclusiveLockedPtr Instance; + StorageServerInstance* RawInstance = nullptr; + + { + RwLock::ExclusiveLockScope _(m_Lock); + + if (m_RecoveringModules.contains(std::string(ModuleId)) || m_ProvisioningModules.contains(std::string(ModuleId)) || + m_DeprovisioningModules.contains(std::string(ModuleId)) || m_HibernatingModules.contains(std::string(ModuleId)) || + m_WakingModules.contains(std::string(ModuleId))) + { + return; + } + + auto It = m_InstanceLookup.find(std::string(ModuleId)); + if (It == m_InstanceLookup.end()) + { + return; + } + + const size_t ActiveInstanceIndex = It->second; + ZEN_ASSERT(ActiveInstanceIndex < m_ActiveInstances.size()); + RawInstance = m_ActiveInstances[ActiveInstanceIndex].get(); + Instance = RawInstance->LockExclusive(/*Wait*/ true); + m_RecoveringModules.emplace(std::string(ModuleId)); + } + + ZEN_ASSERT(Instance); + + auto RemoveRecoveringModule = MakeGuard([&] { + RwLock::ExclusiveLockScope _(m_Lock); + m_RecoveringModules.erase(std::string(ModuleId)); + }); + + // Re-validate: state may have changed between releasing shared lock and acquiring exclusive lock + if (Instance.GetState() != HubInstanceState::Provisioned || Instance.IsRunning()) + { + return; + } + + const uint16_t Port = RawInstance->GetBasePort(); + std::string BaseUri; // TODO? + + if (Instance.RecoverFromCrash()) + { + Instance = {}; + return; + } + + // Restart threw but data dir is intact - run Dehydrate via Deprovision before cleanup. + Instance.Deprovision(); + Instance = {}; + + if (m_DeprovisionedModuleCallback) + { + try + { + m_DeprovisionedModuleCallback(ModuleId, HubProvisionedInstanceInfo{.BaseUri = BaseUri, .Port = Port}); + } + catch (const std::exception& Ex) + { + ZEN_ERROR("Deprovision callback for recovered module {} failed. Reason: '{}'", ModuleId, Ex.what()); + } + } + + std::unique_ptr<StorageServerInstance> DestroyInstance; + { + RwLock::ExclusiveLockScope _(m_Lock); + if (auto It = m_InstanceLookup.find(std::string(ModuleId)); It != m_InstanceLookup.end()) + { + const size_t ActiveInstanceIndex = It->second; + ZEN_ASSERT(ActiveInstanceIndex < m_ActiveInstances.size()); + DestroyInstance = std::move(m_ActiveInstances[ActiveInstanceIndex]); + m_FreeActiveInstanceIndexes.push_back(ActiveInstanceIndex); + m_InstanceLookup.erase(It); + } + m_FreePorts.push_back(Port); + m_RecoveringModules.erase(std::string(ModuleId)); + } + RemoveRecoveringModule.Dismiss(); + + try + { + DestroyInstance.reset(); + } + catch (const std::exception& Ex) + { + ZEN_ERROR("Failed to destroy recovered instance for module '{}': {}", ModuleId, Ex.what()); + } +} + +void Hub::WatchDog() { constexpr uint64_t WatchDogWakeupTimeMs = 5000; constexpr uint64_t WatchDogProcessingTimeMs = 500; - size_t CheckInstanceIndex = 0; + size_t CheckInstanceIndex = SIZE_MAX; // first increment wraps to 0 while (!m_WatchDogEvent.Wait(WatchDogWakeupTimeMs)) { try { - size_t MaxCheckCount = m_Lock.WithSharedLock([this]() { return m_InstanceLookup.size(); }); + // Snapshot slot count. We iterate all slots (including freed nulls) so + // round-robin coverage is not skewed by deprovisioned entries. + size_t SlotsRemaining = m_Lock.WithSharedLock([this]() { return m_ActiveInstances.size(); }); Stopwatch Timer; - while (MaxCheckCount-- > 0 && Timer.GetElapsedTimeMs() < WatchDogProcessingTimeMs && !m_WatchDogEvent.Wait(5)) + bool ShuttingDown = false; + while (SlotsRemaining > 0 && Timer.GetElapsedTimeMs() < WatchDogProcessingTimeMs && !ShuttingDown) { StorageServerInstance::SharedLockedPtr LockedInstance; - m_Lock.WithSharedLock([this, &CheckInstanceIndex, &LockedInstance]() { - if (m_InstanceLookup.empty()) - { - return; - } - - size_t MaxLoopCount = m_ActiveInstances.size(); - StorageServerInstance* Instance = nullptr; - while (MaxLoopCount-- > 0 && !Instance) + m_Lock.WithSharedLock([this, &CheckInstanceIndex, &LockedInstance, &SlotsRemaining]() { + // Advance through null (freed) slots under a single lock acquisition. + while (SlotsRemaining > 0) { + SlotsRemaining--; CheckInstanceIndex++; if (CheckInstanceIndex >= m_ActiveInstances.size()) { CheckInstanceIndex = 0; } - Instance = (CheckInstanceIndex < m_ActiveInstances.size()) ? m_ActiveInstances[CheckInstanceIndex].get() : nullptr; - } - - if (Instance) - { - LockedInstance = Instance->LockShared(/*Wait*/ false); + StorageServerInstance* Instance = m_ActiveInstances[CheckInstanceIndex].get(); + if (Instance) + { + LockedInstance = Instance->LockShared(/*Wait*/ false); + break; // Found a live slot (locked or busy); stop scanning this batch. + } } }); - if (LockedInstance) + if (!LockedInstance) { - if (LockedInstance.IsRunning()) - { - LockedInstance.UpdateMetrics(); - } - else if (LockedInstance.GetState() == HubInstanceState::Provisioned) - { - // Process is not running but state says it should be - instance died unexpectedly. - // TODO: Track and attempt recovery. - } - // else: transitional state (Provisioning, Deprovisioning, Hibernating, Waking) - expected, skip. + // Either all remaining slots were null, or the live slot's lock was busy -- move on. + continue; + } + + if (LockedInstance.IsRunning()) + { + LockedInstance.UpdateMetrics(); + } + else if (LockedInstance.GetState() == HubInstanceState::Provisioned) + { + // Process is not running but state says it should be - instance died unexpectedly. + const std::string ModuleId(LockedInstance.GetModuleId()); LockedInstance = {}; + AttemptRecoverInstance(ModuleId); } + // else: transitional state (Provisioning, Deprovisioning, Hibernating, Waking, Recovering, Crashed) - expected, skip. + LockedInstance = {}; + + // Rate-limit: pause briefly between live-instance checks and respond to shutdown. + ShuttingDown = m_WatchDogEvent.Wait(5); } } catch (const std::exception& Ex) @@ -1293,9 +1406,102 @@ TEST_CASE("hub.hibernate_wake_errors") CHECK(Reason.empty()); } +TEST_CASE("hub.recover_process_crash") +{ + ScopedTemporaryDirectory TempDir; + std::unique_ptr<Hub> HubInstance = hub_testutils::MakeHub(TempDir.Path()); + + HubProvisionedInstanceInfo Info; + std::string Reason; + REQUIRE_MESSAGE(HubInstance->Provision("module_a", Info, Reason), Reason); + + // Kill the child process to simulate a crash, then poll until the watchdog detects it, + // recovers the instance, and the new process is serving requests. + HubInstance->TerminateModuleForTesting("module_a"); + + constexpr auto kPollIntervalMs = std::chrono::milliseconds(200); + constexpr auto kTimeoutMs = std::chrono::seconds(20); + const auto Deadline = std::chrono::steady_clock::now() + kTimeoutMs; + + // A successful HTTP health check on the same port confirms the new process is up. + HttpClient ModClient(fmt::format("http://localhost:{}", Info.Port), kFastTimeout); + bool Recovered = false; + while (std::chrono::steady_clock::now() < Deadline) + { + std::this_thread::sleep_for(kPollIntervalMs); + Hub::InstanceInfo InstanceInfo; + if (HubInstance->Find("module_a", &InstanceInfo) && InstanceInfo.State == HubInstanceState::Provisioned && + ModClient.Get("/health/")) + { + // Recovery must reuse the same port - the instance was never removed from the hub's + // port table during recovery, so AttemptRecoverInstance reuses m_Config.BasePort. + CHECK_EQ(InstanceInfo.Port, Info.Port); + Recovered = true; + break; + } + } + CHECK_MESSAGE(Recovered, "Instance did not recover within timeout"); +} + +TEST_CASE("hub.recover_process_crash_then_deprovision") +{ + ScopedTemporaryDirectory TempDir; + std::unique_ptr<Hub> HubInstance = hub_testutils::MakeHub(TempDir.Path()); + + HubProvisionedInstanceInfo Info; + std::string Reason; + REQUIRE_MESSAGE(HubInstance->Provision("module_a", Info, Reason), Reason); + + // Kill the child process, wait for the watchdog to detect and recover the instance. + HubInstance->TerminateModuleForTesting("module_a"); + + constexpr auto kPollIntervalMs = std::chrono::milliseconds(200); + constexpr auto kTimeoutMs = std::chrono::seconds(20); + const auto Deadline = std::chrono::steady_clock::now() + kTimeoutMs; + + bool Recovered = false; + while (std::chrono::steady_clock::now() < Deadline) + { + std::this_thread::sleep_for(kPollIntervalMs); + Hub::InstanceInfo InstanceInfo; + if (HubInstance->Find("module_a", &InstanceInfo) && InstanceInfo.State == HubInstanceState::Provisioned) + { + Recovered = true; + break; + } + } + REQUIRE_MESSAGE(Recovered, "Instance did not recover within timeout"); + + // After recovery, deprovision should succeed and a re-provision should work. + CHECK_MESSAGE(HubInstance->Deprovision("module_a", Reason), Reason); + CHECK_EQ(HubInstance->GetInstanceCount(), 0); + + HubProvisionedInstanceInfo NewInfo; + CHECK_MESSAGE(HubInstance->Provision("module_a", NewInfo, Reason), Reason); + CHECK_NE(NewInfo.Port, 0); + HttpClient NewClient(fmt::format("http://localhost:{}", NewInfo.Port), kFastTimeout); + CHECK_MESSAGE(NewClient.Get("/health/"), "Re-provisioned instance is not serving requests"); +} + TEST_SUITE_END(); void +Hub::TerminateModuleForTesting(const std::string& ModuleId) +{ + RwLock::SharedLockScope _(m_Lock); + auto It = m_InstanceLookup.find(ModuleId); + if (It == m_InstanceLookup.end()) + { + return; + } + StorageServerInstance::SharedLockedPtr Locked = m_ActiveInstances[It->second]->LockShared(/*Wait*/ true); + if (Locked) + { + Locked.TerminateForTesting(); + } +} + +void hub_forcelink() { } diff --git a/src/zenserver/hub/hub.h b/src/zenserver/hub/hub.h index 8c4039c38..f880dab52 100644 --- a/src/zenserver/hub/hub.h +++ b/src/zenserver/hub/hub.h @@ -131,6 +131,10 @@ public: const Configuration& GetConfig() const { return m_Config; } +#if ZEN_WITH_TESTS + void TerminateModuleForTesting(const std::string& ModuleId); +#endif + private: const Configuration m_Config; ZenServerEnvironment m_RunEnvironment; @@ -150,6 +154,7 @@ private: std::unordered_set<std::string> m_ProvisioningModules; std::unordered_set<std::string> m_HibernatingModules; std::unordered_set<std::string> m_WakingModules; + std::unordered_set<std::string> m_RecoveringModules; std::vector<std::unique_ptr<StorageServerInstance>> m_ActiveInstances; std::vector<size_t> m_FreeActiveInstanceIndexes; ResourceMetrics m_ResourceLimits; @@ -160,6 +165,7 @@ private: Event m_WatchDogEvent; void WatchDog(); + void AttemptRecoverInstance(std::string_view ModuleId); void UpdateStats(); void UpdateCapacityMetrics(); diff --git a/src/zenserver/hub/hubinstancestate.cpp b/src/zenserver/hub/hubinstancestate.cpp index 8337f73a8..c47fdd294 100644 --- a/src/zenserver/hub/hubinstancestate.cpp +++ b/src/zenserver/hub/hubinstancestate.cpp @@ -25,6 +25,10 @@ ToString(HubInstanceState State) return "waking"; case HubInstanceState::Deprovisioning: return "deprovisioning"; + case HubInstanceState::Crashed: + return "crashed"; + case HubInstanceState::Recovering: + return "recovering"; } ZEN_ASSERT(false); return "unknown"; diff --git a/src/zenserver/hub/hubinstancestate.h b/src/zenserver/hub/hubinstancestate.h index ce1f222bd..2dee89ff0 100644 --- a/src/zenserver/hub/hubinstancestate.h +++ b/src/zenserver/hub/hubinstancestate.h @@ -16,6 +16,8 @@ enum class HubInstanceState : uint32_t Hibernated, // Process stopped, data preserved; can be woken Waking, // Starting process from preserved data Deprovisioning, // Shutting down process and cleaning up data + Crashed, // Process died unexpectedly while Provisioned; recovery pending + Recovering, // Attempting in-place restart after a crash }; std::string_view ToString(HubInstanceState State); diff --git a/src/zenserver/hub/storageserverinstance.cpp b/src/zenserver/hub/storageserverinstance.cpp index 7933cfa70..0a9efcc44 100644 --- a/src/zenserver/hub/storageserverinstance.cpp +++ b/src/zenserver/hub/storageserverinstance.cpp @@ -28,6 +28,7 @@ void StorageServerInstance::SpawnServerProcess() { ZEN_ASSERT_FORMAT(!m_ServerInstance.IsRunning(), "Storage server instance for module '{}' is already running", m_ModuleId); + m_ServerInstance.ResetDeadProcess(); m_ServerInstance.SetServerExecutablePath(GetRunningExecutablePath()); m_ServerInstance.SetDataDir(m_BaseDir); @@ -77,6 +78,12 @@ StorageServerInstance::ProvisionLocked() return; } + if (m_State.load() == HubInstanceState::Crashed) + { + ZEN_WARN("Storage server instance for module '{}' is in crashed state; re-provisioning from scratch", m_ModuleId); + m_State = HubInstanceState::Unprovisioned; + } + if (m_State.load() == HubInstanceState::Hibernated) { if (WakeLocked()) @@ -105,18 +112,23 @@ StorageServerInstance::ProvisionLocked() void StorageServerInstance::DeprovisionLocked() { - if (m_State.load() != HubInstanceState::Provisioned) + const HubInstanceState CurrentState = m_State.load(); + if (CurrentState != HubInstanceState::Provisioned && CurrentState != HubInstanceState::Crashed) { ZEN_WARN("Attempted to deprovision storage server instance for module '{}' which is not provisioned (state: '{}')", m_ModuleId, - ToString(m_State.load())); + ToString(CurrentState)); return; } ZEN_INFO("Deprovisioning storage server instance for module '{}'", m_ModuleId); m_State = HubInstanceState::Deprovisioning; - m_ServerInstance.Shutdown(); + if (CurrentState == HubInstanceState::Provisioned) + { + m_ServerInstance.Shutdown(); // Graceful; process may still be running + } + // Crashed: process already dead; skip Shutdown Dehydrate(); @@ -187,6 +199,29 @@ StorageServerInstance::WakeLocked() } } +bool +StorageServerInstance::RecoverCrashedLocked() +{ + ZEN_ASSERT(m_State.load() == HubInstanceState::Provisioned); + ZEN_ASSERT(!m_ServerInstance.IsRunning()); + + ZEN_WARN("Storage server instance for module '{}' has crashed; attempting in-place recovery", m_ModuleId); + m_State = HubInstanceState::Recovering; + try + { + SpawnServerProcess(); + m_State = HubInstanceState::Provisioned; + ZEN_INFO("Storage server instance for module '{}' recovered successfully", m_ModuleId); + return true; + } + catch (const std::exception& Ex) + { + ZEN_ERROR("Failed to restart module '{}': {}", m_ModuleId, Ex.what()); + m_State = HubInstanceState::Crashed; + return false; + } +} + void StorageServerInstance::Hydrate() { @@ -303,6 +338,15 @@ StorageServerInstance::UpdateMetricsLocked() // TODO: Resource metrics... } +#if ZEN_WITH_TESTS +void +StorageServerInstance::SharedLockedPtr::TerminateForTesting() const +{ + ZEN_ASSERT(m_Instance != nullptr); + m_Instance->m_ServerInstance.Terminate(); +} +#endif + StorageServerInstance::ExclusiveLockedPtr::ExclusiveLockedPtr() : m_Lock(nullptr), m_Instance(nullptr) { } @@ -402,4 +446,11 @@ StorageServerInstance::ExclusiveLockedPtr::Wake() return m_Instance->WakeLocked(); } +bool +StorageServerInstance::ExclusiveLockedPtr::RecoverFromCrash() +{ + ZEN_ASSERT(m_Instance != nullptr); + return m_Instance->RecoverCrashedLocked(); +} + } // namespace zen diff --git a/src/zenserver/hub/storageserverinstance.h b/src/zenserver/hub/storageserverinstance.h index ba15133bf..9963f640e 100644 --- a/src/zenserver/hub/storageserverinstance.h +++ b/src/zenserver/hub/storageserverinstance.h @@ -81,6 +81,10 @@ public: return m_Instance->UpdateMetricsLocked(); } +#if ZEN_WITH_TESTS + void TerminateForTesting() const; // kills the child process to simulate a crash +#endif + private: RwLock* m_Lock = nullptr; StorageServerInstance* m_Instance = nullptr; @@ -122,6 +126,7 @@ public: void Deprovision(); [[nodiscard]] bool Hibernate(); [[nodiscard]] bool Wake(); + [[nodiscard]] bool RecoverFromCrash(); // true = recovered; false = spawn failed (Crashed), caller must Deprovision() + cleanup private: RwLock* m_Lock = nullptr; @@ -136,6 +141,7 @@ private: [[nodiscard]] bool HibernateLocked(); [[nodiscard]] bool WakeLocked(); + [[nodiscard]] bool RecoverCrashedLocked(); // true = recovered (Provisioned); false = spawn failed (Crashed) void UpdateMetricsLocked(); diff --git a/src/zenutil/include/zenutil/zenserverprocess.h b/src/zenutil/include/zenutil/zenserverprocess.h index daa07a1e1..03d507400 100644 --- a/src/zenutil/include/zenutil/zenserverprocess.h +++ b/src/zenutil/include/zenutil/zenserverprocess.h @@ -135,6 +135,7 @@ struct ZenServerInstance #endif bool IsRunning() const; bool Terminate(); + void ResetDeadProcess(); std::string GetLogOutput() const; inline ServerMode GetServerMode() const { return m_ServerMode; } diff --git a/src/zenutil/zenserverprocess.cpp b/src/zenutil/zenserverprocess.cpp index 3993d6a32..2b27b2d8b 100644 --- a/src/zenutil/zenserverprocess.cpp +++ b/src/zenutil/zenserverprocess.cpp @@ -1445,6 +1445,16 @@ ZenServerInstance::IsRunning() const return m_Process.IsRunning(); } +void +ZenServerInstance::ResetDeadProcess() +{ + if (m_Process.IsValid() && !m_Process.IsRunning()) + { + m_Process.Reset(); + m_ShutdownEvent.reset(); + } +} + std::string ZenServerInstance::GetLogOutput() const { |