aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStefan Boberg <[email protected]>2026-03-23 11:40:11 +0100
committerGitHub Enterprise <[email protected]>2026-03-23 11:40:11 +0100
commita15de3d263a5759ca4f07aa7a2b9e6d494551150 (patch)
tree9f1f7abc0fa5e5c09a9d4bedcf0d85f3a32b7246
parentimprove zenserver startup time (#879) (diff)
downloadzen-a15de3d263a5759ca4f07aa7a2b9e6d494551150.tar.xz
zen-a15de3d263a5759ca4f07aa7a2b9e6d494551150.zip
Process management improvements (#881)
This PR improves process lifecycle handling and resilience across several areas: - **Reclaim stale shared-memory entries instead of exiting** (`zenserver.cpp`): When a zenserver instance fails to attach as a sponsor to an existing process (e.g. because the PID was reused by an unrelated process), the server now clears the stale shared-memory entry and proceeds with normal startup instead of calling `std::exit(1)`. - **Wait for child process exit in `Kill()` and `Terminate()` on Unix** (`process.cpp`): After sending `SIGTERM` in `Kill()`, the code now waits up to 5s for graceful shutdown (escalating to `SIGKILL` on timeout), matching the Windows behavior. `Terminate()` also waits after `SIGKILL` so the child is properly reaped and doesn't linger as a zombie clogging up the process table. - **Fix sysctl buffer race in macOS `FindProcess`** (`process.cpp`): The macOS process enumeration now retries the `sysctl` call (up to 3 attempts with 25% buffer padding) to handle the race where the process list changes between the sizing call and the data-fetching call. Also flattens the nesting and fixes the guard/free scoping. - **Terminate stale processes before integration tests** (`zenserver-test.cpp`, `test.lua`): The integration test runner now accepts a `--kill-stale-processes` flag (passed automatically by `test.lua`) that scans for and terminates any leftover `zenserver`, `zenserver-test`, and `zentest-appstub` processes from previous test runs, logging the executable name and PID of each. This addresses flaky test failures caused by stale processes from prior runs holding ports or other resources.
-rw-r--r--scripts/test.lua8
-rw-r--r--src/zencore/process.cpp85
-rw-r--r--src/zencore/thread.cpp12
-rw-r--r--src/zenserver-test/zenserver-test.cpp41
-rw-r--r--src/zenserver/zenserver.cpp32
-rw-r--r--src/zenutil/zenserverprocess.cpp35
6 files changed, 166 insertions, 47 deletions
diff --git a/scripts/test.lua b/scripts/test.lua
index 3c18225fb..52495342a 100644
--- a/scripts/test.lua
+++ b/scripts/test.lua
@@ -231,8 +231,12 @@ function main()
if use_noskip then
cmd = string.format("%s --no-skip", cmd)
end
- if use_verbose and name == "integration" then
- cmd = string.format("%s --verbose", cmd)
+ if name == "integration" then
+ cmd = string.format("%s --kill-stale-processes", cmd)
+
+ if use_verbose then
+ cmd = string.format("%s --verbose", cmd)
+ end
end
for _, arg in ipairs(extra_args) do
cmd = string.format("%s %s", cmd, arg)
diff --git a/src/zencore/process.cpp b/src/zencore/process.cpp
index 8a91ab287..47289a37b 100644
--- a/src/zencore/process.cpp
+++ b/src/zencore/process.cpp
@@ -397,6 +397,17 @@ ProcessHandle::Kill()
return false;
}
}
+
+ // Wait for the process to exit after SIGTERM, matching the Windows path
+ // which waits up to 5 seconds for graceful shutdown. Without this wait
+ // the child becomes a zombie and may hold resources (e.g. TCP ports).
+ std::error_code Ec;
+ if (!Wait(5000, Ec))
+ {
+ // Graceful shutdown timed out — force-kill
+ kill(pid_t(m_Pid), SIGKILL);
+ Wait(1000, Ec);
+ }
#endif
Reset();
@@ -435,6 +446,11 @@ ProcessHandle::Terminate(int ExitCode)
return false;
}
}
+
+ // Wait for the process to be reaped after SIGKILL so it doesn't linger
+ // as a zombie holding resources (e.g. TCP ports).
+ std::error_code Ec;
+ Wait(5000, Ec);
#endif
Reset();
return true;
@@ -1648,47 +1664,60 @@ FindProcess(const std::filesystem::path& ExecutableImage, ProcessHandle& OutHand
return MakeErrorCodeFromLastError();
#endif // ZEN_PLATFORM_WINDOWS
#if ZEN_PLATFORM_MAC
- int Mib[4] = {CTL_KERN, KERN_PROC, KERN_PROC_ALL, 0};
- size_t BufferSize = 0;
-
- struct kinfo_proc* Processes = nullptr;
- uint32_t ProcCount = 0;
+ int Mib[4] = {CTL_KERN, KERN_PROC, KERN_PROC_ALL, 0};
const pid_t ThisProcessId = getpid();
- if (sysctl(Mib, 4, NULL, &BufferSize, NULL, 0) != -1 && BufferSize > 0)
+ // The process list can change between the sizing sysctl call and the data sysctl call.
+ // Retry with padding to handle this race.
+ struct kinfo_proc* Processes = nullptr;
+ size_t BufferSize = 0;
+ bool Fetched = false;
+ auto _ = MakeGuard([&]() { free(Processes); });
+
+ for (int Attempt = 0; Attempt < 3; Attempt++)
{
- struct kinfo_proc* Processes = (struct kinfo_proc*)malloc(BufferSize);
- auto _ = MakeGuard([&]() { free(Processes); });
- if (sysctl(Mib, 4, Processes, &BufferSize, NULL, 0) != -1)
+ if (sysctl(Mib, 4, nullptr, &BufferSize, nullptr, 0) == -1 || BufferSize == 0)
+ {
+ break;
+ }
+ BufferSize += BufferSize / 4;
+ free(Processes);
+ Processes = (struct kinfo_proc*)malloc(BufferSize);
+ if (sysctl(Mib, 4, Processes, &BufferSize, nullptr, 0) != -1)
{
- ProcCount = (uint32_t)(BufferSize / sizeof(struct kinfo_proc));
- char Buffer[PROC_PIDPATHINFO_MAXSIZE];
- for (uint32_t ProcIndex = 0; ProcIndex < ProcCount; ProcIndex++)
+ Fetched = true;
+ break;
+ }
+ }
+
+ if (!Fetched)
+ {
+ return MakeErrorCodeFromLastError();
+ }
+
+ uint32_t ProcCount = (uint32_t)(BufferSize / sizeof(struct kinfo_proc));
+ for (uint32_t ProcIndex = 0; ProcIndex < ProcCount; ProcIndex++)
+ {
+ pid_t Pid = Processes[ProcIndex].kp_proc.p_pid;
+ if (IncludeSelf || (Pid != ThisProcessId))
+ {
+ std::error_code Ec;
+ std::filesystem::path EntryPath = GetProcessExecutablePath(Pid, Ec);
+ if (!Ec)
{
- pid_t Pid = Processes[ProcIndex].kp_proc.p_pid;
- if (IncludeSelf || (Pid != ThisProcessId))
+ if (EntryPath == ExecutableImage)
{
- std::error_code Ec;
- std::filesystem::path EntryPath = GetProcessExecutablePath(Pid, Ec);
- if (!Ec)
+ if (Processes[ProcIndex].kp_proc.p_stat != SZOMB)
{
- if (EntryPath == ExecutableImage)
- {
- if (Processes[ProcIndex].kp_proc.p_stat != SZOMB)
- {
- OutHandle.Initialize(Pid, Ec);
- return Ec;
- }
- }
+ OutHandle.Initialize(Pid, Ec);
+ return Ec;
}
- Ec.clear();
}
}
- return {};
}
}
- return MakeErrorCodeFromLastError();
+ return {};
#endif // ZEN_PLATFORM_MAC
#if ZEN_PLATFORM_LINUX
const pid_t ThisProcessId = getpid();
diff --git a/src/zencore/thread.cpp b/src/zencore/thread.cpp
index f74791333..067e66c0d 100644
--- a/src/zencore/thread.cpp
+++ b/src/zencore/thread.cpp
@@ -503,6 +503,18 @@ NamedMutex::~NamedMutex()
if (m_MutexHandle)
{
int Inner = int(intptr_t(m_MutexHandle));
+
+ // Remove the backing file before releasing the lock so that new callers
+ // of Create()/Exists() won't find a stale entry. Other processes that
+ // already have the file open still hold valid fds (unlink only removes
+ // the directory entry; the inode lives until the last fd is closed).
+ std::error_code Ec;
+ std::filesystem::path Name = PathFromHandle((void*)(intptr_t(Inner)), Ec);
+ if (!Ec)
+ {
+ unlink(Name.c_str());
+ }
+
flock(Inner, LOCK_UN);
close(Inner);
}
diff --git a/src/zenserver-test/zenserver-test.cpp b/src/zenserver-test/zenserver-test.cpp
index fff77957d..cf7ffe4e4 100644
--- a/src/zenserver-test/zenserver-test.cpp
+++ b/src/zenserver-test/zenserver-test.cpp
@@ -99,7 +99,8 @@ main(int argc, char** argv)
// somehow in the future
std::string ServerClass;
- bool Verbose = false;
+ bool Verbose = false;
+ bool KillStale = false;
for (int i = 1; i < argc; ++i)
{
@@ -119,6 +120,44 @@ main(int argc, char** argv)
{
Verbose = true;
}
+ else if (argv[i] == "--kill-stale-processes"sv)
+ {
+ KillStale = true;
+ }
+ }
+
+ // Since GHA runners can leave processes behind from a previous test run, we need
+ // to be able to clean up any stale server processes before starting the tests to
+ // avoid interference
+
+ if (KillStale)
+ {
+ ZEN_INFO("Killing any stale processes from previous test runs...");
+
+ auto KillStaleProcesses = [](const std::filesystem::path& Executable) {
+ ZEN_INFO(" Looking for stale '{}' processes...", Executable.filename());
+
+ for (;;)
+ {
+ ProcessHandle StaleProcess;
+ std::error_code Ec = FindProcess(Executable, StaleProcess, /*IncludeSelf*/ false);
+
+ if (Ec || !StaleProcess.IsValid())
+ {
+ break;
+ }
+
+ ZEN_WARN("====> Found stale '{}' process (pid {}) from a previous test run - terminating it",
+ Executable.filename(),
+ StaleProcess.Pid());
+
+ StaleProcess.Terminate(0);
+ }
+ };
+
+ KillStaleProcesses(ProgramBaseDir / "zenserver" ZEN_EXE_SUFFIX_LITERAL);
+ KillStaleProcesses(ProgramBaseDir / "zenserver-test" ZEN_EXE_SUFFIX_LITERAL);
+ KillStaleProcesses(ProgramBaseDir / "zentest-appstub" ZEN_EXE_SUFFIX_LITERAL);
}
zen::tests::TestEnv.InitializeForTest(ProgramBaseDir, TestBaseDir, ServerClass);
diff --git a/src/zenserver/zenserver.cpp b/src/zenserver/zenserver.cpp
index 49cbbb9fc..ea86c5654 100644
--- a/src/zenserver/zenserver.cpp
+++ b/src/zenserver/zenserver.cpp
@@ -684,11 +684,33 @@ ZenServerMain::Run()
}
else
{
- ZEN_CONSOLE_WARN(ZEN_APP_NAME " exiting, failed to add sponsor owner pid {} to process listening to port {} (pid: {})",
- m_ServerOptions.OwnerPid,
- m_ServerOptions.BasePort,
- Entry->Pid.load());
- std::exit(1);
+ // The entry's process failed to pick up our sponsor request after
+ // multiple attempts. Before reclaiming the entry, verify that the
+ // PID does not still belong to a zenserver process. If it does, the
+ // server is alive but unresponsive – fall back to the original error
+ // path. If the PID is gone or belongs to a different executable the
+ // entry is genuinely stale and safe to reclaim.
+ const int StalePid = Entry->Pid.load();
+ std::error_code ExeEc;
+ std::filesystem::path PidExePath = GetProcessExecutablePath(StalePid, ExeEc);
+ const bool PidIsZenServer = !ExeEc && (PidExePath.filename() == GetRunningExecutablePath().filename());
+ if (PidIsZenServer)
+ {
+ ZEN_CONSOLE_WARN(ZEN_APP_NAME
+ " exiting, failed to add sponsor to process on port {} "
+ "(pid {}); that pid is still a running zenserver instance",
+ m_ServerOptions.BasePort,
+ StalePid);
+ std::exit(1);
+ }
+ ZEN_CONSOLE_WARN(
+ "Failed to add sponsor to process on port {} (pid {}); "
+ "pid belongs to '{}' – assuming stale entry and reclaiming",
+ m_ServerOptions.BasePort,
+ StalePid,
+ ExeEc ? "<unknown>" : PidExePath.filename().string());
+ Entry->Reset();
+ Entry = nullptr;
}
}
else
diff --git a/src/zenutil/zenserverprocess.cpp b/src/zenutil/zenserverprocess.cpp
index a2ab4c291..3993d6a32 100644
--- a/src/zenutil/zenserverprocess.cpp
+++ b/src/zenutil/zenserverprocess.cpp
@@ -1375,18 +1375,31 @@ ZenServerInstance::OnServerReady()
const ZenServerState::ZenServerEntry* Entry = nullptr;
- if (m_BasePort)
+ // The child process signals its ready event after writing its state entry, but under
+ // heavy instrumentation (e.g. sanitizers) the shared memory writes may not be immediately
+ // visible to this process. Retry briefly before giving up.
+ for (int Attempt = 0; Attempt < 10; ++Attempt)
{
- Entry = State.Lookup(m_BasePort);
- }
- else
- {
- State.Snapshot([&](const ZenServerState::ZenServerEntry& InEntry) {
- if (InEntry.Pid == (uint32_t)m_Process.Pid())
- {
- Entry = &InEntry;
- }
- });
+ if (m_BasePort)
+ {
+ Entry = State.Lookup(m_BasePort);
+ }
+ else
+ {
+ State.Snapshot([&](const ZenServerState::ZenServerEntry& InEntry) {
+ if (InEntry.Pid == (uint32_t)m_Process.Pid())
+ {
+ Entry = &InEntry;
+ }
+ });
+ }
+
+ if (Entry)
+ {
+ break;
+ }
+
+ Sleep(100);
}
if (!Entry)