aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorStefan Boberg <[email protected]>2026-03-23 11:40:11 +0100
committerGitHub Enterprise <[email protected]>2026-03-23 11:40:11 +0100
commita15de3d263a5759ca4f07aa7a2b9e6d494551150 (patch)
tree9f1f7abc0fa5e5c09a9d4bedcf0d85f3a32b7246 /src
parentimprove zenserver startup time (#879) (diff)
downloadzen-a15de3d263a5759ca4f07aa7a2b9e6d494551150.tar.xz
zen-a15de3d263a5759ca4f07aa7a2b9e6d494551150.zip
Process management improvements (#881)
This PR improves process lifecycle handling and resilience across several areas: - **Reclaim stale shared-memory entries instead of exiting** (`zenserver.cpp`): When a zenserver instance fails to attach as a sponsor to an existing process (e.g. because the PID was reused by an unrelated process), the server now clears the stale shared-memory entry and proceeds with normal startup instead of calling `std::exit(1)`. - **Wait for child process exit in `Kill()` and `Terminate()` on Unix** (`process.cpp`): After sending `SIGTERM` in `Kill()`, the code now waits up to 5s for graceful shutdown (escalating to `SIGKILL` on timeout), matching the Windows behavior. `Terminate()` also waits after `SIGKILL` so the child is properly reaped and doesn't linger as a zombie clogging up the process table. - **Fix sysctl buffer race in macOS `FindProcess`** (`process.cpp`): The macOS process enumeration now retries the `sysctl` call (up to 3 attempts with 25% buffer padding) to handle the race where the process list changes between the sizing call and the data-fetching call. Also flattens the nesting and fixes the guard/free scoping. - **Terminate stale processes before integration tests** (`zenserver-test.cpp`, `test.lua`): The integration test runner now accepts a `--kill-stale-processes` flag (passed automatically by `test.lua`) that scans for and terminates any leftover `zenserver`, `zenserver-test`, and `zentest-appstub` processes from previous test runs, logging the executable name and PID of each. This addresses flaky test failures caused by stale processes from prior runs holding ports or other resources.
Diffstat (limited to 'src')
-rw-r--r--src/zencore/process.cpp85
-rw-r--r--src/zencore/thread.cpp12
-rw-r--r--src/zenserver-test/zenserver-test.cpp41
-rw-r--r--src/zenserver/zenserver.cpp32
-rw-r--r--src/zenutil/zenserverprocess.cpp35
5 files changed, 160 insertions, 45 deletions
diff --git a/src/zencore/process.cpp b/src/zencore/process.cpp
index 8a91ab287..47289a37b 100644
--- a/src/zencore/process.cpp
+++ b/src/zencore/process.cpp
@@ -397,6 +397,17 @@ ProcessHandle::Kill()
return false;
}
}
+
+ // Wait for the process to exit after SIGTERM, matching the Windows path
+ // which waits up to 5 seconds for graceful shutdown. Without this wait
+ // the child becomes a zombie and may hold resources (e.g. TCP ports).
+ std::error_code Ec;
+ if (!Wait(5000, Ec))
+ {
+ // Graceful shutdown timed out — force-kill
+ kill(pid_t(m_Pid), SIGKILL);
+ Wait(1000, Ec);
+ }
#endif
Reset();
@@ -435,6 +446,11 @@ ProcessHandle::Terminate(int ExitCode)
return false;
}
}
+
+ // Wait for the process to be reaped after SIGKILL so it doesn't linger
+ // as a zombie holding resources (e.g. TCP ports).
+ std::error_code Ec;
+ Wait(5000, Ec);
#endif
Reset();
return true;
@@ -1648,47 +1664,60 @@ FindProcess(const std::filesystem::path& ExecutableImage, ProcessHandle& OutHand
return MakeErrorCodeFromLastError();
#endif // ZEN_PLATFORM_WINDOWS
#if ZEN_PLATFORM_MAC
- int Mib[4] = {CTL_KERN, KERN_PROC, KERN_PROC_ALL, 0};
- size_t BufferSize = 0;
-
- struct kinfo_proc* Processes = nullptr;
- uint32_t ProcCount = 0;
+ int Mib[4] = {CTL_KERN, KERN_PROC, KERN_PROC_ALL, 0};
const pid_t ThisProcessId = getpid();
- if (sysctl(Mib, 4, NULL, &BufferSize, NULL, 0) != -1 && BufferSize > 0)
+ // The process list can change between the sizing sysctl call and the data sysctl call.
+ // Retry with padding to handle this race.
+ struct kinfo_proc* Processes = nullptr;
+ size_t BufferSize = 0;
+ bool Fetched = false;
+ auto _ = MakeGuard([&]() { free(Processes); });
+
+ for (int Attempt = 0; Attempt < 3; Attempt++)
{
- struct kinfo_proc* Processes = (struct kinfo_proc*)malloc(BufferSize);
- auto _ = MakeGuard([&]() { free(Processes); });
- if (sysctl(Mib, 4, Processes, &BufferSize, NULL, 0) != -1)
+ if (sysctl(Mib, 4, nullptr, &BufferSize, nullptr, 0) == -1 || BufferSize == 0)
+ {
+ break;
+ }
+ BufferSize += BufferSize / 4;
+ free(Processes);
+ Processes = (struct kinfo_proc*)malloc(BufferSize);
+ if (sysctl(Mib, 4, Processes, &BufferSize, nullptr, 0) != -1)
{
- ProcCount = (uint32_t)(BufferSize / sizeof(struct kinfo_proc));
- char Buffer[PROC_PIDPATHINFO_MAXSIZE];
- for (uint32_t ProcIndex = 0; ProcIndex < ProcCount; ProcIndex++)
+ Fetched = true;
+ break;
+ }
+ }
+
+ if (!Fetched)
+ {
+ return MakeErrorCodeFromLastError();
+ }
+
+ uint32_t ProcCount = (uint32_t)(BufferSize / sizeof(struct kinfo_proc));
+ for (uint32_t ProcIndex = 0; ProcIndex < ProcCount; ProcIndex++)
+ {
+ pid_t Pid = Processes[ProcIndex].kp_proc.p_pid;
+ if (IncludeSelf || (Pid != ThisProcessId))
+ {
+ std::error_code Ec;
+ std::filesystem::path EntryPath = GetProcessExecutablePath(Pid, Ec);
+ if (!Ec)
{
- pid_t Pid = Processes[ProcIndex].kp_proc.p_pid;
- if (IncludeSelf || (Pid != ThisProcessId))
+ if (EntryPath == ExecutableImage)
{
- std::error_code Ec;
- std::filesystem::path EntryPath = GetProcessExecutablePath(Pid, Ec);
- if (!Ec)
+ if (Processes[ProcIndex].kp_proc.p_stat != SZOMB)
{
- if (EntryPath == ExecutableImage)
- {
- if (Processes[ProcIndex].kp_proc.p_stat != SZOMB)
- {
- OutHandle.Initialize(Pid, Ec);
- return Ec;
- }
- }
+ OutHandle.Initialize(Pid, Ec);
+ return Ec;
}
- Ec.clear();
}
}
- return {};
}
}
- return MakeErrorCodeFromLastError();
+ return {};
#endif // ZEN_PLATFORM_MAC
#if ZEN_PLATFORM_LINUX
const pid_t ThisProcessId = getpid();
diff --git a/src/zencore/thread.cpp b/src/zencore/thread.cpp
index f74791333..067e66c0d 100644
--- a/src/zencore/thread.cpp
+++ b/src/zencore/thread.cpp
@@ -503,6 +503,18 @@ NamedMutex::~NamedMutex()
if (m_MutexHandle)
{
int Inner = int(intptr_t(m_MutexHandle));
+
+ // Remove the backing file before releasing the lock so that new callers
+ // of Create()/Exists() won't find a stale entry. Other processes that
+ // already have the file open still hold valid fds (unlink only removes
+ // the directory entry; the inode lives until the last fd is closed).
+ std::error_code Ec;
+ std::filesystem::path Name = PathFromHandle((void*)(intptr_t(Inner)), Ec);
+ if (!Ec)
+ {
+ unlink(Name.c_str());
+ }
+
flock(Inner, LOCK_UN);
close(Inner);
}
diff --git a/src/zenserver-test/zenserver-test.cpp b/src/zenserver-test/zenserver-test.cpp
index fff77957d..cf7ffe4e4 100644
--- a/src/zenserver-test/zenserver-test.cpp
+++ b/src/zenserver-test/zenserver-test.cpp
@@ -99,7 +99,8 @@ main(int argc, char** argv)
// somehow in the future
std::string ServerClass;
- bool Verbose = false;
+ bool Verbose = false;
+ bool KillStale = false;
for (int i = 1; i < argc; ++i)
{
@@ -119,6 +120,44 @@ main(int argc, char** argv)
{
Verbose = true;
}
+ else if (argv[i] == "--kill-stale-processes"sv)
+ {
+ KillStale = true;
+ }
+ }
+
+ // Since GHA runners can leave processes behind from a previous test run, we need
+ // to be able to clean up any stale server processes before starting the tests to
+ // avoid interference
+
+ if (KillStale)
+ {
+ ZEN_INFO("Killing any stale processes from previous test runs...");
+
+ auto KillStaleProcesses = [](const std::filesystem::path& Executable) {
+ ZEN_INFO(" Looking for stale '{}' processes...", Executable.filename());
+
+ for (;;)
+ {
+ ProcessHandle StaleProcess;
+ std::error_code Ec = FindProcess(Executable, StaleProcess, /*IncludeSelf*/ false);
+
+ if (Ec || !StaleProcess.IsValid())
+ {
+ break;
+ }
+
+ ZEN_WARN("====> Found stale '{}' process (pid {}) from a previous test run - terminating it",
+ Executable.filename(),
+ StaleProcess.Pid());
+
+ StaleProcess.Terminate(0);
+ }
+ };
+
+ KillStaleProcesses(ProgramBaseDir / "zenserver" ZEN_EXE_SUFFIX_LITERAL);
+ KillStaleProcesses(ProgramBaseDir / "zenserver-test" ZEN_EXE_SUFFIX_LITERAL);
+ KillStaleProcesses(ProgramBaseDir / "zentest-appstub" ZEN_EXE_SUFFIX_LITERAL);
}
zen::tests::TestEnv.InitializeForTest(ProgramBaseDir, TestBaseDir, ServerClass);
diff --git a/src/zenserver/zenserver.cpp b/src/zenserver/zenserver.cpp
index 49cbbb9fc..ea86c5654 100644
--- a/src/zenserver/zenserver.cpp
+++ b/src/zenserver/zenserver.cpp
@@ -684,11 +684,33 @@ ZenServerMain::Run()
}
else
{
- ZEN_CONSOLE_WARN(ZEN_APP_NAME " exiting, failed to add sponsor owner pid {} to process listening to port {} (pid: {})",
- m_ServerOptions.OwnerPid,
- m_ServerOptions.BasePort,
- Entry->Pid.load());
- std::exit(1);
+ // The entry's process failed to pick up our sponsor request after
+ // multiple attempts. Before reclaiming the entry, verify that the
+ // PID does not still belong to a zenserver process. If it does, the
+ // server is alive but unresponsive – fall back to the original error
+ // path. If the PID is gone or belongs to a different executable the
+ // entry is genuinely stale and safe to reclaim.
+ const int StalePid = Entry->Pid.load();
+ std::error_code ExeEc;
+ std::filesystem::path PidExePath = GetProcessExecutablePath(StalePid, ExeEc);
+ const bool PidIsZenServer = !ExeEc && (PidExePath.filename() == GetRunningExecutablePath().filename());
+ if (PidIsZenServer)
+ {
+ ZEN_CONSOLE_WARN(ZEN_APP_NAME
+ " exiting, failed to add sponsor to process on port {} "
+ "(pid {}); that pid is still a running zenserver instance",
+ m_ServerOptions.BasePort,
+ StalePid);
+ std::exit(1);
+ }
+ ZEN_CONSOLE_WARN(
+ "Failed to add sponsor to process on port {} (pid {}); "
+ "pid belongs to '{}' – assuming stale entry and reclaiming",
+ m_ServerOptions.BasePort,
+ StalePid,
+ ExeEc ? "<unknown>" : PidExePath.filename().string());
+ Entry->Reset();
+ Entry = nullptr;
}
}
else
diff --git a/src/zenutil/zenserverprocess.cpp b/src/zenutil/zenserverprocess.cpp
index a2ab4c291..3993d6a32 100644
--- a/src/zenutil/zenserverprocess.cpp
+++ b/src/zenutil/zenserverprocess.cpp
@@ -1375,18 +1375,31 @@ ZenServerInstance::OnServerReady()
const ZenServerState::ZenServerEntry* Entry = nullptr;
- if (m_BasePort)
+ // The child process signals its ready event after writing its state entry, but under
+ // heavy instrumentation (e.g. sanitizers) the shared memory writes may not be immediately
+ // visible to this process. Retry briefly before giving up.
+ for (int Attempt = 0; Attempt < 10; ++Attempt)
{
- Entry = State.Lookup(m_BasePort);
- }
- else
- {
- State.Snapshot([&](const ZenServerState::ZenServerEntry& InEntry) {
- if (InEntry.Pid == (uint32_t)m_Process.Pid())
- {
- Entry = &InEntry;
- }
- });
+ if (m_BasePort)
+ {
+ Entry = State.Lookup(m_BasePort);
+ }
+ else
+ {
+ State.Snapshot([&](const ZenServerState::ZenServerEntry& InEntry) {
+ if (InEntry.Pid == (uint32_t)m_Process.Pid())
+ {
+ Entry = &InEntry;
+ }
+ });
+ }
+
+ if (Entry)
+ {
+ break;
+ }
+
+ Sleep(100);
}
if (!Entry)