// Copyright Epic Games, Inc. All Rights Reserved. #include "linuxrunner.h" #if ZEN_WITH_COMPUTE_SERVICES && ZEN_PLATFORM_LINUX # include # include # include # include # include # include # include # include # include # include # include # include # include # include # include # include namespace zen::compute { using namespace std::literals; namespace { // All helper functions in this namespace are async-signal-safe (safe to call // between fork() and execve()). They use only raw syscalls and avoid any // heap allocation, stdio, or other non-AS-safe operations. void WriteToFd(int Fd, const char* Buf, size_t Len) { while (Len > 0) { ssize_t Written = write(Fd, Buf, Len); if (Written <= 0) { break; } Buf += Written; Len -= static_cast(Written); } } [[noreturn]] void WriteErrorAndExit(int ErrorPipeFd, const char* Msg, int Errno) { // Write the message prefix size_t MsgLen = 0; for (const char* P = Msg; *P; ++P) { ++MsgLen; } WriteToFd(ErrorPipeFd, Msg, MsgLen); // Append ": " and the errno string if non-zero if (Errno != 0) { WriteToFd(ErrorPipeFd, ": ", 2); const char* ErrStr = strerror(Errno); size_t ErrLen = 0; for (const char* P = ErrStr; *P; ++P) { ++ErrLen; } WriteToFd(ErrorPipeFd, ErrStr, ErrLen); } _exit(127); } int MkdirIfNeeded(const char* Path, mode_t Mode) { if (mkdir(Path, Mode) != 0 && errno != EEXIST) { return -1; } return 0; } int BindMountReadOnly(const char* Src, const char* Dst) { if (mount(Src, Dst, nullptr, MS_BIND | MS_REC, nullptr) != 0) { return -1; } // Remount read-only if (mount(nullptr, Dst, nullptr, MS_REMOUNT | MS_BIND | MS_RDONLY | MS_REC, nullptr) != 0) { return -1; } return 0; } // Set up namespace-based sandbox isolation in the child process. // This is called after fork(), before execve(). All operations must be // async-signal-safe. // // The sandbox layout after pivot_root: // / -> the sandbox directory (tmpfs-like, was SandboxPath) // /usr -> bind-mount of host /usr (read-only) // /lib -> bind-mount of host /lib (read-only) // /lib64 -> bind-mount of host /lib64 (read-only, optional) // /etc -> bind-mount of host /etc (read-only) // /worker -> bind-mount of worker directory (read-only) // /proc -> proc filesystem // /dev -> tmpfs with null, zero, urandom void SetupNamespaceSandbox(const char* SandboxPath, uid_t Uid, gid_t Gid, const char* WorkerPath, int ErrorPipeFd) { // 1. Unshare user, mount, and network namespaces if (unshare(CLONE_NEWUSER | CLONE_NEWNS | CLONE_NEWNET) != 0) { WriteErrorAndExit(ErrorPipeFd, "unshare() failed", errno); } // 2. Write UID/GID mappings // Must deny setgroups first (required by kernel for unprivileged user namespaces) { int Fd = open("/proc/self/setgroups", O_WRONLY); if (Fd >= 0) { WriteToFd(Fd, "deny", 4); close(Fd); } // setgroups file may not exist on older kernels; not fatal } { // uid_map: map our UID to 0 inside the namespace char Buf[64]; int Len = snprintf(Buf, sizeof(Buf), "0 %u 1\n", static_cast(Uid)); int Fd = open("/proc/self/uid_map", O_WRONLY); if (Fd < 0) { WriteErrorAndExit(ErrorPipeFd, "open uid_map failed", errno); } WriteToFd(Fd, Buf, static_cast(Len)); close(Fd); } { // gid_map: map our GID to 0 inside the namespace char Buf[64]; int Len = snprintf(Buf, sizeof(Buf), "0 %u 1\n", static_cast(Gid)); int Fd = open("/proc/self/gid_map", O_WRONLY); if (Fd < 0) { WriteErrorAndExit(ErrorPipeFd, "open gid_map failed", errno); } WriteToFd(Fd, Buf, static_cast(Len)); close(Fd); } // 3. Privatize the entire mount tree so our mounts don't propagate if (mount(nullptr, "/", nullptr, MS_REC | MS_PRIVATE, nullptr) != 0) { WriteErrorAndExit(ErrorPipeFd, "mount MS_PRIVATE failed", errno); } // 4. Create mount points inside the sandbox and bind-mount system directories // Helper macro-like pattern for building paths inside sandbox // We use stack buffers since we can't allocate heap memory safely char MountPoint[4096]; auto BuildPath = [&](const char* Suffix) -> const char* { snprintf(MountPoint, sizeof(MountPoint), "%s/%s", SandboxPath, Suffix); return MountPoint; }; // /usr (required) if (MkdirIfNeeded(BuildPath("usr"), 0755) != 0) { WriteErrorAndExit(ErrorPipeFd, "mkdir sandbox/usr failed", errno); } if (BindMountReadOnly("/usr", BuildPath("usr")) != 0) { WriteErrorAndExit(ErrorPipeFd, "bind mount /usr failed", errno); } // /lib (required) if (MkdirIfNeeded(BuildPath("lib"), 0755) != 0) { WriteErrorAndExit(ErrorPipeFd, "mkdir sandbox/lib failed", errno); } if (BindMountReadOnly("/lib", BuildPath("lib")) != 0) { WriteErrorAndExit(ErrorPipeFd, "bind mount /lib failed", errno); } // /lib64 (optional — not all distros have it) { struct stat St; if (stat("/lib64", &St) == 0 && S_ISDIR(St.st_mode)) { if (MkdirIfNeeded(BuildPath("lib64"), 0755) == 0) { BindMountReadOnly("/lib64", BuildPath("lib64")); // Failure is non-fatal for lib64 } } } // /etc (required — for resolv.conf, ld.so.cache, etc.) if (MkdirIfNeeded(BuildPath("etc"), 0755) != 0) { WriteErrorAndExit(ErrorPipeFd, "mkdir sandbox/etc failed", errno); } if (BindMountReadOnly("/etc", BuildPath("etc")) != 0) { WriteErrorAndExit(ErrorPipeFd, "bind mount /etc failed", errno); } // /worker — bind-mount worker directory (contains the executable) if (MkdirIfNeeded(BuildPath("worker"), 0755) != 0) { WriteErrorAndExit(ErrorPipeFd, "mkdir sandbox/worker failed", errno); } if (BindMountReadOnly(WorkerPath, BuildPath("worker")) != 0) { WriteErrorAndExit(ErrorPipeFd, "bind mount worker dir failed", errno); } // 5. Mount /proc inside sandbox if (MkdirIfNeeded(BuildPath("proc"), 0755) != 0) { WriteErrorAndExit(ErrorPipeFd, "mkdir sandbox/proc failed", errno); } if (mount("proc", BuildPath("proc"), "proc", MS_NOSUID | MS_NOEXEC | MS_NODEV, nullptr) != 0) { WriteErrorAndExit(ErrorPipeFd, "mount /proc failed", errno); } // 6. Mount tmpfs /dev and bind-mount essential device nodes if (MkdirIfNeeded(BuildPath("dev"), 0755) != 0) { WriteErrorAndExit(ErrorPipeFd, "mkdir sandbox/dev failed", errno); } if (mount("tmpfs", BuildPath("dev"), "tmpfs", MS_NOSUID | MS_NOEXEC, "size=64k,mode=0755") != 0) { WriteErrorAndExit(ErrorPipeFd, "mount tmpfs /dev failed", errno); } // Bind-mount /dev/null, /dev/zero, /dev/urandom { char DevSrc[64]; char DevDst[4096]; auto BindDev = [&](const char* Name) { snprintf(DevSrc, sizeof(DevSrc), "/dev/%s", Name); snprintf(DevDst, sizeof(DevDst), "%s/dev/%s", SandboxPath, Name); // Create the file to mount over int Fd = open(DevDst, O_WRONLY | O_CREAT, 0666); if (Fd >= 0) { close(Fd); } mount(DevSrc, DevDst, nullptr, MS_BIND, nullptr); // Non-fatal if individual devices fail }; BindDev("null"); BindDev("zero"); BindDev("urandom"); } // 7. pivot_root to sandbox // pivot_root requires the new root and put_old to be mount points. // Bind-mount sandbox onto itself to make it a mount point. if (mount(SandboxPath, SandboxPath, nullptr, MS_BIND | MS_REC, nullptr) != 0) { WriteErrorAndExit(ErrorPipeFd, "bind mount sandbox onto itself failed", errno); } // Create .pivot_old inside sandbox char PivotOld[4096]; snprintf(PivotOld, sizeof(PivotOld), "%s/.pivot_old", SandboxPath); if (MkdirIfNeeded(PivotOld, 0755) != 0) { WriteErrorAndExit(ErrorPipeFd, "mkdir .pivot_old failed", errno); } if (syscall(SYS_pivot_root, SandboxPath, PivotOld) != 0) { WriteErrorAndExit(ErrorPipeFd, "pivot_root failed", errno); } // 8. Now inside new root. Clean up old root. if (chdir("/") != 0) { WriteErrorAndExit(ErrorPipeFd, "chdir / failed", errno); } if (umount2("/.pivot_old", MNT_DETACH) != 0) { WriteErrorAndExit(ErrorPipeFd, "umount2 .pivot_old failed", errno); } rmdir("/.pivot_old"); } } // anonymous namespace LinuxProcessRunner::LinuxProcessRunner(ChunkResolver& Resolver, const std::filesystem::path& BaseDir, DeferredDirectoryDeleter& Deleter, WorkerThreadPool& WorkerPool, bool Sandboxed) : LocalProcessRunner(Resolver, BaseDir, Deleter, WorkerPool) , m_Sandboxed(Sandboxed) { // Restore SIGCHLD to default behavior so waitpid() can properly collect // child exit status. zenserver/main.cpp sets SIGCHLD to SIG_IGN which // causes the kernel to auto-reap children, making waitpid() return // -1/ECHILD instead of the exit status we need. struct sigaction Action = {}; sigemptyset(&Action.sa_mask); Action.sa_handler = SIG_DFL; sigaction(SIGCHLD, &Action, nullptr); if (m_Sandboxed) { ZEN_INFO("namespace sandboxing enabled for child processes"); } } SubmitResult LinuxProcessRunner::SubmitAction(Ref Action) { ZEN_TRACE_CPU("LinuxProcessRunner::SubmitAction"); std::optional Prepared = PrepareActionSubmission(Action); if (!Prepared) { return SubmitResult{.IsAccepted = false}; } // Build environment array from worker descriptor CbObject WorkerDescription = Prepared->WorkerPackage.GetObject(); std::vector EnvStrings; for (auto& It : WorkerDescription["environment"sv]) { EnvStrings.emplace_back(It.AsString()); } std::vector Envp; Envp.reserve(EnvStrings.size() + 1); for (auto& Str : EnvStrings) { Envp.push_back(Str.data()); } Envp.push_back(nullptr); // Build argv: -Build=build.action // Pre-compute all path strings before fork() for async-signal-safety. std::string_view ExecPath = WorkerDescription["path"sv].AsString(); std::string ExePathStr; std::string SandboxedExePathStr; if (m_Sandboxed) { // After pivot_root, the worker dir is at /worker inside the new root std::filesystem::path SandboxedExePath = std::filesystem::path("/worker") / std::filesystem::path(ExecPath); SandboxedExePathStr = SandboxedExePath.string(); // We still need the real path for logging ExePathStr = (Prepared->WorkerPath / std::filesystem::path(ExecPath)).string(); } else { ExePathStr = (Prepared->WorkerPath / std::filesystem::path(ExecPath)).string(); } std::string BuildArg = "-Build=build.action"; // argv[0] should be the path the child will see const std::string& ChildExePath = m_Sandboxed ? SandboxedExePathStr : ExePathStr; std::vector ArgV; ArgV.push_back(const_cast(ChildExePath.data())); ArgV.push_back(BuildArg.data()); ArgV.push_back(nullptr); ZEN_DEBUG("Executing: {} {} (sandboxed={})", ExePathStr, BuildArg, m_Sandboxed); std::string SandboxPathStr = Prepared->SandboxPath.string(); std::string WorkerPathStr = Prepared->WorkerPath.string(); // Pre-fork: get uid/gid for namespace mapping, create error pipe uid_t CurrentUid = 0; gid_t CurrentGid = 0; int ErrorPipe[2] = {-1, -1}; if (m_Sandboxed) { CurrentUid = getuid(); CurrentGid = getgid(); if (pipe2(ErrorPipe, O_CLOEXEC) != 0) { throw zen::runtime_error("pipe2() for sandbox error pipe failed: {}", strerror(errno)); } } pid_t ChildPid = fork(); if (ChildPid < 0) { int SavedErrno = errno; if (m_Sandboxed) { close(ErrorPipe[0]); close(ErrorPipe[1]); } throw zen::runtime_error("fork() failed: {}", strerror(SavedErrno)); } if (ChildPid == 0) { // Child process if (m_Sandboxed) { // Close read end of error pipe — child only writes close(ErrorPipe[0]); SetupNamespaceSandbox(SandboxPathStr.c_str(), CurrentUid, CurrentGid, WorkerPathStr.c_str(), ErrorPipe[1]); // After pivot_root, CWD is "/" which is the sandbox root. // execve with the sandboxed path. execve(SandboxedExePathStr.c_str(), ArgV.data(), Envp.data()); WriteErrorAndExit(ErrorPipe[1], "execve failed", errno); } else { if (chdir(SandboxPathStr.c_str()) != 0) { _exit(127); } execve(ExePathStr.c_str(), ArgV.data(), Envp.data()); _exit(127); } } // Parent process if (m_Sandboxed) { // Close write end of error pipe — parent only reads close(ErrorPipe[1]); // Read from error pipe. If execve succeeded, pipe was closed by O_CLOEXEC // and read returns 0. If setup failed, child wrote an error message. char ErrBuf[512]; ssize_t BytesRead = read(ErrorPipe[0], ErrBuf, sizeof(ErrBuf) - 1); close(ErrorPipe[0]); if (BytesRead > 0) { // Sandbox setup or execve failed ErrBuf[BytesRead] = '\0'; // Reap the child (it called _exit(127)) waitpid(ChildPid, nullptr, 0); // Clean up the sandbox in the background m_DeferredDeleter.Enqueue(Action->ActionLsn, std::move(Prepared->SandboxPath)); ZEN_ERROR("Sandbox setup failed for action {}: {}", Action->ActionLsn, ErrBuf); Action->SetActionState(RunnerAction::State::Failed); return SubmitResult{.IsAccepted = false}; } } // Store child pid as void* (same convention as zencore/process.cpp) Ref NewAction{new RunningAction()}; NewAction->Action = Action; NewAction->ProcessHandle = reinterpret_cast(static_cast(ChildPid)); NewAction->SandboxPath = std::move(Prepared->SandboxPath); { RwLock::ExclusiveLockScope _(m_RunningLock); m_RunningMap[Prepared->ActionLsn] = std::move(NewAction); } Action->SetActionState(RunnerAction::State::Running); return SubmitResult{.IsAccepted = true}; } void LinuxProcessRunner::SweepRunningActions() { ZEN_TRACE_CPU("LinuxProcessRunner::SweepRunningActions"); std::vector> CompletedActions; m_RunningLock.WithExclusiveLock([&] { for (auto It = begin(m_RunningMap), ItEnd = end(m_RunningMap); It != ItEnd;) { Ref Running = It->second; pid_t Pid = static_cast(reinterpret_cast(Running->ProcessHandle)); int Status = 0; pid_t Result = waitpid(Pid, &Status, WNOHANG); if (Result == Pid) { if (WIFEXITED(Status)) { Running->ExitCode = WEXITSTATUS(Status); } else if (WIFSIGNALED(Status)) { Running->ExitCode = 128 + WTERMSIG(Status); } else { Running->ExitCode = 1; } Running->ProcessHandle = nullptr; CompletedActions.push_back(std::move(Running)); It = m_RunningMap.erase(It); } else { ++It; } } }); ProcessCompletedActions(CompletedActions); } void LinuxProcessRunner::CancelRunningActions() { ZEN_TRACE_CPU("LinuxProcessRunner::CancelRunningActions"); Stopwatch Timer; std::unordered_map> RunningMap; m_RunningLock.WithExclusiveLock([&] { std::swap(RunningMap, m_RunningMap); }); if (RunningMap.empty()) { return; } ZEN_INFO("cancelling all running actions"); // Send SIGTERM to all running processes first std::vector TerminatedLsnList; for (const auto& Kv : RunningMap) { Ref Running = Kv.second; pid_t Pid = static_cast(reinterpret_cast(Running->ProcessHandle)); if (kill(Pid, SIGTERM) == 0) { TerminatedLsnList.push_back(Kv.first); } else { ZEN_WARN("kill(SIGTERM) for LSN {} (pid {}) failed: {}", Running->Action->ActionLsn, Pid, strerror(errno)); } } // Wait up to 2 seconds for graceful exit, then SIGKILL if needed for (int Lsn : TerminatedLsnList) { if (auto It = RunningMap.find(Lsn); It != RunningMap.end()) { Ref Running = It->second; pid_t Pid = static_cast(reinterpret_cast(Running->ProcessHandle)); // Poll for up to 2 seconds bool Exited = false; for (int i = 0; i < 20; ++i) { int Status = 0; pid_t WaitResult = waitpid(Pid, &Status, WNOHANG); if (WaitResult == Pid) { Exited = true; ZEN_DEBUG("LSN {}: process exit OK", Running->Action->ActionLsn); break; } usleep(100000); // 100ms } if (!Exited) { ZEN_WARN("LSN {}: process did not exit after SIGTERM, sending SIGKILL", Running->Action->ActionLsn); kill(Pid, SIGKILL); waitpid(Pid, nullptr, 0); } m_DeferredDeleter.Enqueue(Running->Action->ActionLsn, std::move(Running->SandboxPath)); Running->Action->SetActionState(RunnerAction::State::Failed); } } ZEN_INFO("DONE - cancelled {} running processes (took {})", TerminatedLsnList.size(), NiceTimeSpanMs(Timer.GetElapsedTimeMs())); } } // namespace zen::compute #endif