diff options
| author | Stefan Boberg <[email protected]> | 2026-04-23 18:16:57 +0200 |
|---|---|---|
| committer | Stefan Boberg <[email protected]> | 2026-04-23 18:16:57 +0200 |
| commit | 0232b991cd7d8e3a2114ea30e4591dd3e7b65c36 (patch) | |
| tree | 94730e7594fd09ae1fa820391ce311f6daf13905 /src/zencompute/computeservice.cpp | |
| parent | Fix forward declaration order for s_GotSigWinch and SigWinchHandler (diff) | |
| parent | trace: declare Region event name fields as AnsiString (#1012) (diff) | |
| download | archived-zen-sb/zen-help.tar.xz archived-zen-sb/zen-help.zip | |
Merge branch 'main' into sb/zen-helpsb/zen-help
- Combine HelpCommand (this branch) with HistoryCommand (main) in zen CLI dispatcher
- Keep filter-aware TuiPickOne rewrite; adopt main's ASCII arrow glyphs in doc comment
Diffstat (limited to 'src/zencompute/computeservice.cpp')
| -rw-r--r-- | src/zencompute/computeservice.cpp | 196 |
1 files changed, 135 insertions, 61 deletions
diff --git a/src/zencompute/computeservice.cpp b/src/zencompute/computeservice.cpp index 58761556a..7f354a51c 100644 --- a/src/zencompute/computeservice.cpp +++ b/src/zencompute/computeservice.cpp @@ -121,6 +121,8 @@ struct ComputeServiceSession::Impl , m_LocalSubmitPool(GetLargeWorkerPool(EWorkloadType::Burst)) , m_RemoteSubmitPool(GetLargeWorkerPool(EWorkloadType::Burst)) { + m_RemoteRunnerGroup.SetWorkerPool(&m_RemoteSubmitPool); + // Create a non-expiring, non-deletable implicit queue for legacy endpoints auto Result = CreateQueue("implicit"sv, {}, {}); m_ImplicitQueueId = Result.QueueId; @@ -240,8 +242,9 @@ struct ComputeServiceSession::Impl // Recording - void StartRecording(ChunkResolver& InCidStore, const std::filesystem::path& RecordingPath); - void StopRecording(); + bool StartRecording(ChunkResolver& InCidStore, const std::filesystem::path& RecordingPath); + bool StopRecording(); + bool IsRecording() const; std::unique_ptr<ActionRecorder> m_Recorder; @@ -446,7 +449,7 @@ ComputeServiceSession::Impl::RequestStateTransition(SessionState NewState) return true; } - // CAS failed, Current was updated — retry with the new value + // CAS failed, Current was updated - retry with the new value } } @@ -615,6 +618,7 @@ ComputeServiceSession::Impl::UpdateCoordinatorState() m_KnownWorkerUris.insert(UriStr); auto* NewRunner = new RemoteHttpRunner(m_ChunkResolver, m_OrchestratorBasePath, UriStr, m_RemoteSubmitPool); + NewRunner->SetRemoteHostname(Hostname); SyncWorkersToRunner(*NewRunner); m_RemoteRunnerGroup.AddRunner(NewRunner); } @@ -716,24 +720,44 @@ ComputeServiceSession::Impl::ShutdownRunners() m_RemoteRunnerGroup.Shutdown(); } -void +bool ComputeServiceSession::Impl::StartRecording(ChunkResolver& InCidStore, const std::filesystem::path& RecordingPath) { + if (m_Recorder) + { + ZEN_WARN("recording is already active"); + return false; + } + ZEN_INFO("starting recording to '{}'", RecordingPath); m_Recorder = std::make_unique<ActionRecorder>(InCidStore, RecordingPath); ZEN_INFO("started recording to '{}'", RecordingPath); + return true; } -void +bool ComputeServiceSession::Impl::StopRecording() { + if (!m_Recorder) + { + ZEN_WARN("no recording is active"); + return false; + } + ZEN_INFO("stopping recording"); m_Recorder = nullptr; ZEN_INFO("stopped recording"); + return true; +} + +bool +ComputeServiceSession::Impl::IsRecording() const +{ + return m_Recorder != nullptr; } std::vector<ComputeServiceSession::RunningActionInfo> @@ -1128,6 +1152,10 @@ ComputeServiceSession::Impl::GetCompleted(CbWriter& Cbo) Cbo.BeginObject(); Cbo << "lsn"sv << Lsn; Cbo << "state"sv << RunnerAction::ToString(Action->ActionState()); + if (!Action->FailureReason.empty()) + { + Cbo << "reason"sv << Action->FailureReason; + } Cbo.EndObject(); } }); @@ -1416,8 +1444,8 @@ ComputeServiceSession::Impl::GetQueueCompleted(int QueueId, CbWriter& Cbo) if (Queue) { - Queue->m_Lock.WithSharedLock([&] { - m_ActionMapLock.WithSharedLock([&] { + m_ActionMapLock.WithSharedLock([&] { + Queue->m_Lock.WithSharedLock([&] { for (int Lsn : Queue->FinishedLsns) { if (m_ResultsMap.contains(Lsn)) @@ -1447,15 +1475,19 @@ ComputeServiceSession::Impl::NotifyQueueActionComplete(int QueueId, int Lsn, Run return; } + bool WasActive = false; Queue->m_Lock.WithExclusiveLock([&] { - Queue->ActiveLsns.erase(Lsn); + WasActive = Queue->ActiveLsns.erase(Lsn) > 0; Queue->FinishedLsns.insert(Lsn); }); - const int PreviousActive = Queue->ActiveCount.fetch_sub(1, std::memory_order_relaxed); - if (PreviousActive == 1) + if (WasActive) { - Queue->IdleSince.store(GetHifreqTimerValue(), std::memory_order_relaxed); + const int PreviousActive = Queue->ActiveCount.fetch_sub(1, std::memory_order_relaxed); + if (PreviousActive == 1) + { + Queue->IdleSince.store(GetHifreqTimerValue(), std::memory_order_relaxed); + } } switch (ActionState) @@ -1526,12 +1558,12 @@ ComputeServiceSession::Impl::SchedulePendingActions() static Stopwatch DumpRunningTimer; auto _ = MakeGuard([&] { - ZEN_INFO("scheduled {} pending actions. {} running ({} retired), {} still pending, {} results", - ScheduledCount, - RunningCount, - m_RetiredCount.load(), - PendingCount, - ResultCount); + ZEN_DEBUG("scheduled {} pending actions. {} running ({} retired), {} still pending, {} results", + ScheduledCount, + RunningCount, + m_RetiredCount.load(), + PendingCount, + ResultCount); if (DumpRunningTimer.GetElapsedTimeMs() > 30000) { @@ -1580,13 +1612,13 @@ ComputeServiceSession::Impl::SchedulePendingActions() // Also note that the m_PendingActions list is not maintained // here, that's done periodically in SchedulePendingActions() - m_ActionMapLock.WithExclusiveLock([&] { - if (m_SessionState.load(std::memory_order_relaxed) >= SessionState::Paused) - { - return; - } + // Extract pending actions under a shared lock - we only need to read + // the map and take Ref copies. ActionState() is atomic so this is safe. + // Sorting and capacity trimming happen outside the lock to avoid + // blocking HTTP handlers on O(N log N) work with large pending queues. - if (m_PendingActions.empty()) + m_ActionMapLock.WithSharedLock([&] { + if (m_SessionState.load(std::memory_order_relaxed) >= SessionState::Paused) { return; } @@ -1606,6 +1638,7 @@ ComputeServiceSession::Impl::SchedulePendingActions() case RunnerAction::State::Completed: case RunnerAction::State::Failed: case RunnerAction::State::Abandoned: + case RunnerAction::State::Rejected: case RunnerAction::State::Cancelled: break; @@ -1616,30 +1649,30 @@ ComputeServiceSession::Impl::SchedulePendingActions() } } - // Sort by priority descending, then by LSN ascending (FIFO within same priority) - std::sort(ActionsToSchedule.begin(), ActionsToSchedule.end(), [](const Ref<RunnerAction>& A, const Ref<RunnerAction>& B) { - if (A->Priority != B->Priority) - { - return A->Priority > B->Priority; - } - return A->ActionLsn < B->ActionLsn; - }); + PendingCount = m_PendingActions.size(); + }); - if (ActionsToSchedule.size() > Capacity) + // Sort by priority descending, then by LSN ascending (FIFO within same priority) + std::sort(ActionsToSchedule.begin(), ActionsToSchedule.end(), [](const Ref<RunnerAction>& A, const Ref<RunnerAction>& B) { + if (A->Priority != B->Priority) { - ActionsToSchedule.resize(Capacity); + return A->Priority > B->Priority; } - - PendingCount = m_PendingActions.size(); + return A->ActionLsn < B->ActionLsn; }); + if (ActionsToSchedule.size() > Capacity) + { + ActionsToSchedule.resize(Capacity); + } + if (ActionsToSchedule.empty()) { _.Dismiss(); return; } - ZEN_INFO("attempting schedule of {} pending actions", ActionsToSchedule.size()); + ZEN_DEBUG("attempting schedule of {} pending actions", ActionsToSchedule.size()); Stopwatch SubmitTimer; std::vector<SubmitResult> SubmitResults = SubmitActions(ActionsToSchedule); @@ -1659,10 +1692,10 @@ ComputeServiceSession::Impl::SchedulePendingActions() } } - ZEN_INFO("scheduled {} pending actions in {} ({} rejected)", - ScheduledActionCount, - NiceTimeSpanMs(SubmitTimer.GetElapsedTimeMs()), - NotAcceptedCount); + ZEN_DEBUG("scheduled {} pending actions in {} ({} rejected)", + ScheduledActionCount, + NiceTimeSpanMs(SubmitTimer.GetElapsedTimeMs()), + NotAcceptedCount); ScheduledCount += ScheduledActionCount; PendingCount -= ScheduledActionCount; @@ -1708,7 +1741,7 @@ ComputeServiceSession::Impl::SchedulerThreadFunction() HandleActionUpdates(); - // Auto-transition Draining → Paused when all work is done + // Auto-transition Draining -> Paused when all work is done if (m_SessionState.load(std::memory_order_relaxed) == SessionState::Draining) { bool AllDrained = m_ActionMapLock.WithSharedLock([&] { return m_PendingActions.empty() && m_RunningMap.empty(); }); @@ -1833,7 +1866,7 @@ ComputeServiceSession::Impl::RescheduleAction(int ActionLsn) } } - // Reset action state — this calls PostUpdate() internally + // Reset action state - this calls PostUpdate() internally Action->ResetActionStateToPending(); int NewRetryCount = Action->RetryCount.load(std::memory_order_relaxed); @@ -1915,7 +1948,7 @@ ComputeServiceSession::Impl::HandleActionUpdates() // This is safe because state transitions are monotonically increasing by enum // rank (Pending < Submitting < Running < Completed/Failed/Cancelled), so // SetActionState rejects any transition to a lower-ranked state. By the time - // we read ActionState() here, it reflects the highest state reached — making + // we read ActionState() here, it reflects the highest state reached - making // the first occurrence per LSN authoritative and duplicates redundant. for (Ref<RunnerAction>& Action : UpdatedActions) { @@ -1925,7 +1958,7 @@ ComputeServiceSession::Impl::HandleActionUpdates() { switch (Action->ActionState()) { - // Newly enqueued — add to pending map for scheduling + // Newly enqueued - add to pending map for scheduling case RunnerAction::State::Pending: // Guard against a race where the session is abandoned between // EnqueueAction (which calls PostUpdate) and this scheduler @@ -1946,11 +1979,11 @@ ComputeServiceSession::Impl::HandleActionUpdates() } break; - // Async submission in progress — remains in pending map + // Async submission in progress - remains in pending map case RunnerAction::State::Submitting: break; - // Dispatched to a runner — move from pending to running + // Dispatched to a runner - move from pending to running case RunnerAction::State::Running: m_ActionMapLock.WithExclusiveLock([&] { m_RunningMap.insert({ActionLsn, Action}); @@ -1959,7 +1992,7 @@ ComputeServiceSession::Impl::HandleActionUpdates() ZEN_DEBUG("action {} ({}) RUNNING", Action->ActionId, ActionLsn); break; - // Retracted — pull back for rescheduling without counting against retry limit + // Retracted - pull back for rescheduling without counting against retry limit case RunnerAction::State::Retracted: { m_ActionMapLock.WithExclusiveLock([&] { @@ -1971,7 +2004,15 @@ ComputeServiceSession::Impl::HandleActionUpdates() break; } - // Terminal states — move to results, record history, notify queue + // Rejected - runner was at capacity, reschedule without retry cost + case RunnerAction::State::Rejected: + { + Action->ResetActionStateToPending(); + ZEN_DEBUG("action {} ({}) rescheduled after runner rejection", Action->ActionId, ActionLsn); + break; + } + + // Terminal states - move to results, record history, notify queue case RunnerAction::State::Completed: case RunnerAction::State::Failed: case RunnerAction::State::Abandoned: @@ -1980,7 +2021,7 @@ ComputeServiceSession::Impl::HandleActionUpdates() auto TerminalState = Action->ActionState(); // Automatic retry for Failed/Abandoned actions with retries remaining. - // Skip retries when the session itself is abandoned — those actions + // Skip retries when the session itself is abandoned - those actions // were intentionally abandoned and should not be rescheduled. if ((TerminalState == RunnerAction::State::Failed || TerminalState == RunnerAction::State::Abandoned) && m_SessionState.load(std::memory_order_relaxed) < SessionState::Abandoned) @@ -2005,6 +2046,14 @@ ComputeServiceSession::Impl::HandleActionUpdates() MaxRetries); break; } + else + { + ZEN_WARN("action {} ({}) {} after {} retries, not rescheduling", + Action->ActionId, + ActionLsn, + RunnerAction::ToString(TerminalState), + Action->RetryCount.load(std::memory_order_relaxed)); + } } m_ActionMapLock.WithExclusiveLock([&] { @@ -2097,10 +2146,9 @@ ComputeServiceSession::Impl::SubmitActions(const std::vector<Ref<RunnerAction>>& ZEN_TRACE_CPU("ComputeServiceSession::SubmitActions"); std::vector<SubmitResult> Results(Actions.size()); - // First try submitting the batch to local runners in parallel + // First try submitting the batch to local runners std::vector<SubmitResult> LocalResults = m_LocalRunnerGroup.SubmitActions(Actions); - std::vector<size_t> RemoteIndices; std::vector<Ref<RunnerAction>> RemoteActions; for (size_t i = 0; i < Actions.size(); ++i) @@ -2111,20 +2159,40 @@ ComputeServiceSession::Impl::SubmitActions(const std::vector<Ref<RunnerAction>>& } else { - RemoteIndices.push_back(i); RemoteActions.push_back(Actions[i]); + Results[i] = SubmitResult{.IsAccepted = true, .Reason = "dispatched to remote"}; } } - // Submit remaining actions to remote runners in parallel + // Dispatch remaining actions to remote runners asynchronously. + // Mark actions as Submitting so the scheduler won't re-pick them. + // The remote runner will transition them to Running on success, or + // we mark them Failed on rejection so HandleActionUpdates retries. if (!RemoteActions.empty()) { - std::vector<SubmitResult> RemoteResults = m_RemoteRunnerGroup.SubmitActions(RemoteActions); - - for (size_t j = 0; j < RemoteIndices.size(); ++j) + for (const Ref<RunnerAction>& Action : RemoteActions) { - Results[RemoteIndices[j]] = std::move(RemoteResults[j]); + Action->SetActionState(RunnerAction::State::Submitting); } + + m_RemoteSubmitPool.ScheduleWork( + [this, RemoteActions = std::move(RemoteActions)]() { + std::vector<SubmitResult> RemoteResults = m_RemoteRunnerGroup.SubmitActions(RemoteActions); + + for (size_t j = 0; j < RemoteResults.size(); ++j) + { + if (!RemoteResults[j].IsAccepted) + { + ZEN_DEBUG("remote submission rejected for action {} ({}): {}", + RemoteActions[j]->ActionId, + RemoteActions[j]->ActionLsn, + RemoteResults[j].Reason); + + RemoteActions[j]->SetActionState(RunnerAction::State::Rejected); + } + } + }, + WorkerThreadPool::EMode::EnableBacklog); } return Results; @@ -2190,16 +2258,22 @@ ComputeServiceSession::NotifyOrchestratorChanged() m_Impl->NotifyOrchestratorChanged(); } -void +bool ComputeServiceSession::StartRecording(ChunkResolver& InResolver, const std::filesystem::path& RecordingPath) { - m_Impl->StartRecording(InResolver, RecordingPath); + return m_Impl->StartRecording(InResolver, RecordingPath); } -void +bool ComputeServiceSession::StopRecording() { - m_Impl->StopRecording(); + return m_Impl->StopRecording(); +} + +bool +ComputeServiceSession::IsRecording() const +{ + return m_Impl->IsRecording(); } ComputeServiceSession::ActionCounts |