From 5d41b3bdce463d43f12b365d8e19ca1cef4b34d2 Mon Sep 17 00:00:00 2001 From: Marcelo Lynch Date: Mon, 2 Feb 2026 08:14:32 -0800 Subject: [PATCH] fix: avoid deadlock by not throttling workers when the task manager is shutting down (#12052) This PR avoids a potential deadlock on shutdown of a Lean program when the number of pooled threads has temporarily been pushed above the limit. There's a potential race between the finalizer "waking up everyone" after setting `m_shutting_down = true` and a worker that is about to be throttled because of concurrency limits. - `m_max_std_workers = 1`, `m_std_workers.size() = 2`, and the queue still has tasks. - Finalizer sets `m_shutting_down = true` and calls `notify_all()` while a worker is running a task (outside of the mutex). - Worker finishes a task, re-enters the loop, sees work, and "should wait" because `active >= max`. - Worker then calls `wait()` after the notify and never wakes, so `join()` in the finalizer hangs. This PR avoids the worker being blocked by not `wait()`ing if we are already shutting down. The code is restructured a bit for readability, where the first section is "there's no work in the queue" and the next section is "there is some work in the queue" --- src/runtime/object.cpp | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/src/runtime/object.cpp b/src/runtime/object.cpp index 80c9735268..a3be15ffe9 100644 --- a/src/runtime/object.cpp +++ b/src/runtime/object.cpp @@ -754,14 +754,25 @@ class task_manager { unique_lock lock(m_mutex); m_idle_std_workers++; while (true) { - if (m_queues_size == 0 && m_shutting_down) { - break; + if (m_queues_size == 0) { + if (m_shutting_down) { + // We're done + break; + } + // Wait for new tasks + m_queue_cv.wait(lock); + continue; } - if (m_queues_size == 0 || - // If we have reached the maximum number of standard workers (because the - // maximum was decreased by `task_get`), wait for someone else to become - // idle before picking up new work. - m_std_workers.size() - m_idle_std_workers >= m_max_std_workers) { + + // There's work to be done. + // If we have reached the maximum number of standard workers (because the + // maximum was decreased by `task_get`), wait for someone else to become + // idle before picking up new work. + // But during shutdown, we skip this throttling: + // because the finalizer might have called m_queue_cv.notify_all() for the last + // time, we don't want to get stuck behind the wait(). + if (!m_shutting_down && + m_std_workers.size() - m_idle_std_workers >= m_max_std_workers) { m_queue_cv.wait(lock); continue; }