From 8daaa26efdda3802f73367d844b267bda3f84cbe Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Wed, 1 Apr 2026 19:20:16 -0700 Subject: [PATCH] [Support] Support nested parallel TaskGroup via work-stealing (#189293) Nested TaskGroups run serially to prevent deadlock, as documented by https://reviews.llvm.org/D61115 and refined by https://reviews.llvm.org/D148984 to use threadIndex. Enable nested parallelism by having worker threads actively execute tasks from the work queue while waiting (work-stealing), instead of just blocking. Root-level TaskGroups (main thread) keep the efficient blocking Latch::sync(), so there is no overhead for the common non-nested case. In lld, https://reviews.llvm.org/D131247 worked around the limitation by passing a single root TaskGroup into OutputSection::writeTo and spawning 4MB-chunked tasks into it. However, SyntheticSection::writeTo calls with internal parallelism (e.g. GdbIndexSection, MergeNoTailSection) still ran serially on worker threads. With this change, their internal parallelFor/parallelForEach calls parallelize automatically via helpSync work-stealing. The increased parallelism can reorder error messages from parallel phases (e.g. relocation processing during section writes), so one lld test is updated to use --threads=1 for deterministic output. --- lld/test/ELF/merge-piece-oob.s | 4 +- llvm/include/llvm/Support/Parallel.h | 6 +- llvm/lib/Support/Parallel.cpp | 34 ++++++++--- llvm/unittests/Support/ParallelTest.cpp | 75 ++++++------------------- 4 files changed, 47 insertions(+), 72 deletions(-) diff --git a/lld/test/ELF/merge-piece-oob.s b/lld/test/ELF/merge-piece-oob.s index d2bf9fab443a..829f45cbc75e 100644 --- a/lld/test/ELF/merge-piece-oob.s +++ b/lld/test/ELF/merge-piece-oob.s @@ -3,8 +3,8 @@ ## Non-section symbols and offset <= section_size are accepted, matching GNU ld. # RUN: llvm-mc %s -o %t.o -filetype=obj -triple=x86_64 -# RUN: not ld.lld %t.o -o /dev/null -shared 2>&1 | FileCheck %s -DPREFIX=error --implicit-check-not=error: -# RUN: ld.lld %t.o -o /dev/null -shared --noinhibit-exec 2>&1 | FileCheck %s -DPREFIX=warning --implicit-check-not=warning: +# RUN: not ld.lld --threads=1 %t.o -o /dev/null -shared 2>&1 | FileCheck %s -DPREFIX=error --implicit-check-not=error: +# RUN: ld.lld --threads=1 %t.o -o /dev/null -shared --noinhibit-exec 2>&1 | FileCheck %s -DPREFIX=warning --implicit-check-not=warning: ## .foo is 8 bytes with entsize=8 (1 piece). .foo+8 (offset==size) is accepted. # CHECK: [[PREFIX]]: {{.*}}:(.foo): offset 0x9 is outside the section diff --git a/llvm/include/llvm/Support/Parallel.h b/llvm/include/llvm/Support/Parallel.h index a7e4013f0fc5..4c69b950cb08 100644 --- a/llvm/include/llvm/Support/Parallel.h +++ b/llvm/include/llvm/Support/Parallel.h @@ -78,6 +78,8 @@ public: Cond.notify_all(); } + uint32_t getCount() const { return Count.load(std::memory_order_acquire); } + void sync() const { std::unique_lock lock(Mutex); Cond.wait(lock, [&] { return Count.load(std::memory_order_relaxed) == 0; }); @@ -94,12 +96,8 @@ public: LLVM_ABI ~TaskGroup(); // Spawn a task, but does not wait for it to finish. - // Tasks marked with \p Sequential will be executed - // exactly in the order which they were spawned. LLVM_ABI void spawn(std::function f); - void sync() const { L.sync(); } - bool isParallel() const { return Parallel; } }; diff --git a/llvm/lib/Support/Parallel.cpp b/llvm/lib/Support/Parallel.cpp index 4d793e0de892..a7ca21e89f61 100644 --- a/llvm/lib/Support/Parallel.cpp +++ b/llvm/lib/Support/Parallel.cpp @@ -113,6 +113,18 @@ public: Cond.notify_one(); } + // Execute tasks from the work queue until the latch reaches zero. + // Used by nested TaskGroups (on worker threads) to prevent deadlock: + // instead of blocking in sync(), actively help drain the queue. + void helpSync(const parallel::detail::Latch &L) { + while (L.getCount() != 0) { + std::unique_lock Lock(Mutex); + if (Stop || WorkStack.empty()) + return; + popAndRun(Lock); + } + } + size_t getThreadCount() const { return ThreadCount; } private: @@ -215,22 +227,30 @@ size_t parallel::getThreadCount() { } #endif -// Latch::sync() called by the dtor may cause one thread to block. If is a dead -// lock if all threads in the default executor are blocked. To prevent the dead -// lock, only allow the root TaskGroup to run tasks parallelly. In the scenario -// of nested parallel_for_each(), only the outermost one runs parallelly. +static bool isNested() { +#if LLVM_ENABLE_THREADS + return threadIndex != UINT_MAX; +#else + return false; +#endif +} + TaskGroup::TaskGroup() : Parallel( #if LLVM_ENABLE_THREADS - strategy.ThreadsRequested != 1 && threadIndex == UINT_MAX + strategy.ThreadsRequested != 1 #else false #endif ) { } + TaskGroup::~TaskGroup() { - // We must ensure that all the workloads have finished before decrementing the - // instances count. +#if LLVM_ENABLE_THREADS + // In a nested TaskGroup (threadIndex != -1u), actively help drain the queue. + if (Parallel && isNested()) + getDefaultExecutor()->helpSync(L); +#endif L.sync(); } diff --git a/llvm/unittests/Support/ParallelTest.cpp b/llvm/unittests/Support/ParallelTest.cpp index c7ecc4eff6c2..ad833419cf1b 100644 --- a/llvm/unittests/Support/ParallelTest.cpp +++ b/llvm/unittests/Support/ParallelTest.cpp @@ -95,73 +95,30 @@ TEST(Parallel, ForEachError) { #if LLVM_ENABLE_THREADS TEST(Parallel, NestedTaskGroup) { - // This test checks: - // 1. Root TaskGroup is in Parallel mode. - // 2. Nested TaskGroup is not in Parallel mode. parallel::TaskGroup tg; - - tg.spawn([&]() { - EXPECT_TRUE(tg.isParallel() || (parallel::strategy.ThreadsRequested == 1)); - }); + EXPECT_TRUE(tg.isParallel() || (parallel::strategy.ThreadsRequested == 1)); tg.spawn([&]() { parallel::TaskGroup nestedTG; - EXPECT_FALSE(nestedTG.isParallel()); - - nestedTG.spawn([&]() { - // Check that root TaskGroup is in Parallel mode. - EXPECT_TRUE(tg.isParallel() || - (parallel::strategy.ThreadsRequested == 1)); - - // Check that nested TaskGroup is not in Parallel mode. - EXPECT_FALSE(nestedTG.isParallel()); - }); + EXPECT_TRUE(nestedTG.isParallel() || + (parallel::strategy.ThreadsRequested == 1)); }); } -TEST(Parallel, ParallelNestedTaskGroup) { - // This test checks that it is possible to have several TaskGroups - // run from different threads in Parallel mode. - std::atomic Count{0}; - - { - std::function Fn = [&]() { - parallel::TaskGroup tg; - - tg.spawn([&]() { - // Check that root TaskGroup is in Parallel mode. - EXPECT_TRUE(tg.isParallel() || - (parallel::strategy.ThreadsRequested == 1)); - - // Check that nested TaskGroup is not in Parallel mode. - parallel::TaskGroup nestedTG; - EXPECT_FALSE(nestedTG.isParallel()); - ++Count; - - nestedTG.spawn([&]() { - // Check that root TaskGroup is in Parallel mode. - EXPECT_TRUE(tg.isParallel() || - (parallel::strategy.ThreadsRequested == 1)); - - // Check that nested TaskGroup is not in Parallel mode. - EXPECT_FALSE(nestedTG.isParallel()); - ++Count; - }); +// Verify nested parallelFor doesn't deadlock. This is a simplified version of +// the pattern from https://reviews.llvm.org/D61115 that originally motivated +// serializing nested TaskGroups. With work-stealing in helpSync(), nested +// parallelism now works without deadlock. +TEST(Parallel, NestedParallelFor) { + std::atomic count{0}; + parallelFor(0, 8, [&](size_t i) { + parallelFor(0, 8, [&](size_t j) { + parallelFor(0, 8, [&](size_t k) { + count.fetch_add(1, std::memory_order_relaxed); }); - }; - - DefaultThreadPool Pool; - - Pool.async(Fn); - Pool.async(Fn); - Pool.async(Fn); - Pool.async(Fn); - Pool.async(Fn); - Pool.async(Fn); - - Pool.wait(); - } - EXPECT_EQ(Count, 12ul); + }); + }); + EXPECT_EQ(count.load(), 512u); } #endif