diff --git a/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp b/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp index ee42801da09d..a41d8d882650 100644 --- a/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp +++ b/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp @@ -236,19 +236,25 @@ static bool isSafeToParallelize(Operation *op) { // Thread-local variables allocated in the OpenMP parallel region or coming // from privatizing clauses are private to each thread and thus safe (and - // sometimes required) to parallelize. If the compiler wraps such load/stores - // in an omp.single block, only one thread updates its local copy, while - // all other threads read uninitialized data (see issue #143330). - // Use MemoryEffectOpInterface to check all memory effects generically. - // This also handles hlfir.assign which is present when the pass runs before - // HLFIR lowering. + // sometimes required) to parallelize. If the compiler wraps stores to + // thread-local variables in an omp.single block, only one thread updates + // its local copy, while all other threads read uninitialized data (see + // issue #143330). + // + // Only WRITE effects to thread-local memory are considered safe here, not + // reads. If reads were also safe, the cascading effect in moveToSingle + // could cause entire SingleRegions to become fully parallelized (all ops + // safe), eliminating the omp.single and its implicit barrier. This removes + // synchronization points needed to keep threads coordinated inside + // sequential loops that contain workshared operations. if (auto memEffects = dyn_cast(op)) { SmallVector effects; memEffects.getEffects(effects); if (!effects.empty() && llvm::all_of(effects, [&](const MemoryEffects::EffectInstance &effect) { Value val = effect.getValue(); - return val && isOpenMPThreadLocalMemory(op, val); + return val && isa(effect.getEffect()) && + isOpenMPThreadLocalMemory(op, val); })) return true; } diff --git a/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir b/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir index d046ef4cc46f..d6000c989515 100644 --- a/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir +++ b/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir @@ -119,6 +119,39 @@ func.func @hlfir_assign_private_clause(%arg0: !fir.ref) { // CHECK-NEXT: } +// ----- + +// Check that hlfir.assign from a shared variable to a private variable is +// NOT parallelized. The assign has a Read effect on the shared RHS, which +// is not a write to thread-local memory, so isSafeToParallelize returns false. + +omp.private {type = private} @y_private : i32 + +// CHECK-LABEL: func.func @hlfir_assign_shared_to_private +func.func @hlfir_assign_shared_to_private(%arg0: !fir.ref, %shared: !fir.ref) { + omp.parallel private(@y_private %arg0 -> %priv_arg : !fir.ref) { + %decl:2 = hlfir.declare %priv_arg {uniq_name = "x"} : (!fir.ref) -> (!fir.ref, !fir.ref) + omp.workshare { + // hlfir.assign with a shared RHS variable should stay in omp.single + hlfir.assign %shared to %decl#0 : !fir.ref, !fir.ref + omp.terminator + } + omp.terminator + } + return +} + +// CHECK: omp.parallel private(@y_private %{{.*}} -> %[[PRIV_ARG:.*]] : !fir.ref) { +// CHECK-NEXT: %[[DECL:.*]]:2 = hlfir.declare %[[PRIV_ARG]] +// CHECK-NEXT: omp.single nowait { +// CHECK: hlfir.assign %{{.*}} to %[[DECL]]#0 : !fir.ref, !fir.ref +// CHECK: omp.terminator +// CHECK-NEXT: } +// CHECK-NEXT: omp.barrier +// CHECK-NEXT: omp.terminator +// CHECK-NEXT: } + + // Check that reduction clause block arguments are recognized as thread-local. omp.declare_reduction @add_reduction_i32 : i32 init { @@ -263,7 +296,9 @@ func.func @non_thread_local_needs_single(%arg0: !fir.ref) { // CHECK-NEXT: } -// Check that loads from thread-local alloca combined with stores are parallelized. +// Check that stores to thread-local alloca are parallelized, but loads stay in +// omp.single. Only writes to thread-local memory are safe to parallelize; +// reads must remain in the single to preserve synchronization barriers. // CHECK-LABEL: func.func @thread_local_load_and_store func.func @thread_local_load_and_store() { @@ -281,14 +316,90 @@ func.func @thread_local_load_and_store() { return } -// All operations on thread-local memory should be outside omp.single +// The store to thread-local memory is parallelized (outside the single), +// but the load remains inside the single to maintain synchronization. // CHECK: omp.parallel { // CHECK-NEXT: %[[ALLOCA:.*]] = fir.alloca i32 -// CHECK-NEXT: %[[C1:.*]] = arith.constant 1 : i32 -// CHECK-NEXT: fir.store %[[C1]] to %[[ALLOCA]] : !fir.ref -// CHECK-NEXT: %[[VAL:.*]] = fir.load %[[ALLOCA]] : !fir.ref -// CHECK-NEXT: fir.store %[[VAL]] to %[[ALLOCA]] : !fir.ref +// CHECK: omp.single nowait { +// CHECK: fir.store {{.*}} to %[[ALLOCA]] : !fir.ref +// CHECK: fir.load %[[ALLOCA]] : !fir.ref +// CHECK: fir.store {{.*}} to %[[ALLOCA]] : !fir.ref +// CHECK: omp.terminator +// CHECK-NEXT: } +// CHECK: fir.store {{.*}} to %[[ALLOCA]] : !fir.ref // CHECK-NEXT: omp.barrier // CHECK-NEXT: omp.terminator // CHECK-NEXT: } + + +// Check the forall-in-workshare pattern: a sequential loop (fir.do_loop) with +// thread-local index stores, shared memory loads, and a workshare.loop_wrapper. +// This models the lowered IR for: +// !$omp workshare +// forall (i=1:n) a(:,i) = a(:,i) + 1 +// !$omp end workshare +// +// Loads from thread-local allocas must remain inside omp.single so that the +// single's barrier preserves thread synchronization. +// If reads were also safe-to-parallelize, the entire SingleRegion before the +// workshare.loop_wrapper could become fully parallelized, eliminating the +// omp.single and its implicit barrier. This caused race conditions on shared +// runtime data structures in forall-workshare patterns (see issue #143330). + +// CHECK-LABEL: func.func @forall_pattern_in_workshare +func.func @forall_pattern_in_workshare(%shared: !fir.ref) { + omp.parallel { + %idx_alloca = fir.alloca i32 {bindc_name = "i", pinned} + omp.workshare { + %c1 = arith.constant 1 : index + %c10 = arith.constant 10 : index + fir.do_loop %iv = %c1 to %c10 step %c1 { + // Store loop index to thread-local alloca (must be parallelized) + %iv_i32 = fir.convert %iv : (index) -> i32 + fir.store %iv_i32 to %idx_alloca : !fir.ref + // Load from shared memory (must stay in omp.single) + %shared_val = fir.load %shared : !fir.ref + // Load from thread-local alloca (must stay in omp.single to + // prevent SingleRegion elimination and barrier loss) + %idx_val = fir.load %idx_alloca : !fir.ref + // Side-effecting op using both values (must stay in omp.single) + "test.side_effect"(%shared_val, %idx_val) : (i32, i32) -> () + // Workshared loop + omp.workshare.loop_wrapper { + omp.loop_nest (%j) : index = (%c1) to (%c10) inclusive step (%c1) { + "test.inner"(%j) : (index) -> () + omp.yield + } + } + } + omp.terminator + } + omp.terminator + } + return +} + +// The thread-local load must remain inside omp.single along with the shared +// load and side-effecting op (preserving the single's barrier). The +// thread-local store is also cloned inside the single, but a parallel copy +// is placed after it so all threads update their own alloca. +// CHECK: omp.parallel { +// CHECK: %[[IDX:.*]] = fir.alloca i32 {bindc_name = "i", pinned} +// CHECK: fir.do_loop +// The single contains the shared load, thread-local load, and side effect. +// The thread-local store is also cloned inside (harmless, one thread runs it). +// CHECK: omp.single { +// CHECK: fir.store {{.*}} to %[[IDX]] : !fir.ref +// CHECK: fir.load %{{.*}} : !fir.ref +// CHECK: fir.load %[[IDX]] : !fir.ref +// CHECK: "test.side_effect" +// CHECK: omp.terminator +// CHECK-NEXT: } +// The parallelized copy of the store runs after the single's barrier, +// ensuring all threads update their own thread-local index alloca. +// CHECK: fir.store {{.*}} to %[[IDX]] : !fir.ref +// CHECK: omp.wsloop { +// CHECK: } +// CHECK: omp.barrier +// CHECK: }