From c985f285e4911affa46ea79e21263cbaf4c8322f Mon Sep 17 00:00:00 2001 From: Kareem Ergawy Date: Tue, 17 Feb 2026 09:08:17 +0100 Subject: [PATCH] [OMPIRBuilder] Hoist alloca's to entry blocks of compiler-emitted GPU reduction functions (#181359) Fixes a bug in GPU reductions when `-O0` was used to compile GPU reductions. There were invalid memory accesses at runtime for the following example: ```fortran program test_array_reduction() integer :: red_array(1) integer :: i red_array = 0 !$omp target teams distribute parallel do reduction(+:red_array) do i = 1, 100 red_array(1) = red_array(1) + 4422 end do !$omp end target teams distribute parallel do print *, red_array end program test_array_reduction ``` The issue was caused by alloca's for some temp values in the combiner region of the reduction op being inlined beyond the entry blocks of the GPU reduction functions emitted by the compiler. This PR fixes the issue by hoisting all alloca's to the entry block after the reduction functions are completely emitted by the compiler. --- llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 22 ++++++++++++++----- .../LLVMIR/allocatable_gpu_reduction.mlir | 17 ++++++++++++++ 2 files changed, 33 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 464ec5b5a2ec..1947323ef85f 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -787,6 +787,13 @@ static void hoistNonEntryAllocasToEntryBlock(llvm::BasicBlock &Block) { AllocaInst->moveBefore(InsertPoint); } +static void hoistNonEntryAllocasToEntryBlock(llvm::Function *Func) { + PostDominatorTree PostDomTree(*Func); + for (llvm::BasicBlock &BB : *Func) + if (PostDomTree.properlyDominates(&BB, &Func->getEntryBlock())) + hoistNonEntryAllocasToEntryBlock(BB); +} + void OpenMPIRBuilder::finalize(Function *Fn) { SmallPtrSet ParallelRegionBlockSet; SmallVector Blocks; @@ -893,12 +900,8 @@ void OpenMPIRBuilder::finalize(Function *Fn) { if (OI.PostOutlineCB) OI.PostOutlineCB(*OutlinedFn); - if (OI.FixUpNonEntryAllocas) { - PostDominatorTree PostDomTree(*OutlinedFn); - for (llvm::BasicBlock &BB : *OutlinedFn) - if (PostDomTree.properlyDominates(&BB, &OutlinedFn->getEntryBlock())) - hoistNonEntryAllocasToEntryBlock(BB); - } + if (OI.FixUpNonEntryAllocas) + hoistNonEntryAllocasToEntryBlock(OutlinedFn); } // Remove work items that have been completed. @@ -4234,6 +4237,13 @@ Expected OpenMPIRBuilder::createReductionFunction( } Builder.CreateRetVoid(); + // Compiling with `-O0`, `alloca`s emitted in non-entry blocks are not hoisted + // to the entry block (this is dones for higher opt levels by later passes in + // the pipeline). This has caused issues because non-entry `alloca`s force the + // function to use dynamic stack allocations and we might run out of scratch + // memory. + hoistNonEntryAllocasToEntryBlock(ReductionFunc); + return ReductionFunc; } diff --git a/mlir/test/Target/LLVMIR/allocatable_gpu_reduction.mlir b/mlir/test/Target/LLVMIR/allocatable_gpu_reduction.mlir index 95d12f304aca..c9ff6de8cc95 100644 --- a/mlir/test/Target/LLVMIR/allocatable_gpu_reduction.mlir +++ b/mlir/test/Target/LLVMIR/allocatable_gpu_reduction.mlir @@ -24,6 +24,8 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<"dlti.alloca_memory_space" = 5 : "llvm.intr.memcpy"(%5, %arg0, %6) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> () %7 = llvm.mlir.constant(24 : i32) : i32 "llvm.intr.memcpy"(%2, %arg1, %7) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> () + llvm.br ^bb1 + ^bb1: %8 = llvm.getelementptr %5[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> %9 = llvm.load %8 : !llvm.ptr -> !llvm.ptr %10 = llvm.getelementptr %2[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> @@ -63,6 +65,21 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<"dlti.alloca_memory_space" = 5 : } } +// CHECK: define internal void @"{{.*}}$reduction$reduction_func"(ptr noundef %[[ARG_0:.*]], ptr noundef %[[ARG_1:.*]]) {{.*}} { +// CHECK: entry: +// CHECK: %[[TEMP_1:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 } +// CHECK: %[[TEMP_2:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 } +// CHECK: br label %[[RED_BODY:omp.reduction.nonatomic.body]] +// Verify that no allocas are emitted beyond the entry block. +// CHECK-NOT: alloca + +// CHECK: [[RED_BODY]]: +// CHECK: %[[TEMP_1_ACAST:.*]] = addrspacecast ptr addrspace(5) %[[TEMP_1]] to ptr +// CHECK: %[[TEMP_2_ACAST:.*]] = addrspacecast ptr addrspace(5) %[[TEMP_2]] to ptr +// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr %[[TEMP_2_ACAST]], ptr %{{.*}}, i32 24, i1 false) +// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr %[[TEMP_1_ACAST]], ptr %{{.*}}, i32 24, i1 false) +// CHECK: } + // CHECK: define {{.*}} @_omp_reduction_shuffle_and_reduce_func({{.*}}) {{.*}} { // CHECK: %[[REMOTE_RED_LIST:.omp.reduction.remote_reduce_list]] = alloca [1 x ptr], align 8, addrspace(5) // CHECK: %[[RED_ELEM:.omp.reduction.element]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8, addrspace(5)