diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 464ec5b5a2ec..1947323ef85f 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -787,6 +787,13 @@ static void hoistNonEntryAllocasToEntryBlock(llvm::BasicBlock &Block) { AllocaInst->moveBefore(InsertPoint); } +static void hoistNonEntryAllocasToEntryBlock(llvm::Function *Func) { + PostDominatorTree PostDomTree(*Func); + for (llvm::BasicBlock &BB : *Func) + if (PostDomTree.properlyDominates(&BB, &Func->getEntryBlock())) + hoistNonEntryAllocasToEntryBlock(BB); +} + void OpenMPIRBuilder::finalize(Function *Fn) { SmallPtrSet ParallelRegionBlockSet; SmallVector Blocks; @@ -893,12 +900,8 @@ void OpenMPIRBuilder::finalize(Function *Fn) { if (OI.PostOutlineCB) OI.PostOutlineCB(*OutlinedFn); - if (OI.FixUpNonEntryAllocas) { - PostDominatorTree PostDomTree(*OutlinedFn); - for (llvm::BasicBlock &BB : *OutlinedFn) - if (PostDomTree.properlyDominates(&BB, &OutlinedFn->getEntryBlock())) - hoistNonEntryAllocasToEntryBlock(BB); - } + if (OI.FixUpNonEntryAllocas) + hoistNonEntryAllocasToEntryBlock(OutlinedFn); } // Remove work items that have been completed. @@ -4234,6 +4237,13 @@ Expected OpenMPIRBuilder::createReductionFunction( } Builder.CreateRetVoid(); + // Compiling with `-O0`, `alloca`s emitted in non-entry blocks are not hoisted + // to the entry block (this is dones for higher opt levels by later passes in + // the pipeline). This has caused issues because non-entry `alloca`s force the + // function to use dynamic stack allocations and we might run out of scratch + // memory. + hoistNonEntryAllocasToEntryBlock(ReductionFunc); + return ReductionFunc; } diff --git a/mlir/test/Target/LLVMIR/allocatable_gpu_reduction.mlir b/mlir/test/Target/LLVMIR/allocatable_gpu_reduction.mlir index 95d12f304aca..c9ff6de8cc95 100644 --- a/mlir/test/Target/LLVMIR/allocatable_gpu_reduction.mlir +++ b/mlir/test/Target/LLVMIR/allocatable_gpu_reduction.mlir @@ -24,6 +24,8 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<"dlti.alloca_memory_space" = 5 : "llvm.intr.memcpy"(%5, %arg0, %6) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> () %7 = llvm.mlir.constant(24 : i32) : i32 "llvm.intr.memcpy"(%2, %arg1, %7) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> () + llvm.br ^bb1 + ^bb1: %8 = llvm.getelementptr %5[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> %9 = llvm.load %8 : !llvm.ptr -> !llvm.ptr %10 = llvm.getelementptr %2[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> @@ -63,6 +65,21 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<"dlti.alloca_memory_space" = 5 : } } +// CHECK: define internal void @"{{.*}}$reduction$reduction_func"(ptr noundef %[[ARG_0:.*]], ptr noundef %[[ARG_1:.*]]) {{.*}} { +// CHECK: entry: +// CHECK: %[[TEMP_1:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 } +// CHECK: %[[TEMP_2:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 } +// CHECK: br label %[[RED_BODY:omp.reduction.nonatomic.body]] +// Verify that no allocas are emitted beyond the entry block. +// CHECK-NOT: alloca + +// CHECK: [[RED_BODY]]: +// CHECK: %[[TEMP_1_ACAST:.*]] = addrspacecast ptr addrspace(5) %[[TEMP_1]] to ptr +// CHECK: %[[TEMP_2_ACAST:.*]] = addrspacecast ptr addrspace(5) %[[TEMP_2]] to ptr +// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr %[[TEMP_2_ACAST]], ptr %{{.*}}, i32 24, i1 false) +// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr %[[TEMP_1_ACAST]], ptr %{{.*}}, i32 24, i1 false) +// CHECK: } + // CHECK: define {{.*}} @_omp_reduction_shuffle_and_reduce_func({{.*}}) {{.*}} { // CHECK: %[[REMOTE_RED_LIST:.omp.reduction.remote_reduce_list]] = alloca [1 x ptr], align 8, addrspace(5) // CHECK: %[[RED_ELEM:.omp.reduction.element]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8, addrspace(5)