[OpenMP][MLIR] Support LLVM translation for distribute with delayed privatization (#131564)

Adds support for tranlating delayed privatization (`private` and
`firstprivate`) for `omp.distribute` ops.
This commit is contained in:
Kareem Ergawy 2025-03-18 10:14:42 +01:00 committed by GitHub
parent 44e4b27aec
commit 49b8d8472f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 166 additions and 38 deletions

View File

@ -250,7 +250,6 @@ static LogicalResult checkImplementationStatus(Operation &op) {
checkAllocate(op, result);
checkDistSchedule(op, result);
checkOrder(op, result);
checkPrivate(op, result);
})
.Case([&](omp::OrderedRegionOp op) { checkParLevelSimd(op, result); })
.Case([&](omp::SectionsOp op) {
@ -4188,6 +4187,38 @@ convertOmpDistribute(Operation &opInst, llvm::IRBuilderBase &builder,
// DistributeOp has only one region associated with it.
builder.restoreIP(codeGenIP);
// TODO This is a recurring pattern in almost all ops that need
// privatization. Try to abstract it in a shared util/interface.
MutableArrayRef<BlockArgument> privateBlockArgs =
cast<omp::BlockArgOpenMPOpInterface>(*distributeOp)
.getPrivateBlockArgs();
SmallVector<mlir::Value> mlirPrivateVars;
SmallVector<llvm::Value *> llvmPrivateVars;
SmallVector<omp::PrivateClauseOp> privateDecls;
mlirPrivateVars.reserve(privateBlockArgs.size());
llvmPrivateVars.reserve(privateBlockArgs.size());
collectPrivatizationDecls(distributeOp, privateDecls);
for (mlir::Value privateVar : distributeOp.getPrivateVars())
mlirPrivateVars.push_back(privateVar);
llvm::Expected<llvm::BasicBlock *> afterAllocas = allocatePrivateVars(
builder, moduleTranslation, privateBlockArgs, privateDecls,
mlirPrivateVars, llvmPrivateVars, allocaIP);
if (handleError(afterAllocas, opInst).failed())
return llvm::make_error<PreviouslyReportedError>();
if (handleError(initPrivateVars(builder, moduleTranslation,
privateBlockArgs, privateDecls,
mlirPrivateVars, llvmPrivateVars),
opInst)
.failed())
return llvm::make_error<PreviouslyReportedError>();
if (failed(copyFirstPrivateVars(builder, moduleTranslation, mlirPrivateVars,
llvmPrivateVars, privateDecls)))
return llvm::make_error<PreviouslyReportedError>();
llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
llvm::Expected<llvm::BasicBlock *> regionBlock =
@ -4200,31 +4231,37 @@ convertOmpDistribute(Operation &opInst, llvm::IRBuilderBase &builder,
// Skip applying a workshare loop below when translating 'distribute
// parallel do' (it's been already handled by this point while translating
// the nested omp.wsloop).
if (isa_and_present<omp::WsloopOp>(distributeOp.getNestedWrapper()))
return llvm::Error::success();
if (!isa_and_present<omp::WsloopOp>(distributeOp.getNestedWrapper())) {
// TODO: Add support for clauses which are valid for DISTRIBUTE
// constructs. Static schedule is the default.
auto schedule = omp::ClauseScheduleKind::Static;
bool isOrdered = false;
std::optional<omp::ScheduleModifier> scheduleMod;
bool isSimd = false;
llvm::omp::WorksharingLoopType workshareLoopType =
llvm::omp::WorksharingLoopType::DistributeStaticLoop;
bool loopNeedsBarrier = false;
llvm::Value *chunk = nullptr;
// TODO: Add support for clauses which are valid for DISTRIBUTE constructs.
// Static schedule is the default.
auto schedule = omp::ClauseScheduleKind::Static;
bool isOrdered = false;
std::optional<omp::ScheduleModifier> scheduleMod;
bool isSimd = false;
llvm::omp::WorksharingLoopType workshareLoopType =
llvm::omp::WorksharingLoopType::DistributeStaticLoop;
bool loopNeedsBarrier = false;
llvm::Value *chunk = nullptr;
llvm::CanonicalLoopInfo *loopInfo =
findCurrentLoopInfo(moduleTranslation);
llvm::OpenMPIRBuilder::InsertPointOrErrorTy wsloopIP =
ompBuilder->applyWorkshareLoop(
ompLoc.DL, loopInfo, allocaIP, loopNeedsBarrier,
convertToScheduleKind(schedule), chunk, isSimd,
scheduleMod == omp::ScheduleModifier::monotonic,
scheduleMod == omp::ScheduleModifier::nonmonotonic, isOrdered,
workshareLoopType);
llvm::CanonicalLoopInfo *loopInfo = findCurrentLoopInfo(moduleTranslation);
llvm::OpenMPIRBuilder::InsertPointOrErrorTy wsloopIP =
ompBuilder->applyWorkshareLoop(
ompLoc.DL, loopInfo, allocaIP, loopNeedsBarrier,
convertToScheduleKind(schedule), chunk, isSimd,
scheduleMod == omp::ScheduleModifier::monotonic,
scheduleMod == omp::ScheduleModifier::nonmonotonic, isOrdered,
workshareLoopType);
if (!wsloopIP)
return wsloopIP.takeError();
}
if (failed(cleanupPrivateVars(builder, moduleTranslation,
distributeOp.getLoc(), llvmPrivateVars,
privateDecls)))
return llvm::make_error<PreviouslyReportedError>();
if (!wsloopIP)
return wsloopIP.takeError();
return llvm::Error::success();
};

View File

@ -0,0 +1,106 @@
// Test code-gen for `omp.distribute` ops with delayed privatizers (i.e. using
// `omp.private` ops).
// RUN: mlir-translate -mlir-to-llvmir -split-input-file %s | FileCheck %s
omp.private {type = private} @_QFEi_private_i32 : i32
omp.private {type = private} @_QFEpriv_val_dist_private_f32 : f32
llvm.func @_QQmain() {
%0 = llvm.mlir.constant(1 : i64) : i64
%1 = llvm.alloca %0 x f32 {bindc_name = "priv_val_dist"} : (i64) -> !llvm.ptr
%3 = llvm.alloca %0 x i32 {bindc_name = "i"} : (i64) -> !llvm.ptr
%4 = llvm.mlir.constant(3.140000e+00 : f32) : f32
%5 = llvm.mlir.constant(1000 : i32) : i32
%6 = llvm.mlir.constant(1 : i32) : i32
omp.teams {
omp.distribute private(@_QFEpriv_val_dist_private_f32 %1 -> %arg0, @_QFEi_private_i32 %3 -> %arg1 : !llvm.ptr, !llvm.ptr) {
omp.loop_nest (%arg2) : i32 = (%6) to (%5) inclusive step (%6) {
llvm.store %arg2, %arg1 : i32, !llvm.ptr
llvm.store %4, %arg0 : f32, !llvm.ptr
omp.yield
}
}
omp.terminator
}
llvm.return
}
// CHECK-LABEL: define void @_QQmain() {
// CHECK: call void {{.*}} @__kmpc_fork_teams(ptr @{{.*}}, i32 0, ptr @[[TEAMS_FUNC:.*]])
// CHECK-NEXT: br label %teams.exit
// CHECK: }
// CHECK: define internal void @[[TEAMS_FUNC]]({{.*}}) {
// CHECK: call void @[[DIST_FUNC:.*]]()
// CHECK-NEXT: br label %distribute.exit
// CHECK: }
// CHECK: define internal void @[[DIST_FUNC]]() {
// CHECK: %[[PRIV_VAR_ALLOC:.*]] = alloca float, align 4
// CHECK: %[[IV_ALLOC:.*]] = alloca i32, align 4
// CHECK: omp.loop_nest.region:
// CHECK-NEXT: store i32 %{{.*}}, ptr %[[IV_ALLOC]], align 4
// CHECK-NEXT: store float 0x40091EB860000000, ptr %[[PRIV_VAR_ALLOC]], align 4
// CHECK: }
// -----
llvm.func @foo_free(!llvm.ptr)
omp.private {type = firstprivate} @_QFEpriv_val_dist_firstprivate_f32 : f32 copy {
^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
%0 = llvm.load %arg0 : !llvm.ptr -> f32
llvm.store %0, %arg1 : f32, !llvm.ptr
omp.yield(%arg1 : !llvm.ptr)
} dealloc {
^bb0(%arg0: !llvm.ptr):
llvm.call @foo_free(%arg0) : (!llvm.ptr) -> ()
omp.yield
}
llvm.func @_QQmain() {
%0 = llvm.mlir.constant(1 : i64) : i64
%1 = llvm.alloca %0 x f32 {bindc_name = "priv_val_dist"} : (i64) -> !llvm.ptr
%4 = llvm.mlir.constant(3.140000e+00 : f32) : f32
%6 = llvm.mlir.constant(1 : i32) : i32
omp.distribute private(@_QFEpriv_val_dist_firstprivate_f32 %1 -> %arg0 : !llvm.ptr) {
omp.loop_nest (%arg2) : i32 = (%6) to (%6) inclusive step (%6) {
llvm.store %4, %arg0 : f32, !llvm.ptr
omp.yield
}
}
llvm.return
}
// CHECK-LABEL: define void @_QQmain() {
// CHECK: %[[SHARED_VAR_ALLOC:.*]] = alloca float, i64 1, align 4
// CHECK: %[[SHARED_VAR_PTR:.*]] = getelementptr { ptr }, ptr %[[DIST_PARAM:.*]], i32 0, i32 0
// CHECK: store ptr %[[SHARED_VAR_ALLOC]], ptr %[[SHARED_VAR_PTR]], align 8
// CHECK: call void @[[DIST_FUNC:.*]](ptr %[[DIST_PARAM]])
// CHECK-NEXT: br label %distribute.exit
// CHECK: }
// CHECK: define internal void @[[DIST_FUNC]](ptr %[[DIST_ARG:.*]]) {
// CHECK: %[[SHARED_VAR_GEP:.*]] = getelementptr { ptr }, ptr %[[DIST_ARG]], i32 0, i32 0
// CHECK: %[[SHARED_VAR_PTR2:.*]] = load ptr, ptr %[[SHARED_VAR_GEP]], align 8
// CHECK: %[[PRIV_VAR_ALLOC:.*]] = alloca float, align 4
// CHECK: omp.private.copy:
// CHECK-NEXT: %[[SHARED_VAR_VAL:.*]] = load float, ptr %[[SHARED_VAR_PTR2]], align 4
// CHECK-NEXT: store float %[[SHARED_VAR_VAL]], ptr %[[PRIV_VAR_ALLOC]], align 4
// CHECK: omp_loop.after:
// CHECK-NEXT: br label %omp.region.cont
// CHECK: omp.region.cont:
// CHECK-NEXT: call void @foo_free(ptr %[[PRIV_VAR_ALLOC]])
// CHECK: omp.loop_nest.region:
// CHECK-NEXT: store float 0x40091EB860000000, ptr %[[PRIV_VAR_ALLOC]], align 4
// CHECK: }

View File

@ -105,21 +105,6 @@ llvm.func @distribute_order(%lb : i32, %ub : i32, %step : i32) {
// -----
omp.private {type = private} @x.privatizer : !llvm.ptr
llvm.func @distribute_private(%lb : i32, %ub : i32, %step : i32, %x : !llvm.ptr) {
// expected-error@below {{not yet implemented: Unhandled clause privatization in omp.distribute operation}}
// expected-error@below {{LLVM Translation failed for operation: omp.distribute}}
omp.distribute private(@x.privatizer %x -> %arg0 : !llvm.ptr) {
omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
omp.yield
}
}
llvm.return
}
// -----
llvm.func @ordered_region_par_level_simd() {
// expected-error@below {{not yet implemented: Unhandled clause parallelization-level in omp.ordered.region operation}}
// expected-error@below {{LLVM Translation failed for operation: omp.ordered.region}}