llvm-project/mlir/test/Target/LLVMIR/omptarget-teams-reduction.mlir
Alexander Richardson 07e2ba445d
[AMDGPU] Set AS8 address width to 48 bits
Of the 128-bits of buffer descriptor only 48 bits are address bits, so
following the discussion on https://discourse.llvm.org/t/clarifiying-the-semantics-of-ptrtoint/83987/54,
the logic conclusion is to set the index width to 48 bits instead of
the current value of 128.

Most of the test changes are mechanical datalayout updates, but there
is one actual change: the ptrmask test now uses .i48 instead of .i128
and I had to update SelectionDAGBuilder to correctly extend the mask.

Reviewed By: krzysz00

Pull Request: https://github.com/llvm/llvm-project/pull/139419
2025-05-19 17:26:05 -07:00

80 lines
4.3 KiB
MLIR

// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
// Only check the overall shape of the code and the presence of relevant
// runtime calls. Actual IR checking is done at the OpenMPIRBuilder level.
module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memory_space", 5 : ui32>>, llvm.data_layout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8", llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_gpu = true, omp.is_target_device = true } {
omp.declare_reduction @add_reduction_i32 : i32 init {
^bb0(%arg0: i32):
%0 = llvm.mlir.constant(0 : i32) : i32
omp.yield(%0 : i32)
} combiner {
^bb0(%arg0: i32, %arg1: i32):
%0 = llvm.add %arg0, %arg1 : i32
omp.yield(%0 : i32)
}
llvm.func @simple_target_teams_only_reduction_() attributes {fir.internal_name = "_QPsimple_target_teams_only_reduction", frame_pointer = #llvm.framePointerKind<all>, omp.declare_target = #omp.declaretarget<device_type = (host), capture_clause = (to)>, target_cpu = "gfx1030", target_features = #llvm.target_features<["+16-bit-insts", "+ci-insts", "+dl-insts", "+dot1-insts", "+dot10-insts", "+dot2-insts", "+dot5-insts", "+dot6-insts", "+dot7-insts", "+dpp", "+gfx10-3-insts", "+gfx10-insts", "+gfx8-insts", "+gfx9-insts", "+gws", "+image-insts", "+s-memrealtime", "+s-memtime-inst", "+wavefrontsize32"]>} {
%0 = llvm.mlir.constant(1 : i64) : i64
%1 = llvm.alloca %0 x i32 {bindc_name = "sum"} : (i64) -> !llvm.ptr<5>
%2 = llvm.addrspacecast %1 : !llvm.ptr<5> to !llvm.ptr
%3 = llvm.mlir.constant(1 : i64) : i64
%4 = llvm.alloca %3 x i32 {bindc_name = "index_"} : (i64) -> !llvm.ptr<5>
%5 = llvm.addrspacecast %4 : !llvm.ptr<5> to !llvm.ptr
%6 = llvm.mlir.constant(0 : i32) : i32
%7 = llvm.mlir.constant(1 : i64) : i64
%8 = llvm.mlir.constant(1 : i64) : i64
llvm.store %6, %2 : i32, !llvm.ptr
%9 = omp.map.info var_ptr(%2 : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = "sum"}
%10 = omp.map.info var_ptr(%5 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "index_"}
omp.target map_entries(%9 -> %arg0, %10 -> %arg1 : !llvm.ptr, !llvm.ptr) {
%11 = llvm.mlir.constant(0 : index) : i64
%12 = llvm.mlir.constant(10000 : index) : i64
%13 = llvm.mlir.constant(1 : index) : i64
omp.teams reduction(@add_reduction_i32 %arg0 -> %arg2 : !llvm.ptr) {
%14 = llvm.trunc %13 : i64 to i32
llvm.br ^bb1(%14, %12 : i32, i64)
^bb1(%15: i32, %16: i64): // 2 preds: ^bb0, ^bb2
%17 = llvm.icmp "sgt" %16, %11 : i64
llvm.cond_br %17, ^bb2, ^bb3
^bb2: // pred: ^bb1
llvm.store %15, %arg1 : i32, !llvm.ptr
%18 = llvm.load %arg2 : !llvm.ptr -> i32
%19 = llvm.load %arg1 : !llvm.ptr -> i32
%20 = llvm.add %18, %19 : i32
llvm.store %20, %arg2 : i32, !llvm.ptr
%21 = llvm.load %arg1 : !llvm.ptr -> i32
%22 = llvm.add %21, %14 overflow<nsw> : i32
%23 = llvm.sub %16, %13 : i64
llvm.br ^bb1(%22, %23 : i32, i64)
^bb3: // pred: ^bb1
llvm.store %15, %arg1 : i32, !llvm.ptr
omp.terminator
}
omp.terminator
}
llvm.return
}
}
// CHECK: call i32 @__kmpc_target_init
// CHECK: call void @[[OUTLINED:__omp_offloading_[A-Za-z0-9_.]*]]
// CHECK: %[[MASTER:.+]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2
// CHECK: icmp eq i32 %[[MASTER]], 1
// CHECK: i1 %{{.+}}, label %[[THEN:[A-Za-z0-9_.]*]], label %[[DONE:[A-Za-z0-9_.]*]]
// CHECK: [[THEN]]:
// CHECK-NEXT: %[[FINAL_RHS:[A-Za-z0-9_.]*]] = load i32
// CHECK-NEXT: %[[FINAL_LHS:[A-Za-z0-9_.]*]] = load i32
// CHECK-NEXT: %[[FINAL_RESULT:[A-Za-z0-9_.]*]] = add i32 %[[FINAL_LHS]], %[[FINAL_RHS]]
// CHECK-NEXT: store i32 %[[FINAL_RESULT]]
// CHECK: call void @__kmpc_barrier
// CHECK: call void @__kmpc_target_deinit
// CHECK: define internal void @[[OUTLINED]]
// Skip to the loop
// CHECK: br i1
// CHECK: %[[TEAM_RHS:[A-Za-z0-9_.]*]] = load i32
// CHECK-NEXT: %[[TEAM_LHS:[A-Za-z0-9_.]*]] = load i32
// CHECK-NEXT: %[[TEAM_RESULT:[A-Za-z0-9_.]*]] = add i32 %[[TEAM_RHS]], %[[TEAM_LHS]]
// CHECK-NEXT: store i32 %[[TEAM_RESULT]]