[flang][cuda] Add interfaces and lowering for tma_bulk_load (#165474)
As defined in https://docs.nvidia.com/hpc-sdk/compilers/cuda-fortran-prog-guide/#load-and-store-functions-using-bulk-tma-operations
This commit is contained in:
parent
e0dab824f3
commit
5d89a474b0
@ -461,6 +461,13 @@ struct IntrinsicLibrary {
|
||||
mlir::Value genTime(mlir::Type, llvm::ArrayRef<mlir::Value>);
|
||||
void genTMABulkCommitGroup(llvm::ArrayRef<fir::ExtendedValue>);
|
||||
void genTMABulkG2S(llvm::ArrayRef<fir::ExtendedValue>);
|
||||
void genTMABulkLoadC4(llvm::ArrayRef<fir::ExtendedValue>);
|
||||
void genTMABulkLoadC8(llvm::ArrayRef<fir::ExtendedValue>);
|
||||
void genTMABulkLoadI4(llvm::ArrayRef<fir::ExtendedValue>);
|
||||
void genTMABulkLoadI8(llvm::ArrayRef<fir::ExtendedValue>);
|
||||
void genTMABulkLoadR2(llvm::ArrayRef<fir::ExtendedValue>);
|
||||
void genTMABulkLoadR4(llvm::ArrayRef<fir::ExtendedValue>);
|
||||
void genTMABulkLoadR8(llvm::ArrayRef<fir::ExtendedValue>);
|
||||
void genTMABulkS2G(llvm::ArrayRef<fir::ExtendedValue>);
|
||||
void genTMABulkWaitGroup(llvm::ArrayRef<fir::ExtendedValue>);
|
||||
mlir::Value genTrailz(mlir::Type, llvm::ArrayRef<mlir::Value>);
|
||||
|
||||
@ -1045,6 +1045,55 @@ static constexpr IntrinsicHandler handlers[]{
|
||||
{"dst", asAddr},
|
||||
{"nbytes", asValue}}},
|
||||
/*isElemental=*/false},
|
||||
{"tma_bulk_ldc4",
|
||||
&I::genTMABulkLoadC4,
|
||||
{{{"barrier", asAddr},
|
||||
{"src", asAddr},
|
||||
{"dst", asAddr},
|
||||
{"nelems", asValue}}},
|
||||
/*isElemental=*/false},
|
||||
{"tma_bulk_ldc8",
|
||||
&I::genTMABulkLoadC8,
|
||||
{{{"barrier", asAddr},
|
||||
{"src", asAddr},
|
||||
{"dst", asAddr},
|
||||
{"nelems", asValue}}},
|
||||
/*isElemental=*/false},
|
||||
{"tma_bulk_ldi4",
|
||||
&I::genTMABulkLoadI4,
|
||||
{{{"barrier", asAddr},
|
||||
{"src", asAddr},
|
||||
{"dst", asAddr},
|
||||
{"nelems", asValue}}},
|
||||
/*isElemental=*/false},
|
||||
{"tma_bulk_ldi8",
|
||||
&I::genTMABulkLoadI8,
|
||||
{{{"barrier", asAddr},
|
||||
{"src", asAddr},
|
||||
{"dst", asAddr},
|
||||
{"nelems", asValue}}},
|
||||
/*isElemental=*/false},
|
||||
{"tma_bulk_ldr2",
|
||||
&I::genTMABulkLoadR2,
|
||||
{{{"barrier", asAddr},
|
||||
{"src", asAddr},
|
||||
{"dst", asAddr},
|
||||
{"nelems", asValue}}},
|
||||
/*isElemental=*/false},
|
||||
{"tma_bulk_ldr4",
|
||||
&I::genTMABulkLoadR4,
|
||||
{{{"barrier", asAddr},
|
||||
{"src", asAddr},
|
||||
{"dst", asAddr},
|
||||
{"nelems", asValue}}},
|
||||
/*isElemental=*/false},
|
||||
{"tma_bulk_ldr8",
|
||||
&I::genTMABulkLoadR8,
|
||||
{{{"barrier", asAddr},
|
||||
{"src", asAddr},
|
||||
{"dst", asAddr},
|
||||
{"nelems", asValue}}},
|
||||
/*isElemental=*/false},
|
||||
{"tma_bulk_s2g",
|
||||
&I::genTMABulkS2G,
|
||||
{{{"src", asAddr}, {"dst", asAddr}, {"nbytes", asValue}}},
|
||||
@ -9278,6 +9327,93 @@ void IntrinsicLibrary::genTMABulkG2S(llvm::ArrayRef<fir::ExtendedValue> args) {
|
||||
builder, loc, dst, src, barrier, fir::getBase(args[3]), {}, {});
|
||||
}
|
||||
|
||||
static void genTMABulkLoad(fir::FirOpBuilder &builder, mlir::Location loc,
|
||||
mlir::Value barrier, mlir::Value src,
|
||||
mlir::Value dst, mlir::Value nelem,
|
||||
mlir::Value eleSize) {
|
||||
mlir::Value size = mlir::arith::MulIOp::create(builder, loc, nelem, eleSize);
|
||||
auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext());
|
||||
barrier = builder.createConvert(loc, llvmPtrTy, barrier);
|
||||
mlir::NVVM::InlinePtxOp::create(
|
||||
builder, loc, mlir::TypeRange{}, {dst, src, size, barrier}, {},
|
||||
"cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], "
|
||||
"[%1], %2, [%3];",
|
||||
{});
|
||||
mlir::NVVM::InlinePtxOp::create(
|
||||
builder, loc, mlir::TypeRange{}, {barrier, size}, {},
|
||||
"mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;", {});
|
||||
}
|
||||
|
||||
// TMA_BULK_LOADC4
|
||||
void IntrinsicLibrary::genTMABulkLoadC4(
|
||||
llvm::ArrayRef<fir::ExtendedValue> args) {
|
||||
assert(args.size() == 4);
|
||||
mlir::Value eleSize =
|
||||
builder.createIntegerConstant(loc, builder.getI32Type(), 8);
|
||||
genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
|
||||
fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
|
||||
}
|
||||
|
||||
// TMA_BULK_LOADC8
|
||||
void IntrinsicLibrary::genTMABulkLoadC8(
|
||||
llvm::ArrayRef<fir::ExtendedValue> args) {
|
||||
assert(args.size() == 4);
|
||||
mlir::Value eleSize =
|
||||
builder.createIntegerConstant(loc, builder.getI32Type(), 16);
|
||||
genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
|
||||
fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
|
||||
}
|
||||
|
||||
// TMA_BULK_LOADI4
|
||||
void IntrinsicLibrary::genTMABulkLoadI4(
|
||||
llvm::ArrayRef<fir::ExtendedValue> args) {
|
||||
assert(args.size() == 4);
|
||||
mlir::Value eleSize =
|
||||
builder.createIntegerConstant(loc, builder.getI32Type(), 4);
|
||||
genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
|
||||
fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
|
||||
}
|
||||
|
||||
// TMA_BULK_LOADI8
|
||||
void IntrinsicLibrary::genTMABulkLoadI8(
|
||||
llvm::ArrayRef<fir::ExtendedValue> args) {
|
||||
assert(args.size() == 4);
|
||||
mlir::Value eleSize =
|
||||
builder.createIntegerConstant(loc, builder.getI32Type(), 8);
|
||||
genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
|
||||
fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
|
||||
}
|
||||
|
||||
// TMA_BULK_LOADR2
|
||||
void IntrinsicLibrary::genTMABulkLoadR2(
|
||||
llvm::ArrayRef<fir::ExtendedValue> args) {
|
||||
assert(args.size() == 4);
|
||||
mlir::Value eleSize =
|
||||
builder.createIntegerConstant(loc, builder.getI32Type(), 2);
|
||||
genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
|
||||
fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
|
||||
}
|
||||
|
||||
// TMA_BULK_LOADR4
|
||||
void IntrinsicLibrary::genTMABulkLoadR4(
|
||||
llvm::ArrayRef<fir::ExtendedValue> args) {
|
||||
assert(args.size() == 4);
|
||||
mlir::Value eleSize =
|
||||
builder.createIntegerConstant(loc, builder.getI32Type(), 4);
|
||||
genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
|
||||
fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
|
||||
}
|
||||
|
||||
// TMA_BULK_LOADR8
|
||||
void IntrinsicLibrary::genTMABulkLoadR8(
|
||||
llvm::ArrayRef<fir::ExtendedValue> args) {
|
||||
assert(args.size() == 4);
|
||||
mlir::Value eleSize =
|
||||
builder.createIntegerConstant(loc, builder.getI32Type(), 8);
|
||||
genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
|
||||
fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
|
||||
}
|
||||
|
||||
// TMA_BULK_S2G (CUDA)
|
||||
void IntrinsicLibrary::genTMABulkS2G(llvm::ArrayRef<fir::ExtendedValue> args) {
|
||||
assert(args.size() == 3);
|
||||
|
||||
@ -2067,6 +2067,67 @@ implicit none
|
||||
end subroutine
|
||||
end interface
|
||||
|
||||
! Load specific types, count is in elements
|
||||
! -----------------------------------------
|
||||
interface tma_bulk_load
|
||||
attributes(device) subroutine tma_bulk_ldc4(barrier, src, dst, nelems)
|
||||
!dir$ ignore_tkr (r) src, (r) dst
|
||||
integer(8), shared :: barrier
|
||||
complex(4), device :: src(*)
|
||||
complex(4), shared :: dst(*)
|
||||
integer(4), value :: nelems
|
||||
end subroutine
|
||||
|
||||
attributes(device) subroutine tma_bulk_ldc8(barrier, src, dst, nelems)
|
||||
!dir$ ignore_tkr (r) src, (r) dst
|
||||
integer(8), shared :: barrier
|
||||
complex(8), device :: src(*)
|
||||
complex(8), shared :: dst(*)
|
||||
integer(4), value :: nelems
|
||||
end subroutine
|
||||
|
||||
attributes(device) subroutine tma_bulk_ldi4(barrier, src, dst, nelems)
|
||||
!dir$ ignore_tkr (r) src, (r) dst
|
||||
integer(8), shared :: barrier
|
||||
integer(4), device :: src(*)
|
||||
integer(4), shared :: dst(*)
|
||||
integer(4), value :: nelems
|
||||
end subroutine
|
||||
|
||||
attributes(device) subroutine tma_bulk_ldi8(barrier, src, dst, nelems)
|
||||
!dir$ ignore_tkr (r) src, (r) dst
|
||||
integer(8), shared :: barrier
|
||||
integer(8), device :: src(*)
|
||||
integer(8), shared :: dst(*)
|
||||
integer(4), value :: nelems
|
||||
end subroutine
|
||||
|
||||
attributes(device) subroutine tma_bulk_ldr2(barrier, src, dst, nelems)
|
||||
!dir$ ignore_tkr (r) src, (r) dst
|
||||
integer(8), shared :: barrier
|
||||
real(2), device :: src(*)
|
||||
real(2), shared :: dst(*)
|
||||
integer(4), value :: nelems
|
||||
end subroutine
|
||||
|
||||
attributes(device) subroutine tma_bulk_ldr4(barrier, src, dst, nelems)
|
||||
!dir$ ignore_tkr (r) src, (r) dst
|
||||
integer(8), shared :: barrier
|
||||
real(4), device :: src(*)
|
||||
real(4), shared :: dst(*)
|
||||
integer(4), value :: nelems
|
||||
end subroutine
|
||||
|
||||
attributes(device) subroutine tma_bulk_ldr8(barrier, src, dst, nelems)
|
||||
!dir$ ignore_tkr (r) src, (r) dst
|
||||
integer(8), shared :: barrier
|
||||
real(8), device :: src(*)
|
||||
real(8), shared :: dst(*)
|
||||
integer(4), value :: nelems
|
||||
end subroutine
|
||||
end interface
|
||||
|
||||
|
||||
contains
|
||||
|
||||
attributes(device) subroutine syncthreads()
|
||||
|
||||
@ -516,3 +516,136 @@ end subroutine
|
||||
|
||||
! CHECK-LABEL: func.func @_QPtest_barrier_try_wait_sleep()
|
||||
! CHECK: %{{.*}} = nvvm.inline_ptx ".reg .pred p; mbarrier.try_wait.shared.b64 p, [%{{.*}}], %{{.*}}, %{{.*}}; selp.b32 %0, 1, 0, p;" ro(%{{.*}}, %{{.*}}, %{{.*}} : !llvm.ptr, i64, i32) -> i32
|
||||
|
||||
attributes(global) subroutine test_tma_bulk_load_c4(a, n)
|
||||
integer(8), shared :: barrier1
|
||||
integer, value :: n
|
||||
complex(4), device :: r8(n)
|
||||
complex(4), shared :: tmp(1024)
|
||||
integer(4) :: j, elem_count
|
||||
call tma_bulk_load(barrier1, r8(j), tmp, elem_count)
|
||||
end subroutine
|
||||
|
||||
! CHECK-LABEL: func.func @_QPtest_tma_bulk_load_c4
|
||||
! CHECK: %[[BARRIER:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<shared>, uniq_name = "_QFtest_tma_bulk_load_c4Ebarrier1"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
|
||||
! CHECK: %[[ELEM_COUNT:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFtest_tma_bulk_load_c4Eelem_count"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
|
||||
! CHECK: %[[COUNT:.*]] = fir.load %[[ELEM_COUNT]]#0 : !fir.ref<i32>
|
||||
! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 8 : i32
|
||||
! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32
|
||||
! CHECK: %[[BARRIER_PTR:.*]] = fir.convert %[[BARRIER]]#0 : (!fir.ref<i64>) -> !llvm.ptr
|
||||
! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !fir.ref<!fir.array<1024xcomplex<f32>>>, !fir.ref<complex<f32>>, i32, !llvm.ptr)
|
||||
! CHECK: nvvm.inline_ptx "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" ro(%[[BARRIER_PTR]], %[[SIZE]] : !llvm.ptr, i32)
|
||||
|
||||
attributes(global) subroutine test_tma_bulk_load_c8(a, n)
|
||||
integer(8), shared :: barrier1
|
||||
integer, value :: n
|
||||
complex(8), device :: r8(n)
|
||||
complex(8), shared :: tmp(1024)
|
||||
integer(4) :: j, elem_count
|
||||
call tma_bulk_load(barrier1, r8(j), tmp, elem_count)
|
||||
end subroutine
|
||||
|
||||
! CHECK-LABEL: func.func @_QPtest_tma_bulk_load_c8
|
||||
! CHECK: %[[BARRIER:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<shared>, uniq_name = "_QFtest_tma_bulk_load_c8Ebarrier1"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
|
||||
! CHECK: %[[ELEM_COUNT:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFtest_tma_bulk_load_c8Eelem_count"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
|
||||
! CHECK: %[[COUNT:.*]] = fir.load %[[ELEM_COUNT]]#0 : !fir.ref<i32>
|
||||
! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 16 : i32
|
||||
! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32
|
||||
! CHECK: %[[BARRIER_PTR:.*]] = fir.convert %[[BARRIER]]#0 : (!fir.ref<i64>) -> !llvm.ptr
|
||||
! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !fir.ref<!fir.array<1024xcomplex<f64>>>, !fir.ref<complex<f64>>, i32, !llvm.ptr)
|
||||
! CHECK: nvvm.inline_ptx "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" ro(%[[BARRIER_PTR]], %[[SIZE]] : !llvm.ptr, i32)
|
||||
|
||||
attributes(global) subroutine test_tma_bulk_load_i4(a, n)
|
||||
integer(8), shared :: barrier1
|
||||
integer, value :: n
|
||||
integer(4), device :: r8(n)
|
||||
integer(4), shared :: tmp(1024)
|
||||
integer(4) :: j, elem_count
|
||||
call tma_bulk_load(barrier1, r8(j), tmp, elem_count)
|
||||
end subroutine
|
||||
|
||||
! CHECK-LABEL: func.func @_QPtest_tma_bulk_load_i4
|
||||
! CHECK: %[[BARRIER:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<shared>, uniq_name = "_QFtest_tma_bulk_load_i4Ebarrier1"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
|
||||
! CHECK: %[[ELEM_COUNT:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFtest_tma_bulk_load_i4Eelem_count"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
|
||||
! CHECK: %[[COUNT:.*]] = fir.load %[[ELEM_COUNT]]#0 : !fir.ref<i32>
|
||||
! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 4 : i32
|
||||
! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32
|
||||
! CHECK: %[[BARRIER_PTR:.*]] = fir.convert %[[BARRIER]]#0 : (!fir.ref<i64>) -> !llvm.ptr
|
||||
! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !fir.ref<!fir.array<1024xi32>>, !fir.ref<i32>, i32, !llvm.ptr)
|
||||
! CHECK: nvvm.inline_ptx "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" ro(%[[BARRIER_PTR]], %[[SIZE]] : !llvm.ptr, i32)
|
||||
|
||||
attributes(global) subroutine test_tma_bulk_load_i8(a, n)
|
||||
integer(8), shared :: barrier1
|
||||
integer, value :: n
|
||||
integer(8), device :: r8(n)
|
||||
integer(8), shared :: tmp(1024)
|
||||
integer(4) :: j, elem_count
|
||||
call tma_bulk_load(barrier1, r8(j), tmp, elem_count)
|
||||
end subroutine
|
||||
|
||||
! CHECK-LABEL: func.func @_QPtest_tma_bulk_load_i8
|
||||
! CHECK: %[[BARRIER:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<shared>, uniq_name = "_QFtest_tma_bulk_load_i8Ebarrier1"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
|
||||
! CHECK: %[[ELEM_COUNT:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFtest_tma_bulk_load_i8Eelem_count"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
|
||||
! CHECK: %[[COUNT:.*]] = fir.load %[[ELEM_COUNT]]#0 : !fir.ref<i32>
|
||||
! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 8 : i32
|
||||
! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32
|
||||
! CHECK: %[[BARRIER_PTR:.*]] = fir.convert %[[BARRIER]]#0 : (!fir.ref<i64>) -> !llvm.ptr
|
||||
! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !fir.ref<!fir.array<1024xi64>>, !fir.ref<i64>, i32, !llvm.ptr)
|
||||
! CHECK: nvvm.inline_ptx "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" ro(%[[BARRIER_PTR]], %[[SIZE]] : !llvm.ptr, i32)
|
||||
|
||||
attributes(global) subroutine test_tma_bulk_load_r2(a, n)
|
||||
integer(8), shared :: barrier1
|
||||
integer, value :: n
|
||||
real(2), device :: r8(n)
|
||||
real(2), shared :: tmp(1024)
|
||||
integer(4) :: j, elem_count
|
||||
call tma_bulk_load(barrier1, r8(j), tmp, elem_count)
|
||||
end subroutine
|
||||
|
||||
! CHECK-LABEL: func.func @_QPtest_tma_bulk_load_r2
|
||||
! CHECK: %[[BARRIER:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<shared>, uniq_name = "_QFtest_tma_bulk_load_r2Ebarrier1"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
|
||||
! CHECK: %[[ELEM_COUNT:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFtest_tma_bulk_load_r2Eelem_count"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
|
||||
! CHECK: %[[COUNT:.*]] = fir.load %[[ELEM_COUNT]]#0 : !fir.ref<i32>
|
||||
! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 2 : i32
|
||||
! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32
|
||||
! CHECK: %[[BARRIER_PTR:.*]] = fir.convert %[[BARRIER]]#0 : (!fir.ref<i64>) -> !llvm.ptr
|
||||
! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !fir.ref<!fir.array<1024xf16>>, !fir.ref<f16>, i32, !llvm.ptr)
|
||||
! CHECK: nvvm.inline_ptx "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" ro(%[[BARRIER_PTR]], %[[SIZE]] : !llvm.ptr, i32)
|
||||
|
||||
attributes(global) subroutine test_tma_bulk_load_r4(a, n)
|
||||
integer(8), shared :: barrier1
|
||||
integer, value :: n
|
||||
real(4), device :: r8(n)
|
||||
real(4), shared :: tmp(1024)
|
||||
integer(4) :: j, elem_count
|
||||
call tma_bulk_load(barrier1, r8(j), tmp, elem_count)
|
||||
end subroutine
|
||||
|
||||
! CHECK-LABEL: func.func @_QPtest_tma_bulk_load_r4
|
||||
! CHECK: %[[BARRIER:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<shared>, uniq_name = "_QFtest_tma_bulk_load_r4Ebarrier1"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
|
||||
! CHECK: %[[ELEM_COUNT:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFtest_tma_bulk_load_r4Eelem_count"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
|
||||
! CHECK: %[[COUNT:.*]] = fir.load %[[ELEM_COUNT]]#0 : !fir.ref<i32>
|
||||
! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 4 : i32
|
||||
! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32
|
||||
! CHECK: %[[BARRIER_PTR:.*]] = fir.convert %[[BARRIER]]#0 : (!fir.ref<i64>) -> !llvm.ptr
|
||||
! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !fir.ref<!fir.array<1024xf32>>, !fir.ref<f32>, i32, !llvm.ptr)
|
||||
! CHECK: nvvm.inline_ptx "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" ro(%[[BARRIER_PTR]], %[[SIZE]] : !llvm.ptr, i32)
|
||||
|
||||
attributes(global) subroutine test_tma_bulk_load_r8(a, n)
|
||||
integer(8), shared :: barrier1
|
||||
integer, value :: n
|
||||
real(8), device :: r8(n)
|
||||
real(8), shared :: tmp(1024)
|
||||
integer(4) :: j, elem_count
|
||||
call tma_bulk_load(barrier1, r8(j), tmp, elem_count)
|
||||
end subroutine
|
||||
|
||||
! CHECK-LABEL: func.func @_QPtest_tma_bulk_load_r8
|
||||
! CHECK: %[[BARRIER:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<shared>, uniq_name = "_QFtest_tma_bulk_load_r8Ebarrier1"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
|
||||
! CHECK: %[[ELEM_COUNT:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFtest_tma_bulk_load_r8Eelem_count"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
|
||||
! CHECK: %[[COUNT:.*]] = fir.load %[[ELEM_COUNT]]#0 : !fir.ref<i32>
|
||||
! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 8 : i32
|
||||
! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32
|
||||
! CHECK: %[[BARRIER_PTR:.*]] = fir.convert %[[BARRIER]]#0 : (!fir.ref<i64>) -> !llvm.ptr
|
||||
! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !fir.ref<!fir.array<1024xf64>>, !fir.ref<f64>, i32, !llvm.ptr)
|
||||
! CHECK: nvvm.inline_ptx "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" ro(%[[BARRIER_PTR]], %[[SIZE]] : !llvm.ptr, i32)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user