llvm-project/mlir/test/Integration/GPU/CUDA/concurrent-kernels.mlir
Matthias Springer 951ab04d6c
[mlir][NVVM] Add no-rollback option to NVVM lowering passes (#168477)
Add pass options to run lowerings to NVVM without pattern rollback. This
makes the dialect conversions easier to debug and improves
performance/memory usage.
2025-11-18 13:47:28 +08:00

54 lines
1.7 KiB
MLIR

// Tests multiple kernels running concurrently. Runs two kernels, which
// increment a global atomic counter and wait for the counter to reach 2.
//
// RUN: mlir-opt %s \
// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format allow-pattern-rollback=0" \
// RUN: | env CUDA_MODULE_LOADING=EAGER mlir-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
// RUN: --entry-point-result=void
// CUDA_MODULE_LOADING=EAGER avoids an implicit context synchronization on first
// use of each kernel. It is technically not needed for this test, because
// there is only one kernel.
module attributes {gpu.container_module} {
gpu.module @kernels {
gpu.func @kernel(%memref: memref<i32>) kernel {
%c0 = arith.constant 0 : i32
%c1 = arith.constant 1 : i32
%c2 = arith.constant 2 : i32
%block = memref.atomic_rmw addi %c1, %memref[] : (i32, memref<i32>) -> i32
scf.while: () -> () {
%value = memref.atomic_rmw addi %c0, %memref[] : (i32, memref<i32>) -> i32
%cond = arith.cmpi slt, %value, %c2 : i32
scf.condition(%cond)
} do {
scf.yield
}
gpu.return
}
}
func.func @main() {
%c0 = arith.constant 0 : i32
%c1 = arith.constant 1 : index
%memref = gpu.alloc host_shared () : memref<i32>
memref.store %c0, %memref[] : memref<i32>
%0 = gpu.wait async
%1 = gpu.wait async
%2 = gpu.launch_func async [%0] @kernels::@kernel
blocks in (%c1, %c1, %c1)
threads in (%c1, %c1, %c1)
args(%memref: memref<i32>)
%3 = gpu.launch_func async [%1] @kernels::@kernel
blocks in (%c1, %c1, %c1)
threads in (%c1, %c1, %c1)
args(%memref: memref<i32>)
gpu.wait [%2, %3]
return
}
}