From 2ada4b8fb0914ebdddc386130db24bcc074b3554 Mon Sep 17 00:00:00 2001 From: Krzysztof Drewniak Date: Wed, 11 Feb 2026 18:23:34 -0800 Subject: [PATCH] [mlir][ROCDL] Wrap asyncmark and wait.asyncmark intrinsics (#181054) (see op-level and LLVM documentation for details so I'm not repeating myself, but these are the general operations for compiler-operated asynchronous operation tracking, which frees programmers from having to deal with all the different counters, allows certain optimization, and doesn't require precise alias analysis) ----- Co-authored-by: Claude Opus 4.5 --- mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td | 55 ++++++++++++++++++++ mlir/test/Dialect/LLVMIR/rocdl.mlir | 14 +++++ mlir/test/Target/LLVMIR/rocdl.mlir | 14 +++++ 3 files changed, 83 insertions(+) diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td index 6a874aafdec3..c3af1bd32ebd 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td @@ -590,6 +590,61 @@ def ROCDL_WaitTensorcntOp: ROCDL_ConcreteNonMemIntrOp<"s.wait.tensorcnt", [], 0, let assemblyFormat = "$count attr-dict"; } +def ROCDL_AsyncmarkOp : ROCDL_ConcreteNonMemIntrOp<"asyncmark", [], 0>, + Arguments<(ins)> { + let summary = "Mark the end of a group of asynchronous operations"; + let description = [{ + This operation, in conjunction with `rocdl.wait.asyncmark`, forms the + compiler-provided framework for tracking explicitly asynchronous + memory operations, such as copies to LDS that use async intrinsics + and gfx1250's tensor loads. + + Details of its behavior can be found in + [the LLVM documentation on async tracking](/llvm/docs/AMDGPUAsyncOperations.rst). + + See `rocdl.wait.asyncmark`'s documentation for a usage example. + + Available on gfx9 and later. + }]; + let results = (outs); + let assemblyFormat = "attr-dict"; +} + +def ROCDL_WaitAsyncmarkOp: ROCDL_ConcreteNonMemIntrOp<"wait.asyncmark", [], 0, [0], ["count"]>, + Arguments<(ins I16Attr:$count)> { + let summary = "Wait until N or fewer async operation groups are unexecuted"; + let description = [{ + This operation, along with `rocdl.asyncmark`, forms the compiler-provided + framework for explicitly tracking asynchronous operations. + + At the point where a wait.asyncmark operation is executed, all async operations + that were parts of any async group (established by asyncmark in program order) + other than the `count` previously-added ones will have finished executing. + + For more detail, including on how this mechanism composes with function calls, + see [the LLVM documentation on async tracking](/llvm/docs/AMDGPUAsyncOperations.rst). + + Available on gfx9 and later. + + Example: + ```mlir + rocdl.tensor.load.to.lds ... + rocdl.global.async.load.to.lds ... + + rocdl.asyncmark + + rocdl.tensor.load.to.lds ... + rocdl.global.async.load.to.lds ... + + rocdl.asyncmark + + rocdl.wait.asyncmark 1 // First group of loads completes after this + ``` + }]; + let results = (outs); + let assemblyFormat = "$count attr-dict"; +} + def ROCDL_SetPrioOp : ROCDL_ConcreteNonMemIntrOp<"s.setprio", [], 0, [0], ["priority"]>, Arguments<(ins I16Attr:$priority)> { let assemblyFormat = "$priority attr-dict"; diff --git a/mlir/test/Dialect/LLVMIR/rocdl.mlir b/mlir/test/Dialect/LLVMIR/rocdl.mlir index ca599e802534..2adb5bc90915 100644 --- a/mlir/test/Dialect/LLVMIR/rocdl.mlir +++ b/mlir/test/Dialect/LLVMIR/rocdl.mlir @@ -1256,6 +1256,20 @@ llvm.func @rocdl.s.wait.tensorcnt() { llvm.return } +llvm.func @rocdl.asyncmark() { + // CHECK-LABEL: rocdl.asyncmark + // CHECK: rocdl.asyncmark + rocdl.asyncmark + llvm.return +} + +llvm.func @rocdl.wait.asyncmark() { + // CHECK-LABEL: rocdl.wait.asyncmark + // CHECK: rocdl.wait.asyncmark 0 + rocdl.wait.asyncmark 0 + llvm.return +} + // ----- llvm.func @rocdl.readfirstlane(%src : f32) -> f32 { diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir index 382bc0b9f8ff..7a7e76410e4d 100644 --- a/mlir/test/Target/LLVMIR/rocdl.mlir +++ b/mlir/test/Target/LLVMIR/rocdl.mlir @@ -391,6 +391,20 @@ llvm.func @rocdl.s.wait.tensorcnt() { llvm.return } +llvm.func @rocdl.asyncmark() { + // CHECK-LABEL: rocdl.asyncmark + // CHECK-NEXT: call void @llvm.amdgcn.asyncmark() + rocdl.asyncmark + llvm.return +} + +llvm.func @rocdl.wait.asyncmark() { + // CHECK-LABEL: rocdl.wait.asyncmark + // CHECK-NEXT: call void @llvm.amdgcn.wait.asyncmark(i16 0) + rocdl.wait.asyncmark 0 + llvm.return +} + llvm.func @rocdl.setprio() { // CHECK: call void @llvm.amdgcn.s.setprio(i16 0) rocdl.s.setprio 0