[AMDGPU] asyncmark support for ASYNC_CNT (#185813)
Some checks failed
Bazel Checks / Buildifier (push) Has been cancelled
Bazel Checks / Bazel Build/Test (push) Has been cancelled
Build CI Tooling Containers / Build Container abi-tests (push) Has been cancelled
Build CI Tooling Containers / Build Container format (push) Has been cancelled
Build CI Tooling Containers / Build Container lint (push) Has been cancelled
Build Windows CI Container / build-ci-container-windows (push) Has been cancelled
Build CI Container / Build Container X64 (push) Has been cancelled
Build CI Container / Build Container ARM64 (push) Has been cancelled
Build CI Container / Build Container agent X64 (push) Has been cancelled
Build CI Container / Build Container agent ARM64 (push) Has been cancelled
Build libc Container / Build libc container (ubuntu-24.04) (push) Has been cancelled
Build libc Container / Build libc container (ubuntu-24.04-arm) (push) Has been cancelled
Build Metrics Container / build-metrics-container (push) Has been cancelled
Check CI Scripts / Check Python Tests (push) Has been cancelled
Test documentation build / Test documentation build (push) Has been cancelled
Libclang Python Binding Tests / Build and run Python unit tests (3.13) (push) Has been cancelled
Libclang Python Binding Tests / Build and run Python unit tests (3.8) (push) Has been cancelled
Build Docker images for libc++ CI / build-and-push (push) Has been cancelled
Test Unprivileged Download Artifact Action / Upload Test Artifact (push) Has been cancelled
Zizmor GitHub Actions Analysis / Run zizmor (push) Has been cancelled
Build CI Tooling Containers / push-ci-container (push) Has been cancelled
Build Windows CI Container / push-ci-container (push) Has been cancelled
Build CI Container / push-ci-container (push) Has been cancelled
Build libc Container / push-libc-container (push) Has been cancelled
Build Metrics Container / push-metrics-container (push) Has been cancelled
Test Unprivileged Download Artifact Action / Test Unprivileged Download Artifact (push) Has been cancelled
Commit Access Review / commit-access-review (push) Has been cancelled

The ASYNC_CNT is used to track the progress of asynchronous copies
between global and LDS memories. By including it in asyncmark, the
compiler can now assist the programmer in generating waits for
ASYNC_CNT.

Assisted-By: Claude Sonnet 4.5

This is part of a stack:

- #185813
- #185810 

Fixes: LCOMPILER-332
This commit is contained in:
Sameer Sahasrabuddhe 2026-04-07 07:23:09 +05:30 committed by GitHub
parent 164505d348
commit f9adee2f6b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 380 additions and 29 deletions

View File

@ -1330,6 +1330,9 @@ defm VMemToLDSLoad : AMDGPUSubtargetFeature<"vmem-to-lds-load-insts",
"w/lds bit set or global_load_lds. This does not include scratch_load_lds."
>;
// Manual predicate for hasAsyncMark() which combines HasVMemToLDSLoad and GFX1250Plus
def HasAsyncMark : Predicate<"Subtarget->hasAsyncMark()">;
defm LdsBarrierArriveAtomic : AMDGPUSubtargetFeature<"lds-barrier-arrive-atomic",
"Has LDS barrier-arrive atomic instructions"
>;

View File

@ -2398,8 +2398,7 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
return selectTensorLoadStore(I, IntrinsicID);
case Intrinsic::amdgcn_asyncmark:
case Intrinsic::amdgcn_wait_asyncmark:
// FIXME: Not supported on GFX12 yet. Will need a new feature when we do.
if (!Subtarget->hasVMemToLDSLoad())
if (!Subtarget->hasAsyncMark())
return false;
break;
case Intrinsic::amdgcn_exp_compr:

View File

@ -226,6 +226,8 @@ public:
bool hasScalarSubwordLoads() const { return getGeneration() >= GFX12; }
bool hasAsyncMark() const { return hasVMemToLDSLoad() || HasAsynccnt; }
TrapHandlerAbi getTrapHandlerAbi() const {
return isAmdHsaOS() ? TrapHandlerAbi::AMDHSA : TrapHandlerAbi::NONE;
}

View File

@ -650,6 +650,14 @@ public:
return SIInstrInfo::mayWriteLDSThroughDMA(MI) && isAsync(MI);
}
bool shouldUpdateAsyncMark(const MachineInstr &MI, InstCounterType T) const {
if (!isAsyncLdsDmaWrite(MI))
return false;
if (SIInstrInfo::usesASYNC_CNT(MI))
return T == ASYNC_CNT;
return T == LOAD_CNT;
}
bool isVmemAccess(const MachineInstr &MI) const;
bool generateWaitcntInstBefore(MachineInstr &MI,
WaitcntBrackets &ScoreBrackets,
@ -1258,12 +1266,7 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
setVMemScore(LDSDMA_BEGIN + Slot, T, CurrScore);
}
// FIXME: Not supported on GFX12 yet. Newer async operations use other
// counters too, so will need a map from instruction or event types to
// counter types.
if (Context->isAsyncLdsDmaWrite(Inst) && T == LOAD_CNT) {
assert(!SIInstrInfo::usesASYNC_CNT(Inst) &&
"unexpected GFX1250 instruction");
if (Context->shouldUpdateAsyncMark(Inst, T)) {
AsyncScore[T] = CurrScore;
}
@ -2110,7 +2113,11 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
II.eraseFromParent();
Modified = true;
} else if (Opcode == AMDGPU::WAIT_ASYNCMARK) {
reportFatalUsageError("WAIT_ASYNCMARK is not ready for GFX12 yet");
// Update the Waitcnt, but don't erase the wait.asyncmark() itself. It
// shows up in the assembly as a comment with the original parameter N.
unsigned N = II.getOperand(0).getImm();
AMDGPU::Waitcnt OldWait = ScoreBrackets.determineAsyncWait(N);
Wait = Wait.combined(OldWait);
} else {
std::optional<InstCounterType> CT = counterTypeForInstr(Opcode);
assert(CT.has_value());

View File

@ -1727,7 +1727,7 @@ def S_WAITCNT_lds_direct : SPseudoInstSI<(outs), (ins)> {
let hasSideEffects = 0;
}
let SubtargetPredicate = HasVMemToLDSLoad in {
let SubtargetPredicate = HasAsyncMark in {
def ASYNCMARK : SPseudoInstSI<(outs), (ins),
[(int_amdgcn_asyncmark)]> {
let maybeAtomic = 0;

View File

@ -1,19 +0,0 @@
; RUN: split-file %s %t
; RUN: not --crash llc -filetype=null -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 %t/mark.ll 2>&1 | FileCheck --ignore-case %s
; RUN: not llc -filetype=null -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 %t/mark.ll 2>&1 | FileCheck --ignore-case %s
; RUN: not --crash llc -filetype=null -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 %t/wait.ll 2>&1 | FileCheck --ignore-case %s
; RUN: not llc -filetype=null -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 %t/wait.ll 2>&1 | FileCheck --ignore-case %s
; CHECK: LLVM ERROR: Cannot select
;--- mark.ll
define void @async_err() {
call void @llvm.amdgcn.asyncmark()
ret void
}
;--- wait.ll
define void @async_err() {
call void @llvm.amdgcn.wait.asyncmark(i16 0)
ret void
}

View File

@ -0,0 +1,359 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck %s -check-prefixes=GFX1250
; Test async mark/wait with global_load_lds and global loads
; This version uses wave barriers to enforce program order so that unrelated vmem
; instructions do not get reordered before reaching this point.
define void @interleaved_with_wave_barrier(ptr addrspace(1) %foo, ptr addrspace(3) %lds, ptr addrspace(1) %bar, ptr addrspace(1) %out) {
; GFX1250-LABEL: interleaved_with_wave_barrier:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v9, v4
; GFX1250-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v6, v5
; GFX1250-NEXT: v_add_nc_u64_e32 v[4:5], 0x54, v[0:1]
; GFX1250-NEXT: v_add_nc_u32_e32 v3, 0x54, v2
; GFX1250-NEXT: global_load_b32 v10, v[8:9], off offset:44
; GFX1250-NEXT: global_load_b32 v11, v[0:1], off offset:4
; GFX1250-NEXT: ; wave barrier
; GFX1250-NEXT: global_load_async_to_lds_b32 v3, v[4:5], off offset:4 th:TH_LOAD_NT nv
; GFX1250-NEXT: v_add_nc_u64_e32 v[4:5], 0x58, v[8:9]
; GFX1250-NEXT: v_add_nc_u32_e32 v3, 0x58, v2
; GFX1250-NEXT: ; wave barrier
; GFX1250-NEXT: ; asyncmark
; GFX1250-NEXT: global_load_b32 v0, v[0:1], off offset:8
; GFX1250-NEXT: ; wave barrier
; GFX1250-NEXT: global_load_async_to_lds_b32 v3, v[4:5], off offset:4 th:TH_LOAD_LU nv
; GFX1250-NEXT: ; wave barrier
; GFX1250-NEXT: global_load_b32 v1, v[8:9], off offset:48
; GFX1250-NEXT: ; asyncmark
; GFX1250-NEXT: ; wait_asyncmark(1)
; GFX1250-NEXT: s_wait_asynccnt 0x1
; GFX1250-NEXT: ds_load_b32 v3, v2 offset:84
; GFX1250-NEXT: ; wait_asyncmark(0)
; GFX1250-NEXT: s_wait_asynccnt 0x0
; GFX1250-NEXT: ds_load_b32 v2, v2 offset:88
; GFX1250-NEXT: s_wait_loadcnt 0x2
; GFX1250-NEXT: v_add_nc_u32_e32 v4, v11, v10
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x101
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_add3_u32 v0, v4, v3, v0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: v_add3_u32 v0, v0, v1, v2
; GFX1250-NEXT: global_store_b32 v[6:7], v0, off
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
entry:
; First batch: global load, global load, async global-to-LDS
%bar_gep11 = getelementptr i32, ptr addrspace(1) %bar, i32 11
%bar_v11 = load i32, ptr addrspace(1) %bar_gep11
%foo_gep1 = getelementptr i32, ptr addrspace(1) %foo, i32 1
%foo_v1 = load i32, ptr addrspace(1) %foo_gep1
%lds_gep21 = getelementptr i32, ptr addrspace(3) %lds, i32 21
%bar_gep21 = getelementptr i32, ptr addrspace(1) %foo, i32 21
call void @llvm.amdgcn.wave.barrier()
call void @llvm.amdgcn.global.load.async.to.lds.b32(ptr addrspace(1) %bar_gep21, ptr addrspace(3) %lds_gep21, i32 4, i32 u0x21)
call void @llvm.amdgcn.wave.barrier()
call void @llvm.amdgcn.asyncmark()
; Second batch: global load, async global-to-LDS, global load
%foo_gep2 = getelementptr i32, ptr addrspace(1) %foo, i32 2
%foo_v2 = load i32, ptr addrspace(1) %foo_gep2
%bar_gep22 = getelementptr i32, ptr addrspace(1) %bar, i32 22
%lds_gep22 = getelementptr i32, ptr addrspace(3) %lds, i32 22
call void @llvm.amdgcn.wave.barrier()
call void @llvm.amdgcn.global.load.async.to.lds.b32(ptr addrspace(1) %bar_gep22, ptr addrspace(3) %lds_gep22, i32 4, i32 u0x23)
call void @llvm.amdgcn.wave.barrier()
%bar_gep12 = getelementptr i32, ptr addrspace(1) %bar, i32 12
%bar_v12 = load i32, ptr addrspace(1) %bar_gep12
call void @llvm.amdgcn.asyncmark()
; Wait for first async mark and read from LDS
call void @llvm.amdgcn.wait.asyncmark(i16 1)
%lds_val21 = load i32, ptr addrspace(3) %lds_gep21
; Wait for the next async mark.
; Notable that the asyncmark is sufficient to prevent the optimizer from coalescing the previous ds_load with the next one.
call void @llvm.amdgcn.wait.asyncmark(i16 0)
%lds_val22 = load i32, ptr addrspace(3) %lds_gep22
%sum1 = add i32 %foo_v1, %bar_v11
%sum2 = add i32 %sum1, %lds_val21
%sum3 = add i32 %sum2, %foo_v2
; Finally a loadcnt(0) for %bar_v12, which was not included in the async mark that followed it.
%sum4 = add i32 %sum3, %bar_v12
%sum5 = add i32 %sum4, %lds_val22
store i32 %sum5, ptr addrspace(1) %out
ret void
}
; A perfect loop that is unlikely to exist in real life. It uses only async
; operations, and result in waits that exactly match the stream of
; those outstanding operations.
define amdgpu_kernel void @test_pipelined_loop(ptr addrspace(1) %foo, ptr addrspace(3) %lds, ptr addrspace(1) %bar, ptr addrspace(1) %out, i32 %n) {
; GFX1250-LABEL: test_pipelined_loop:
; GFX1250: ; %bb.0: ; %prolog
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 nv
; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x44 nv
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX1250-NEXT: s_add_co_i32 s6, s2, 4
; GFX1250-NEXT: s_mov_b32 s7, s2
; GFX1250-NEXT: v_mov_b32_e32 v2, s6
; GFX1250-NEXT: s_mov_b32 s6, 2
; GFX1250-NEXT: global_load_async_to_lds_b32 v1, v0, s[0:1] offset:4 nv
; GFX1250-NEXT: v_mov_b32_e32 v1, 4
; GFX1250-NEXT: ; asyncmark
; GFX1250-NEXT: global_load_async_to_lds_b32 v2, v1, s[0:1] offset:4 nv
; GFX1250-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-NEXT: s_add_nc_u64 s[0:1], s[0:1], 8
; GFX1250-NEXT: ; asyncmark
; GFX1250-NEXT: .LBB1_1: ; %loop_body
; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1250-NEXT: s_add_co_i32 s8, s7, 8
; GFX1250-NEXT: s_add_co_i32 s6, s6, 1
; GFX1250-NEXT: v_mov_b32_e32 v2, s8
; GFX1250-NEXT: global_load_async_to_lds_b32 v2, v0, s[0:1] offset:4 nv
; GFX1250-NEXT: v_mov_b32_e32 v2, s7
; GFX1250-NEXT: ; asyncmark
; GFX1250-NEXT: ; wait_asyncmark(2)
; GFX1250-NEXT: s_wait_asynccnt 0x2
; GFX1250-NEXT: s_add_co_i32 s7, s7, 4
; GFX1250-NEXT: s_cmp_lt_i32 s6, s3
; GFX1250-NEXT: ds_load_b32 v2, v2
; GFX1250-NEXT: s_add_nc_u64 s[0:1], s[0:1], 4
; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: v_add_nc_u32_e32 v1, v1, v2
; GFX1250-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1250-NEXT: ; %bb.2: ; %epilog
; GFX1250-NEXT: s_lshl2_add_u32 s0, s3, s2
; GFX1250-NEXT: ; wait_asyncmark(1)
; GFX1250-NEXT: s_wait_asynccnt 0x1
; GFX1250-NEXT: s_add_co_i32 s0, s0, -8
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s0
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 nv
; GFX1250-NEXT: ds_load_b32 v0, v0
; GFX1250-NEXT: ; wait_asyncmark(0)
; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: s_wait_asynccnt 0x0
; GFX1250-NEXT: v_add_nc_u32_e32 v0, v1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX1250-NEXT: s_endpgm
prolog:
; Load first iteration
call void @llvm.amdgcn.global.load.async.to.lds.b32(ptr addrspace(1) %foo, ptr addrspace(3) %lds, i32 4, i32 u0x20)
call void @llvm.amdgcn.asyncmark()
; Load second iteration
%lds_gep1 = getelementptr i32, ptr addrspace(3) %lds, i32 1
%foo_gep1 = getelementptr i32, ptr addrspace(1) %foo, i32 1
call void @llvm.amdgcn.global.load.async.to.lds.b32(ptr addrspace(1) %foo_gep1, ptr addrspace(3) %lds_gep1, i32 4, i32 u0x20)
call void @llvm.amdgcn.asyncmark()
br label %loop_body
loop_body:
%i = phi i32 [ 2, %prolog ], [ %i.next, %loop_body ]
%sum = phi i32 [ 0, %prolog ], [ %sum_i, %loop_body ]
; Load next iteration
%lds_gep_cur = getelementptr i32, ptr addrspace(3) %lds, i32 %i
%foo_gep_cur = getelementptr i32, ptr addrspace(1) %foo, i32 %i
call void @llvm.amdgcn.global.load.async.to.lds.b32(ptr addrspace(1) %foo_gep_cur, ptr addrspace(3) %lds_gep_cur, i32 4, i32 u0x20)
call void @llvm.amdgcn.asyncmark()
; Wait for iteration i-2 and process
call void @llvm.amdgcn.wait.asyncmark(i16 2)
%lds_idx = sub i32 %i, 2
%lds_gep_read = getelementptr i32, ptr addrspace(3) %lds, i32 %lds_idx
%lds_val = load i32, ptr addrspace(3) %lds_gep_read
%sum_i = add i32 %sum, %lds_val
%i.next = add i32 %i, 1
%cmp = icmp slt i32 %i.next, %n
br i1 %cmp, label %loop_body, label %epilog
epilog:
; Process remaining iterations
call void @llvm.amdgcn.wait.asyncmark(i16 1)
%lds_n_2 = sub i32 %n, 2
%lds_gep_n_2 = getelementptr i32, ptr addrspace(3) %lds, i32 %lds_n_2
%lds_val_n_2 = load i32, ptr addrspace(3) %lds_gep_n_2
%sum_e2 = add i32 %sum_i, %lds_val_n_2
%out_gep_e1 = getelementptr i32, ptr addrspace(1) %out, i32 %lds_n_2
call void @llvm.amdgcn.wait.asyncmark(i16 0)
%lds_n_1 = sub i32 %n, 1
%lds_gep_n_1 = getelementptr i32, ptr addrspace(3) %lds, i32 %lds_n_1
%lds_val_n_1 = load i32, ptr addrspace(3) %lds_gep_n_1
%sum_e1 = add i32 %sum_e2, %lds_val_n_1
store i32 %sum_e2, ptr addrspace(1) %bar
ret void
}
; Software pipelined loop with async global-to-LDS and global loads
define amdgpu_kernel void @test_pipelined_loop_with_global(ptr addrspace(1) %foo, ptr addrspace(3) %lds, ptr addrspace(1) %bar, ptr addrspace(1) %out, i32 %n) {
; GFX1250-LABEL: test_pipelined_loop_with_global:
; GFX1250: ; %bb.0: ; %prolog
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x24 nv
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 nv
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_load_b32 s6, s[8:9], 0x0
; GFX1250-NEXT: s_load_b32 s7, s[0:1], 0x0
; GFX1250-NEXT: v_mov_b32_e32 v1, s10
; GFX1250-NEXT: s_add_co_i32 s11, s10, 4
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_mov_b32 v4, s11
; GFX1250-NEXT: s_load_b32 s11, s[4:5], 0x44 nv
; GFX1250-NEXT: global_load_async_to_lds_b32 v1, v0, s[8:9] offset:4 nv
; GFX1250-NEXT: ; asyncmark
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_load_b32 v1, v0, s[8:9] offset:4
; GFX1250-NEXT: global_load_b32 v2, v0, s[0:1] offset:4
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_add_nc_u64 s[0:1], s[0:1], 8
; GFX1250-NEXT: s_add_nc_u64 s[4:5], s[8:9], 8
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_dual_mov_b32 v5, s6 :: v_dual_mov_b32 v6, s7
; GFX1250-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1250-NEXT: global_load_async_to_lds_b32 v4, v3, s[8:9] offset:4 nv
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, v2
; GFX1250-NEXT: s_mov_b32 s8, 2
; GFX1250-NEXT: s_mov_b32 s9, s10
; GFX1250-NEXT: ; asyncmark
; GFX1250-NEXT: .LBB2_1: ; %loop_body
; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1250-NEXT: s_add_co_i32 s12, s9, 8
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_dual_mov_b32 v7, v4 :: v_dual_mov_b32 v9, s12
; GFX1250-NEXT: v_mov_b32_e32 v8, v3
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_load_b32 v3, v0, s[4:5]
; GFX1250-NEXT: global_load_b32 v4, v0, s[0:1]
; GFX1250-NEXT: v_dual_add_nc_u32 v10, v5, v6 :: v_dual_mov_b32 v6, v2
; GFX1250-NEXT: global_load_async_to_lds_b32 v9, v0, s[4:5] offset:4 nv
; GFX1250-NEXT: v_mov_b32_e32 v9, s9
; GFX1250-NEXT: ; asyncmark
; GFX1250-NEXT: ; wait_asyncmark(2)
; GFX1250-NEXT: s_wait_asynccnt 0x2
; GFX1250-NEXT: s_wait_asynccnt 0x2
; GFX1250-NEXT: s_add_co_i32 s8, s8, 1
; GFX1250-NEXT: s_add_co_i32 s9, s9, 4
; GFX1250-NEXT: ds_load_b32 v9, v9
; GFX1250-NEXT: v_mov_b32_e32 v5, v1
; GFX1250-NEXT: s_cmp_lt_i32 s8, s11
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_add_nc_u64 s[0:1], s[0:1], 4
; GFX1250-NEXT: s_add_nc_u64 s[4:5], s[4:5], 4
; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: v_add_nc_u32_e32 v9, v10, v9
; GFX1250-NEXT: global_store_b32 v0, v9, s[6:7]
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_add_nc_u64 s[6:7], s[6:7], 4
; GFX1250-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1250-NEXT: ; %bb.2: ; %epilog
; GFX1250-NEXT: s_add_co_i32 s0, s11, -2
; GFX1250-NEXT: ; wait_asyncmark(1)
; GFX1250-NEXT: s_wait_asynccnt 0x1
; GFX1250-NEXT: s_lshl2_add_u32 s1, s0, s10
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-NEXT: v_dual_add_nc_u32 v2, v8, v7 :: v_dual_mov_b32 v0, s1
; GFX1250-NEXT: ds_load_b32 v1, v0
; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_add_nc_u32 v1, v2, v1
; GFX1250-NEXT: global_store_b32 v5, v1, s[2:3] scale_offset
; GFX1250-NEXT: ; wait_asyncmark(0)
; GFX1250-NEXT: s_wait_asynccnt 0x0
; GFX1250-NEXT: ds_load_b32 v0, v0 offset:4
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: v_add_nc_u32_e32 v1, v3, v4
; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_add_nc_u32_e32 v0, v1, v0
; GFX1250-NEXT: global_store_b32 v5, v0, s[2:3] offset:4 scale_offset
; GFX1250-NEXT: s_endpgm
prolog:
; Load first iteration
%v0 = load i32, ptr addrspace(1) %foo
%g0 = load i32, ptr addrspace(1) %bar
call void @llvm.amdgcn.global.load.async.to.lds.b32(ptr addrspace(1) %foo, ptr addrspace(3) %lds, i32 4, i32 u0x20)
call void @llvm.amdgcn.asyncmark()
; Load second iteration
%foo_gep1 = getelementptr i32, ptr addrspace(1) %foo, i32 1
%v1 = load i32, ptr addrspace(1) %foo_gep1
%bar_gep1 = getelementptr i32, ptr addrspace(1) %bar, i32 1
%g1 = load i32, ptr addrspace(1) %bar_gep1
%lds_gep1 = getelementptr i32, ptr addrspace(3) %lds, i32 1
call void @llvm.amdgcn.global.load.async.to.lds.b32(ptr addrspace(1) %foo_gep1, ptr addrspace(3) %lds_gep1, i32 4, i32 u0x20)
call void @llvm.amdgcn.asyncmark()
br label %loop_body
loop_body:
%i = phi i32 [ 2, %prolog ], [ %i.next, %loop_body ]
%prev_v = phi i32 [ %v0, %prolog ], [ %v1, %loop_body ]
%prev_g = phi i32 [ %g0, %prolog ], [ %g1, %loop_body ]
%v1_phi = phi i32 [ %v1, %prolog ], [ %cur_v, %loop_body ]
%g1_phi = phi i32 [ %g1, %prolog ], [ %cur_g, %loop_body ]
; Load next iteration
%foo_gep_cur = getelementptr i32, ptr addrspace(1) %foo, i32 %i
%cur_v = load i32, ptr addrspace(1) %foo_gep_cur
%bar_gep_cur = getelementptr i32, ptr addrspace(1) %bar, i32 %i
%cur_g = load i32, ptr addrspace(1) %bar_gep_cur
%lds_gep_cur = getelementptr i32, ptr addrspace(3) %lds, i32 %i
call void @llvm.amdgcn.global.load.async.to.lds.b32(ptr addrspace(1) %foo_gep_cur, ptr addrspace(3) %lds_gep_cur, i32 4, i32 u0x20)
call void @llvm.amdgcn.asyncmark()
; Wait for iteration i-2 and process
call void @llvm.amdgcn.wait.asyncmark(i16 2)
%lds_idx = sub i32 %i, 2
%lds_gep_read = getelementptr i32, ptr addrspace(3) %lds, i32 %lds_idx
%lds_val = load i32, ptr addrspace(3) %lds_gep_read
%sum1 = add i32 %prev_v, %prev_g
%sum2 = add i32 %sum1, %lds_val
%out_gep = getelementptr i32, ptr addrspace(1) %out, i32 %lds_idx
store i32 %sum2, ptr addrspace(1) %out_gep
%i.next = add i32 %i, 1
%cmp = icmp slt i32 %i.next, %n
br i1 %cmp, label %loop_body, label %epilog
epilog:
; Process remaining iterations
call void @llvm.amdgcn.wait.asyncmark(i16 1)
%lds_n_2 = sub i32 %n, 2
%lds_gep_n_2 = getelementptr i32, ptr addrspace(3) %lds, i32 %lds_n_2
%lds_val_n_2 = load i32, ptr addrspace(3) %lds_gep_n_2
%sum_e1 = add i32 %v1_phi, %g1_phi
%sum_e2 = add i32 %sum_e1, %lds_val_n_2
%out_gep_e1 = getelementptr i32, ptr addrspace(1) %out, i32 %lds_n_2
store i32 %sum_e2, ptr addrspace(1) %out_gep_e1
call void @llvm.amdgcn.wait.asyncmark(i16 0)
%lds_n_1 = sub i32 %n, 1
%lds_gep_n_1 = getelementptr i32, ptr addrspace(3) %lds, i32 %lds_n_1
%lds_val_n_1 = load i32, ptr addrspace(3) %lds_gep_n_1
%sum_e3 = add i32 %cur_v, %cur_g
%sum_e4 = add i32 %sum_e3, %lds_val_n_1
%out_gep_e2 = getelementptr i32, ptr addrspace(1) %out, i32 %lds_n_1
store i32 %sum_e4, ptr addrspace(1) %out_gep_e2
ret void
}