llvm-project/llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-prefetch-flushed.mir
hidekisaito 9d1fd9ec1e
[AMDGPU] Extend DS loop wait optimization with flush point tracking (#175658)
Add support for prefetch patterns where some DS loads are used in the
same iteration (creating flush points) while others remain unflushed at
the backedge.

This complements the existing pure prefetch optimization (PR172728) by
handling cases where partial same-iteration consumption occurs.

Assisted-by: Cursor / claude-4.5-opus-high
2026-03-02 09:19:46 -08:00

90 lines
4.0 KiB
YAML

# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass=si-insert-waitcnts -o - %s | FileCheck %s
---
# Test: DS prefetch with flush points in single-block loop.
# Preheader has DS load. Loop has DS loads where some are used in same iteration
# (creating flush points) but others remain unflushed at backedge (prefetches).
# Expected: s_wait_dscnt 0 in preheader, non-zero waits in loop.
name: ds_prefetch_flushed
tracksRegLiveness: true
machineFunctionInfo:
isEntryFunction: true
body: |
; CHECK-LABEL: name: ds_prefetch_flushed
; CHECK: bb.0:
; CHECK-NEXT: successors: %bb.1(0x80000000)
; CHECK-NEXT: liveins: $sgpr0, $vgpr0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: $vgpr10_vgpr11_vgpr12_vgpr13 = DS_READ_B128 $vgpr0, 0, 0, implicit $m0, implicit $exec
; CHECK-NEXT: $vgpr28 = DS_READ_B32 $vgpr0, 0, 0, implicit $m0, implicit $exec
; CHECK-NEXT: $vgpr32 = DS_READ_B32 $vgpr0, 0, 0, implicit $m0, implicit $exec
; CHECK-NEXT: S_WAIT_DSCNT 0
; CHECK-NEXT: S_BRANCH %bb.1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; CHECK-NEXT: liveins: $sgpr0, $vgpr0, $vgpr10_vgpr11_vgpr12_vgpr13, $vgpr28, $vgpr32
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: $vgpr50 = V_ADD_F32_e32 $vgpr10, $vgpr11, implicit $mode, implicit $exec
; CHECK-NEXT: S_WAIT_DSCNT 1
; CHECK-NEXT: $vgpr51 = V_ADD_F32_e32 $vgpr28, $vgpr28, implicit $mode, implicit $exec
; CHECK-NEXT: S_BARRIER
; CHECK-NEXT: $vgpr20_vgpr21_vgpr22_vgpr23 = DS_READ_B128 $vgpr0, 64, 0, implicit $m0, implicit $exec
; CHECK-NEXT: $vgpr24_vgpr25_vgpr26_vgpr27 = DS_READ_B128 $vgpr0, 80, 0, implicit $m0, implicit $exec
; CHECK-NEXT: S_WAIT_DSCNT 2
; CHECK-NEXT: $vgpr52 = V_ADD_F32_e32 $vgpr32, $vgpr32, implicit $mode, implicit $exec
; CHECK-NEXT: $vgpr28_vgpr29_vgpr30_vgpr31 = DS_READ_B128 $vgpr0, 96, 0, implicit $m0, implicit $exec
; CHECK-NEXT: $vgpr32_vgpr33_vgpr34_vgpr35 = DS_READ_B128 $vgpr0, 112, 0, implicit $m0, implicit $exec
; CHECK-NEXT: S_WAIT_DSCNT 2
; CHECK-NEXT: $vgpr53 = V_ADD_F32_e32 $vgpr24, $vgpr25, implicit $mode, implicit $exec
; CHECK-NEXT: $sgpr0 = S_ADD_I32 $sgpr0, -1, implicit-def $scc
; CHECK-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc
; CHECK-NEXT: S_BRANCH %bb.2
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2:
; CHECK-NEXT: S_ENDPGM 0
bb.0:
successors: %bb.1
liveins: $sgpr0, $vgpr0
; Preheader: DS load for use inside loop
$vgpr10_vgpr11_vgpr12_vgpr13 = DS_READ_B128 $vgpr0, 0, 0, implicit $m0, implicit $exec
$vgpr28 = DS_READ_B32 $vgpr0, 0, 0, implicit $m0, implicit $exec
$vgpr32 = DS_READ_B32 $vgpr0, 0, 0, implicit $m0, implicit $exec
S_BRANCH %bb.1
bb.1:
successors: %bb.1, %bb.2
liveins: $sgpr0, $vgpr0, $vgpr10_vgpr11_vgpr12_vgpr13, $vgpr28, $vgpr32
; Use preheader value
$vgpr50 = V_ADD_F32_e32 $vgpr10, $vgpr11, implicit $mode, implicit $exec
; Use preheader and prefetched value
$vgpr51 = V_ADD_F32_e32 $vgpr28, $vgpr28, implicit $mode, implicit $exec
; Barrier
S_BARRIER
; First two will be "flushed" by same-iteration use below
$vgpr20_vgpr21_vgpr22_vgpr23 = DS_READ_B128 $vgpr0, 64, 0, implicit $m0, implicit $exec
$vgpr24_vgpr25_vgpr26_vgpr27 = DS_READ_B128 $vgpr0, 80, 0, implicit $m0, implicit $exec
; Use preheader and prefetched value
$vgpr52 = V_ADD_F32_e32 $vgpr32, $vgpr32, implicit $mode, implicit $exec
; These two remain unflushed - true prefetches for next iteration
$vgpr28_vgpr29_vgpr30_vgpr31 = DS_READ_B128 $vgpr0, 96, 0, implicit $m0, implicit $exec
$vgpr32_vgpr33_vgpr34_vgpr35 = DS_READ_B128 $vgpr0, 112, 0, implicit $m0, implicit $exec
; Use vgpr24 - creates flush point, flushes loads 1-2
$vgpr53 = V_ADD_F32_e32 $vgpr24, $vgpr25, implicit $mode, implicit $exec
; Loop control
$sgpr0 = S_ADD_I32 $sgpr0, -1, implicit-def $scc
S_CBRANCH_SCC1 %bb.1, implicit $scc
S_BRANCH %bb.2
bb.2:
S_ENDPGM 0
...