Add support for prefetch patterns where some DS loads are used in the same iteration (creating flush points) while others remain unflushed at the backedge. This complements the existing pure prefetch optimization (PR172728) by handling cases where partial same-iteration consumption occurs. Assisted-by: Cursor / claude-4.5-opus-high
90 lines
4.0 KiB
YAML
90 lines
4.0 KiB
YAML
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
|
|
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass=si-insert-waitcnts -o - %s | FileCheck %s
|
|
|
|
---
|
|
# Test: DS prefetch with flush points in single-block loop.
|
|
# Preheader has DS load. Loop has DS loads where some are used in same iteration
|
|
# (creating flush points) but others remain unflushed at backedge (prefetches).
|
|
# Expected: s_wait_dscnt 0 in preheader, non-zero waits in loop.
|
|
name: ds_prefetch_flushed
|
|
tracksRegLiveness: true
|
|
machineFunctionInfo:
|
|
isEntryFunction: true
|
|
body: |
|
|
; CHECK-LABEL: name: ds_prefetch_flushed
|
|
; CHECK: bb.0:
|
|
; CHECK-NEXT: successors: %bb.1(0x80000000)
|
|
; CHECK-NEXT: liveins: $sgpr0, $vgpr0
|
|
; CHECK-NEXT: {{ $}}
|
|
; CHECK-NEXT: $vgpr10_vgpr11_vgpr12_vgpr13 = DS_READ_B128 $vgpr0, 0, 0, implicit $m0, implicit $exec
|
|
; CHECK-NEXT: $vgpr28 = DS_READ_B32 $vgpr0, 0, 0, implicit $m0, implicit $exec
|
|
; CHECK-NEXT: $vgpr32 = DS_READ_B32 $vgpr0, 0, 0, implicit $m0, implicit $exec
|
|
; CHECK-NEXT: S_WAIT_DSCNT 0
|
|
; CHECK-NEXT: S_BRANCH %bb.1
|
|
; CHECK-NEXT: {{ $}}
|
|
; CHECK-NEXT: bb.1:
|
|
; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
|
|
; CHECK-NEXT: liveins: $sgpr0, $vgpr0, $vgpr10_vgpr11_vgpr12_vgpr13, $vgpr28, $vgpr32
|
|
; CHECK-NEXT: {{ $}}
|
|
; CHECK-NEXT: $vgpr50 = V_ADD_F32_e32 $vgpr10, $vgpr11, implicit $mode, implicit $exec
|
|
; CHECK-NEXT: S_WAIT_DSCNT 1
|
|
; CHECK-NEXT: $vgpr51 = V_ADD_F32_e32 $vgpr28, $vgpr28, implicit $mode, implicit $exec
|
|
; CHECK-NEXT: S_BARRIER
|
|
; CHECK-NEXT: $vgpr20_vgpr21_vgpr22_vgpr23 = DS_READ_B128 $vgpr0, 64, 0, implicit $m0, implicit $exec
|
|
; CHECK-NEXT: $vgpr24_vgpr25_vgpr26_vgpr27 = DS_READ_B128 $vgpr0, 80, 0, implicit $m0, implicit $exec
|
|
; CHECK-NEXT: S_WAIT_DSCNT 2
|
|
; CHECK-NEXT: $vgpr52 = V_ADD_F32_e32 $vgpr32, $vgpr32, implicit $mode, implicit $exec
|
|
; CHECK-NEXT: $vgpr28_vgpr29_vgpr30_vgpr31 = DS_READ_B128 $vgpr0, 96, 0, implicit $m0, implicit $exec
|
|
; CHECK-NEXT: $vgpr32_vgpr33_vgpr34_vgpr35 = DS_READ_B128 $vgpr0, 112, 0, implicit $m0, implicit $exec
|
|
; CHECK-NEXT: S_WAIT_DSCNT 2
|
|
; CHECK-NEXT: $vgpr53 = V_ADD_F32_e32 $vgpr24, $vgpr25, implicit $mode, implicit $exec
|
|
; CHECK-NEXT: $sgpr0 = S_ADD_I32 $sgpr0, -1, implicit-def $scc
|
|
; CHECK-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc
|
|
; CHECK-NEXT: S_BRANCH %bb.2
|
|
; CHECK-NEXT: {{ $}}
|
|
; CHECK-NEXT: bb.2:
|
|
; CHECK-NEXT: S_ENDPGM 0
|
|
bb.0:
|
|
successors: %bb.1
|
|
liveins: $sgpr0, $vgpr0
|
|
|
|
; Preheader: DS load for use inside loop
|
|
$vgpr10_vgpr11_vgpr12_vgpr13 = DS_READ_B128 $vgpr0, 0, 0, implicit $m0, implicit $exec
|
|
$vgpr28 = DS_READ_B32 $vgpr0, 0, 0, implicit $m0, implicit $exec
|
|
$vgpr32 = DS_READ_B32 $vgpr0, 0, 0, implicit $m0, implicit $exec
|
|
S_BRANCH %bb.1
|
|
|
|
bb.1:
|
|
successors: %bb.1, %bb.2
|
|
liveins: $sgpr0, $vgpr0, $vgpr10_vgpr11_vgpr12_vgpr13, $vgpr28, $vgpr32
|
|
|
|
; Use preheader value
|
|
$vgpr50 = V_ADD_F32_e32 $vgpr10, $vgpr11, implicit $mode, implicit $exec
|
|
|
|
; Use preheader and prefetched value
|
|
$vgpr51 = V_ADD_F32_e32 $vgpr28, $vgpr28, implicit $mode, implicit $exec
|
|
|
|
; Barrier
|
|
S_BARRIER
|
|
|
|
; First two will be "flushed" by same-iteration use below
|
|
$vgpr20_vgpr21_vgpr22_vgpr23 = DS_READ_B128 $vgpr0, 64, 0, implicit $m0, implicit $exec
|
|
$vgpr24_vgpr25_vgpr26_vgpr27 = DS_READ_B128 $vgpr0, 80, 0, implicit $m0, implicit $exec
|
|
; Use preheader and prefetched value
|
|
$vgpr52 = V_ADD_F32_e32 $vgpr32, $vgpr32, implicit $mode, implicit $exec
|
|
; These two remain unflushed - true prefetches for next iteration
|
|
$vgpr28_vgpr29_vgpr30_vgpr31 = DS_READ_B128 $vgpr0, 96, 0, implicit $m0, implicit $exec
|
|
$vgpr32_vgpr33_vgpr34_vgpr35 = DS_READ_B128 $vgpr0, 112, 0, implicit $m0, implicit $exec
|
|
|
|
; Use vgpr24 - creates flush point, flushes loads 1-2
|
|
$vgpr53 = V_ADD_F32_e32 $vgpr24, $vgpr25, implicit $mode, implicit $exec
|
|
|
|
; Loop control
|
|
$sgpr0 = S_ADD_I32 $sgpr0, -1, implicit-def $scc
|
|
S_CBRANCH_SCC1 %bb.1, implicit $scc
|
|
S_BRANCH %bb.2
|
|
|
|
bb.2:
|
|
S_ENDPGM 0
|
|
...
|