[AMDGPU] Fix SIFixSGPRCopies handling of STRICT_WWM and friends (#142122)

SIFixSGPRCopies handled STRICT_WWM (and similar WWM/WQM pseudos) like a
COPY. In particular, if the source was a VGPR and the result was an
SGPR, lowerVGPR2SGPRCopies would replace it with a readfirstlane,
erasing the original pseudo and hence sabotaging the WWM region marking
which is supposed to be performed by SIWholeQuadMode.

Fix this by handling it more like INSERT_SUBREG, PHI and REG_SEQUENCE:
if the source is a VGPR then move the result to a VGPR, and keep the
pseudo.
This commit is contained in:
Jay Foad 2025-05-30 16:32:56 +01:00 committed by GitHub
parent ccb6b0dafd
commit f8d3bdf6a2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 72 additions and 5 deletions

View File

@ -634,11 +634,7 @@ bool SIFixSGPRCopies::run(MachineFunction &MF) {
switch (MI.getOpcode()) { switch (MI.getOpcode()) {
default: default:
continue; continue;
case AMDGPU::COPY: case AMDGPU::COPY: {
case AMDGPU::WQM:
case AMDGPU::STRICT_WQM:
case AMDGPU::SOFT_WQM:
case AMDGPU::STRICT_WWM: {
const TargetRegisterClass *SrcRC, *DstRC; const TargetRegisterClass *SrcRC, *DstRC;
std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, *MRI); std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, *MRI);
@ -662,6 +658,10 @@ bool SIFixSGPRCopies::run(MachineFunction &MF) {
break; break;
} }
case AMDGPU::WQM:
case AMDGPU::STRICT_WQM:
case AMDGPU::SOFT_WQM:
case AMDGPU::STRICT_WWM:
case AMDGPU::INSERT_SUBREG: case AMDGPU::INSERT_SUBREG:
case AMDGPU::PHI: case AMDGPU::PHI:
case AMDGPU::REG_SEQUENCE: { case AMDGPU::REG_SEQUENCE: {

View File

@ -0,0 +1,40 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s
define amdgpu_gs i32 @main() {
; CHECK-LABEL: main:
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_bitcmp1_b32 0, 0
; CHECK-NEXT: s_mov_b32 s0, 0
; CHECK-NEXT: s_cselect_b32 s1, -1, 0
; CHECK-NEXT: s_or_saveexec_b32 s2, -1
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
; CHECK-NEXT: v_readfirstlane_b32 s1, v0
; CHECK-NEXT: s_mov_b32 exec_lo, s2
; CHECK-NEXT: s_or_b32 s0, s0, s1
; CHECK-NEXT: s_wait_alu 0xfffe
; CHECK-NEXT: s_bitcmp1_b32 s0, 0
; CHECK-NEXT: s_cselect_b32 s0, -1, 0
; CHECK-NEXT: s_wait_alu 0xfffe
; CHECK-NEXT: s_xor_b32 s0, s0, -1
; CHECK-NEXT: s_wait_alu 0xfffe
; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
; CHECK-NEXT: v_readfirstlane_b32 s0, v1
; CHECK-NEXT: s_wait_alu 0xf1ff
; CHECK-NEXT: ; return to shader part epilog
bb:
%i = call i1 @llvm.amdgcn.readfirstlane.i1(i1 false)
br label %bb1
bb1:
%i2 = zext i1 %i to i32
%i3 = call i32 @llvm.amdgcn.wwm.i32(i32 0)
%i4 = call i32 @llvm.amdgcn.wwm.i32(i32 %i2)
%i5 = trunc i32 %i4 to i1
%i6 = trunc i32 %i3 to i1
%i7 = or i1 %i6, %i5
%i8 = select i1 %i7, i32 0, i32 1
ret i32 %i8
}

View File

@ -0,0 +1,27 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass=si-fix-sgpr-copies %s -o - | FileCheck %s
---
name: main
tracksRegLiveness: true
body: |
bb.0:
; CHECK-LABEL: name: main
; CHECK: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[DEF1]], implicit $exec
; CHECK-NEXT: early-clobber %2:sreg_32 = STRICT_WWM killed undef [[V_READFIRSTLANE_B32_]], implicit $exec
; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 killed undef [[DEF]], killed undef %2, implicit-def dead $scc
; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, killed undef [[S_OR_B32_]], implicit-def dead $scc
; CHECK-NEXT: S_CMP_EQ_U32 killed undef [[S_AND_B32_]], 1, implicit-def $scc
; CHECK-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32 killed undef [[S_AND_B32_]], killed undef [[S_AND_B32_]], implicit-def dead $scc
; CHECK-NEXT: SI_RETURN_TO_EPILOG undef $sgpr0
%0:sreg_32 = IMPLICIT_DEF
%1:vgpr_32 = IMPLICIT_DEF
early-clobber %2:sreg_32 = STRICT_WWM killed undef %1, implicit $exec
%3:sreg_32 = S_OR_B32 killed undef %0, killed undef %2, implicit-def dead $scc
%4:sreg_32 = S_AND_B32 1, killed undef %3, implicit-def dead $scc
S_CMP_EQ_U32 killed undef %4, 1, implicit-def $scc
%5:sreg_32_xm0_xexec = S_XOR_B32 killed undef %4, killed undef %4, implicit-def dead $scc
SI_RETURN_TO_EPILOG undef $sgpr0
...