diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp index e945564fb1f3..a014cfdc6496 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp @@ -292,8 +292,15 @@ bool AMDGPULowerVGPREncoding::runOnMachineInstr(MachineInstr &MI) { TII->commuteInstruction(MI)) { ModeTy NewModeCommuted; computeMode(NewModeCommuted, MI, Ops.first, Ops.second); - if (CurrentMode.isCompatible(NewModeCommuted)) - return false; + if (CurrentMode.isCompatible(NewModeCommuted)) { + // Update CurrentMode with mode bits the commuted instruction relies on. + // This prevents later instructions from piggybacking and corrupting + // those bits (e.g., a nullopt src treated as 0 could be overwritten). + bool Unused = false; + CurrentMode.update(NewModeCommuted, Unused); + // MI was modified by the commute above. + return true; + } // Commute back. if (!TII->commuteInstruction(MI)) llvm_unreachable("Failed to restore commuted instruction."); diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250.mir b/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250.mir index 809f648b5643..1215bc736434 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250.mir +++ b/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250.mir @@ -346,15 +346,20 @@ body: | ; GCN-NEXT: v_fmaak_f32 v0, v1 /*v257*/, v2 /*v258*/, 0x1 $vgpr0 = V_FMAAK_F32 undef $vgpr257, undef $vgpr258, 1, implicit $exec, implicit $mode + ; The v_fmamk below needs src2=1. After the commute, compatible piggybacking + ; is still allowed, so src2=1 is piggybacked here. ; GCN-NEXT: s_set_vgpr_msb 0x551 ; ASM-SAME: ; msbs: dst=1 src0=1 src1=0 src2=1 ; GCN-NEXT: v_fmaak_f32 v0 /*v256*/, v1 /*v257*/, v2, 0x1 $vgpr256 = V_FMAAK_F32 undef $vgpr257, undef $vgpr2, 1, implicit $exec, implicit $mode - ; Commute FMAAK to avoid mode change + ; Commute FMAAK to avoid mode change. After commute, CurrentMode is updated + ; to lock in the mode bits the commuted instruction relies on. ; GCN-NEXT: v_fmaak_f32 v0 /*v256*/, v2 /*v258*/, v1, 0x1 $vgpr256 = V_FMAAK_F32 undef $vgpr1, undef $vgpr258, 1, implicit $exec, implicit $mode + ; This v_fmamk needs src2=1. Since src2 was nullopt (not locked), it can + ; piggyback onto the s_set_vgpr_msb above by adding src2=1. ; GCN-NEXT: v_fmamk_f32 v0 /*v256*/, v1 /*v257*/, 0x1, v2 /*v258*/ $vgpr256 = V_FMAMK_F32 undef $vgpr257, 1, undef $vgpr258, implicit $exec, implicit $mode @@ -473,6 +478,9 @@ body: | ; GCN-NEXT: v_subrev_nc_u32_e32 v0, v0 /*v256*/, v2 $vgpr0 = V_SUBREV_U32_e32 undef $vgpr256, undef $vgpr2, implicit $exec + ; GCN-NEXT: s_set_vgpr_msb 0x100 + ; ASM-SAME: ; msbs: dst=0 src0=0 src1=0 src2=0 + ; ASM: NumVgprs: 513 ... @@ -964,3 +972,41 @@ body: | ; GCN-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[210:217], v[244:259] /*v[500:515]*/, v[0:15], v[10:17], v[0:1], v[2:3] $vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217 = V_WMMA_SCALE16_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr undef $vgpr500_vgpr501_vgpr502_vgpr503_vgpr504_vgpr505_vgpr506_vgpr507_vgpr508_vgpr509_vgpr510_vgpr511_vgpr512_vgpr513_vgpr514_vgpr515, undef $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, undef $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, undef $vgpr0_vgpr1, undef $vgpr2_vgpr3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec ... + +# Test that incompatible piggybacking is prevented after a successful commute. +# After commute, CurrentMode is updated with the commuted instruction's mode. +# A subsequent instruction that would CHANGE an existing mode bit cannot +# piggyback. However, instructions that only ADD new mode bits (without +# changing existing ones) can still piggyback - see fmak_fmamk_vop3 test. + +# ASM-LABEL: {{^}}commute_no_piggyback: +# DIS-LABEL: : +--- +name: commute_no_piggyback +tracksRegLiveness: true +body: | + bb.0: + ; ASM: %bb.0: + + ; Step 1: v_mov sets dst=1, src0=2. src1 is nullopt (defaults to 0). + ; GCN-NEXT: s_set_vgpr_msb 0x42 + ; ASM-SAME: ; msbs: dst=1 src0=2 src1=0 src2=0 + ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v0 /*v512*/ + $vgpr256 = V_MOV_B32_e32 undef $vgpr512, implicit $exec + + ; Step 2: v_add is commuted to be compatible (src0=2, src1=0). + ; GCN-NEXT: v_add_f32_e64 v0 /*v256*/, v0 /*v512*/, v1 + $vgpr256 = V_ADD_F32_e64 0, undef $vgpr1, 0, undef $vgpr512, 0, 0, implicit $exec, implicit $mode + + ; Step 3: v_add needs src1=2. Without the fix, this piggybacks onto step 1's + ; s_set_vgpr_msb, changing src1 from 0 to 2 and breaking step 2. + ; GCN-NEXT: s_set_vgpr_msb 0x424a + ; ASM-SAME: ; msbs: dst=1 src0=2 src1=2 src2=0 + ; GCN-NEXT: v_add_f32_e64 v0 /*v256*/, v0 /*v512*/, v1 /*v513*/ + $vgpr256 = V_ADD_F32_e64 0, undef $vgpr512, 0, undef $vgpr513, 0, 0, implicit $exec, implicit $mode + + ; GCN-NEXT: s_set_vgpr_msb 0x4a00 + ; ASM-SAME: ; msbs: dst=0 src0=0 src1=0 src2=0 + + ; ASM: NumVgprs: 514 +...