From 1a48e1df4541ccccdaf14a6ea379be004e319a09 Mon Sep 17 00:00:00 2001 From: Mariusz Sikora Date: Tue, 22 Apr 2025 13:33:06 +0200 Subject: [PATCH] [AMDGPU] Do not fold COPY with implicit operands (#136003) Folding may remove COPY from inside of the divergent loop. --- llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 3 +- llvm/test/CodeGen/AMDGPU/do-not-fold-copy.mir | 56 +++++ .../AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll | 5 +- llvm/test/CodeGen/AMDGPU/mfma-loop.ll | 216 ++++++++---------- llvm/test/CodeGen/AMDGPU/mul.ll | 6 +- 5 files changed, 160 insertions(+), 126 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/do-not-fold-copy.mir diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 5e8273a968e8..1547142a8d5c 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -1091,7 +1091,8 @@ void SIFoldOperandsImpl::foldOperand( } else { if (UseMI->isCopy() && OpToFold.isReg() && UseMI->getOperand(0).getReg().isVirtual() && - !UseMI->getOperand(1).getSubReg()) { + !UseMI->getOperand(1).getSubReg() && + OpToFold.getParent()->implicit_operands().empty()) { LLVM_DEBUG(dbgs() << "Folding " << OpToFold << "\n into " << *UseMI); unsigned Size = TII->getOpSize(*UseMI, 1); Register UseReg = OpToFold.getReg(); diff --git a/llvm/test/CodeGen/AMDGPU/do-not-fold-copy.mir b/llvm/test/CodeGen/AMDGPU/do-not-fold-copy.mir new file mode 100644 index 000000000000..5c206da8c544 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/do-not-fold-copy.mir @@ -0,0 +1,56 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=si-fold-operands -o - %s | FileCheck %s + +--- +liveins: +name: do_not_fold_copy_with_implicit_exec +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: do_not_fold_copy_with_implicit_exec + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; CHECK-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64 = PHI [[S_MOV_B64_1]], %bb.0, %4, %bb.1 + ; CHECK-NEXT: [[PHI1:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.0, %6, %bb.1 + ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[PHI1]], 1, implicit-def dead $scc + ; CHECK-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_64 = SI_IF_BREAK [[S_MOV_B64_]], [[PHI]], implicit-def dead $scc + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]], implicit $exec + ; CHECK-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: SI_END_CF [[SI_IF_BREAK]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY]], 0, 0, killed [[DEF]], implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + + bb.0: + %0:sreg_64 = S_MOV_B64 0 + %1:sreg_64 = S_MOV_B64 0 + %2:sreg_32 = S_MOV_B32 0, implicit $exec + S_BRANCH %bb.1 + + bb.1: + %3:sreg_64 = PHI %1, %bb.0, %4, %bb.1 + %5:sreg_32 = PHI %2, %bb.0, %6, %bb.1 + %6:sreg_32 = S_ADD_I32 %5, 1, implicit-def dead $scc + %4:sreg_64 = SI_IF_BREAK %0, %3, implicit-def dead $scc + %7:vgpr_32 = COPY %6, implicit $exec + SI_LOOP %4, %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + S_BRANCH %bb.2 + + bb.2: + SI_END_CF %4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + %9:vgpr_32 = COPY %7 + %10:sreg_64_xexec = IMPLICIT_DEF + %11:vgpr_32 = V_SET_INACTIVE_B32 0, %9, 0, 0, killed %10, implicit $exec + S_ENDPGM 0 +... diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll index 1e2bf8256321..c2f8c2c44316 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll @@ -429,13 +429,14 @@ define amdgpu_cs_chain void @control_flow(<3 x i32> inreg %sgpr, ptr inreg %call ; DAGISEL12-NEXT: v_cmp_ne_u32_e64 s9, 0, v0 ; DAGISEL12-NEXT: s_mov_b32 exec_lo, s8 ; DAGISEL12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v13, v1 +; DAGISEL12-NEXT: v_mov_b32_e32 v11, s9 ; DAGISEL12-NEXT: s_or_b32 s4, vcc_lo, s4 ; DAGISEL12-NEXT: s_wait_alu 0xfffe ; DAGISEL12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; DAGISEL12-NEXT: s_cbranch_execnz .LBB3_2 ; DAGISEL12-NEXT: ; %bb.3: ; %tail.loopexit ; DAGISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; DAGISEL12-NEXT: v_dual_mov_b32 v11, s9 :: v_dual_add_nc_u32 v10, 42, v1 +; DAGISEL12-NEXT: v_add_nc_u32_e32 v10, 42, v1 ; DAGISEL12-NEXT: .LBB3_4: ; %Flow1 ; DAGISEL12-NEXT: s_wait_alu 0xfffe ; DAGISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s3 @@ -526,13 +527,13 @@ define amdgpu_cs_chain void @control_flow(<3 x i32> inreg %sgpr, ptr inreg %call ; DAGISEL10-NEXT: v_cmp_ne_u32_e64 s9, 0, v0 ; DAGISEL10-NEXT: s_mov_b32 exec_lo, s8 ; DAGISEL10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v13, v1 +; DAGISEL10-NEXT: v_mov_b32_e32 v11, s9 ; DAGISEL10-NEXT: s_or_b32 s4, vcc_lo, s4 ; DAGISEL10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; DAGISEL10-NEXT: s_cbranch_execnz .LBB3_2 ; DAGISEL10-NEXT: ; %bb.3: ; %tail.loopexit ; DAGISEL10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; DAGISEL10-NEXT: v_add_nc_u32_e32 v10, 42, v1 -; DAGISEL10-NEXT: v_mov_b32_e32 v11, s9 ; DAGISEL10-NEXT: .LBB3_4: ; %Flow1 ; DAGISEL10-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; DAGISEL10-NEXT: s_mov_b32 s3, exec_lo diff --git a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll index d0042bb69240..a0d587ac68ff 100644 --- a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll @@ -1425,41 +1425,42 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float ; GFX90A: ; %bb.0: ; %entry ; GFX90A-NEXT: s_load_dword s1, s[4:5], 0x2c ; GFX90A-NEXT: s_mov_b32 s0, 16 -; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a31, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, s1 +; GFX90A-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NEXT: v_accvgpr_write_b32 a31, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a30, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a29, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a28, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a27, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a26, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a25, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a24, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a23, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a22, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a21, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a20, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a19, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a18, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a17, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a16, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a15, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a14, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a13, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a12, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a11, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a10, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a9, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a8, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a7, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a6, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a5, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a4, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX90A-NEXT: .LBB5_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_nop 1 @@ -1487,41 +1488,42 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_load_dword s1, s[4:5], 0x2c ; GFX942-NEXT: s_mov_b32 s0, 16 -; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_accvgpr_write_b32 a31, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a30, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a29, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a28, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a27, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a26, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a25, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a24, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a23, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a22, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a21, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a20, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a19, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a18, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a17, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a16, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a15, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a14, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a13, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a12, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a11, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a10, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a9, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a8, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a7, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a6, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a5, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a4, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a3, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a2, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a0, s1 +; GFX942-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NEXT: v_accvgpr_write_b32 a31, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a30, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a29, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a28, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a27, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a26, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a25, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a24, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a23, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a22, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a21, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a20, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a19, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a18, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a17, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a16, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a15, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a14, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a13, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a12, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a11, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a10, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a9, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a8, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a7, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a6, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a5, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a4, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a3, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a2, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a1, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX942-NEXT: .LBB5_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_nop 1 @@ -1696,6 +1698,8 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa ; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, s1 ; GFX90A-NEXT: v_accvgpr_write_b32 a29, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a28, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a27, 0 @@ -1725,8 +1729,7 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa ; GFX90A-NEXT: v_accvgpr_write_b32 a3, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0 ; GFX90A-NEXT: s_mov_b32 s0, 16 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX90A-NEXT: .LBB6_1: ; %for.cond.preheader @@ -1759,6 +1762,8 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa ; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a31, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a30, 0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NEXT: v_accvgpr_write_b32 a29, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a28, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a27, 0 @@ -1788,8 +1793,7 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa ; GFX942-NEXT: v_accvgpr_write_b32 a3, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a2, 0 ; GFX942-NEXT: s_mov_b32 s0, 16 -; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX942-NEXT: v_accvgpr_write_b32 a1, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX942-NEXT: .LBB6_1: ; %for.cond.preheader @@ -2050,66 +2054,38 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 { ; GFX908-NEXT: s_nop 7 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v33, a0 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX908-NEXT: v_accvgpr_write_b32 a1, v2 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX908-NEXT: v_accvgpr_write_b32 a2, v3 -; GFX908-NEXT: v_accvgpr_write_b32 a3, v33 +; GFX908-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX908-NEXT: v_accvgpr_write_b32 a3, v2 ; GFX908-NEXT: v_accvgpr_write_b32 a4, v2 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v33, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX908-NEXT: v_accvgpr_write_b32 a5, v3 -; GFX908-NEXT: v_accvgpr_write_b32 a6, v33 +; GFX908-NEXT: v_accvgpr_write_b32 a5, v2 +; GFX908-NEXT: v_accvgpr_write_b32 a6, v2 ; GFX908-NEXT: v_accvgpr_write_b32 a7, v2 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v33, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX908-NEXT: v_accvgpr_write_b32 a8, v3 -; GFX908-NEXT: v_accvgpr_write_b32 a9, v33 +; GFX908-NEXT: v_accvgpr_write_b32 a8, v2 +; GFX908-NEXT: v_accvgpr_write_b32 a9, v2 ; GFX908-NEXT: v_accvgpr_write_b32 a10, v2 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v33, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX908-NEXT: v_accvgpr_write_b32 a11, v3 -; GFX908-NEXT: v_accvgpr_write_b32 a12, v33 +; GFX908-NEXT: v_accvgpr_write_b32 a11, v2 +; GFX908-NEXT: v_accvgpr_write_b32 a12, v2 ; GFX908-NEXT: v_accvgpr_write_b32 a13, v2 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v33, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX908-NEXT: v_accvgpr_write_b32 a14, v3 -; GFX908-NEXT: v_accvgpr_write_b32 a15, v33 +; GFX908-NEXT: v_accvgpr_write_b32 a14, v2 +; GFX908-NEXT: v_accvgpr_write_b32 a15, v2 ; GFX908-NEXT: v_accvgpr_write_b32 a16, v2 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v33, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX908-NEXT: v_accvgpr_write_b32 a17, v3 -; GFX908-NEXT: v_accvgpr_write_b32 a18, v33 +; GFX908-NEXT: v_accvgpr_write_b32 a17, v2 +; GFX908-NEXT: v_accvgpr_write_b32 a18, v2 ; GFX908-NEXT: v_accvgpr_write_b32 a19, v2 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v33, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX908-NEXT: v_accvgpr_write_b32 a20, v3 -; GFX908-NEXT: v_accvgpr_write_b32 a21, v33 +; GFX908-NEXT: v_accvgpr_write_b32 a20, v2 +; GFX908-NEXT: v_accvgpr_write_b32 a21, v2 ; GFX908-NEXT: v_accvgpr_write_b32 a22, v2 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v33, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX908-NEXT: v_accvgpr_write_b32 a23, v3 -; GFX908-NEXT: v_accvgpr_write_b32 a24, v33 +; GFX908-NEXT: v_accvgpr_write_b32 a23, v2 +; GFX908-NEXT: v_accvgpr_write_b32 a24, v2 ; GFX908-NEXT: v_accvgpr_write_b32 a25, v2 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v33, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX908-NEXT: v_accvgpr_write_b32 a26, v3 -; GFX908-NEXT: v_accvgpr_write_b32 a27, v33 +; GFX908-NEXT: v_accvgpr_write_b32 a26, v2 +; GFX908-NEXT: v_accvgpr_write_b32 a27, v2 ; GFX908-NEXT: v_accvgpr_write_b32 a28, v2 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v33, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX908-NEXT: v_accvgpr_write_b32 a29, v3 -; GFX908-NEXT: v_accvgpr_write_b32 a30, v33 +; GFX908-NEXT: v_accvgpr_write_b32 a29, v2 +; GFX908-NEXT: v_accvgpr_write_b32 a30, v2 ; GFX908-NEXT: v_accvgpr_write_b32 a31, v2 ; GFX908-NEXT: .LBB8_1: ; %for.cond.preheader ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll index 896f48a9215b..0f47a31f52dc 100644 --- a/llvm/test/CodeGen/AMDGPU/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/mul.ll @@ -2619,13 +2619,13 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a, ; SI-NEXT: v_add_i32_e32 v0, vcc, s4, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, s5, v0 ; SI-NEXT: s_mul_i32 s5, s14, s9 -; SI-NEXT: s_mul_i32 s4, s12, s10 ; SI-NEXT: v_add_i32_e32 v1, vcc, s5, v1 ; SI-NEXT: s_mul_i32 s5, s15, s8 ; SI-NEXT: v_add_i32_e32 v1, vcc, s5, v1 ; SI-NEXT: s_mul_i32 s5, s14, s8 -; SI-NEXT: v_mov_b32_e32 v2, s4 -; SI-NEXT: v_add_i32_e32 v2, vcc, s5, v2 +; SI-NEXT: s_mul_i32 s4, s12, s10 +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v2 ; SI-NEXT: v_addc_u32_e32 v0, vcc, v1, v0, vcc ; SI-NEXT: v_mov_b32_e32 v1, s12 ; SI-NEXT: v_mul_hi_u32 v5, s8, v1