AMDGPU: Check for subreg match when folding through reg_sequence (#140582)

We need to consider the use instruction's intepretation of the bits, not the defined immediate without use context. This will regress some cases where we previously coud match f64 inline constants. We can restore them by either using pseudo instructions to materialize f64 constants, or recognizing reg_sequence decomposed into 32-bit pieces for them (which essentially means recognizing every other input is a 0). Fixes #139908
2025-05-19 21:44:44 +02:00 · 2025-05-19 21:44:44 +02:00 · 36018494fd
commit 36018494fd
parent 2b7cc2b03e
7 changed files with 243 additions and 36 deletions
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@ -895,6 +895,8 @@ SIFoldOperandsImpl::isRegSeqSplat(MachineInstr &RegSeq) const {
  if (!SrcRC)
    return {};

+  // TODO: Recognize 64-bit splats broken into 32-bit pieces (i.e. recognize
+  // every other other element is 0 for 64-bit immediates)
  int64_t Imm;
  for (unsigned I = 0, E = Defs.size(); I != E; ++I) {
    const MachineOperand *Op = Defs[I].first;
@ -924,10 +926,41 @@ MachineOperand *SIFoldOperandsImpl::tryFoldRegSeqSplat(
  if (!AMDGPU::isSISrcOperand(Desc, UseOpIdx))
    return nullptr;

-  // FIXME: Verify SplatRC is compatible with the use operand
-  uint8_t OpTy = Desc.operands()[UseOpIdx].OperandType;
-  if (!TII->isInlineConstant(*SplatVal, OpTy) ||
-      !TII->isOperandLegal(*UseMI, UseOpIdx, SplatVal))
+  int16_t RCID = Desc.operands()[UseOpIdx].RegClass;
+  if (RCID == -1)
+    return nullptr;
+
+  // Special case 0/-1, since when interpreted as a 64-bit element both halves
+  // have the same bits. Effectively this code does not handle 64-bit element
+  // operands correctly, as the incoming 64-bit constants are already split into
+  // 32-bit sequence elements.
+  //
+  // TODO: We should try to figure out how to interpret the reg_sequence as a
+  // split 64-bit splat constant, or use 64-bit pseudos for materializing f64
+  // constants.
+  if (SplatVal->getImm() != 0 && SplatVal->getImm() != -1) {
+    const TargetRegisterClass *OpRC = TRI->getRegClass(RCID);
+    // We need to figure out the scalar type read by the operand. e.g. the MFMA
+    // operand will be AReg_128, and we want to check if it's compatible with an
+    // AReg_32 constant.
+    uint8_t OpTy = Desc.operands()[UseOpIdx].OperandType;
+    switch (OpTy) {
+    case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
+    case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
+      OpRC = TRI->getSubRegisterClass(OpRC, AMDGPU::sub0);
+      break;
+    case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
+      OpRC = TRI->getSubRegisterClass(OpRC, AMDGPU::sub0_sub1);
+      break;
+    default:
+      return nullptr;
+    }
+
+    if (!TRI->getCommonSubClass(OpRC, SplatRC))
+      return nullptr;
+  }
+
+  if (!TII->isOperandLegal(*UseMI, UseOpIdx, SplatVal))
    return nullptr;

  return SplatVal;
@ -1039,14 +1072,13 @@ void SIFoldOperandsImpl::foldOperand(
        }
      }

-      if (tryToFoldACImm(UseMI->getOperand(0), RSUseMI, OpNo, FoldList))
-        continue;
-
      if (RSUse->getSubReg() != RegSeqDstSubReg)
        continue;

-      foldOperand(OpToFold, RSUseMI, RSUseMI->getOperandNo(RSUse), FoldList,
-                  CopiesToReplace);
+      if (tryToFoldACImm(UseMI->getOperand(0), RSUseMI, OpNo, FoldList))
+        continue;
+
+      foldOperand(OpToFold, RSUseMI, OpNo, FoldList, CopiesToReplace);
    }

    return;
--- a/llvm/test/CodeGen/AMDGPU/constrained-shift.ll
+++ b/llvm/test/CodeGen/AMDGPU/constrained-shift.ll
@ -192,8 +192,10 @@ define amdgpu_ps <4 x i32> @s_csh_v4i32(<4 x i32> inreg %a, <4 x i32> inreg %b)
 ;
 ; GISEL-LABEL: s_csh_v4i32:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_and_b64 s[4:5], s[4:5], 31
-; GISEL-NEXT:    s_and_b64 s[6:7], s[6:7], 31
+; GISEL-NEXT:    s_mov_b32 s8, 31
+; GISEL-NEXT:    s_mov_b32 s9, s8
+; GISEL-NEXT:    s_and_b64 s[4:5], s[4:5], s[8:9]
+; GISEL-NEXT:    s_and_b64 s[6:7], s[6:7], s[8:9]
 ; GISEL-NEXT:    s_lshl_b32 s8, s0, s4
 ; GISEL-NEXT:    s_lshl_b32 s9, s1, s5
 ; GISEL-NEXT:    s_lshl_b32 s10, s2, s6
--- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
@ -745,7 +745,10 @@ define amdgpu_ps float @global_load_saddr_i8_offset_0x100000001(ptr addrspace(1)
 ;
 ; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_0x100000001:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[2:3], 1
+; GFX12-SDAG-NEXT:    s_mov_b32 s0, 1
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT:    s_mov_b32 s1, s0
+; GFX12-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[2:3], s[0:1]
 ; GFX12-SDAG-NEXT:    s_load_u8 s0, s[0:1], 0x0
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s0
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
@ -262,11 +262,19 @@ bb:
  ret void
 }

-; FIXME: This should not be foldable as an inline immediate
 ; GCN-LABEL: {{^}}test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_and_low:
-; GFX90A: v_mfma_f64_16x16x4f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], 64{{$}}
+; GCN: v_accvgpr_write_b32 a[[A_LOW_BITS_0:[0-9]+]], 64{{$}}
+; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
+; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
+; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
+; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
+; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
+; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
+; GCN: v_accvgpr_mov_b32 a[[LAST_CONST_REG:[0-9]+]], a[[A_LOW_BITS_0]]
+
+; GFX90A: v_mfma_f64_16x16x4f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a{{\[}}[[A_LOW_BITS_0]]:[[LAST_CONST_REG]]{{\]$}}
 ; GFX90A: v_mfma_f64_16x16x4f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], [[M1]] cbsz:1 abid:2 blgp:3
-; GFX942: v_mfma_f64_16x16x4_f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], 64{{$}}
+; GFX942: v_mfma_f64_16x16x4_f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a{{\[}}[[A_LOW_BITS_0]]:[[LAST_CONST_REG]]{{\]$}}
 ; GFX942: v_mfma_f64_16x16x4_f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], [[M1]] cbsz:1 abid:2 neg:[1,1,0]
 ; GCN:    global_store_dwordx4
 ; GCN:    global_store_dwordx4
--- a/llvm/test/CodeGen/AMDGPU/operand-folding.ll
+++ b/llvm/test/CodeGen/AMDGPU/operand-folding.ll
@ -155,7 +155,9 @@ define i32 @issue139908(i64 %in) {
 ; CHECK-LABEL: issue139908:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_cmp_eq_u64_e32 vcc, 42, v[0:1]
+; CHECK-NEXT:    s_mov_b32 s4, 42
+; CHECK-NEXT:    s_mov_b32 s5, s4
+; CHECK-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 2, 1, vcc
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
  %eq = icmp eq i64 %in, 180388626474
--- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
+++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX900 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-GISEL %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-SDAG,GFX90A-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-GISEL,GFX90A-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-SDAG,GFX942-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-GISEL,GFX942-GISEL %s

 define amdgpu_kernel void @fadd_v2_vv(ptr addrspace(1) %a) {
 ; GFX900-LABEL: fadd_v2_vv:
@ -411,10 +411,12 @@ define amdgpu_kernel void @fadd_v2_v_lit_splat(ptr addrspace(1) %a) {
 ; PACKED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; PACKED-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; PACKED-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; PACKED-GISEL-NEXT:    s_mov_b32 s2, 1.0
+; PACKED-GISEL-NEXT:    s_mov_b32 s3, s2
 ; PACKED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; PACKED-GISEL-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
 ; PACKED-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; PACKED-GISEL-NEXT:    v_pk_add_f32 v[0:1], v[0:1], 1.0
+; PACKED-GISEL-NEXT:    v_pk_add_f32 v[0:1], v[0:1], s[2:3]
 ; PACKED-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; PACKED-GISEL-NEXT:    s_endpgm
  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
@ -1186,10 +1188,12 @@ define amdgpu_kernel void @fmul_v2_v_lit_splat(ptr addrspace(1) %a) {
 ; PACKED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; PACKED-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; PACKED-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; PACKED-GISEL-NEXT:    s_mov_b32 s2, 4.0
+; PACKED-GISEL-NEXT:    s_mov_b32 s3, s2
 ; PACKED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; PACKED-GISEL-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
 ; PACKED-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; PACKED-GISEL-NEXT:    v_pk_mul_f32 v[0:1], v[0:1], 4.0
+; PACKED-GISEL-NEXT:    v_pk_mul_f32 v[0:1], v[0:1], s[2:3]
 ; PACKED-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; PACKED-GISEL-NEXT:    s_endpgm
  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
@ -1594,6 +1598,40 @@ define amdgpu_kernel void @fma_v2_v_imm(ptr addrspace(1) %a) {
 ; PACKED-SDAG-NEXT:    v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3] op_sel_hi:[1,0,0]
 ; PACKED-SDAG-NEXT:    global_store_dwordx2 v3, v[0:1], s[0:1]
 ; PACKED-SDAG-NEXT:    s_endpgm
+;
+; GFX90A-GISEL-LABEL: fma_v2_v_imm:
+; GFX90A-GISEL:       ; %bb.0:
+; GFX90A-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX90A-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX90A-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; GFX90A-GISEL-NEXT:    s_mov_b32 s4, 0x43480000
+; GFX90A-GISEL-NEXT:    s_mov_b32 s2, 0x42c80000
+; GFX90A-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-GISEL-NEXT:    global_load_dwordx2 v[0:1], v4, s[0:1]
+; GFX90A-GISEL-NEXT:    s_mov_b32 s5, s4
+; GFX90A-GISEL-NEXT:    s_mov_b32 s3, s2
+; GFX90A-GISEL-NEXT:    v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-GISEL-NEXT:    v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3]
+; GFX90A-GISEL-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX90A-GISEL-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: fma_v2_v_imm:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; GFX942-GISEL-NEXT:    s_mov_b32 s4, 0x43480000
+; GFX942-GISEL-NEXT:    s_mov_b32 s2, 0x42c80000
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    global_load_dwordx2 v[0:1], v4, s[0:1]
+; GFX942-GISEL-NEXT:    s_mov_b32 s5, s4
+; GFX942-GISEL-NEXT:    s_mov_b32 s3, s2
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3]
+; GFX942-GISEL-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX942-GISEL-NEXT:    s_endpgm
  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
  %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
  %load = load <2 x float>, ptr addrspace(1) %gep, align 8
@ -1675,19 +1713,39 @@ define amdgpu_kernel void @fma_v2_v_lit_splat(ptr addrspace(1) %a) {
 ; PACKED-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; PACKED-SDAG-NEXT:    s_endpgm
 ;
-; PACKED-GISEL-LABEL: fma_v2_v_lit_splat:
-; PACKED-GISEL:       ; %bb.0:
-; PACKED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; PACKED-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; PACKED-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; PACKED-GISEL-NEXT:    s_mov_b32 s2, 1.0
-; PACKED-GISEL-NEXT:    s_mov_b32 s3, s2
-; PACKED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; PACKED-GISEL-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
-; PACKED-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; PACKED-GISEL-NEXT:    v_pk_fma_f32 v[0:1], v[0:1], 4.0, s[2:3]
-; PACKED-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; PACKED-GISEL-NEXT:    s_endpgm
+; GFX90A-GISEL-LABEL: fma_v2_v_lit_splat:
+; GFX90A-GISEL:       ; %bb.0:
+; GFX90A-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX90A-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX90A-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; GFX90A-GISEL-NEXT:    s_mov_b32 s4, 1.0
+; GFX90A-GISEL-NEXT:    s_mov_b32 s2, 4.0
+; GFX90A-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-GISEL-NEXT:    global_load_dwordx2 v[0:1], v4, s[0:1]
+; GFX90A-GISEL-NEXT:    s_mov_b32 s5, s4
+; GFX90A-GISEL-NEXT:    s_mov_b32 s3, s2
+; GFX90A-GISEL-NEXT:    v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-GISEL-NEXT:    v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3]
+; GFX90A-GISEL-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX90A-GISEL-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: fma_v2_v_lit_splat:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; GFX942-GISEL-NEXT:    s_mov_b32 s4, 1.0
+; GFX942-GISEL-NEXT:    s_mov_b32 s2, 4.0
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    global_load_dwordx2 v[0:1], v4, s[0:1]
+; GFX942-GISEL-NEXT:    s_mov_b32 s5, s4
+; GFX942-GISEL-NEXT:    s_mov_b32 s3, s2
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3]
+; GFX942-GISEL-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX942-GISEL-NEXT:    s_endpgm
  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
  %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
  %load = load <2 x float>, ptr addrspace(1) %gep, align 8
@ -1725,6 +1783,40 @@ define amdgpu_kernel void @fma_v2_v_unfoldable_lit(ptr addrspace(1) %a) {
 ; PACKED-SDAG-NEXT:    v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3]
 ; PACKED-SDAG-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
 ; PACKED-SDAG-NEXT:    s_endpgm
+;
+; GFX90A-GISEL-LABEL: fma_v2_v_unfoldable_lit:
+; GFX90A-GISEL:       ; %bb.0:
+; GFX90A-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX90A-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX90A-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; GFX90A-GISEL-NEXT:    s_mov_b32 s4, 1.0
+; GFX90A-GISEL-NEXT:    s_mov_b32 s2, 4.0
+; GFX90A-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-GISEL-NEXT:    global_load_dwordx2 v[0:1], v4, s[0:1]
+; GFX90A-GISEL-NEXT:    s_mov_b32 s5, 2.0
+; GFX90A-GISEL-NEXT:    s_mov_b32 s3, 0x40400000
+; GFX90A-GISEL-NEXT:    v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-GISEL-NEXT:    v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3]
+; GFX90A-GISEL-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX90A-GISEL-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: fma_v2_v_unfoldable_lit:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; GFX942-GISEL-NEXT:    s_mov_b32 s4, 1.0
+; GFX942-GISEL-NEXT:    s_mov_b32 s2, 4.0
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    global_load_dwordx2 v[0:1], v4, s[0:1]
+; GFX942-GISEL-NEXT:    s_mov_b32 s5, 2.0
+; GFX942-GISEL-NEXT:    s_mov_b32 s3, 0x40400000
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3]
+; GFX942-GISEL-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX942-GISEL-NEXT:    s_endpgm
  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
  %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
  %load = load <2 x float>, ptr addrspace(1) %gep, align 8
@ -2059,6 +2151,37 @@ define amdgpu_kernel void @fadd_fadd_fsub_0(<2 x float> %arg) {
 ; PACKED-SDAG-NEXT:    v_mov_b32_e32 v0, s0
 ; PACKED-SDAG-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
 ; PACKED-SDAG-NEXT:    s_endpgm
+;
+; GFX90A-GISEL-LABEL: fadd_fadd_fsub_0:
+; GFX90A-GISEL:       ; %bb.0: ; %bb
+; GFX90A-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX90A-GISEL-NEXT:    s_mov_b32 s2, 0
+; GFX90A-GISEL-NEXT:    s_mov_b32 s3, s2
+; GFX90A-GISEL-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-GISEL-NEXT:    v_pk_add_f32 v[0:1], s[0:1], v[0:1]
+; GFX90A-GISEL-NEXT:    v_mov_b32_e32 v0, v1
+; GFX90A-GISEL-NEXT:    v_pk_add_f32 v[0:1], v[0:1], 0
+; GFX90A-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX90A-GISEL-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-GISEL-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX90A-GISEL-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: fadd_fadd_fsub_0:
+; GFX942-GISEL:       ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-GISEL-NEXT:    s_mov_b32 s2, 0
+; GFX942-GISEL-NEXT:    s_mov_b32 s3, s2
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_pk_add_f32 v[0:1], s[0:1], v[0:1]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, v1
+; GFX942-GISEL-NEXT:    v_pk_add_f32 v[0:1], v[0:1], 0
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-GISEL-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX942-GISEL-NEXT:    s_endpgm
 bb:
  %i12 = fadd <2 x float> zeroinitializer, %arg
  %shift8 = shufflevector <2 x float> %i12, <2 x float> poison, <2 x i32> <i32 1, i32 poison>
@ -2099,6 +2222,40 @@ define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, p
 ; PACKED-SDAG-NEXT:    v_pk_add_f32 v[0:1], v[2:3], s[2:3] neg_lo:[0,1] neg_hi:[0,1]
 ; PACKED-SDAG-NEXT:    global_store_dwordx2 v4, v[0:1], s[6:7]
 ; PACKED-SDAG-NEXT:    s_endpgm
+;
+; GFX90A-GISEL-LABEL: fadd_fadd_fsub:
+; GFX90A-GISEL:       ; %bb.0: ; %bb
+; GFX90A-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX90A-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-GISEL-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-GISEL-NEXT:    v_pk_add_f32 v[0:1], s[0:1], v[0:1]
+; GFX90A-GISEL-NEXT:    v_sub_f32_e32 v0, s0, v2
+; GFX90A-GISEL-NEXT:    v_mov_b32_e32 v2, v1
+; GFX90A-GISEL-NEXT:    v_pk_add_f32 v[2:3], s[2:3], v[2:3]
+; GFX90A-GISEL-NEXT:    v_subrev_f32_e32 v1, s3, v2
+; GFX90A-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90A-GISEL-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: fadd_fadd_fsub:
+; GFX942-GISEL:       ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942-GISEL-NEXT:    v_pk_add_f32 v[0:1], s[0:1], v[0:1]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_sub_f32_e32 v0, s0, v2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v2, v1
+; GFX942-GISEL-NEXT:    v_pk_add_f32 v[2:3], s[2:3], v[2:3]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subrev_f32_e32 v1, s3, v2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX942-GISEL-NEXT:    s_endpgm
 bb:
  %i12 = fadd <2 x float> %arg, %arg1
  %shift8 = shufflevector <2 x float> %i12, <2 x float> poison, <2 x i32> <i32 1, i32 poison>
@ -2251,3 +2408,6 @@ declare i32 @llvm.amdgcn.workitem.id.x()
 declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>)
 declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
 declare <32 x float> @llvm.fma.v32f32(<32 x float>, <32 x float>, <32 x float>)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX90A-SDAG: {{.*}}
+; GFX942-SDAG: {{.*}}
--- a/llvm/test/CodeGen/AMDGPU/si-fold-operands-subreg-imm.mir
+++ b/llvm/test/CodeGen/AMDGPU/si-fold-operands-subreg-imm.mir
@ -140,7 +140,7 @@ body:             |
    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
    ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 42
    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1
-    ; CHECK-NEXT: S_CMP_EQ_U64 [[COPY]], 42, implicit-def $scc
+    ; CHECK-NEXT: S_CMP_EQ_U64 [[COPY]], [[REG_SEQUENCE]], implicit-def $scc
    ; CHECK-NEXT: S_ENDPGM 0, implicit $scc
    %0:sgpr_64 = COPY $sgpr8_sgpr9
    %1:sreg_32 = S_MOV_B32 42