diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
index fb7b5802173f..0dcea2b5c42b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
@@ -1061,6 +1061,8 @@ bool RegBankLegalizeHelper::lower(MachineInstr &MI,
     MI.eraseFromParent();
     return true;
   }
+  case ApplyINTRIN_IMAGE:
+    return applyRegisterBanksINTRIN_IMAGE(MI);
   }
 
   if (!WFI.SgprWaterfallOperandRegs.empty()) {
@@ -1751,3 +1753,70 @@ void RegBankLegalizeHelper::applyMappingTrivial(MachineInstr &MI) {
     }
   }
 }
+
+bool RegBankLegalizeHelper::applyRegisterBanksINTRIN_IMAGE(MachineInstr &MI) {
+  const AMDGPU::RsrcIntrinsic *RSrcIntrin =
+      AMDGPU::lookupRsrcIntrinsic(AMDGPU::getIntrinsicID(MI));
+  assert(RSrcIntrin && RSrcIntrin->IsImage);
+
+  unsigned RsrcIdx = RSrcIntrin->RsrcArg;
+  const unsigned NumDefs = MI.getNumExplicitDefs();
+
+  // The reported argument index is relative to the IR intrinsic call arguments,
+  // so we need to shift by the number of defs and the intrinsic ID.
+  RsrcIdx += NumDefs + 1;
+
+  MachineBasicBlock *MBB = MI.getParent();
+  B.setInsertPt(*MBB, MBB->SkipPHIsAndLabels(std::next(MI.getIterator())));
+
+  // Defs(for image loads with return) are vgpr.
+  for (unsigned i = 0; i < NumDefs; ++i) {
+    const RegisterBank *RB = MRI.getRegBank(MI.getOperand(i).getReg());
+    if (RB == VgprRB)
+      continue;
+
+    Register Reg = MI.getOperand(i).getReg();
+    Register NewVgprDst = MRI.createVirtualRegister({VgprRB, MRI.getType(Reg)});
+    MI.getOperand(i).setReg(NewVgprDst);
+    buildReadAnyLane(B, Reg, NewVgprDst, RBI);
+  }
+
+  B.setInstrAndDebugLoc(MI);
+
+  // Register uses(before RsrcIdx) are vgpr.
+  for (unsigned i = 1; i < RsrcIdx; ++i) {
+    MachineOperand &Op = MI.getOperand(i);
+    if (!Op.isReg())
+      continue;
+
+    Register Reg = Op.getReg();
+    if (!Reg.isVirtual())
+      continue;
+
+    if (MRI.getRegBank(Reg) == VgprRB)
+      continue;
+
+    auto Copy = B.buildCopy({VgprRB, MRI.getType(Reg)}, Reg);
+    Op.setReg(Copy.getReg(0));
+  }
+
+  SmallSet<Register, 4> OpsToWaterfall;
+
+  // Register use RsrcIdx(and RsrcIdx+1 in some cases) is sgpr.
+  for (unsigned i = RsrcIdx; i < MI.getNumOperands(); ++i) {
+    MachineOperand &Op = MI.getOperand(i);
+    if (!Op.isReg())
+      continue;
+
+    Register Reg = Op.getReg();
+    if (MRI.getRegBank(Reg) != SgprRB)
+      OpsToWaterfall.insert(Reg);
+  }
+
+  if (!OpsToWaterfall.empty()) {
+    MachineBasicBlock::iterator MII = MI.getIterator();
+    executeInWaterfallLoop(B, {OpsToWaterfall, MII, std::next(MII)});
+  }
+
+  return true;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
index 4d3edae0c125..22e381b389b3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
@@ -140,6 +140,7 @@ private:
   bool lowerSplitTo32SExtInReg(MachineInstr &MI);
   bool lowerUnpackMinMax(MachineInstr &MI);
   bool lowerUnpackAExt(MachineInstr &MI);
+  bool applyRegisterBanksINTRIN_IMAGE(MachineInstr &MI);
 };
 
 } // end namespace AMDGPU
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index bc0c3b651b3b..894db7b88915 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -645,6 +645,16 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
       .Any({{UniBRC}, {{}, {}, VerifyAllSgpr}})
       .Any({{DivBRC}, {{}, {}, ApplyAllVgpr}});
 
+  // LOAD       {Div}, {{VgprDst...}, {VgprSrc, ..., Sgpr_WF_RsrcIdx}}
+  // LOAD       {Uni}, {{UniInVgprDst...}, {VgprSrc, ..., Sgpr_WF_RsrcIdx}}
+  // LOAD_NORET {}, {{}, {Imm, VgprSrc, ..., Sgpr_WF_RsrcIdx}}
+  // STORE      {}, {{}, {VgprSrc, ..., Sgpr_WF_RsrcIdx}}
+  addRulesForGOpcs({G_AMDGPU_INTRIN_IMAGE_LOAD, G_AMDGPU_INTRIN_IMAGE_LOAD_D16,
+                    G_AMDGPU_INTRIN_IMAGE_LOAD_NORET,
+                    G_AMDGPU_INTRIN_IMAGE_STORE,
+                    G_AMDGPU_INTRIN_IMAGE_STORE_D16})
+      .Any({{}, {{}, {}, ApplyINTRIN_IMAGE}});
+
   Predicate isSignedICmp([](const MachineInstr &MI) -> bool {
     auto Pred =
         static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
index 24a1624358c6..36063302e99c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
@@ -266,7 +266,8 @@ enum LoweringMethodID {
   UnpackAExt,
   VerifyAllSgpr,
   ApplyAllVgpr,
-  UnmergeToShiftTrunc
+  UnmergeToShiftTrunc,
+  ApplyINTRIN_IMAGE
 };
 
 enum FastRulesTypes {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bitcast_38_i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bitcast_38_i16.ll
index 62a5313dc8d3..0e32d307bfa7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/bitcast_38_i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/bitcast_38_i16.ll
@@ -1,8 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GPRIDX %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,MOVREL %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefix=GFX11 %s
+
 define void @main(<19 x i32> %arg) {
 ; GCN-LABEL: main:
 ; GCN:       ; %bb.0: ; %bb
@@ -51,7 +52,7 @@ define void @main(<19 x i32> %arg) {
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX11-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:    s_mov_b32 s1, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v2, v1
@@ -76,7 +77,3 @@ bb:
   ret void
 }
 declare void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float>, i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg)
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFX10PLUS: {{.*}}
-; GPRIDX: {{.*}}
-; MOVREL: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll
index 4ed1cb2d1260..d9bccd66d2b2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 -o - %s | FileCheck %s
+; RUN: llc -global-isel -new-reg-bank-select -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 -o - %s | FileCheck %s
 
 ; Make sure the waterfall loop does not fail the verifier after regalloc fast
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll
index d2ee00833e46..b072fec5f8f4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll
@@ -1,13 +1,14 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - %s | FileCheck -check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 -o - %s | FileCheck -check-prefixes=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=-real-true16 -o - %s | FileCheck -check-prefixes=GFX12 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 -o - %s | FileCheck -check-prefixes=GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=-real-true16 -o - %s | FileCheck -check-prefixes=GFX12 %s
 
 define amdgpu_ps float @atomic_swap_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s) {
 ; GFX9-LABEL: atomic_swap_i32_1d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    s_mov_b32 s0, s2
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX9-NEXT:    s_mov_b32 s1, s3
 ; GFX9-NEXT:    s_mov_b32 s2, s4
 ; GFX9-NEXT:    s_mov_b32 s3, s5
@@ -15,16 +16,19 @@ define amdgpu_ps float @atomic_swap_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i16
 ; GFX9-NEXT:    s_mov_b32 s5, s7
 ; GFX9-NEXT:    s_mov_b32 s6, s8
 ; GFX9-NEXT:    s_mov_b32 s7, s9
+; GFX9-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
 ; GFX9-NEXT:    image_atomic_swap v0, v1, s[0:7] dmask:0x1 unorm glc a16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: atomic_swap_i32_1d:
 ; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX10-NEXT:    s_mov_b32 s0, s2
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s3, s5
+; GFX10-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
@@ -35,10 +39,12 @@ define amdgpu_ps float @atomic_swap_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i16
 ;
 ; GFX12-LABEL: atomic_swap_i32_1d:
 ; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX12-NEXT:    s_mov_b32 s0, s2
 ; GFX12-NEXT:    s_mov_b32 s1, s3
 ; GFX12-NEXT:    s_mov_b32 s2, s4
 ; GFX12-NEXT:    s_mov_b32 s3, s5
+; GFX12-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
 ; GFX12-NEXT:    s_mov_b32 s4, s6
 ; GFX12-NEXT:    s_mov_b32 s5, s7
 ; GFX12-NEXT:    s_mov_b32 s6, s8
@@ -56,6 +62,7 @@ define amdgpu_ps float @atomic_add_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i16
 ; GFX9-LABEL: atomic_add_i32_1d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    s_mov_b32 s0, s2
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX9-NEXT:    s_mov_b32 s1, s3
 ; GFX9-NEXT:    s_mov_b32 s2, s4
 ; GFX9-NEXT:    s_mov_b32 s3, s5
@@ -63,16 +70,19 @@ define amdgpu_ps float @atomic_add_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i16
 ; GFX9-NEXT:    s_mov_b32 s5, s7
 ; GFX9-NEXT:    s_mov_b32 s6, s8
 ; GFX9-NEXT:    s_mov_b32 s7, s9
+; GFX9-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
 ; GFX9-NEXT:    image_atomic_add v0, v1, s[0:7] dmask:0x1 unorm glc a16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: atomic_add_i32_1d:
 ; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX10-NEXT:    s_mov_b32 s0, s2
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s3, s5
+; GFX10-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
@@ -83,10 +93,12 @@ define amdgpu_ps float @atomic_add_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i16
 ;
 ; GFX12-LABEL: atomic_add_i32_1d:
 ; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX12-NEXT:    s_mov_b32 s0, s2
 ; GFX12-NEXT:    s_mov_b32 s1, s3
 ; GFX12-NEXT:    s_mov_b32 s2, s4
 ; GFX12-NEXT:    s_mov_b32 s3, s5
+; GFX12-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
 ; GFX12-NEXT:    s_mov_b32 s4, s6
 ; GFX12-NEXT:    s_mov_b32 s5, s7
 ; GFX12-NEXT:    s_mov_b32 s6, s8
@@ -104,6 +116,7 @@ define amdgpu_ps float @atomic_sub_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i16
 ; GFX9-LABEL: atomic_sub_i32_1d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    s_mov_b32 s0, s2
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX9-NEXT:    s_mov_b32 s1, s3
 ; GFX9-NEXT:    s_mov_b32 s2, s4
 ; GFX9-NEXT:    s_mov_b32 s3, s5
@@ -111,16 +124,19 @@ define amdgpu_ps float @atomic_sub_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i16
 ; GFX9-NEXT:    s_mov_b32 s5, s7
 ; GFX9-NEXT:    s_mov_b32 s6, s8
 ; GFX9-NEXT:    s_mov_b32 s7, s9
+; GFX9-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
 ; GFX9-NEXT:    image_atomic_sub v0, v1, s[0:7] dmask:0x1 unorm glc a16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: atomic_sub_i32_1d:
 ; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX10-NEXT:    s_mov_b32 s0, s2
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s3, s5
+; GFX10-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
@@ -131,10 +147,12 @@ define amdgpu_ps float @atomic_sub_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i16
 ;
 ; GFX12-LABEL: atomic_sub_i32_1d:
 ; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX12-NEXT:    s_mov_b32 s0, s2
 ; GFX12-NEXT:    s_mov_b32 s1, s3
 ; GFX12-NEXT:    s_mov_b32 s2, s4
 ; GFX12-NEXT:    s_mov_b32 s3, s5
+; GFX12-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
 ; GFX12-NEXT:    s_mov_b32 s4, s6
 ; GFX12-NEXT:    s_mov_b32 s5, s7
 ; GFX12-NEXT:    s_mov_b32 s6, s8
@@ -152,6 +170,7 @@ define amdgpu_ps float @atomic_smin_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i16
 ; GFX9-LABEL: atomic_smin_i32_1d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    s_mov_b32 s0, s2
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX9-NEXT:    s_mov_b32 s1, s3
 ; GFX9-NEXT:    s_mov_b32 s2, s4
 ; GFX9-NEXT:    s_mov_b32 s3, s5
@@ -159,16 +178,19 @@ define amdgpu_ps float @atomic_smin_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i16
 ; GFX9-NEXT:    s_mov_b32 s5, s7
 ; GFX9-NEXT:    s_mov_b32 s6, s8
 ; GFX9-NEXT:    s_mov_b32 s7, s9
+; GFX9-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
 ; GFX9-NEXT:    image_atomic_smin v0, v1, s[0:7] dmask:0x1 unorm glc a16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: atomic_smin_i32_1d:
 ; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX10-NEXT:    s_mov_b32 s0, s2
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s3, s5
+; GFX10-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
@@ -179,10 +201,12 @@ define amdgpu_ps float @atomic_smin_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i16
 ;
 ; GFX12-LABEL: atomic_smin_i32_1d:
 ; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX12-NEXT:    s_mov_b32 s0, s2
 ; GFX12-NEXT:    s_mov_b32 s1, s3
 ; GFX12-NEXT:    s_mov_b32 s2, s4
 ; GFX12-NEXT:    s_mov_b32 s3, s5
+; GFX12-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
 ; GFX12-NEXT:    s_mov_b32 s4, s6
 ; GFX12-NEXT:    s_mov_b32 s5, s7
 ; GFX12-NEXT:    s_mov_b32 s6, s8
@@ -200,6 +224,7 @@ define amdgpu_ps float @atomic_umin_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i16
 ; GFX9-LABEL: atomic_umin_i32_1d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    s_mov_b32 s0, s2
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX9-NEXT:    s_mov_b32 s1, s3
 ; GFX9-NEXT:    s_mov_b32 s2, s4
 ; GFX9-NEXT:    s_mov_b32 s3, s5
@@ -207,16 +232,19 @@ define amdgpu_ps float @atomic_umin_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i16
 ; GFX9-NEXT:    s_mov_b32 s5, s7
 ; GFX9-NEXT:    s_mov_b32 s6, s8
 ; GFX9-NEXT:    s_mov_b32 s7, s9
+; GFX9-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
 ; GFX9-NEXT:    image_atomic_umin v0, v1, s[0:7] dmask:0x1 unorm glc a16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: atomic_umin_i32_1d:
 ; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX10-NEXT:    s_mov_b32 s0, s2
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s3, s5
+; GFX10-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
@@ -227,10 +255,12 @@ define amdgpu_ps float @atomic_umin_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i16
 ;
 ; GFX12-LABEL: atomic_umin_i32_1d:
 ; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX12-NEXT:    s_mov_b32 s0, s2
 ; GFX12-NEXT:    s_mov_b32 s1, s3
 ; GFX12-NEXT:    s_mov_b32 s2, s4
 ; GFX12-NEXT:    s_mov_b32 s3, s5
+; GFX12-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
 ; GFX12-NEXT:    s_mov_b32 s4, s6
 ; GFX12-NEXT:    s_mov_b32 s5, s7
 ; GFX12-NEXT:    s_mov_b32 s6, s8
@@ -248,6 +278,7 @@ define amdgpu_ps float @atomic_smax_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i16
 ; GFX9-LABEL: atomic_smax_i32_1d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    s_mov_b32 s0, s2
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX9-NEXT:    s_mov_b32 s1, s3
 ; GFX9-NEXT:    s_mov_b32 s2, s4
 ; GFX9-NEXT:    s_mov_b32 s3, s5
@@ -255,16 +286,19 @@ define amdgpu_ps float @atomic_smax_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i16
 ; GFX9-NEXT:    s_mov_b32 s5, s7
 ; GFX9-NEXT:    s_mov_b32 s6, s8
 ; GFX9-NEXT:    s_mov_b32 s7, s9
+; GFX9-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
 ; GFX9-NEXT:    image_atomic_smax v0, v1, s[0:7] dmask:0x1 unorm glc a16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: atomic_smax_i32_1d:
 ; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX10-NEXT:    s_mov_b32 s0, s2
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s3, s5
+; GFX10-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
@@ -275,10 +309,12 @@ define amdgpu_ps float @atomic_smax_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i16
 ;
 ; GFX12-LABEL: atomic_smax_i32_1d:
 ; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX12-NEXT:    s_mov_b32 s0, s2
 ; GFX12-NEXT:    s_mov_b32 s1, s3
 ; GFX12-NEXT:    s_mov_b32 s2, s4
 ; GFX12-NEXT:    s_mov_b32 s3, s5
+; GFX12-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
 ; GFX12-NEXT:    s_mov_b32 s4, s6
 ; GFX12-NEXT:    s_mov_b32 s5, s7
 ; GFX12-NEXT:    s_mov_b32 s6, s8
@@ -296,6 +332,7 @@ define amdgpu_ps float @atomic_umax_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i16
 ; GFX9-LABEL: atomic_umax_i32_1d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    s_mov_b32 s0, s2
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX9-NEXT:    s_mov_b32 s1, s3
 ; GFX9-NEXT:    s_mov_b32 s2, s4
 ; GFX9-NEXT:    s_mov_b32 s3, s5
@@ -303,16 +340,19 @@ define amdgpu_ps float @atomic_umax_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i16
 ; GFX9-NEXT:    s_mov_b32 s5, s7
 ; GFX9-NEXT:    s_mov_b32 s6, s8
 ; GFX9-NEXT:    s_mov_b32 s7, s9
+; GFX9-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
 ; GFX9-NEXT:    image_atomic_umax v0, v1, s[0:7] dmask:0x1 unorm glc a16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: atomic_umax_i32_1d:
 ; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX10-NEXT:    s_mov_b32 s0, s2
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s3, s5
+; GFX10-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
@@ -323,10 +363,12 @@ define amdgpu_ps float @atomic_umax_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i16
 ;
 ; GFX12-LABEL: atomic_umax_i32_1d:
 ; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX12-NEXT:    s_mov_b32 s0, s2
 ; GFX12-NEXT:    s_mov_b32 s1, s3
 ; GFX12-NEXT:    s_mov_b32 s2, s4
 ; GFX12-NEXT:    s_mov_b32 s3, s5
+; GFX12-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
 ; GFX12-NEXT:    s_mov_b32 s4, s6
 ; GFX12-NEXT:    s_mov_b32 s5, s7
 ; GFX12-NEXT:    s_mov_b32 s6, s8
@@ -344,6 +386,7 @@ define amdgpu_ps float @atomic_and_i321d(<8 x i32> inreg %rsrc, i32 %data, i16 %
 ; GFX9-LABEL: atomic_and_i321d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    s_mov_b32 s0, s2
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX9-NEXT:    s_mov_b32 s1, s3
 ; GFX9-NEXT:    s_mov_b32 s2, s4
 ; GFX9-NEXT:    s_mov_b32 s3, s5
@@ -351,16 +394,19 @@ define amdgpu_ps float @atomic_and_i321d(<8 x i32> inreg %rsrc, i32 %data, i16 %
 ; GFX9-NEXT:    s_mov_b32 s5, s7
 ; GFX9-NEXT:    s_mov_b32 s6, s8
 ; GFX9-NEXT:    s_mov_b32 s7, s9
+; GFX9-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
 ; GFX9-NEXT:    image_atomic_and v0, v1, s[0:7] dmask:0x1 unorm glc a16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: atomic_and_i321d:
 ; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX10-NEXT:    s_mov_b32 s0, s2
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s3, s5
+; GFX10-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
@@ -371,10 +417,12 @@ define amdgpu_ps float @atomic_and_i321d(<8 x i32> inreg %rsrc, i32 %data, i16 %
 ;
 ; GFX12-LABEL: atomic_and_i321d:
 ; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX12-NEXT:    s_mov_b32 s0, s2
 ; GFX12-NEXT:    s_mov_b32 s1, s3
 ; GFX12-NEXT:    s_mov_b32 s2, s4
 ; GFX12-NEXT:    s_mov_b32 s3, s5
+; GFX12-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
 ; GFX12-NEXT:    s_mov_b32 s4, s6
 ; GFX12-NEXT:    s_mov_b32 s5, s7
 ; GFX12-NEXT:    s_mov_b32 s6, s8
@@ -392,6 +440,7 @@ define amdgpu_ps float @atomic_or_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %
 ; GFX9-LABEL: atomic_or_i32_1d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    s_mov_b32 s0, s2
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX9-NEXT:    s_mov_b32 s1, s3
 ; GFX9-NEXT:    s_mov_b32 s2, s4
 ; GFX9-NEXT:    s_mov_b32 s3, s5
@@ -399,16 +448,19 @@ define amdgpu_ps float @atomic_or_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %
 ; GFX9-NEXT:    s_mov_b32 s5, s7
 ; GFX9-NEXT:    s_mov_b32 s6, s8
 ; GFX9-NEXT:    s_mov_b32 s7, s9
+; GFX9-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
 ; GFX9-NEXT:    image_atomic_or v0, v1, s[0:7] dmask:0x1 unorm glc a16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: atomic_or_i32_1d:
 ; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX10-NEXT:    s_mov_b32 s0, s2
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s3, s5
+; GFX10-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
@@ -419,10 +471,12 @@ define amdgpu_ps float @atomic_or_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %
 ;
 ; GFX12-LABEL: atomic_or_i32_1d:
 ; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX12-NEXT:    s_mov_b32 s0, s2
 ; GFX12-NEXT:    s_mov_b32 s1, s3
 ; GFX12-NEXT:    s_mov_b32 s2, s4
 ; GFX12-NEXT:    s_mov_b32 s3, s5
+; GFX12-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
 ; GFX12-NEXT:    s_mov_b32 s4, s6
 ; GFX12-NEXT:    s_mov_b32 s5, s7
 ; GFX12-NEXT:    s_mov_b32 s6, s8
@@ -440,6 +494,7 @@ define amdgpu_ps float @atomic_xor_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i16
 ; GFX9-LABEL: atomic_xor_i32_1d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    s_mov_b32 s0, s2
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX9-NEXT:    s_mov_b32 s1, s3
 ; GFX9-NEXT:    s_mov_b32 s2, s4
 ; GFX9-NEXT:    s_mov_b32 s3, s5
@@ -447,16 +502,19 @@ define amdgpu_ps float @atomic_xor_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i16
 ; GFX9-NEXT:    s_mov_b32 s5, s7
 ; GFX9-NEXT:    s_mov_b32 s6, s8
 ; GFX9-NEXT:    s_mov_b32 s7, s9
+; GFX9-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
 ; GFX9-NEXT:    image_atomic_xor v0, v1, s[0:7] dmask:0x1 unorm glc a16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: atomic_xor_i32_1d:
 ; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX10-NEXT:    s_mov_b32 s0, s2
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s3, s5
+; GFX10-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
@@ -467,10 +525,12 @@ define amdgpu_ps float @atomic_xor_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i16
 ;
 ; GFX12-LABEL: atomic_xor_i32_1d:
 ; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX12-NEXT:    s_mov_b32 s0, s2
 ; GFX12-NEXT:    s_mov_b32 s1, s3
 ; GFX12-NEXT:    s_mov_b32 s2, s4
 ; GFX12-NEXT:    s_mov_b32 s3, s5
+; GFX12-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
 ; GFX12-NEXT:    s_mov_b32 s4, s6
 ; GFX12-NEXT:    s_mov_b32 s5, s7
 ; GFX12-NEXT:    s_mov_b32 s6, s8
@@ -488,6 +548,7 @@ define amdgpu_ps float @atomic_inc_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i16
 ; GFX9-LABEL: atomic_inc_i32_1d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    s_mov_b32 s0, s2
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX9-NEXT:    s_mov_b32 s1, s3
 ; GFX9-NEXT:    s_mov_b32 s2, s4
 ; GFX9-NEXT:    s_mov_b32 s3, s5
@@ -495,16 +556,19 @@ define amdgpu_ps float @atomic_inc_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i16
 ; GFX9-NEXT:    s_mov_b32 s5, s7
 ; GFX9-NEXT:    s_mov_b32 s6, s8
 ; GFX9-NEXT:    s_mov_b32 s7, s9
+; GFX9-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
 ; GFX9-NEXT:    image_atomic_inc v0, v1, s[0:7] dmask:0x1 unorm glc a16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: atomic_inc_i32_1d:
 ; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX10-NEXT:    s_mov_b32 s0, s2
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s3, s5
+; GFX10-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
@@ -515,10 +579,12 @@ define amdgpu_ps float @atomic_inc_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i16
 ;
 ; GFX12-LABEL: atomic_inc_i32_1d:
 ; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX12-NEXT:    s_mov_b32 s0, s2
 ; GFX12-NEXT:    s_mov_b32 s1, s3
 ; GFX12-NEXT:    s_mov_b32 s2, s4
 ; GFX12-NEXT:    s_mov_b32 s3, s5
+; GFX12-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
 ; GFX12-NEXT:    s_mov_b32 s4, s6
 ; GFX12-NEXT:    s_mov_b32 s5, s7
 ; GFX12-NEXT:    s_mov_b32 s6, s8
@@ -536,6 +602,7 @@ define amdgpu_ps float @atomic_dec_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i16
 ; GFX9-LABEL: atomic_dec_i32_1d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    s_mov_b32 s0, s2
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX9-NEXT:    s_mov_b32 s1, s3
 ; GFX9-NEXT:    s_mov_b32 s2, s4
 ; GFX9-NEXT:    s_mov_b32 s3, s5
@@ -543,16 +610,19 @@ define amdgpu_ps float @atomic_dec_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i16
 ; GFX9-NEXT:    s_mov_b32 s5, s7
 ; GFX9-NEXT:    s_mov_b32 s6, s8
 ; GFX9-NEXT:    s_mov_b32 s7, s9
+; GFX9-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
 ; GFX9-NEXT:    image_atomic_dec v0, v1, s[0:7] dmask:0x1 unorm glc a16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: atomic_dec_i32_1d:
 ; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX10-NEXT:    s_mov_b32 s0, s2
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s3, s5
+; GFX10-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
@@ -563,10 +633,12 @@ define amdgpu_ps float @atomic_dec_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i16
 ;
 ; GFX12-LABEL: atomic_dec_i32_1d:
 ; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX12-NEXT:    s_mov_b32 s0, s2
 ; GFX12-NEXT:    s_mov_b32 s1, s3
 ; GFX12-NEXT:    s_mov_b32 s2, s4
 ; GFX12-NEXT:    s_mov_b32 s3, s5
+; GFX12-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
 ; GFX12-NEXT:    s_mov_b32 s4, s6
 ; GFX12-NEXT:    s_mov_b32 s5, s7
 ; GFX12-NEXT:    s_mov_b32 s6, s8
@@ -584,6 +656,7 @@ define amdgpu_ps float @atomic_cmpswap_i32_1d(<8 x i32> inreg %rsrc, i32 %cmp, i
 ; GFX9-LABEL: atomic_cmpswap_i32_1d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    s_mov_b32 s0, s2
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX9-NEXT:    s_mov_b32 s1, s3
 ; GFX9-NEXT:    s_mov_b32 s2, s4
 ; GFX9-NEXT:    s_mov_b32 s3, s5
@@ -591,16 +664,19 @@ define amdgpu_ps float @atomic_cmpswap_i32_1d(<8 x i32> inreg %rsrc, i32 %cmp, i
 ; GFX9-NEXT:    s_mov_b32 s5, s7
 ; GFX9-NEXT:    s_mov_b32 s6, s8
 ; GFX9-NEXT:    s_mov_b32 s7, s9
+; GFX9-NEXT:    v_lshl_or_b32 v2, s0, 16, v2
 ; GFX9-NEXT:    image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm glc a16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: atomic_cmpswap_i32_1d:
 ; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX10-NEXT:    s_mov_b32 s0, s2
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s3, s5
+; GFX10-NEXT:    v_lshl_or_b32 v2, s0, 16, v2
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
@@ -611,10 +687,12 @@ define amdgpu_ps float @atomic_cmpswap_i32_1d(<8 x i32> inreg %rsrc, i32 %cmp, i
 ;
 ; GFX12-LABEL: atomic_cmpswap_i32_1d:
 ; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX12-NEXT:    s_mov_b32 s0, s2
 ; GFX12-NEXT:    s_mov_b32 s1, s3
 ; GFX12-NEXT:    s_mov_b32 s2, s4
 ; GFX12-NEXT:    s_mov_b32 s3, s5
+; GFX12-NEXT:    v_lshl_or_b32 v2, s0, 16, v2
 ; GFX12-NEXT:    s_mov_b32 s4, s6
 ; GFX12-NEXT:    s_mov_b32 s5, s7
 ; GFX12-NEXT:    s_mov_b32 s6, s8
@@ -687,6 +765,8 @@ define amdgpu_ps float @atomic_add_i32_3d(<8 x i32> inreg %rsrc, i32 %data, i16
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX9-NEXT:    s_mov_b32 s0, s2
+; GFX9-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v3
 ; GFX9-NEXT:    s_mov_b32 s1, s3
 ; GFX9-NEXT:    s_mov_b32 s2, s4
 ; GFX9-NEXT:    s_mov_b32 s3, s5
@@ -694,40 +774,44 @@ define amdgpu_ps float @atomic_add_i32_3d(<8 x i32> inreg %rsrc, i32 %data, i16
 ; GFX9-NEXT:    s_mov_b32 s5, s7
 ; GFX9-NEXT:    s_mov_b32 s6, s8
 ; GFX9-NEXT:    s_mov_b32 s7, s9
-; GFX9-NEXT:    v_lshl_or_b32 v2, v2, 16, v1
-; GFX9-NEXT:    image_atomic_add v0, v[2:3], s[0:7] dmask:0x1 unorm glc a16
+; GFX9-NEXT:    v_lshl_or_b32 v2, s0, 16, v2
+; GFX9-NEXT:    image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 unorm glc a16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: atomic_add_i32_3d:
 ; GFX10:       ; %bb.0: ; %main_body
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX10-NEXT:    s_mov_b32 s0, s2
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s2, s4
-; GFX10-NEXT:    s_mov_b32 s3, s5
 ; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
+; GFX10-NEXT:    v_lshl_or_b32 v2, s0, 16, v3
+; GFX10-NEXT:    s_mov_b32 s3, s5
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
 ; GFX10-NEXT:    s_mov_b32 s7, s9
-; GFX10-NEXT:    image_atomic_add v0, [v1, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_3D unorm glc a16
+; GFX10-NEXT:    image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_3D unorm glc a16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: atomic_add_i32_3d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX12-NEXT:    s_mov_b32 s0, s2
 ; GFX12-NEXT:    s_mov_b32 s1, s3
 ; GFX12-NEXT:    s_mov_b32 s2, s4
-; GFX12-NEXT:    s_mov_b32 s3, s5
 ; GFX12-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
+; GFX12-NEXT:    v_lshl_or_b32 v2, s0, 16, v3
+; GFX12-NEXT:    s_mov_b32 s3, s5
 ; GFX12-NEXT:    s_mov_b32 s4, s6
 ; GFX12-NEXT:    s_mov_b32 s5, s7
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
-; GFX12-NEXT:    image_atomic_add_uint v0, [v1, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_3D th:TH_ATOMIC_RETURN a16
+; GFX12-NEXT:    image_atomic_add_uint v0, [v1, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_3D th:TH_ATOMIC_RETURN a16
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
@@ -741,6 +825,8 @@ define amdgpu_ps float @atomic_add_i32_cube(<8 x i32> inreg %rsrc, i32 %data, i1
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX9-NEXT:    s_mov_b32 s0, s2
+; GFX9-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v3
 ; GFX9-NEXT:    s_mov_b32 s1, s3
 ; GFX9-NEXT:    s_mov_b32 s2, s4
 ; GFX9-NEXT:    s_mov_b32 s3, s5
@@ -748,40 +834,44 @@ define amdgpu_ps float @atomic_add_i32_cube(<8 x i32> inreg %rsrc, i32 %data, i1
 ; GFX9-NEXT:    s_mov_b32 s5, s7
 ; GFX9-NEXT:    s_mov_b32 s6, s8
 ; GFX9-NEXT:    s_mov_b32 s7, s9
-; GFX9-NEXT:    v_lshl_or_b32 v2, v2, 16, v1
-; GFX9-NEXT:    image_atomic_add v0, v[2:3], s[0:7] dmask:0x1 unorm glc a16 da
+; GFX9-NEXT:    v_lshl_or_b32 v2, s0, 16, v2
+; GFX9-NEXT:    image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 unorm glc a16 da
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: atomic_add_i32_cube:
 ; GFX10:       ; %bb.0: ; %main_body
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX10-NEXT:    s_mov_b32 s0, s2
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s2, s4
-; GFX10-NEXT:    s_mov_b32 s3, s5
 ; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
+; GFX10-NEXT:    v_lshl_or_b32 v2, s0, 16, v3
+; GFX10-NEXT:    s_mov_b32 s3, s5
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
 ; GFX10-NEXT:    s_mov_b32 s7, s9
-; GFX10-NEXT:    image_atomic_add v0, [v1, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_CUBE unorm glc a16
+; GFX10-NEXT:    image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_CUBE unorm glc a16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: atomic_add_i32_cube:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX12-NEXT:    s_mov_b32 s0, s2
 ; GFX12-NEXT:    s_mov_b32 s1, s3
 ; GFX12-NEXT:    s_mov_b32 s2, s4
-; GFX12-NEXT:    s_mov_b32 s3, s5
 ; GFX12-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
+; GFX12-NEXT:    v_lshl_or_b32 v2, s0, 16, v3
+; GFX12-NEXT:    s_mov_b32 s3, s5
 ; GFX12-NEXT:    s_mov_b32 s4, s6
 ; GFX12-NEXT:    s_mov_b32 s5, s7
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
-; GFX12-NEXT:    image_atomic_add_uint v0, [v1, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_CUBE th:TH_ATOMIC_RETURN a16
+; GFX12-NEXT:    image_atomic_add_uint v0, [v1, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_CUBE th:TH_ATOMIC_RETURN a16
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
@@ -849,6 +939,8 @@ define amdgpu_ps float @atomic_add_i32_2darray(<8 x i32> inreg %rsrc, i32 %data,
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX9-NEXT:    s_mov_b32 s0, s2
+; GFX9-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v3
 ; GFX9-NEXT:    s_mov_b32 s1, s3
 ; GFX9-NEXT:    s_mov_b32 s2, s4
 ; GFX9-NEXT:    s_mov_b32 s3, s5
@@ -856,40 +948,44 @@ define amdgpu_ps float @atomic_add_i32_2darray(<8 x i32> inreg %rsrc, i32 %data,
 ; GFX9-NEXT:    s_mov_b32 s5, s7
 ; GFX9-NEXT:    s_mov_b32 s6, s8
 ; GFX9-NEXT:    s_mov_b32 s7, s9
-; GFX9-NEXT:    v_lshl_or_b32 v2, v2, 16, v1
-; GFX9-NEXT:    image_atomic_add v0, v[2:3], s[0:7] dmask:0x1 unorm glc a16 da
+; GFX9-NEXT:    v_lshl_or_b32 v2, s0, 16, v2
+; GFX9-NEXT:    image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 unorm glc a16 da
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: atomic_add_i32_2darray:
 ; GFX10:       ; %bb.0: ; %main_body
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX10-NEXT:    s_mov_b32 s0, s2
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s2, s4
-; GFX10-NEXT:    s_mov_b32 s3, s5
 ; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
+; GFX10-NEXT:    v_lshl_or_b32 v2, s0, 16, v3
+; GFX10-NEXT:    s_mov_b32 s3, s5
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
 ; GFX10-NEXT:    s_mov_b32 s7, s9
-; GFX10-NEXT:    image_atomic_add v0, [v1, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY unorm glc a16
+; GFX10-NEXT:    image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY unorm glc a16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: atomic_add_i32_2darray:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX12-NEXT:    s_mov_b32 s0, s2
 ; GFX12-NEXT:    s_mov_b32 s1, s3
 ; GFX12-NEXT:    s_mov_b32 s2, s4
-; GFX12-NEXT:    s_mov_b32 s3, s5
 ; GFX12-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
+; GFX12-NEXT:    v_lshl_or_b32 v2, s0, 16, v3
+; GFX12-NEXT:    s_mov_b32 s3, s5
 ; GFX12-NEXT:    s_mov_b32 s4, s6
 ; GFX12-NEXT:    s_mov_b32 s5, s7
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
-; GFX12-NEXT:    image_atomic_add_uint v0, [v1, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY th:TH_ATOMIC_RETURN a16
+; GFX12-NEXT:    image_atomic_add_uint v0, [v1, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY th:TH_ATOMIC_RETURN a16
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
@@ -903,6 +999,8 @@ define amdgpu_ps float @atomic_add_i32_2dmsaa(<8 x i32> inreg %rsrc, i32 %data,
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX9-NEXT:    s_mov_b32 s0, s2
+; GFX9-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v3
 ; GFX9-NEXT:    s_mov_b32 s1, s3
 ; GFX9-NEXT:    s_mov_b32 s2, s4
 ; GFX9-NEXT:    s_mov_b32 s3, s5
@@ -910,40 +1008,44 @@ define amdgpu_ps float @atomic_add_i32_2dmsaa(<8 x i32> inreg %rsrc, i32 %data,
 ; GFX9-NEXT:    s_mov_b32 s5, s7
 ; GFX9-NEXT:    s_mov_b32 s6, s8
 ; GFX9-NEXT:    s_mov_b32 s7, s9
-; GFX9-NEXT:    v_lshl_or_b32 v2, v2, 16, v1
-; GFX9-NEXT:    image_atomic_add v0, v[2:3], s[0:7] dmask:0x1 unorm glc a16
+; GFX9-NEXT:    v_lshl_or_b32 v2, s0, 16, v2
+; GFX9-NEXT:    image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 unorm glc a16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: atomic_add_i32_2dmsaa:
 ; GFX10:       ; %bb.0: ; %main_body
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX10-NEXT:    s_mov_b32 s0, s2
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s2, s4
-; GFX10-NEXT:    s_mov_b32 s3, s5
 ; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
+; GFX10-NEXT:    v_lshl_or_b32 v2, s0, 16, v3
+; GFX10-NEXT:    s_mov_b32 s3, s5
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
 ; GFX10-NEXT:    s_mov_b32 s7, s9
-; GFX10-NEXT:    image_atomic_add v0, [v1, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm glc a16
+; GFX10-NEXT:    image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm glc a16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: atomic_add_i32_2dmsaa:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX12-NEXT:    s_mov_b32 s0, s2
 ; GFX12-NEXT:    s_mov_b32 s1, s3
 ; GFX12-NEXT:    s_mov_b32 s2, s4
-; GFX12-NEXT:    s_mov_b32 s3, s5
 ; GFX12-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
+; GFX12-NEXT:    v_lshl_or_b32 v2, s0, 16, v3
+; GFX12-NEXT:    s_mov_b32 s3, s5
 ; GFX12-NEXT:    s_mov_b32 s4, s6
 ; GFX12-NEXT:    s_mov_b32 s5, s7
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
-; GFX12-NEXT:    image_atomic_add_uint v0, [v1, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA th:TH_ATOMIC_RETURN a16
+; GFX12-NEXT:    image_atomic_add_uint v0, [v1, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA th:TH_ATOMIC_RETURN a16
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
@@ -1016,6 +1118,7 @@ define amdgpu_ps float @atomic_add_i32_1d_slc(<8 x i32> inreg %rsrc, i32 %data,
 ; GFX9-LABEL: atomic_add_i32_1d_slc:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    s_mov_b32 s0, s2
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX9-NEXT:    s_mov_b32 s1, s3
 ; GFX9-NEXT:    s_mov_b32 s2, s4
 ; GFX9-NEXT:    s_mov_b32 s3, s5
@@ -1023,16 +1126,19 @@ define amdgpu_ps float @atomic_add_i32_1d_slc(<8 x i32> inreg %rsrc, i32 %data,
 ; GFX9-NEXT:    s_mov_b32 s5, s7
 ; GFX9-NEXT:    s_mov_b32 s6, s8
 ; GFX9-NEXT:    s_mov_b32 s7, s9
+; GFX9-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
 ; GFX9-NEXT:    image_atomic_add v0, v1, s[0:7] dmask:0x1 unorm glc slc a16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: atomic_add_i32_1d_slc:
 ; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX10-NEXT:    s_mov_b32 s0, s2
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s3, s5
+; GFX10-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
@@ -1043,10 +1149,12 @@ define amdgpu_ps float @atomic_add_i32_1d_slc(<8 x i32> inreg %rsrc, i32 %data,
 ;
 ; GFX12-LABEL: atomic_add_i32_1d_slc:
 ; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX12-NEXT:    s_mov_b32 s0, s2
 ; GFX12-NEXT:    s_mov_b32 s1, s3
 ; GFX12-NEXT:    s_mov_b32 s2, s4
 ; GFX12-NEXT:    s_mov_b32 s3, s5
+; GFX12-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
 ; GFX12-NEXT:    s_mov_b32 s4, s6
 ; GFX12-NEXT:    s_mov_b32 s5, s7
 ; GFX12-NEXT:    s_mov_b32 s6, s8
@@ -1064,6 +1172,7 @@ define amdgpu_ps <2 x float> @atomic_swap_i64_1d(<8 x i32> inreg %rsrc, i64 %dat
 ; GFX9-LABEL: atomic_swap_i64_1d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    s_mov_b32 s0, s2
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX9-NEXT:    s_mov_b32 s1, s3
 ; GFX9-NEXT:    s_mov_b32 s2, s4
 ; GFX9-NEXT:    s_mov_b32 s3, s5
@@ -1071,16 +1180,19 @@ define amdgpu_ps <2 x float> @atomic_swap_i64_1d(<8 x i32> inreg %rsrc, i64 %dat
 ; GFX9-NEXT:    s_mov_b32 s5, s7
 ; GFX9-NEXT:    s_mov_b32 s6, s8
 ; GFX9-NEXT:    s_mov_b32 s7, s9
+; GFX9-NEXT:    v_lshl_or_b32 v2, s0, 16, v2
 ; GFX9-NEXT:    image_atomic_swap v[0:1], v2, s[0:7] dmask:0x3 unorm glc a16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: atomic_swap_i64_1d:
 ; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX10-NEXT:    s_mov_b32 s0, s2
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s3, s5
+; GFX10-NEXT:    v_lshl_or_b32 v2, s0, 16, v2
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
@@ -1091,10 +1203,12 @@ define amdgpu_ps <2 x float> @atomic_swap_i64_1d(<8 x i32> inreg %rsrc, i64 %dat
 ;
 ; GFX12-LABEL: atomic_swap_i64_1d:
 ; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX12-NEXT:    s_mov_b32 s0, s2
 ; GFX12-NEXT:    s_mov_b32 s1, s3
 ; GFX12-NEXT:    s_mov_b32 s2, s4
 ; GFX12-NEXT:    s_mov_b32 s3, s5
+; GFX12-NEXT:    v_lshl_or_b32 v2, s0, 16, v2
 ; GFX12-NEXT:    s_mov_b32 s4, s6
 ; GFX12-NEXT:    s_mov_b32 s5, s7
 ; GFX12-NEXT:    s_mov_b32 s6, s8
@@ -1112,6 +1226,7 @@ define amdgpu_ps <2 x float> @atomic_add_i64_1d(<8 x i32> inreg %rsrc, i64 %data
 ; GFX9-LABEL: atomic_add_i64_1d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    s_mov_b32 s0, s2
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX9-NEXT:    s_mov_b32 s1, s3
 ; GFX9-NEXT:    s_mov_b32 s2, s4
 ; GFX9-NEXT:    s_mov_b32 s3, s5
@@ -1119,16 +1234,19 @@ define amdgpu_ps <2 x float> @atomic_add_i64_1d(<8 x i32> inreg %rsrc, i64 %data
 ; GFX9-NEXT:    s_mov_b32 s5, s7
 ; GFX9-NEXT:    s_mov_b32 s6, s8
 ; GFX9-NEXT:    s_mov_b32 s7, s9
+; GFX9-NEXT:    v_lshl_or_b32 v2, s0, 16, v2
 ; GFX9-NEXT:    image_atomic_add v[0:1], v2, s[0:7] dmask:0x3 unorm glc a16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: atomic_add_i64_1d:
 ; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX10-NEXT:    s_mov_b32 s0, s2
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s3, s5
+; GFX10-NEXT:    v_lshl_or_b32 v2, s0, 16, v2
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
@@ -1139,10 +1257,12 @@ define amdgpu_ps <2 x float> @atomic_add_i64_1d(<8 x i32> inreg %rsrc, i64 %data
 ;
 ; GFX12-LABEL: atomic_add_i64_1d:
 ; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX12-NEXT:    s_mov_b32 s0, s2
 ; GFX12-NEXT:    s_mov_b32 s1, s3
 ; GFX12-NEXT:    s_mov_b32 s2, s4
 ; GFX12-NEXT:    s_mov_b32 s3, s5
+; GFX12-NEXT:    v_lshl_or_b32 v2, s0, 16, v2
 ; GFX12-NEXT:    s_mov_b32 s4, s6
 ; GFX12-NEXT:    s_mov_b32 s5, s7
 ; GFX12-NEXT:    s_mov_b32 s6, s8
@@ -1160,6 +1280,7 @@ define amdgpu_ps <2 x float> @atomic_sub_i64_1d(<8 x i32> inreg %rsrc, i64 %data
 ; GFX9-LABEL: atomic_sub_i64_1d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    s_mov_b32 s0, s2
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX9-NEXT:    s_mov_b32 s1, s3
 ; GFX9-NEXT:    s_mov_b32 s2, s4
 ; GFX9-NEXT:    s_mov_b32 s3, s5
@@ -1167,16 +1288,19 @@ define amdgpu_ps <2 x float> @atomic_sub_i64_1d(<8 x i32> inreg %rsrc, i64 %data
 ; GFX9-NEXT:    s_mov_b32 s5, s7
 ; GFX9-NEXT:    s_mov_b32 s6, s8
 ; GFX9-NEXT:    s_mov_b32 s7, s9
+; GFX9-NEXT:    v_lshl_or_b32 v2, s0, 16, v2
 ; GFX9-NEXT:    image_atomic_sub v[0:1], v2, s[0:7] dmask:0x3 unorm glc a16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: atomic_sub_i64_1d:
 ; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX10-NEXT:    s_mov_b32 s0, s2
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s3, s5
+; GFX10-NEXT:    v_lshl_or_b32 v2, s0, 16, v2
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
@@ -1187,10 +1311,12 @@ define amdgpu_ps <2 x float> @atomic_sub_i64_1d(<8 x i32> inreg %rsrc, i64 %data
 ;
 ; GFX12-LABEL: atomic_sub_i64_1d:
 ; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX12-NEXT:    s_mov_b32 s0, s2
 ; GFX12-NEXT:    s_mov_b32 s1, s3
 ; GFX12-NEXT:    s_mov_b32 s2, s4
 ; GFX12-NEXT:    s_mov_b32 s3, s5
+; GFX12-NEXT:    v_lshl_or_b32 v2, s0, 16, v2
 ; GFX12-NEXT:    s_mov_b32 s4, s6
 ; GFX12-NEXT:    s_mov_b32 s5, s7
 ; GFX12-NEXT:    s_mov_b32 s6, s8
@@ -1208,6 +1334,7 @@ define amdgpu_ps <2 x float> @atomic_smin_i64_1d(<8 x i32> inreg %rsrc, i64 %dat
 ; GFX9-LABEL: atomic_smin_i64_1d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    s_mov_b32 s0, s2
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX9-NEXT:    s_mov_b32 s1, s3
 ; GFX9-NEXT:    s_mov_b32 s2, s4
 ; GFX9-NEXT:    s_mov_b32 s3, s5
@@ -1215,16 +1342,19 @@ define amdgpu_ps <2 x float> @atomic_smin_i64_1d(<8 x i32> inreg %rsrc, i64 %dat
 ; GFX9-NEXT:    s_mov_b32 s5, s7
 ; GFX9-NEXT:    s_mov_b32 s6, s8
 ; GFX9-NEXT:    s_mov_b32 s7, s9
+; GFX9-NEXT:    v_lshl_or_b32 v2, s0, 16, v2
 ; GFX9-NEXT:    image_atomic_smin v[0:1], v2, s[0:7] dmask:0x3 unorm glc a16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: atomic_smin_i64_1d:
 ; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX10-NEXT:    s_mov_b32 s0, s2
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s3, s5
+; GFX10-NEXT:    v_lshl_or_b32 v2, s0, 16, v2
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
@@ -1235,10 +1365,12 @@ define amdgpu_ps <2 x float> @atomic_smin_i64_1d(<8 x i32> inreg %rsrc, i64 %dat
 ;
 ; GFX12-LABEL: atomic_smin_i64_1d:
 ; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX12-NEXT:    s_mov_b32 s0, s2
 ; GFX12-NEXT:    s_mov_b32 s1, s3
 ; GFX12-NEXT:    s_mov_b32 s2, s4
 ; GFX12-NEXT:    s_mov_b32 s3, s5
+; GFX12-NEXT:    v_lshl_or_b32 v2, s0, 16, v2
 ; GFX12-NEXT:    s_mov_b32 s4, s6
 ; GFX12-NEXT:    s_mov_b32 s5, s7
 ; GFX12-NEXT:    s_mov_b32 s6, s8
@@ -1256,6 +1388,7 @@ define amdgpu_ps <2 x float> @atomic_umin_i64_1d(<8 x i32> inreg %rsrc, i64 %dat
 ; GFX9-LABEL: atomic_umin_i64_1d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    s_mov_b32 s0, s2
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX9-NEXT:    s_mov_b32 s1, s3
 ; GFX9-NEXT:    s_mov_b32 s2, s4
 ; GFX9-NEXT:    s_mov_b32 s3, s5
@@ -1263,16 +1396,19 @@ define amdgpu_ps <2 x float> @atomic_umin_i64_1d(<8 x i32> inreg %rsrc, i64 %dat
 ; GFX9-NEXT:    s_mov_b32 s5, s7
 ; GFX9-NEXT:    s_mov_b32 s6, s8
 ; GFX9-NEXT:    s_mov_b32 s7, s9
+; GFX9-NEXT:    v_lshl_or_b32 v2, s0, 16, v2
 ; GFX9-NEXT:    image_atomic_umin v[0:1], v2, s[0:7] dmask:0x3 unorm glc a16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: atomic_umin_i64_1d:
 ; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX10-NEXT:    s_mov_b32 s0, s2
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s3, s5
+; GFX10-NEXT:    v_lshl_or_b32 v2, s0, 16, v2
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
@@ -1283,10 +1419,12 @@ define amdgpu_ps <2 x float> @atomic_umin_i64_1d(<8 x i32> inreg %rsrc, i64 %dat
 ;
 ; GFX12-LABEL: atomic_umin_i64_1d:
 ; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX12-NEXT:    s_mov_b32 s0, s2
 ; GFX12-NEXT:    s_mov_b32 s1, s3
 ; GFX12-NEXT:    s_mov_b32 s2, s4
 ; GFX12-NEXT:    s_mov_b32 s3, s5
+; GFX12-NEXT:    v_lshl_or_b32 v2, s0, 16, v2
 ; GFX12-NEXT:    s_mov_b32 s4, s6
 ; GFX12-NEXT:    s_mov_b32 s5, s7
 ; GFX12-NEXT:    s_mov_b32 s6, s8
@@ -1304,6 +1442,7 @@ define amdgpu_ps <2 x float> @atomic_smax_i64_1d(<8 x i32> inreg %rsrc, i64 %dat
 ; GFX9-LABEL: atomic_smax_i64_1d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    s_mov_b32 s0, s2
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX9-NEXT:    s_mov_b32 s1, s3
 ; GFX9-NEXT:    s_mov_b32 s2, s4
 ; GFX9-NEXT:    s_mov_b32 s3, s5
@@ -1311,16 +1450,19 @@ define amdgpu_ps <2 x float> @atomic_smax_i64_1d(<8 x i32> inreg %rsrc, i64 %dat
 ; GFX9-NEXT:    s_mov_b32 s5, s7
 ; GFX9-NEXT:    s_mov_b32 s6, s8
 ; GFX9-NEXT:    s_mov_b32 s7, s9
+; GFX9-NEXT:    v_lshl_or_b32 v2, s0, 16, v2
 ; GFX9-NEXT:    image_atomic_smax v[0:1], v2, s[0:7] dmask:0x3 unorm glc a16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: atomic_smax_i64_1d:
 ; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX10-NEXT:    s_mov_b32 s0, s2
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s3, s5
+; GFX10-NEXT:    v_lshl_or_b32 v2, s0, 16, v2
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
@@ -1331,10 +1473,12 @@ define amdgpu_ps <2 x float> @atomic_smax_i64_1d(<8 x i32> inreg %rsrc, i64 %dat
 ;
 ; GFX12-LABEL: atomic_smax_i64_1d:
 ; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX12-NEXT:    s_mov_b32 s0, s2
 ; GFX12-NEXT:    s_mov_b32 s1, s3
 ; GFX12-NEXT:    s_mov_b32 s2, s4
 ; GFX12-NEXT:    s_mov_b32 s3, s5
+; GFX12-NEXT:    v_lshl_or_b32 v2, s0, 16, v2
 ; GFX12-NEXT:    s_mov_b32 s4, s6
 ; GFX12-NEXT:    s_mov_b32 s5, s7
 ; GFX12-NEXT:    s_mov_b32 s6, s8
@@ -1352,6 +1496,7 @@ define amdgpu_ps <2 x float> @atomic_umax_i64_1d(<8 x i32> inreg %rsrc, i64 %dat
 ; GFX9-LABEL: atomic_umax_i64_1d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    s_mov_b32 s0, s2
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX9-NEXT:    s_mov_b32 s1, s3
 ; GFX9-NEXT:    s_mov_b32 s2, s4
 ; GFX9-NEXT:    s_mov_b32 s3, s5
@@ -1359,16 +1504,19 @@ define amdgpu_ps <2 x float> @atomic_umax_i64_1d(<8 x i32> inreg %rsrc, i64 %dat
 ; GFX9-NEXT:    s_mov_b32 s5, s7
 ; GFX9-NEXT:    s_mov_b32 s6, s8
 ; GFX9-NEXT:    s_mov_b32 s7, s9
+; GFX9-NEXT:    v_lshl_or_b32 v2, s0, 16, v2
 ; GFX9-NEXT:    image_atomic_umax v[0:1], v2, s[0:7] dmask:0x3 unorm glc a16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: atomic_umax_i64_1d:
 ; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX10-NEXT:    s_mov_b32 s0, s2
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s3, s5
+; GFX10-NEXT:    v_lshl_or_b32 v2, s0, 16, v2
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
@@ -1379,10 +1527,12 @@ define amdgpu_ps <2 x float> @atomic_umax_i64_1d(<8 x i32> inreg %rsrc, i64 %dat
 ;
 ; GFX12-LABEL: atomic_umax_i64_1d:
 ; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX12-NEXT:    s_mov_b32 s0, s2
 ; GFX12-NEXT:    s_mov_b32 s1, s3
 ; GFX12-NEXT:    s_mov_b32 s2, s4
 ; GFX12-NEXT:    s_mov_b32 s3, s5
+; GFX12-NEXT:    v_lshl_or_b32 v2, s0, 16, v2
 ; GFX12-NEXT:    s_mov_b32 s4, s6
 ; GFX12-NEXT:    s_mov_b32 s5, s7
 ; GFX12-NEXT:    s_mov_b32 s6, s8
@@ -1400,6 +1550,7 @@ define amdgpu_ps <2 x float> @atomic_and_i64_1d(<8 x i32> inreg %rsrc, i64 %data
 ; GFX9-LABEL: atomic_and_i64_1d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    s_mov_b32 s0, s2
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX9-NEXT:    s_mov_b32 s1, s3
 ; GFX9-NEXT:    s_mov_b32 s2, s4
 ; GFX9-NEXT:    s_mov_b32 s3, s5
@@ -1407,16 +1558,19 @@ define amdgpu_ps <2 x float> @atomic_and_i64_1d(<8 x i32> inreg %rsrc, i64 %data
 ; GFX9-NEXT:    s_mov_b32 s5, s7
 ; GFX9-NEXT:    s_mov_b32 s6, s8
 ; GFX9-NEXT:    s_mov_b32 s7, s9
+; GFX9-NEXT:    v_lshl_or_b32 v2, s0, 16, v2
 ; GFX9-NEXT:    image_atomic_and v[0:1], v2, s[0:7] dmask:0x3 unorm glc a16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: atomic_and_i64_1d:
 ; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX10-NEXT:    s_mov_b32 s0, s2
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s3, s5
+; GFX10-NEXT:    v_lshl_or_b32 v2, s0, 16, v2
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
@@ -1427,10 +1581,12 @@ define amdgpu_ps <2 x float> @atomic_and_i64_1d(<8 x i32> inreg %rsrc, i64 %data
 ;
 ; GFX12-LABEL: atomic_and_i64_1d:
 ; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX12-NEXT:    s_mov_b32 s0, s2
 ; GFX12-NEXT:    s_mov_b32 s1, s3
 ; GFX12-NEXT:    s_mov_b32 s2, s4
 ; GFX12-NEXT:    s_mov_b32 s3, s5
+; GFX12-NEXT:    v_lshl_or_b32 v2, s0, 16, v2
 ; GFX12-NEXT:    s_mov_b32 s4, s6
 ; GFX12-NEXT:    s_mov_b32 s5, s7
 ; GFX12-NEXT:    s_mov_b32 s6, s8
@@ -1448,6 +1604,7 @@ define amdgpu_ps <2 x float> @atomic_or_i64_1d(<8 x i32> inreg %rsrc, i64 %data,
 ; GFX9-LABEL: atomic_or_i64_1d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    s_mov_b32 s0, s2
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX9-NEXT:    s_mov_b32 s1, s3
 ; GFX9-NEXT:    s_mov_b32 s2, s4
 ; GFX9-NEXT:    s_mov_b32 s3, s5
@@ -1455,16 +1612,19 @@ define amdgpu_ps <2 x float> @atomic_or_i64_1d(<8 x i32> inreg %rsrc, i64 %data,
 ; GFX9-NEXT:    s_mov_b32 s5, s7
 ; GFX9-NEXT:    s_mov_b32 s6, s8
 ; GFX9-NEXT:    s_mov_b32 s7, s9
+; GFX9-NEXT:    v_lshl_or_b32 v2, s0, 16, v2
 ; GFX9-NEXT:    image_atomic_or v[0:1], v2, s[0:7] dmask:0x3 unorm glc a16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: atomic_or_i64_1d:
 ; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX10-NEXT:    s_mov_b32 s0, s2
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s3, s5
+; GFX10-NEXT:    v_lshl_or_b32 v2, s0, 16, v2
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
@@ -1475,10 +1635,12 @@ define amdgpu_ps <2 x float> @atomic_or_i64_1d(<8 x i32> inreg %rsrc, i64 %data,
 ;
 ; GFX12-LABEL: atomic_or_i64_1d:
 ; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX12-NEXT:    s_mov_b32 s0, s2
 ; GFX12-NEXT:    s_mov_b32 s1, s3
 ; GFX12-NEXT:    s_mov_b32 s2, s4
 ; GFX12-NEXT:    s_mov_b32 s3, s5
+; GFX12-NEXT:    v_lshl_or_b32 v2, s0, 16, v2
 ; GFX12-NEXT:    s_mov_b32 s4, s6
 ; GFX12-NEXT:    s_mov_b32 s5, s7
 ; GFX12-NEXT:    s_mov_b32 s6, s8
@@ -1496,6 +1658,7 @@ define amdgpu_ps <2 x float> @atomic_xor_i64_1d(<8 x i32> inreg %rsrc, i64 %data
 ; GFX9-LABEL: atomic_xor_i64_1d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    s_mov_b32 s0, s2
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX9-NEXT:    s_mov_b32 s1, s3
 ; GFX9-NEXT:    s_mov_b32 s2, s4
 ; GFX9-NEXT:    s_mov_b32 s3, s5
@@ -1503,16 +1666,19 @@ define amdgpu_ps <2 x float> @atomic_xor_i64_1d(<8 x i32> inreg %rsrc, i64 %data
 ; GFX9-NEXT:    s_mov_b32 s5, s7
 ; GFX9-NEXT:    s_mov_b32 s6, s8
 ; GFX9-NEXT:    s_mov_b32 s7, s9
+; GFX9-NEXT:    v_lshl_or_b32 v2, s0, 16, v2
 ; GFX9-NEXT:    image_atomic_xor v[0:1], v2, s[0:7] dmask:0x3 unorm glc a16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: atomic_xor_i64_1d:
 ; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX10-NEXT:    s_mov_b32 s0, s2
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s3, s5
+; GFX10-NEXT:    v_lshl_or_b32 v2, s0, 16, v2
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
@@ -1523,10 +1689,12 @@ define amdgpu_ps <2 x float> @atomic_xor_i64_1d(<8 x i32> inreg %rsrc, i64 %data
 ;
 ; GFX12-LABEL: atomic_xor_i64_1d:
 ; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX12-NEXT:    s_mov_b32 s0, s2
 ; GFX12-NEXT:    s_mov_b32 s1, s3
 ; GFX12-NEXT:    s_mov_b32 s2, s4
 ; GFX12-NEXT:    s_mov_b32 s3, s5
+; GFX12-NEXT:    v_lshl_or_b32 v2, s0, 16, v2
 ; GFX12-NEXT:    s_mov_b32 s4, s6
 ; GFX12-NEXT:    s_mov_b32 s5, s7
 ; GFX12-NEXT:    s_mov_b32 s6, s8
@@ -1544,6 +1712,7 @@ define amdgpu_ps <2 x float> @atomic_inc_i64_1d(<8 x i32> inreg %rsrc, i64 %data
 ; GFX9-LABEL: atomic_inc_i64_1d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    s_mov_b32 s0, s2
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX9-NEXT:    s_mov_b32 s1, s3
 ; GFX9-NEXT:    s_mov_b32 s2, s4
 ; GFX9-NEXT:    s_mov_b32 s3, s5
@@ -1551,16 +1720,19 @@ define amdgpu_ps <2 x float> @atomic_inc_i64_1d(<8 x i32> inreg %rsrc, i64 %data
 ; GFX9-NEXT:    s_mov_b32 s5, s7
 ; GFX9-NEXT:    s_mov_b32 s6, s8
 ; GFX9-NEXT:    s_mov_b32 s7, s9
+; GFX9-NEXT:    v_lshl_or_b32 v2, s0, 16, v2
 ; GFX9-NEXT:    image_atomic_inc v[0:1], v2, s[0:7] dmask:0x3 unorm glc a16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: atomic_inc_i64_1d:
 ; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX10-NEXT:    s_mov_b32 s0, s2
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s3, s5
+; GFX10-NEXT:    v_lshl_or_b32 v2, s0, 16, v2
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
@@ -1571,10 +1743,12 @@ define amdgpu_ps <2 x float> @atomic_inc_i64_1d(<8 x i32> inreg %rsrc, i64 %data
 ;
 ; GFX12-LABEL: atomic_inc_i64_1d:
 ; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX12-NEXT:    s_mov_b32 s0, s2
 ; GFX12-NEXT:    s_mov_b32 s1, s3
 ; GFX12-NEXT:    s_mov_b32 s2, s4
 ; GFX12-NEXT:    s_mov_b32 s3, s5
+; GFX12-NEXT:    v_lshl_or_b32 v2, s0, 16, v2
 ; GFX12-NEXT:    s_mov_b32 s4, s6
 ; GFX12-NEXT:    s_mov_b32 s5, s7
 ; GFX12-NEXT:    s_mov_b32 s6, s8
@@ -1592,6 +1766,7 @@ define amdgpu_ps <2 x float> @atomic_dec_i64_1d(<8 x i32> inreg %rsrc, i64 %data
 ; GFX9-LABEL: atomic_dec_i64_1d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    s_mov_b32 s0, s2
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX9-NEXT:    s_mov_b32 s1, s3
 ; GFX9-NEXT:    s_mov_b32 s2, s4
 ; GFX9-NEXT:    s_mov_b32 s3, s5
@@ -1599,16 +1774,19 @@ define amdgpu_ps <2 x float> @atomic_dec_i64_1d(<8 x i32> inreg %rsrc, i64 %data
 ; GFX9-NEXT:    s_mov_b32 s5, s7
 ; GFX9-NEXT:    s_mov_b32 s6, s8
 ; GFX9-NEXT:    s_mov_b32 s7, s9
+; GFX9-NEXT:    v_lshl_or_b32 v2, s0, 16, v2
 ; GFX9-NEXT:    image_atomic_dec v[0:1], v2, s[0:7] dmask:0x3 unorm glc a16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: atomic_dec_i64_1d:
 ; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX10-NEXT:    s_mov_b32 s0, s2
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s3, s5
+; GFX10-NEXT:    v_lshl_or_b32 v2, s0, 16, v2
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
@@ -1619,10 +1797,12 @@ define amdgpu_ps <2 x float> @atomic_dec_i64_1d(<8 x i32> inreg %rsrc, i64 %data
 ;
 ; GFX12-LABEL: atomic_dec_i64_1d:
 ; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX12-NEXT:    s_mov_b32 s0, s2
 ; GFX12-NEXT:    s_mov_b32 s1, s3
 ; GFX12-NEXT:    s_mov_b32 s2, s4
 ; GFX12-NEXT:    s_mov_b32 s3, s5
+; GFX12-NEXT:    v_lshl_or_b32 v2, s0, 16, v2
 ; GFX12-NEXT:    s_mov_b32 s4, s6
 ; GFX12-NEXT:    s_mov_b32 s5, s7
 ; GFX12-NEXT:    s_mov_b32 s6, s8
@@ -1640,6 +1820,7 @@ define amdgpu_ps <2 x float> @atomic_cmpswap_i64_1d(<8 x i32> inreg %rsrc, i64 %
 ; GFX9-LABEL: atomic_cmpswap_i64_1d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    s_mov_b32 s0, s2
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; GFX9-NEXT:    s_mov_b32 s1, s3
 ; GFX9-NEXT:    s_mov_b32 s2, s4
 ; GFX9-NEXT:    s_mov_b32 s3, s5
@@ -1647,16 +1828,19 @@ define amdgpu_ps <2 x float> @atomic_cmpswap_i64_1d(<8 x i32> inreg %rsrc, i64 %
 ; GFX9-NEXT:    s_mov_b32 s5, s7
 ; GFX9-NEXT:    s_mov_b32 s6, s8
 ; GFX9-NEXT:    s_mov_b32 s7, s9
+; GFX9-NEXT:    v_lshl_or_b32 v4, s0, 16, v4
 ; GFX9-NEXT:    image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm glc a16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: atomic_cmpswap_i64_1d:
 ; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; GFX10-NEXT:    s_mov_b32 s0, s2
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s3, s5
+; GFX10-NEXT:    v_lshl_or_b32 v4, s0, 16, v4
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
@@ -1667,10 +1851,12 @@ define amdgpu_ps <2 x float> @atomic_cmpswap_i64_1d(<8 x i32> inreg %rsrc, i64 %
 ;
 ; GFX12-LABEL: atomic_cmpswap_i64_1d:
 ; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; GFX12-NEXT:    s_mov_b32 s0, s2
 ; GFX12-NEXT:    s_mov_b32 s1, s3
 ; GFX12-NEXT:    s_mov_b32 s2, s4
 ; GFX12-NEXT:    s_mov_b32 s3, s5
+; GFX12-NEXT:    v_lshl_or_b32 v4, s0, 16, v4
 ; GFX12-NEXT:    s_mov_b32 s4, s6
 ; GFX12-NEXT:    s_mov_b32 s5, s7
 ; GFX12-NEXT:    s_mov_b32 s6, s8
@@ -1743,6 +1929,8 @@ define amdgpu_ps <2 x float> @atomic_add_i64_3d(<8 x i32> inreg %rsrc, i64 %data
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX9-NEXT:    s_mov_b32 s0, s2
+; GFX9-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff, v4
 ; GFX9-NEXT:    s_mov_b32 s1, s3
 ; GFX9-NEXT:    s_mov_b32 s2, s4
 ; GFX9-NEXT:    s_mov_b32 s3, s5
@@ -1750,40 +1938,44 @@ define amdgpu_ps <2 x float> @atomic_add_i64_3d(<8 x i32> inreg %rsrc, i64 %data
 ; GFX9-NEXT:    s_mov_b32 s5, s7
 ; GFX9-NEXT:    s_mov_b32 s6, s8
 ; GFX9-NEXT:    s_mov_b32 s7, s9
-; GFX9-NEXT:    v_lshl_or_b32 v3, v3, 16, v2
-; GFX9-NEXT:    image_atomic_add v[0:1], v[3:4], s[0:7] dmask:0x3 unorm glc a16
+; GFX9-NEXT:    v_lshl_or_b32 v3, s0, 16, v3
+; GFX9-NEXT:    image_atomic_add v[0:1], v[2:3], s[0:7] dmask:0x3 unorm glc a16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: atomic_add_i64_3d:
 ; GFX10:       ; %bb.0: ; %main_body
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; GFX10-NEXT:    s_mov_b32 s0, s2
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s2, s4
-; GFX10-NEXT:    s_mov_b32 s3, s5
 ; GFX10-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
+; GFX10-NEXT:    v_lshl_or_b32 v3, s0, 16, v4
+; GFX10-NEXT:    s_mov_b32 s3, s5
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
 ; GFX10-NEXT:    s_mov_b32 s7, s9
-; GFX10-NEXT:    image_atomic_add v[0:1], [v2, v4], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_3D unorm glc a16
+; GFX10-NEXT:    image_atomic_add v[0:1], v[2:3], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_3D unorm glc a16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: atomic_add_i64_3d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX12-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; GFX12-NEXT:    s_mov_b32 s0, s2
 ; GFX12-NEXT:    s_mov_b32 s1, s3
 ; GFX12-NEXT:    s_mov_b32 s2, s4
-; GFX12-NEXT:    s_mov_b32 s3, s5
 ; GFX12-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
+; GFX12-NEXT:    v_lshl_or_b32 v3, s0, 16, v4
+; GFX12-NEXT:    s_mov_b32 s3, s5
 ; GFX12-NEXT:    s_mov_b32 s4, s6
 ; GFX12-NEXT:    s_mov_b32 s5, s7
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
-; GFX12-NEXT:    image_atomic_add_uint v[0:1], [v2, v4], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_3D th:TH_ATOMIC_RETURN a16
+; GFX12-NEXT:    image_atomic_add_uint v[0:1], [v2, v3], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_3D th:TH_ATOMIC_RETURN a16
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
@@ -1797,6 +1989,8 @@ define amdgpu_ps <2 x float> @atomic_add_i64_cube(<8 x i32> inreg %rsrc, i64 %da
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX9-NEXT:    s_mov_b32 s0, s2
+; GFX9-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff, v4
 ; GFX9-NEXT:    s_mov_b32 s1, s3
 ; GFX9-NEXT:    s_mov_b32 s2, s4
 ; GFX9-NEXT:    s_mov_b32 s3, s5
@@ -1804,40 +1998,44 @@ define amdgpu_ps <2 x float> @atomic_add_i64_cube(<8 x i32> inreg %rsrc, i64 %da
 ; GFX9-NEXT:    s_mov_b32 s5, s7
 ; GFX9-NEXT:    s_mov_b32 s6, s8
 ; GFX9-NEXT:    s_mov_b32 s7, s9
-; GFX9-NEXT:    v_lshl_or_b32 v3, v3, 16, v2
-; GFX9-NEXT:    image_atomic_add v[0:1], v[3:4], s[0:7] dmask:0x3 unorm glc a16 da
+; GFX9-NEXT:    v_lshl_or_b32 v3, s0, 16, v3
+; GFX9-NEXT:    image_atomic_add v[0:1], v[2:3], s[0:7] dmask:0x3 unorm glc a16 da
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: atomic_add_i64_cube:
 ; GFX10:       ; %bb.0: ; %main_body
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; GFX10-NEXT:    s_mov_b32 s0, s2
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s2, s4
-; GFX10-NEXT:    s_mov_b32 s3, s5
 ; GFX10-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
+; GFX10-NEXT:    v_lshl_or_b32 v3, s0, 16, v4
+; GFX10-NEXT:    s_mov_b32 s3, s5
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
 ; GFX10-NEXT:    s_mov_b32 s7, s9
-; GFX10-NEXT:    image_atomic_add v[0:1], [v2, v4], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_CUBE unorm glc a16
+; GFX10-NEXT:    image_atomic_add v[0:1], v[2:3], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_CUBE unorm glc a16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: atomic_add_i64_cube:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX12-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; GFX12-NEXT:    s_mov_b32 s0, s2
 ; GFX12-NEXT:    s_mov_b32 s1, s3
 ; GFX12-NEXT:    s_mov_b32 s2, s4
-; GFX12-NEXT:    s_mov_b32 s3, s5
 ; GFX12-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
+; GFX12-NEXT:    v_lshl_or_b32 v3, s0, 16, v4
+; GFX12-NEXT:    s_mov_b32 s3, s5
 ; GFX12-NEXT:    s_mov_b32 s4, s6
 ; GFX12-NEXT:    s_mov_b32 s5, s7
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
-; GFX12-NEXT:    image_atomic_add_uint v[0:1], [v2, v4], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_CUBE th:TH_ATOMIC_RETURN a16
+; GFX12-NEXT:    image_atomic_add_uint v[0:1], [v2, v3], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_CUBE th:TH_ATOMIC_RETURN a16
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
@@ -1905,6 +2103,8 @@ define amdgpu_ps <2 x float> @atomic_add_i64_2darray(<8 x i32> inreg %rsrc, i64
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX9-NEXT:    s_mov_b32 s0, s2
+; GFX9-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff, v4
 ; GFX9-NEXT:    s_mov_b32 s1, s3
 ; GFX9-NEXT:    s_mov_b32 s2, s4
 ; GFX9-NEXT:    s_mov_b32 s3, s5
@@ -1912,40 +2112,44 @@ define amdgpu_ps <2 x float> @atomic_add_i64_2darray(<8 x i32> inreg %rsrc, i64
 ; GFX9-NEXT:    s_mov_b32 s5, s7
 ; GFX9-NEXT:    s_mov_b32 s6, s8
 ; GFX9-NEXT:    s_mov_b32 s7, s9
-; GFX9-NEXT:    v_lshl_or_b32 v3, v3, 16, v2
-; GFX9-NEXT:    image_atomic_add v[0:1], v[3:4], s[0:7] dmask:0x3 unorm glc a16 da
+; GFX9-NEXT:    v_lshl_or_b32 v3, s0, 16, v3
+; GFX9-NEXT:    image_atomic_add v[0:1], v[2:3], s[0:7] dmask:0x3 unorm glc a16 da
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: atomic_add_i64_2darray:
 ; GFX10:       ; %bb.0: ; %main_body
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; GFX10-NEXT:    s_mov_b32 s0, s2
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s2, s4
-; GFX10-NEXT:    s_mov_b32 s3, s5
 ; GFX10-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
+; GFX10-NEXT:    v_lshl_or_b32 v3, s0, 16, v4
+; GFX10-NEXT:    s_mov_b32 s3, s5
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
 ; GFX10-NEXT:    s_mov_b32 s7, s9
-; GFX10-NEXT:    image_atomic_add v[0:1], [v2, v4], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D_ARRAY unorm glc a16
+; GFX10-NEXT:    image_atomic_add v[0:1], v[2:3], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D_ARRAY unorm glc a16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: atomic_add_i64_2darray:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX12-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; GFX12-NEXT:    s_mov_b32 s0, s2
 ; GFX12-NEXT:    s_mov_b32 s1, s3
 ; GFX12-NEXT:    s_mov_b32 s2, s4
-; GFX12-NEXT:    s_mov_b32 s3, s5
 ; GFX12-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
+; GFX12-NEXT:    v_lshl_or_b32 v3, s0, 16, v4
+; GFX12-NEXT:    s_mov_b32 s3, s5
 ; GFX12-NEXT:    s_mov_b32 s4, s6
 ; GFX12-NEXT:    s_mov_b32 s5, s7
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
-; GFX12-NEXT:    image_atomic_add_uint v[0:1], [v2, v4], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D_ARRAY th:TH_ATOMIC_RETURN a16
+; GFX12-NEXT:    image_atomic_add_uint v[0:1], [v2, v3], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D_ARRAY th:TH_ATOMIC_RETURN a16
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
@@ -1959,6 +2163,8 @@ define amdgpu_ps <2 x float> @atomic_add_i64_2dmsaa(<8 x i32> inreg %rsrc, i64 %
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX9-NEXT:    s_mov_b32 s0, s2
+; GFX9-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff, v4
 ; GFX9-NEXT:    s_mov_b32 s1, s3
 ; GFX9-NEXT:    s_mov_b32 s2, s4
 ; GFX9-NEXT:    s_mov_b32 s3, s5
@@ -1966,40 +2172,44 @@ define amdgpu_ps <2 x float> @atomic_add_i64_2dmsaa(<8 x i32> inreg %rsrc, i64 %
 ; GFX9-NEXT:    s_mov_b32 s5, s7
 ; GFX9-NEXT:    s_mov_b32 s6, s8
 ; GFX9-NEXT:    s_mov_b32 s7, s9
-; GFX9-NEXT:    v_lshl_or_b32 v3, v3, 16, v2
-; GFX9-NEXT:    image_atomic_add v[0:1], v[3:4], s[0:7] dmask:0x3 unorm glc a16
+; GFX9-NEXT:    v_lshl_or_b32 v3, s0, 16, v3
+; GFX9-NEXT:    image_atomic_add v[0:1], v[2:3], s[0:7] dmask:0x3 unorm glc a16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: atomic_add_i64_2dmsaa:
 ; GFX10:       ; %bb.0: ; %main_body
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; GFX10-NEXT:    s_mov_b32 s0, s2
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s2, s4
-; GFX10-NEXT:    s_mov_b32 s3, s5
 ; GFX10-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
+; GFX10-NEXT:    v_lshl_or_b32 v3, s0, 16, v4
+; GFX10-NEXT:    s_mov_b32 s3, s5
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
 ; GFX10-NEXT:    s_mov_b32 s7, s9
-; GFX10-NEXT:    image_atomic_add v[0:1], [v2, v4], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D_MSAA unorm glc a16
+; GFX10-NEXT:    image_atomic_add v[0:1], v[2:3], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D_MSAA unorm glc a16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: atomic_add_i64_2dmsaa:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX12-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; GFX12-NEXT:    s_mov_b32 s0, s2
 ; GFX12-NEXT:    s_mov_b32 s1, s3
 ; GFX12-NEXT:    s_mov_b32 s2, s4
-; GFX12-NEXT:    s_mov_b32 s3, s5
 ; GFX12-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
+; GFX12-NEXT:    v_lshl_or_b32 v3, s0, 16, v4
+; GFX12-NEXT:    s_mov_b32 s3, s5
 ; GFX12-NEXT:    s_mov_b32 s4, s6
 ; GFX12-NEXT:    s_mov_b32 s5, s7
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
-; GFX12-NEXT:    image_atomic_add_uint v[0:1], [v2, v4], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D_MSAA th:TH_ATOMIC_RETURN a16
+; GFX12-NEXT:    image_atomic_add_uint v[0:1], [v2, v3], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D_MSAA th:TH_ATOMIC_RETURN a16
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
@@ -2072,6 +2282,7 @@ define amdgpu_ps <2 x float> @atomic_add_i64_1d_slc(<8 x i32> inreg %rsrc, i64 %
 ; GFX9-LABEL: atomic_add_i64_1d_slc:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    s_mov_b32 s0, s2
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX9-NEXT:    s_mov_b32 s1, s3
 ; GFX9-NEXT:    s_mov_b32 s2, s4
 ; GFX9-NEXT:    s_mov_b32 s3, s5
@@ -2079,16 +2290,19 @@ define amdgpu_ps <2 x float> @atomic_add_i64_1d_slc(<8 x i32> inreg %rsrc, i64 %
 ; GFX9-NEXT:    s_mov_b32 s5, s7
 ; GFX9-NEXT:    s_mov_b32 s6, s8
 ; GFX9-NEXT:    s_mov_b32 s7, s9
+; GFX9-NEXT:    v_lshl_or_b32 v2, s0, 16, v2
 ; GFX9-NEXT:    image_atomic_add v[0:1], v2, s[0:7] dmask:0x3 unorm glc slc a16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: atomic_add_i64_1d_slc:
 ; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX10-NEXT:    s_mov_b32 s0, s2
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s3, s5
+; GFX10-NEXT:    v_lshl_or_b32 v2, s0, 16, v2
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
@@ -2099,10 +2313,12 @@ define amdgpu_ps <2 x float> @atomic_add_i64_1d_slc(<8 x i32> inreg %rsrc, i64 %
 ;
 ; GFX12-LABEL: atomic_add_i64_1d_slc:
 ; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX12-NEXT:    s_mov_b32 s0, s2
 ; GFX12-NEXT:    s_mov_b32 s1, s3
 ; GFX12-NEXT:    s_mov_b32 s2, s4
 ; GFX12-NEXT:    s_mov_b32 s3, s5
+; GFX12-NEXT:    v_lshl_or_b32 v2, s0, 16, v2
 ; GFX12-NEXT:    s_mov_b32 s4, s6
 ; GFX12-NEXT:    s_mov_b32 s5, s7
 ; GFX12-NEXT:    s_mov_b32 s6, s8
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.ll
index 09e1fca3f267..b1c457c236ed 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.ll
@@ -1,11 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -o - %s | FileCheck -check-prefix=GFX6 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX900 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx90a -o - %s | FileCheck -check-prefix=GFX90A %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - %s | FileCheck -check-prefix=GFX10PLUS %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -o - %s | FileCheck -check-prefix=GFX10PLUS %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GFX12 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -o - %s | FileCheck -check-prefix=GFX6 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX900 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx90a -o - %s | FileCheck -check-prefix=GFX90A %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - %s | FileCheck -check-prefix=GFX10PLUS %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -o - %s | FileCheck -check-prefix=GFX10PLUS %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GFX12 %s
 
 define amdgpu_ps float @atomic_swap_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
 ; GFX6-LABEL: atomic_swap_i32_1d:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll
index 13c697f49784..4b72a9bd9f74 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - %s | FileCheck -check-prefix=GFX10NSA %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 -o - %s | FileCheck -check-prefixes=GFX10NSA %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=-real-true16 -o - %s | FileCheck -check-prefixes=GFX12 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - %s | FileCheck -check-prefix=GFX10NSA %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 -o - %s | FileCheck -check-prefixes=GFX10NSA %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=-real-true16 -o - %s | FileCheck -check-prefixes=GFX12 %s
 
 define amdgpu_ps <4 x float> @gather4_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t) {
 ; GFX9-LABEL: gather4_2d:
@@ -85,6 +85,8 @@ define amdgpu_ps <4 x float> @gather4_cube(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX9-NEXT:    s_mov_b32 s0, s2
 ; GFX9-NEXT:    s_wqm_b64 exec, exec
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v2
 ; GFX9-NEXT:    s_mov_b32 s1, s3
 ; GFX9-NEXT:    s_mov_b32 s2, s4
 ; GFX9-NEXT:    s_mov_b32 s3, s5
@@ -96,9 +98,9 @@ define amdgpu_ps <4 x float> @gather4_cube(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX9-NEXT:    s_mov_b32 s9, s11
 ; GFX9-NEXT:    s_mov_b32 s10, s12
 ; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    v_lshl_or_b32 v1, v1, 16, v0
+; GFX9-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
 ; GFX9-NEXT:    s_and_b64 exec, exec, s[14:15]
-; GFX9-NEXT:    image_gather4 v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 a16 da
+; GFX9-NEXT:    image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16 da
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
@@ -108,6 +110,7 @@ define amdgpu_ps <4 x float> @gather4_cube(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX10NSA-NEXT:    s_mov_b32 s0, s2
 ; GFX10NSA-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX10NSA-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10NSA-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX10NSA-NEXT:    s_mov_b32 s1, s3
 ; GFX10NSA-NEXT:    s_mov_b32 s2, s4
 ; GFX10NSA-NEXT:    s_mov_b32 s3, s5
@@ -120,8 +123,9 @@ define amdgpu_ps <4 x float> @gather4_cube(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX10NSA-NEXT:    s_mov_b32 s10, s12
 ; GFX10NSA-NEXT:    s_mov_b32 s11, s13
 ; GFX10NSA-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX10NSA-NEXT:    v_lshl_or_b32 v1, s0, 16, v2
 ; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s14
-; GFX10NSA-NEXT:    image_gather4 v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE a16
+; GFX10NSA-NEXT:    image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE a16
 ; GFX10NSA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10NSA-NEXT:    ; return to shader part epilog
 ;
@@ -131,6 +135,7 @@ define amdgpu_ps <4 x float> @gather4_cube(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX12-NEXT:    s_mov_b32 s0, s2
 ; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX12-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX12-NEXT:    s_mov_b32 s1, s3
 ; GFX12-NEXT:    s_mov_b32 s2, s4
 ; GFX12-NEXT:    s_mov_b32 s3, s5
@@ -143,8 +148,9 @@ define amdgpu_ps <4 x float> @gather4_cube(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX12-NEXT:    s_mov_b32 s10, s12
 ; GFX12-NEXT:    s_mov_b32 s11, s13
 ; GFX12-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX12-NEXT:    v_lshl_or_b32 v1, s0, 16, v2
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s14
-; GFX12-NEXT:    image_gather4 v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE a16
+; GFX12-NEXT:    image_gather4 v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE a16
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
@@ -159,6 +165,8 @@ define amdgpu_ps <4 x float> @gather4_2darray(<8 x i32> inreg %rsrc, <4 x i32> i
 ; GFX9-NEXT:    s_mov_b32 s0, s2
 ; GFX9-NEXT:    s_wqm_b64 exec, exec
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v2
 ; GFX9-NEXT:    s_mov_b32 s1, s3
 ; GFX9-NEXT:    s_mov_b32 s2, s4
 ; GFX9-NEXT:    s_mov_b32 s3, s5
@@ -170,9 +178,9 @@ define amdgpu_ps <4 x float> @gather4_2darray(<8 x i32> inreg %rsrc, <4 x i32> i
 ; GFX9-NEXT:    s_mov_b32 s9, s11
 ; GFX9-NEXT:    s_mov_b32 s10, s12
 ; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    v_lshl_or_b32 v1, v1, 16, v0
+; GFX9-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
 ; GFX9-NEXT:    s_and_b64 exec, exec, s[14:15]
-; GFX9-NEXT:    image_gather4 v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 a16 da
+; GFX9-NEXT:    image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16 da
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
@@ -182,6 +190,7 @@ define amdgpu_ps <4 x float> @gather4_2darray(<8 x i32> inreg %rsrc, <4 x i32> i
 ; GFX10NSA-NEXT:    s_mov_b32 s0, s2
 ; GFX10NSA-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX10NSA-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10NSA-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX10NSA-NEXT:    s_mov_b32 s1, s3
 ; GFX10NSA-NEXT:    s_mov_b32 s2, s4
 ; GFX10NSA-NEXT:    s_mov_b32 s3, s5
@@ -194,8 +203,9 @@ define amdgpu_ps <4 x float> @gather4_2darray(<8 x i32> inreg %rsrc, <4 x i32> i
 ; GFX10NSA-NEXT:    s_mov_b32 s10, s12
 ; GFX10NSA-NEXT:    s_mov_b32 s11, s13
 ; GFX10NSA-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX10NSA-NEXT:    v_lshl_or_b32 v1, s0, 16, v2
 ; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s14
-; GFX10NSA-NEXT:    image_gather4 v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY a16
+; GFX10NSA-NEXT:    image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY a16
 ; GFX10NSA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10NSA-NEXT:    ; return to shader part epilog
 ;
@@ -205,6 +215,7 @@ define amdgpu_ps <4 x float> @gather4_2darray(<8 x i32> inreg %rsrc, <4 x i32> i
 ; GFX12-NEXT:    s_mov_b32 s0, s2
 ; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX12-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX12-NEXT:    s_mov_b32 s1, s3
 ; GFX12-NEXT:    s_mov_b32 s2, s4
 ; GFX12-NEXT:    s_mov_b32 s3, s5
@@ -217,8 +228,9 @@ define amdgpu_ps <4 x float> @gather4_2darray(<8 x i32> inreg %rsrc, <4 x i32> i
 ; GFX12-NEXT:    s_mov_b32 s10, s12
 ; GFX12-NEXT:    s_mov_b32 s11, s13
 ; GFX12-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX12-NEXT:    v_lshl_or_b32 v1, s0, 16, v2
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s14
-; GFX12-NEXT:    image_gather4 v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY a16
+; GFX12-NEXT:    image_gather4 v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY a16
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
@@ -307,6 +319,8 @@ define amdgpu_ps <4 x float> @gather4_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX9-NEXT:    s_mov_b32 s0, s2
 ; GFX9-NEXT:    s_wqm_b64 exec, exec
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v2
 ; GFX9-NEXT:    s_mov_b32 s1, s3
 ; GFX9-NEXT:    s_mov_b32 s2, s4
 ; GFX9-NEXT:    s_mov_b32 s3, s5
@@ -318,9 +332,9 @@ define amdgpu_ps <4 x float> @gather4_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX9-NEXT:    s_mov_b32 s9, s11
 ; GFX9-NEXT:    s_mov_b32 s10, s12
 ; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    v_lshl_or_b32 v1, v1, 16, v0
+; GFX9-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
 ; GFX9-NEXT:    s_and_b64 exec, exec, s[14:15]
-; GFX9-NEXT:    image_gather4_cl v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 a16
+; GFX9-NEXT:    image_gather4_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
@@ -330,6 +344,7 @@ define amdgpu_ps <4 x float> @gather4_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX10NSA-NEXT:    s_mov_b32 s0, s2
 ; GFX10NSA-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX10NSA-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10NSA-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX10NSA-NEXT:    s_mov_b32 s1, s3
 ; GFX10NSA-NEXT:    s_mov_b32 s2, s4
 ; GFX10NSA-NEXT:    s_mov_b32 s3, s5
@@ -342,8 +357,9 @@ define amdgpu_ps <4 x float> @gather4_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX10NSA-NEXT:    s_mov_b32 s10, s12
 ; GFX10NSA-NEXT:    s_mov_b32 s11, s13
 ; GFX10NSA-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX10NSA-NEXT:    v_lshl_or_b32 v1, s0, 16, v2
 ; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s14
-; GFX10NSA-NEXT:    image_gather4_cl v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX10NSA-NEXT:    image_gather4_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
 ; GFX10NSA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10NSA-NEXT:    ; return to shader part epilog
 ;
@@ -353,6 +369,7 @@ define amdgpu_ps <4 x float> @gather4_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX12-NEXT:    s_mov_b32 s0, s2
 ; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX12-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX12-NEXT:    s_mov_b32 s1, s3
 ; GFX12-NEXT:    s_mov_b32 s2, s4
 ; GFX12-NEXT:    s_mov_b32 s3, s5
@@ -365,8 +382,9 @@ define amdgpu_ps <4 x float> @gather4_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX12-NEXT:    s_mov_b32 s10, s12
 ; GFX12-NEXT:    s_mov_b32 s11, s13
 ; GFX12-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX12-NEXT:    v_lshl_or_b32 v1, s0, 16, v2
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s14
-; GFX12-NEXT:    image_gather4_cl v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX12-NEXT:    image_gather4_cl v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
@@ -380,9 +398,9 @@ define amdgpu_ps <4 x float> @gather4_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
 ; GFX9-NEXT:    s_mov_b64 s[14:15], exec
 ; GFX9-NEXT:    s_mov_b32 s0, s2
 ; GFX9-NEXT:    s_wqm_b64 exec, exec
-; GFX9-NEXT:    v_mov_b32_e32 v4, v1
-; GFX9-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v4
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX9-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v3
 ; GFX9-NEXT:    s_mov_b32 s1, s3
 ; GFX9-NEXT:    s_mov_b32 s2, s4
 ; GFX9-NEXT:    s_mov_b32 s3, s5
@@ -394,9 +412,9 @@ define amdgpu_ps <4 x float> @gather4_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
 ; GFX9-NEXT:    s_mov_b32 s9, s11
 ; GFX9-NEXT:    s_mov_b32 s10, s12
 ; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    v_lshl_or_b32 v2, v2, 16, v0
+; GFX9-NEXT:    v_lshl_or_b32 v2, s0, 16, v2
 ; GFX9-NEXT:    s_and_b64 exec, exec, s[14:15]
-; GFX9-NEXT:    image_gather4_c_cl v[0:3], v[1:3], s[0:7], s[8:11] dmask:0x1 a16
+; GFX9-NEXT:    image_gather4_c_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 a16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
@@ -406,6 +424,7 @@ define amdgpu_ps <4 x float> @gather4_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
 ; GFX10NSA-NEXT:    s_mov_b32 s0, s2
 ; GFX10NSA-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX10NSA-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX10NSA-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX10NSA-NEXT:    s_mov_b32 s1, s3
 ; GFX10NSA-NEXT:    s_mov_b32 s2, s4
 ; GFX10NSA-NEXT:    s_mov_b32 s3, s5
@@ -418,8 +437,9 @@ define amdgpu_ps <4 x float> @gather4_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
 ; GFX10NSA-NEXT:    s_mov_b32 s10, s12
 ; GFX10NSA-NEXT:    s_mov_b32 s11, s13
 ; GFX10NSA-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
+; GFX10NSA-NEXT:    v_lshl_or_b32 v2, s0, 16, v3
 ; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s14
-; GFX10NSA-NEXT:    image_gather4_c_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX10NSA-NEXT:    image_gather4_c_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
 ; GFX10NSA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10NSA-NEXT:    ; return to shader part epilog
 ;
@@ -429,6 +449,7 @@ define amdgpu_ps <4 x float> @gather4_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
 ; GFX12-NEXT:    s_mov_b32 s0, s2
 ; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX12-NEXT:    s_mov_b32 s1, s3
 ; GFX12-NEXT:    s_mov_b32 s2, s4
 ; GFX12-NEXT:    s_mov_b32 s3, s5
@@ -441,8 +462,9 @@ define amdgpu_ps <4 x float> @gather4_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
 ; GFX12-NEXT:    s_mov_b32 s10, s12
 ; GFX12-NEXT:    s_mov_b32 s11, s13
 ; GFX12-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
+; GFX12-NEXT:    v_lshl_or_b32 v2, s0, 16, v3
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s14
-; GFX12-NEXT:    image_gather4_c_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX12-NEXT:    image_gather4_c_cl v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
@@ -456,6 +478,7 @@ define amdgpu_ps <4 x float> @gather4_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX9-NEXT:    s_mov_b64 s[14:15], exec
 ; GFX9-NEXT:    s_mov_b32 s0, s2
 ; GFX9-NEXT:    s_wqm_b64 exec, exec
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX9-NEXT:    s_mov_b32 s1, s3
 ; GFX9-NEXT:    s_mov_b32 s2, s4
@@ -468,6 +491,7 @@ define amdgpu_ps <4 x float> @gather4_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX9-NEXT:    s_mov_b32 s9, s11
 ; GFX9-NEXT:    s_mov_b32 s10, s12
 ; GFX9-NEXT:    s_mov_b32 s11, s13
+; GFX9-NEXT:    v_lshl_or_b32 v0, s0, 16, v0
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
 ; GFX9-NEXT:    s_and_b64 exec, exec, s[14:15]
 ; GFX9-NEXT:    image_gather4_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16
@@ -479,6 +503,7 @@ define amdgpu_ps <4 x float> @gather4_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX10NSA-NEXT:    s_mov_b32 s14, exec_lo
 ; GFX10NSA-NEXT:    s_mov_b32 s0, s2
 ; GFX10NSA-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX10NSA-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX10NSA-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX10NSA-NEXT:    s_mov_b32 s1, s3
 ; GFX10NSA-NEXT:    s_mov_b32 s2, s4
@@ -491,6 +516,7 @@ define amdgpu_ps <4 x float> @gather4_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX10NSA-NEXT:    s_mov_b32 s9, s11
 ; GFX10NSA-NEXT:    s_mov_b32 s10, s12
 ; GFX10NSA-NEXT:    s_mov_b32 s11, s13
+; GFX10NSA-NEXT:    v_lshl_or_b32 v0, s0, 16, v0
 ; GFX10NSA-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
 ; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s14
 ; GFX10NSA-NEXT:    image_gather4_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
@@ -502,6 +528,7 @@ define amdgpu_ps <4 x float> @gather4_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX12-NEXT:    s_mov_b32 s14, exec_lo
 ; GFX12-NEXT:    s_mov_b32 s0, s2
 ; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX12-NEXT:    s_mov_b32 s1, s3
 ; GFX12-NEXT:    s_mov_b32 s2, s4
@@ -514,6 +541,7 @@ define amdgpu_ps <4 x float> @gather4_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX12-NEXT:    s_mov_b32 s9, s11
 ; GFX12-NEXT:    s_mov_b32 s10, s12
 ; GFX12-NEXT:    s_mov_b32 s11, s13
+; GFX12-NEXT:    v_lshl_or_b32 v0, s0, 16, v0
 ; GFX12-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s14
 ; GFX12-NEXT:    image_gather4_b v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
@@ -530,6 +558,7 @@ define amdgpu_ps <4 x float> @gather4_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX9-NEXT:    s_mov_b64 s[14:15], exec
 ; GFX9-NEXT:    s_mov_b32 s0, s2
 ; GFX9-NEXT:    s_wqm_b64 exec, exec
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX9-NEXT:    s_mov_b32 s1, s3
 ; GFX9-NEXT:    s_mov_b32 s2, s4
@@ -542,6 +571,7 @@ define amdgpu_ps <4 x float> @gather4_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX9-NEXT:    s_mov_b32 s9, s11
 ; GFX9-NEXT:    s_mov_b32 s10, s12
 ; GFX9-NEXT:    s_mov_b32 s11, s13
+; GFX9-NEXT:    v_lshl_or_b32 v0, s0, 16, v0
 ; GFX9-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
 ; GFX9-NEXT:    s_and_b64 exec, exec, s[14:15]
 ; GFX9-NEXT:    image_gather4_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 a16
@@ -553,6 +583,7 @@ define amdgpu_ps <4 x float> @gather4_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX10NSA-NEXT:    s_mov_b32 s14, exec_lo
 ; GFX10NSA-NEXT:    s_mov_b32 s0, s2
 ; GFX10NSA-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX10NSA-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX10NSA-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX10NSA-NEXT:    s_mov_b32 s1, s3
 ; GFX10NSA-NEXT:    s_mov_b32 s2, s4
@@ -565,6 +596,7 @@ define amdgpu_ps <4 x float> @gather4_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX10NSA-NEXT:    s_mov_b32 s9, s11
 ; GFX10NSA-NEXT:    s_mov_b32 s10, s12
 ; GFX10NSA-NEXT:    s_mov_b32 s11, s13
+; GFX10NSA-NEXT:    v_lshl_or_b32 v0, s0, 16, v0
 ; GFX10NSA-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
 ; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s14
 ; GFX10NSA-NEXT:    image_gather4_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
@@ -576,6 +608,7 @@ define amdgpu_ps <4 x float> @gather4_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX12-NEXT:    s_mov_b32 s14, exec_lo
 ; GFX12-NEXT:    s_mov_b32 s0, s2
 ; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX12-NEXT:    s_mov_b32 s1, s3
 ; GFX12-NEXT:    s_mov_b32 s2, s4
@@ -588,6 +621,7 @@ define amdgpu_ps <4 x float> @gather4_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX12-NEXT:    s_mov_b32 s9, s11
 ; GFX12-NEXT:    s_mov_b32 s10, s12
 ; GFX12-NEXT:    s_mov_b32 s11, s13
+; GFX12-NEXT:    v_lshl_or_b32 v0, s0, 16, v0
 ; GFX12-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s14
 ; GFX12-NEXT:    image_gather4_c_b v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
@@ -604,9 +638,10 @@ define amdgpu_ps <4 x float> @gather4_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
 ; GFX9-NEXT:    s_mov_b64 s[14:15], exec
 ; GFX9-NEXT:    s_mov_b32 s0, s2
 ; GFX9-NEXT:    s_wqm_b64 exec, exec
-; GFX9-NEXT:    v_mov_b32_e32 v4, v1
-; GFX9-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v4
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v3
 ; GFX9-NEXT:    s_mov_b32 s1, s3
 ; GFX9-NEXT:    s_mov_b32 s2, s4
 ; GFX9-NEXT:    s_mov_b32 s3, s5
@@ -618,9 +653,10 @@ define amdgpu_ps <4 x float> @gather4_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
 ; GFX9-NEXT:    s_mov_b32 s9, s11
 ; GFX9-NEXT:    s_mov_b32 s10, s12
 ; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    v_lshl_or_b32 v2, v2, 16, v0
+; GFX9-NEXT:    v_lshl_or_b32 v0, s0, 16, v0
+; GFX9-NEXT:    v_lshl_or_b32 v2, s0, 16, v2
 ; GFX9-NEXT:    s_and_b64 exec, exec, s[14:15]
-; GFX9-NEXT:    image_gather4_b_cl v[0:3], v[1:3], s[0:7], s[8:11] dmask:0x1 a16
+; GFX9-NEXT:    image_gather4_b_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 a16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
@@ -629,7 +665,9 @@ define amdgpu_ps <4 x float> @gather4_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
 ; GFX10NSA-NEXT:    s_mov_b32 s14, exec_lo
 ; GFX10NSA-NEXT:    s_mov_b32 s0, s2
 ; GFX10NSA-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX10NSA-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX10NSA-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX10NSA-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX10NSA-NEXT:    s_mov_b32 s1, s3
 ; GFX10NSA-NEXT:    s_mov_b32 s2, s4
 ; GFX10NSA-NEXT:    s_mov_b32 s3, s5
@@ -641,9 +679,11 @@ define amdgpu_ps <4 x float> @gather4_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
 ; GFX10NSA-NEXT:    s_mov_b32 s9, s11
 ; GFX10NSA-NEXT:    s_mov_b32 s10, s12
 ; GFX10NSA-NEXT:    s_mov_b32 s11, s13
+; GFX10NSA-NEXT:    v_lshl_or_b32 v0, s0, 16, v0
 ; GFX10NSA-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
+; GFX10NSA-NEXT:    v_lshl_or_b32 v2, s0, 16, v3
 ; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s14
-; GFX10NSA-NEXT:    image_gather4_b_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX10NSA-NEXT:    image_gather4_b_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
 ; GFX10NSA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10NSA-NEXT:    ; return to shader part epilog
 ;
@@ -652,7 +692,9 @@ define amdgpu_ps <4 x float> @gather4_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
 ; GFX12-NEXT:    s_mov_b32 s14, exec_lo
 ; GFX12-NEXT:    s_mov_b32 s0, s2
 ; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX12-NEXT:    s_mov_b32 s1, s3
 ; GFX12-NEXT:    s_mov_b32 s2, s4
 ; GFX12-NEXT:    s_mov_b32 s3, s5
@@ -664,9 +706,11 @@ define amdgpu_ps <4 x float> @gather4_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
 ; GFX12-NEXT:    s_mov_b32 s9, s11
 ; GFX12-NEXT:    s_mov_b32 s10, s12
 ; GFX12-NEXT:    s_mov_b32 s11, s13
+; GFX12-NEXT:    v_lshl_or_b32 v0, s0, 16, v0
 ; GFX12-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
+; GFX12-NEXT:    v_lshl_or_b32 v2, s0, 16, v3
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s14
-; GFX12-NEXT:    image_gather4_b_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX12-NEXT:    image_gather4_b_cl v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
@@ -680,8 +724,10 @@ define amdgpu_ps <4 x float> @gather4_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX9-NEXT:    s_mov_b64 s[14:15], exec
 ; GFX9-NEXT:    s_mov_b32 s0, s2
 ; GFX9-NEXT:    s_wqm_b64 exec, exec
-; GFX9-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff, v4
 ; GFX9-NEXT:    s_mov_b32 s1, s3
 ; GFX9-NEXT:    s_mov_b32 s2, s4
 ; GFX9-NEXT:    s_mov_b32 s3, s5
@@ -693,8 +739,8 @@ define amdgpu_ps <4 x float> @gather4_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX9-NEXT:    s_mov_b32 s9, s11
 ; GFX9-NEXT:    s_mov_b32 s10, s12
 ; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    v_mov_b32_e32 v3, v4
-; GFX9-NEXT:    v_lshl_or_b32 v2, v5, 16, v2
+; GFX9-NEXT:    v_lshl_or_b32 v0, s0, 16, v0
+; GFX9-NEXT:    v_lshl_or_b32 v3, s0, 16, v3
 ; GFX9-NEXT:    s_and_b64 exec, exec, s[14:15]
 ; GFX9-NEXT:    image_gather4_c_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 a16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -705,7 +751,9 @@ define amdgpu_ps <4 x float> @gather4_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX10NSA-NEXT:    s_mov_b32 s14, exec_lo
 ; GFX10NSA-NEXT:    s_mov_b32 s0, s2
 ; GFX10NSA-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX10NSA-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX10NSA-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX10NSA-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; GFX10NSA-NEXT:    s_mov_b32 s1, s3
 ; GFX10NSA-NEXT:    s_mov_b32 s2, s4
 ; GFX10NSA-NEXT:    s_mov_b32 s3, s5
@@ -717,9 +765,11 @@ define amdgpu_ps <4 x float> @gather4_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX10NSA-NEXT:    s_mov_b32 s9, s11
 ; GFX10NSA-NEXT:    s_mov_b32 s10, s12
 ; GFX10NSA-NEXT:    s_mov_b32 s11, s13
+; GFX10NSA-NEXT:    v_lshl_or_b32 v0, s0, 16, v0
 ; GFX10NSA-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
+; GFX10NSA-NEXT:    v_lshl_or_b32 v3, s0, 16, v4
 ; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s14
-; GFX10NSA-NEXT:    image_gather4_c_b_cl v[0:3], [v0, v1, v2, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX10NSA-NEXT:    image_gather4_c_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
 ; GFX10NSA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10NSA-NEXT:    ; return to shader part epilog
 ;
@@ -728,7 +778,9 @@ define amdgpu_ps <4 x float> @gather4_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX12-NEXT:    s_mov_b32 s14, exec_lo
 ; GFX12-NEXT:    s_mov_b32 s0, s2
 ; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX12-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; GFX12-NEXT:    s_mov_b32 s1, s3
 ; GFX12-NEXT:    s_mov_b32 s2, s4
 ; GFX12-NEXT:    s_mov_b32 s3, s5
@@ -740,9 +792,11 @@ define amdgpu_ps <4 x float> @gather4_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX12-NEXT:    s_mov_b32 s9, s11
 ; GFX12-NEXT:    s_mov_b32 s10, s12
 ; GFX12-NEXT:    s_mov_b32 s11, s13
+; GFX12-NEXT:    v_lshl_or_b32 v0, s0, 16, v0
 ; GFX12-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
+; GFX12-NEXT:    v_lshl_or_b32 v3, s0, 16, v4
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s14
-; GFX12-NEXT:    image_gather4_c_b_cl v[0:3], [v0, v1, v2, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX12-NEXT:    image_gather4_c_b_cl v[0:3], [v0, v1, v2, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
@@ -755,6 +809,8 @@ define amdgpu_ps <4 x float> @gather4_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    s_mov_b32 s0, s2
+; GFX9-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v2
 ; GFX9-NEXT:    s_mov_b32 s1, s3
 ; GFX9-NEXT:    s_mov_b32 s2, s4
 ; GFX9-NEXT:    s_mov_b32 s3, s5
@@ -766,19 +822,21 @@ define amdgpu_ps <4 x float> @gather4_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX9-NEXT:    s_mov_b32 s9, s11
 ; GFX9-NEXT:    s_mov_b32 s10, s12
 ; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    v_lshl_or_b32 v1, v1, 16, v0
-; GFX9-NEXT:    image_gather4_l v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 a16
+; GFX9-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
+; GFX9-NEXT:    image_gather4_l v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10NSA-LABEL: gather4_l_2d:
 ; GFX10NSA:       ; %bb.0: ; %main_body
 ; GFX10NSA-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10NSA-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX10NSA-NEXT:    s_mov_b32 s0, s2
 ; GFX10NSA-NEXT:    s_mov_b32 s1, s3
 ; GFX10NSA-NEXT:    s_mov_b32 s2, s4
-; GFX10NSA-NEXT:    s_mov_b32 s3, s5
 ; GFX10NSA-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX10NSA-NEXT:    v_lshl_or_b32 v1, s0, 16, v2
+; GFX10NSA-NEXT:    s_mov_b32 s3, s5
 ; GFX10NSA-NEXT:    s_mov_b32 s4, s6
 ; GFX10NSA-NEXT:    s_mov_b32 s5, s7
 ; GFX10NSA-NEXT:    s_mov_b32 s6, s8
@@ -787,18 +845,20 @@ define amdgpu_ps <4 x float> @gather4_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX10NSA-NEXT:    s_mov_b32 s9, s11
 ; GFX10NSA-NEXT:    s_mov_b32 s10, s12
 ; GFX10NSA-NEXT:    s_mov_b32 s11, s13
-; GFX10NSA-NEXT:    image_gather4_l v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX10NSA-NEXT:    image_gather4_l v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
 ; GFX10NSA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10NSA-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: gather4_l_2d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX12-NEXT:    s_mov_b32 s0, s2
 ; GFX12-NEXT:    s_mov_b32 s1, s3
 ; GFX12-NEXT:    s_mov_b32 s2, s4
-; GFX12-NEXT:    s_mov_b32 s3, s5
 ; GFX12-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX12-NEXT:    v_lshl_or_b32 v1, s0, 16, v2
+; GFX12-NEXT:    s_mov_b32 s3, s5
 ; GFX12-NEXT:    s_mov_b32 s4, s6
 ; GFX12-NEXT:    s_mov_b32 s5, s7
 ; GFX12-NEXT:    s_mov_b32 s6, s8
@@ -807,7 +867,7 @@ define amdgpu_ps <4 x float> @gather4_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX12-NEXT:    s_mov_b32 s9, s11
 ; GFX12-NEXT:    s_mov_b32 s10, s12
 ; GFX12-NEXT:    s_mov_b32 s11, s13
-; GFX12-NEXT:    image_gather4_l v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX12-NEXT:    image_gather4_l v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
@@ -818,10 +878,10 @@ main_body:
 define amdgpu_ps <4 x float> @gather4_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t, half %lod) {
 ; GFX9-LABEL: gather4_c_l_2d:
 ; GFX9:       ; %bb.0: ; %main_body
-; GFX9-NEXT:    v_mov_b32_e32 v4, v1
-; GFX9-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v4
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX9-NEXT:    s_mov_b32 s0, s2
+; GFX9-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v3
 ; GFX9-NEXT:    s_mov_b32 s1, s3
 ; GFX9-NEXT:    s_mov_b32 s2, s4
 ; GFX9-NEXT:    s_mov_b32 s3, s5
@@ -833,19 +893,21 @@ define amdgpu_ps <4 x float> @gather4_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX9-NEXT:    s_mov_b32 s9, s11
 ; GFX9-NEXT:    s_mov_b32 s10, s12
 ; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    v_lshl_or_b32 v2, v2, 16, v0
-; GFX9-NEXT:    image_gather4_c_l v[0:3], v[1:3], s[0:7], s[8:11] dmask:0x1 a16
+; GFX9-NEXT:    v_lshl_or_b32 v2, s0, 16, v2
+; GFX9-NEXT:    image_gather4_c_l v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 a16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10NSA-LABEL: gather4_c_l_2d:
 ; GFX10NSA:       ; %bb.0: ; %main_body
 ; GFX10NSA-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX10NSA-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX10NSA-NEXT:    s_mov_b32 s0, s2
 ; GFX10NSA-NEXT:    s_mov_b32 s1, s3
 ; GFX10NSA-NEXT:    s_mov_b32 s2, s4
-; GFX10NSA-NEXT:    s_mov_b32 s3, s5
 ; GFX10NSA-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
+; GFX10NSA-NEXT:    v_lshl_or_b32 v2, s0, 16, v3
+; GFX10NSA-NEXT:    s_mov_b32 s3, s5
 ; GFX10NSA-NEXT:    s_mov_b32 s4, s6
 ; GFX10NSA-NEXT:    s_mov_b32 s5, s7
 ; GFX10NSA-NEXT:    s_mov_b32 s6, s8
@@ -854,18 +916,20 @@ define amdgpu_ps <4 x float> @gather4_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX10NSA-NEXT:    s_mov_b32 s9, s11
 ; GFX10NSA-NEXT:    s_mov_b32 s10, s12
 ; GFX10NSA-NEXT:    s_mov_b32 s11, s13
-; GFX10NSA-NEXT:    image_gather4_c_l v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX10NSA-NEXT:    image_gather4_c_l v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
 ; GFX10NSA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10NSA-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: gather4_c_l_2d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX12-NEXT:    s_mov_b32 s0, s2
 ; GFX12-NEXT:    s_mov_b32 s1, s3
 ; GFX12-NEXT:    s_mov_b32 s2, s4
-; GFX12-NEXT:    s_mov_b32 s3, s5
 ; GFX12-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
+; GFX12-NEXT:    v_lshl_or_b32 v2, s0, 16, v3
+; GFX12-NEXT:    s_mov_b32 s3, s5
 ; GFX12-NEXT:    s_mov_b32 s4, s6
 ; GFX12-NEXT:    s_mov_b32 s5, s7
 ; GFX12-NEXT:    s_mov_b32 s6, s8
@@ -874,7 +938,7 @@ define amdgpu_ps <4 x float> @gather4_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX12-NEXT:    s_mov_b32 s9, s11
 ; GFX12-NEXT:    s_mov_b32 s10, s12
 ; GFX12-NEXT:    s_mov_b32 s11, s13
-; GFX12-NEXT:    image_gather4_c_l v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX12-NEXT:    image_gather4_c_l v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll
index 841f4f1ac055..e4ed76da8d70 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -o - %s | FileCheck -check-prefix=GFX6 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - %s | FileCheck -check-prefix=GFX10NSA %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -amdgpu-enable-vopd=0 -o - %s | FileCheck -check-prefix=GFX10NSA %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -amdgpu-enable-delay-alu=0 -amdgpu-enable-vopd=0 -o - %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -o - %s | FileCheck -check-prefix=GFX6 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - %s | FileCheck -check-prefix=GFX10NSA %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -amdgpu-enable-vopd=0 -o - %s | FileCheck -check-prefix=GFX10NSA %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -amdgpu-enable-delay-alu=0 -amdgpu-enable-vopd=0 -o - %s | FileCheck -check-prefix=GFX12 %s
 
 define amdgpu_ps <4 x float> @gather4_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
 ; GFX6-LABEL: gather4_2d:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.o.dim.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.o.dim.ll
index 91b34ae21b48..b52eda84b48f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.o.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.o.dim.ll
@@ -1,9 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -o - %s | FileCheck -check-prefix=GFX6 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - %s | FileCheck -check-prefix=GFX10 %s
-; RUN: not llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -o - %s 2>&1 | FileCheck -check-prefix=GFX11-ERR %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -o - %s | FileCheck -check-prefix=GFX6 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - %s | FileCheck -check-prefix=GFX10 %s
+; RUN: not llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -o - %s 2>&1 | FileCheck -check-prefix=GFX11-ERR %s
 
-; GFX11-ERR: LLVM ERROR: cannot select: {{.*}} = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4
+; image_gather4_.*_o, were removed in gfx11+
+; GFX11-ERR: LLVM ERROR: cannot select: {{.*}} = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.gather4.{{.*}}o{{.*}})
 
 define amdgpu_ps <4 x float> @gather4_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %s, float %t) {
 ; GFX6-LABEL: gather4_o_2d:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.getresinfo.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.getresinfo.a16.ll
index 318337892725..aa8bfdabbe4e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.getresinfo.a16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.getresinfo.a16.ll
@@ -1,13 +1,14 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12 %s
 
 define amdgpu_ps <4 x float> @getresinfo_1d(<8 x i32> inreg %rsrc, i16 %mip) {
 ; GFX9-LABEL: getresinfo_1d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    s_mov_b32 s0, s2
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    s_mov_b32 s1, s3
 ; GFX9-NEXT:    s_mov_b32 s2, s4
 ; GFX9-NEXT:    s_mov_b32 s3, s5
@@ -15,16 +16,19 @@ define amdgpu_ps <4 x float> @getresinfo_1d(<8 x i32> inreg %rsrc, i16 %mip) {
 ; GFX9-NEXT:    s_mov_b32 s5, s7
 ; GFX9-NEXT:    s_mov_b32 s6, s8
 ; GFX9-NEXT:    s_mov_b32 s7, s9
+; GFX9-NEXT:    v_lshl_or_b32 v0, s0, 16, v0
 ; GFX9-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: getresinfo_1d:
 ; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX10-NEXT:    s_mov_b32 s0, s2
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s3, s5
+; GFX10-NEXT:    v_lshl_or_b32 v0, s0, 16, v0
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
@@ -35,10 +39,12 @@ define amdgpu_ps <4 x float> @getresinfo_1d(<8 x i32> inreg %rsrc, i16 %mip) {
 ;
 ; GFX12-LABEL: getresinfo_1d:
 ; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX12-NEXT:    s_mov_b32 s0, s2
 ; GFX12-NEXT:    s_mov_b32 s1, s3
 ; GFX12-NEXT:    s_mov_b32 s2, s4
 ; GFX12-NEXT:    s_mov_b32 s3, s5
+; GFX12-NEXT:    v_lshl_or_b32 v0, s0, 16, v0
 ; GFX12-NEXT:    s_mov_b32 s4, s6
 ; GFX12-NEXT:    s_mov_b32 s5, s7
 ; GFX12-NEXT:    s_mov_b32 s6, s8
@@ -55,6 +61,7 @@ define amdgpu_ps <4 x float> @getresinfo_2d(<8 x i32> inreg %rsrc, i16 %mip) {
 ; GFX9-LABEL: getresinfo_2d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    s_mov_b32 s0, s2
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    s_mov_b32 s1, s3
 ; GFX9-NEXT:    s_mov_b32 s2, s4
 ; GFX9-NEXT:    s_mov_b32 s3, s5
@@ -62,16 +69,19 @@ define amdgpu_ps <4 x float> @getresinfo_2d(<8 x i32> inreg %rsrc, i16 %mip) {
 ; GFX9-NEXT:    s_mov_b32 s5, s7
 ; GFX9-NEXT:    s_mov_b32 s6, s8
 ; GFX9-NEXT:    s_mov_b32 s7, s9
+; GFX9-NEXT:    v_lshl_or_b32 v0, s0, 16, v0
 ; GFX9-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: getresinfo_2d:
 ; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX10-NEXT:    s_mov_b32 s0, s2
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s3, s5
+; GFX10-NEXT:    v_lshl_or_b32 v0, s0, 16, v0
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
@@ -82,10 +92,12 @@ define amdgpu_ps <4 x float> @getresinfo_2d(<8 x i32> inreg %rsrc, i16 %mip) {
 ;
 ; GFX12-LABEL: getresinfo_2d:
 ; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX12-NEXT:    s_mov_b32 s0, s2
 ; GFX12-NEXT:    s_mov_b32 s1, s3
 ; GFX12-NEXT:    s_mov_b32 s2, s4
 ; GFX12-NEXT:    s_mov_b32 s3, s5
+; GFX12-NEXT:    v_lshl_or_b32 v0, s0, 16, v0
 ; GFX12-NEXT:    s_mov_b32 s4, s6
 ; GFX12-NEXT:    s_mov_b32 s5, s7
 ; GFX12-NEXT:    s_mov_b32 s6, s8
@@ -102,6 +114,7 @@ define amdgpu_ps <4 x float> @getresinfo_3d(<8 x i32> inreg %rsrc, i16 %mip) {
 ; GFX9-LABEL: getresinfo_3d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    s_mov_b32 s0, s2
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    s_mov_b32 s1, s3
 ; GFX9-NEXT:    s_mov_b32 s2, s4
 ; GFX9-NEXT:    s_mov_b32 s3, s5
@@ -109,16 +122,19 @@ define amdgpu_ps <4 x float> @getresinfo_3d(<8 x i32> inreg %rsrc, i16 %mip) {
 ; GFX9-NEXT:    s_mov_b32 s5, s7
 ; GFX9-NEXT:    s_mov_b32 s6, s8
 ; GFX9-NEXT:    s_mov_b32 s7, s9
+; GFX9-NEXT:    v_lshl_or_b32 v0, s0, 16, v0
 ; GFX9-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: getresinfo_3d:
 ; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX10-NEXT:    s_mov_b32 s0, s2
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s3, s5
+; GFX10-NEXT:    v_lshl_or_b32 v0, s0, 16, v0
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
@@ -129,10 +145,12 @@ define amdgpu_ps <4 x float> @getresinfo_3d(<8 x i32> inreg %rsrc, i16 %mip) {
 ;
 ; GFX12-LABEL: getresinfo_3d:
 ; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX12-NEXT:    s_mov_b32 s0, s2
 ; GFX12-NEXT:    s_mov_b32 s1, s3
 ; GFX12-NEXT:    s_mov_b32 s2, s4
 ; GFX12-NEXT:    s_mov_b32 s3, s5
+; GFX12-NEXT:    v_lshl_or_b32 v0, s0, 16, v0
 ; GFX12-NEXT:    s_mov_b32 s4, s6
 ; GFX12-NEXT:    s_mov_b32 s5, s7
 ; GFX12-NEXT:    s_mov_b32 s6, s8
@@ -149,6 +167,7 @@ define amdgpu_ps <4 x float> @getresinfo_cube(<8 x i32> inreg %rsrc, i16 %mip) {
 ; GFX9-LABEL: getresinfo_cube:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    s_mov_b32 s0, s2
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    s_mov_b32 s1, s3
 ; GFX9-NEXT:    s_mov_b32 s2, s4
 ; GFX9-NEXT:    s_mov_b32 s3, s5
@@ -156,16 +175,19 @@ define amdgpu_ps <4 x float> @getresinfo_cube(<8 x i32> inreg %rsrc, i16 %mip) {
 ; GFX9-NEXT:    s_mov_b32 s5, s7
 ; GFX9-NEXT:    s_mov_b32 s6, s8
 ; GFX9-NEXT:    s_mov_b32 s7, s9
+; GFX9-NEXT:    v_lshl_or_b32 v0, s0, 16, v0
 ; GFX9-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16 da
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: getresinfo_cube:
 ; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX10-NEXT:    s_mov_b32 s0, s2
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s3, s5
+; GFX10-NEXT:    v_lshl_or_b32 v0, s0, 16, v0
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
@@ -176,10 +198,12 @@ define amdgpu_ps <4 x float> @getresinfo_cube(<8 x i32> inreg %rsrc, i16 %mip) {
 ;
 ; GFX12-LABEL: getresinfo_cube:
 ; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX12-NEXT:    s_mov_b32 s0, s2
 ; GFX12-NEXT:    s_mov_b32 s1, s3
 ; GFX12-NEXT:    s_mov_b32 s2, s4
 ; GFX12-NEXT:    s_mov_b32 s3, s5
+; GFX12-NEXT:    v_lshl_or_b32 v0, s0, 16, v0
 ; GFX12-NEXT:    s_mov_b32 s4, s6
 ; GFX12-NEXT:    s_mov_b32 s5, s7
 ; GFX12-NEXT:    s_mov_b32 s6, s8
@@ -196,6 +220,7 @@ define amdgpu_ps <4 x float> @getresinfo_1darray(<8 x i32> inreg %rsrc, i16 %mip
 ; GFX9-LABEL: getresinfo_1darray:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    s_mov_b32 s0, s2
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    s_mov_b32 s1, s3
 ; GFX9-NEXT:    s_mov_b32 s2, s4
 ; GFX9-NEXT:    s_mov_b32 s3, s5
@@ -203,16 +228,19 @@ define amdgpu_ps <4 x float> @getresinfo_1darray(<8 x i32> inreg %rsrc, i16 %mip
 ; GFX9-NEXT:    s_mov_b32 s5, s7
 ; GFX9-NEXT:    s_mov_b32 s6, s8
 ; GFX9-NEXT:    s_mov_b32 s7, s9
+; GFX9-NEXT:    v_lshl_or_b32 v0, s0, 16, v0
 ; GFX9-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16 da
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: getresinfo_1darray:
 ; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX10-NEXT:    s_mov_b32 s0, s2
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s3, s5
+; GFX10-NEXT:    v_lshl_or_b32 v0, s0, 16, v0
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
@@ -223,10 +251,12 @@ define amdgpu_ps <4 x float> @getresinfo_1darray(<8 x i32> inreg %rsrc, i16 %mip
 ;
 ; GFX12-LABEL: getresinfo_1darray:
 ; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX12-NEXT:    s_mov_b32 s0, s2
 ; GFX12-NEXT:    s_mov_b32 s1, s3
 ; GFX12-NEXT:    s_mov_b32 s2, s4
 ; GFX12-NEXT:    s_mov_b32 s3, s5
+; GFX12-NEXT:    v_lshl_or_b32 v0, s0, 16, v0
 ; GFX12-NEXT:    s_mov_b32 s4, s6
 ; GFX12-NEXT:    s_mov_b32 s5, s7
 ; GFX12-NEXT:    s_mov_b32 s6, s8
@@ -243,6 +273,7 @@ define amdgpu_ps <4 x float> @getresinfo_2darray(<8 x i32> inreg %rsrc, i16 %mip
 ; GFX9-LABEL: getresinfo_2darray:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    s_mov_b32 s0, s2
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    s_mov_b32 s1, s3
 ; GFX9-NEXT:    s_mov_b32 s2, s4
 ; GFX9-NEXT:    s_mov_b32 s3, s5
@@ -250,16 +281,19 @@ define amdgpu_ps <4 x float> @getresinfo_2darray(<8 x i32> inreg %rsrc, i16 %mip
 ; GFX9-NEXT:    s_mov_b32 s5, s7
 ; GFX9-NEXT:    s_mov_b32 s6, s8
 ; GFX9-NEXT:    s_mov_b32 s7, s9
+; GFX9-NEXT:    v_lshl_or_b32 v0, s0, 16, v0
 ; GFX9-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16 da
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: getresinfo_2darray:
 ; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX10-NEXT:    s_mov_b32 s0, s2
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s3, s5
+; GFX10-NEXT:    v_lshl_or_b32 v0, s0, 16, v0
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
@@ -270,10 +304,12 @@ define amdgpu_ps <4 x float> @getresinfo_2darray(<8 x i32> inreg %rsrc, i16 %mip
 ;
 ; GFX12-LABEL: getresinfo_2darray:
 ; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX12-NEXT:    s_mov_b32 s0, s2
 ; GFX12-NEXT:    s_mov_b32 s1, s3
 ; GFX12-NEXT:    s_mov_b32 s2, s4
 ; GFX12-NEXT:    s_mov_b32 s3, s5
+; GFX12-NEXT:    v_lshl_or_b32 v0, s0, 16, v0
 ; GFX12-NEXT:    s_mov_b32 s4, s6
 ; GFX12-NEXT:    s_mov_b32 s5, s7
 ; GFX12-NEXT:    s_mov_b32 s6, s8
@@ -290,6 +326,7 @@ define amdgpu_ps <4 x float> @getresinfo_2dmsaa(<8 x i32> inreg %rsrc, i16 %mip)
 ; GFX9-LABEL: getresinfo_2dmsaa:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    s_mov_b32 s0, s2
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    s_mov_b32 s1, s3
 ; GFX9-NEXT:    s_mov_b32 s2, s4
 ; GFX9-NEXT:    s_mov_b32 s3, s5
@@ -297,16 +334,19 @@ define amdgpu_ps <4 x float> @getresinfo_2dmsaa(<8 x i32> inreg %rsrc, i16 %mip)
 ; GFX9-NEXT:    s_mov_b32 s5, s7
 ; GFX9-NEXT:    s_mov_b32 s6, s8
 ; GFX9-NEXT:    s_mov_b32 s7, s9
+; GFX9-NEXT:    v_lshl_or_b32 v0, s0, 16, v0
 ; GFX9-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: getresinfo_2dmsaa:
 ; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX10-NEXT:    s_mov_b32 s0, s2
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s3, s5
+; GFX10-NEXT:    v_lshl_or_b32 v0, s0, 16, v0
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
@@ -317,10 +357,12 @@ define amdgpu_ps <4 x float> @getresinfo_2dmsaa(<8 x i32> inreg %rsrc, i16 %mip)
 ;
 ; GFX12-LABEL: getresinfo_2dmsaa:
 ; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX12-NEXT:    s_mov_b32 s0, s2
 ; GFX12-NEXT:    s_mov_b32 s1, s3
 ; GFX12-NEXT:    s_mov_b32 s2, s4
 ; GFX12-NEXT:    s_mov_b32 s3, s5
+; GFX12-NEXT:    v_lshl_or_b32 v0, s0, 16, v0
 ; GFX12-NEXT:    s_mov_b32 s4, s6
 ; GFX12-NEXT:    s_mov_b32 s5, s7
 ; GFX12-NEXT:    s_mov_b32 s6, s8
@@ -337,6 +379,7 @@ define amdgpu_ps <4 x float> @getresinfo_2darraymsaa(<8 x i32> inreg %rsrc, i16
 ; GFX9-LABEL: getresinfo_2darraymsaa:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    s_mov_b32 s0, s2
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    s_mov_b32 s1, s3
 ; GFX9-NEXT:    s_mov_b32 s2, s4
 ; GFX9-NEXT:    s_mov_b32 s3, s5
@@ -344,16 +387,19 @@ define amdgpu_ps <4 x float> @getresinfo_2darraymsaa(<8 x i32> inreg %rsrc, i16
 ; GFX9-NEXT:    s_mov_b32 s5, s7
 ; GFX9-NEXT:    s_mov_b32 s6, s8
 ; GFX9-NEXT:    s_mov_b32 s7, s9
+; GFX9-NEXT:    v_lshl_or_b32 v0, s0, 16, v0
 ; GFX9-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16 da
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: getresinfo_2darraymsaa:
 ; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX10-NEXT:    s_mov_b32 s0, s2
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s3, s5
+; GFX10-NEXT:    v_lshl_or_b32 v0, s0, 16, v0
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
@@ -364,10 +410,12 @@ define amdgpu_ps <4 x float> @getresinfo_2darraymsaa(<8 x i32> inreg %rsrc, i16
 ;
 ; GFX12-LABEL: getresinfo_2darraymsaa:
 ; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX12-NEXT:    s_mov_b32 s0, s2
 ; GFX12-NEXT:    s_mov_b32 s1, s3
 ; GFX12-NEXT:    s_mov_b32 s2, s4
 ; GFX12-NEXT:    s_mov_b32 s3, s5
+; GFX12-NEXT:    v_lshl_or_b32 v0, s0, 16, v0
 ; GFX12-NEXT:    s_mov_b32 s4, s6
 ; GFX12-NEXT:    s_mov_b32 s5, s7
 ; GFX12-NEXT:    s_mov_b32 s6, s8
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.getresinfo.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.getresinfo.ll
index a3c507b18b63..7ca0d308c80c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.getresinfo.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.getresinfo.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
 
 define amdgpu_ps <4 x float> @getresinfo_1d(<8 x i32> inreg %rsrc, i32 %mip) {
 ; GFX6-LABEL: getresinfo_1d:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll
index 85ab4c99c73a..3ad29224ea5c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga < %s | FileCheck -check-prefix=GFX8-UNPACKED %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 < %s | FileCheck -check-prefix=GFX8-PACKED %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX10PLUS %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga < %s | FileCheck -check-prefix=GFX8-UNPACKED %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 < %s | FileCheck -check-prefix=GFX8-PACKED %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX10PLUS %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12 %s
 
 define amdgpu_ps half @load_1d_f16_x(<8 x i32> inreg %rsrc, i32 %s) {
 ; GFX8-UNPACKED-LABEL: load_1d_f16_x:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.ll
index fc48664c96ce..e09d9b8efe2d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefix=GFX68 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefix=GFX68 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -mattr=-enable-prt-strict-null < %s | FileCheck -check-prefix=NOPRT %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefix=GFX68 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefix=GFX68 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -mattr=-enable-prt-strict-null < %s | FileCheck -check-prefix=NOPRT %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
 
 define amdgpu_ps float @load_1d_f32_x(<8 x i32> inreg %rsrc, i32 %s) {
 ; GFX68-LABEL: load_1d_f32_x:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll
index 2d0d04e1b533..44cec8e6cc10 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12 %s
 
 define amdgpu_ps <4 x float> @load_2d_v4f32_xyzw(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
 ; GFX6-LABEL: load_2d_v4f32_xyzw:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll
index 676bd8856ce6..246e571667cd 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=-real-true16 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GFX12 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=-real-true16 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GFX12 %s
 
 define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw(<8 x i32> inreg %rsrc, i16 %s, i16 %t, i16 %slice, i16 %fragid) {
 ; GFX9-LABEL: load_2darraymsaa_v4f32_xyzw:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll
index a101a15ea814..da2333bdc2be 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12 %s
 
 define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice, i32 %fragid) {
 ; GFX6-LABEL: load_2darraymsaa_v4f32_xyzw:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll
index b20dc4b53927..16a2f910dbf2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll
@@ -1,14 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12 %s
 
 define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw(<8 x i32> inreg %rsrc, i16 %s, i16 %t, i16 %r) {
 ; GFX9-LABEL: load_3d_v4f32_xyzw:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    s_mov_b32 s0, s2
+; GFX9-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v2
 ; GFX9-NEXT:    s_mov_b32 s1, s3
 ; GFX9-NEXT:    s_mov_b32 s2, s4
 ; GFX9-NEXT:    s_mov_b32 s3, s5
@@ -16,40 +18,44 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw(<8 x i32> inreg %rsrc, i16 %s,
 ; GFX9-NEXT:    s_mov_b32 s5, s7
 ; GFX9-NEXT:    s_mov_b32 s6, s8
 ; GFX9-NEXT:    s_mov_b32 s7, s9
-; GFX9-NEXT:    v_lshl_or_b32 v1, v1, 16, v0
-; GFX9-NEXT:    image_load v[0:3], v[1:2], s[0:7] dmask:0xf unorm a16
+; GFX9-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
+; GFX9-NEXT:    image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: load_3d_v4f32_xyzw:
 ; GFX10PLUS:       ; %bb.0:
 ; GFX10PLUS-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10PLUS-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX10PLUS-NEXT:    s_mov_b32 s0, s2
 ; GFX10PLUS-NEXT:    s_mov_b32 s1, s3
 ; GFX10PLUS-NEXT:    s_mov_b32 s2, s4
-; GFX10PLUS-NEXT:    s_mov_b32 s3, s5
 ; GFX10PLUS-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX10PLUS-NEXT:    v_lshl_or_b32 v1, s0, 16, v2
+; GFX10PLUS-NEXT:    s_mov_b32 s3, s5
 ; GFX10PLUS-NEXT:    s_mov_b32 s4, s6
 ; GFX10PLUS-NEXT:    s_mov_b32 s5, s7
 ; GFX10PLUS-NEXT:    s_mov_b32 s6, s8
 ; GFX10PLUS-NEXT:    s_mov_b32 s7, s9
-; GFX10PLUS-NEXT:    image_load v[0:3], [v0, v2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16
+; GFX10PLUS-NEXT:    image_load v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16
 ; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: load_3d_v4f32_xyzw:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX12-NEXT:    s_mov_b32 s0, s2
 ; GFX12-NEXT:    s_mov_b32 s1, s3
 ; GFX12-NEXT:    s_mov_b32 s2, s4
-; GFX12-NEXT:    s_mov_b32 s3, s5
 ; GFX12-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX12-NEXT:    v_lshl_or_b32 v1, s0, 16, v2
+; GFX12-NEXT:    s_mov_b32 s3, s5
 ; GFX12-NEXT:    s_mov_b32 s4, s6
 ; GFX12-NEXT:    s_mov_b32 s5, s7
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
-; GFX12-NEXT:    image_load v[0:3], [v0, v2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D a16
+; GFX12-NEXT:    image_load v[0:3], [v0, v1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D a16
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %v = call <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i16(i32 15, i16 %s, i16 %t, i16 %r, <8 x i32> %rsrc, i32 0, i32 0)
@@ -60,15 +66,16 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, ptr
 ; GFX9-LABEL: load_3d_v4f32_xyzw_tfe:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX9-NEXT:    v_mov_b32_e32 v7, 0
-; GFX9-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9-NEXT:    v_lshl_or_b32 v5, v1, 16, v0
-; GFX9-NEXT:    v_mov_b32_e32 v8, v7
-; GFX9-NEXT:    v_mov_b32_e32 v9, v7
-; GFX9-NEXT:    v_mov_b32_e32 v10, v7
-; GFX9-NEXT:    v_mov_b32_e32 v11, v7
-; GFX9-NEXT:    v_mov_b32_e32 v0, v7
 ; GFX9-NEXT:    s_mov_b32 s0, s2
+; GFX9-NEXT:    v_lshl_or_b32 v10, v1, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v2
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-NEXT:    v_lshl_or_b32 v11, s0, 16, v0
+; GFX9-NEXT:    v_mov_b32_e32 v6, v5
+; GFX9-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-NEXT:    v_mov_b32_e32 v8, v5
+; GFX9-NEXT:    v_mov_b32_e32 v9, v5
+; GFX9-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX9-NEXT:    s_mov_b32 s1, s3
 ; GFX9-NEXT:    s_mov_b32 s2, s4
 ; GFX9-NEXT:    s_mov_b32 s3, s5
@@ -76,91 +83,95 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, ptr
 ; GFX9-NEXT:    s_mov_b32 s5, s7
 ; GFX9-NEXT:    s_mov_b32 s6, s8
 ; GFX9-NEXT:    s_mov_b32 s7, s9
-; GFX9-NEXT:    v_mov_b32_e32 v1, v8
-; GFX9-NEXT:    v_mov_b32_e32 v2, v9
-; GFX9-NEXT:    v_mov_b32_e32 v3, v10
-; GFX9-NEXT:    v_mov_b32_e32 v4, v11
-; GFX9-NEXT:    image_load v[0:4], v[5:6], s[0:7] dmask:0xf unorm a16 tfe
+; GFX9-NEXT:    v_mov_b32_e32 v1, v6
+; GFX9-NEXT:    v_mov_b32_e32 v2, v7
+; GFX9-NEXT:    v_mov_b32_e32 v3, v8
+; GFX9-NEXT:    v_mov_b32_e32 v4, v9
+; GFX9-NEXT:    image_load v[0:4], v[10:11], s[0:7] dmask:0xf unorm a16 tfe
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_dword v7, v4, s[10:11]
+; GFX9-NEXT:    global_store_dword v5, v4, s[10:11]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: load_3d_v4f32_xyzw_tfe:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    v_mov_b32_e32 v6, 0
+; GFX10-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX10-NEXT:    v_mov_b32_e32 v12, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX10-NEXT:    s_mov_b32 s0, s2
 ; GFX10-NEXT:    s_mov_b32 s1, s3
-; GFX10-NEXT:    v_mov_b32_e32 v7, v6
-; GFX10-NEXT:    v_mov_b32_e32 v8, v6
-; GFX10-NEXT:    v_mov_b32_e32 v9, v6
-; GFX10-NEXT:    v_mov_b32_e32 v10, v6
-; GFX10-NEXT:    v_lshl_or_b32 v11, v1, 16, v0
+; GFX10-NEXT:    v_mov_b32_e32 v6, v5
+; GFX10-NEXT:    v_mov_b32_e32 v7, v5
+; GFX10-NEXT:    v_mov_b32_e32 v8, v5
+; GFX10-NEXT:    v_mov_b32_e32 v9, v5
+; GFX10-NEXT:    v_lshl_or_b32 v10, v1, 16, v0
+; GFX10-NEXT:    v_lshl_or_b32 v11, s0, 16, v2
 ; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s3, s5
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
 ; GFX10-NEXT:    s_mov_b32 s7, s9
-; GFX10-NEXT:    v_mov_b32_e32 v0, v6
-; GFX10-NEXT:    v_mov_b32_e32 v1, v7
-; GFX10-NEXT:    v_mov_b32_e32 v2, v8
-; GFX10-NEXT:    v_mov_b32_e32 v3, v9
-; GFX10-NEXT:    v_mov_b32_e32 v4, v10
-; GFX10-NEXT:    image_load v[0:4], v[11:12], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 tfe
+; GFX10-NEXT:    v_mov_b32_e32 v0, v5
+; GFX10-NEXT:    v_mov_b32_e32 v1, v6
+; GFX10-NEXT:    v_mov_b32_e32 v2, v7
+; GFX10-NEXT:    v_mov_b32_e32 v3, v8
+; GFX10-NEXT:    v_mov_b32_e32 v4, v9
+; GFX10-NEXT:    image_load v[0:4], v[10:11], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 tfe
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_store_dword v6, v4, s[10:11]
+; GFX10-NEXT:    global_store_dword v5, v4, s[10:11]
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: load_3d_v4f32_xyzw_tfe:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_mov_b32_e32 v6, 0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_dual_mov_b32 v5, 0 :: v_dual_and_b32 v0, 0xffff, v0
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX11-NEXT:    s_mov_b32 s0, s2
 ; GFX11-NEXT:    s_mov_b32 s1, s3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_mov_b32_e32 v6, v5
+; GFX11-NEXT:    v_mov_b32_e32 v7, v5
+; GFX11-NEXT:    v_mov_b32_e32 v8, v5
+; GFX11-NEXT:    v_mov_b32_e32 v9, v5
+; GFX11-NEXT:    v_lshl_or_b32 v10, v1, 16, v0
+; GFX11-NEXT:    v_lshl_or_b32 v11, s0, 16, v2
 ; GFX11-NEXT:    s_mov_b32 s2, s4
-; GFX11-NEXT:    v_mov_b32_e32 v7, v6
-; GFX11-NEXT:    v_mov_b32_e32 v8, v6
-; GFX11-NEXT:    v_mov_b32_e32 v9, v6
-; GFX11-NEXT:    v_mov_b32_e32 v10, v6
-; GFX11-NEXT:    v_mov_b32_e32 v12, v2
-; GFX11-NEXT:    v_lshl_or_b32 v11, v1, 16, v0
 ; GFX11-NEXT:    s_mov_b32 s3, s5
 ; GFX11-NEXT:    s_mov_b32 s4, s6
 ; GFX11-NEXT:    s_mov_b32 s5, s7
 ; GFX11-NEXT:    s_mov_b32 s6, s8
 ; GFX11-NEXT:    s_mov_b32 s7, s9
-; GFX11-NEXT:    v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9
-; GFX11-NEXT:    v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8
-; GFX11-NEXT:    v_mov_b32_e32 v4, v10
-; GFX11-NEXT:    image_load v[0:4], v[11:12], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 tfe
+; GFX11-NEXT:    v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
+; GFX11-NEXT:    v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
+; GFX11-NEXT:    v_mov_b32_e32 v4, v9
+; GFX11-NEXT:    image_load v[0:4], v[10:11], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 tfe
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_store_b32 v6, v4, s[10:11]
+; GFX11-NEXT:    global_store_b32 v5, v4, s[10:11]
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: load_3d_v4f32_xyzw_tfe:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v6, 0
-; GFX12-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX12-NEXT:    v_dual_mov_b32 v5, 0 :: v_dual_and_b32 v0, 0xffff, v0
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX12-NEXT:    s_mov_b32 s0, s2
 ; GFX12-NEXT:    s_mov_b32 s1, s3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_dual_mov_b32 v6, v5 :: v_dual_mov_b32 v9, v5
+; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v8, v5
+; GFX12-NEXT:    v_lshl_or_b32 v10, v1, 16, v0
+; GFX12-NEXT:    v_lshl_or_b32 v11, s0, 16, v2
 ; GFX12-NEXT:    s_mov_b32 s2, s4
-; GFX12-NEXT:    v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6
-; GFX12-NEXT:    v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6
-; GFX12-NEXT:    v_lshl_or_b32 v11, v1, 16, v0
 ; GFX12-NEXT:    s_mov_b32 s3, s5
 ; GFX12-NEXT:    s_mov_b32 s4, s6
 ; GFX12-NEXT:    s_mov_b32 s5, s7
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
-; GFX12-NEXT:    v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7
-; GFX12-NEXT:    v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9
-; GFX12-NEXT:    v_mov_b32_e32 v4, v10
-; GFX12-NEXT:    image_load v[0:4], [v11, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D a16 tfe
+; GFX12-NEXT:    v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
+; GFX12-NEXT:    v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
+; GFX12-NEXT:    v_mov_b32_e32 v4, v9
+; GFX12-NEXT:    image_load v[0:4], [v10, v11], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D a16 tfe
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_store_b32 v6, v4, s[10:11]
+; GFX12-NEXT:    global_store_b32 v5, v4, s[10:11]
 ; GFX12-NEXT:    ; return to shader part epilog
   %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.3d.sl_v4f32i32s.i16(i32 15, i16 %s, i16 %t, i16 %r, <8 x i32> %rsrc, i32 1, i32 0)
   %v.vec = extractvalue { <4 x float>, i32 } %v, 0
@@ -173,15 +184,16 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc,
 ; GFX9-LABEL: load_3d_v4f32_xyzw_tfe_lwe:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX9-NEXT:    v_mov_b32_e32 v7, 0
-; GFX9-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9-NEXT:    v_lshl_or_b32 v5, v1, 16, v0
-; GFX9-NEXT:    v_mov_b32_e32 v8, v7
-; GFX9-NEXT:    v_mov_b32_e32 v9, v7
-; GFX9-NEXT:    v_mov_b32_e32 v10, v7
-; GFX9-NEXT:    v_mov_b32_e32 v11, v7
-; GFX9-NEXT:    v_mov_b32_e32 v0, v7
 ; GFX9-NEXT:    s_mov_b32 s0, s2
+; GFX9-NEXT:    v_lshl_or_b32 v10, v1, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v2
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-NEXT:    v_lshl_or_b32 v11, s0, 16, v0
+; GFX9-NEXT:    v_mov_b32_e32 v6, v5
+; GFX9-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-NEXT:    v_mov_b32_e32 v8, v5
+; GFX9-NEXT:    v_mov_b32_e32 v9, v5
+; GFX9-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX9-NEXT:    s_mov_b32 s1, s3
 ; GFX9-NEXT:    s_mov_b32 s2, s4
 ; GFX9-NEXT:    s_mov_b32 s3, s5
@@ -189,91 +201,95 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc,
 ; GFX9-NEXT:    s_mov_b32 s5, s7
 ; GFX9-NEXT:    s_mov_b32 s6, s8
 ; GFX9-NEXT:    s_mov_b32 s7, s9
-; GFX9-NEXT:    v_mov_b32_e32 v1, v8
-; GFX9-NEXT:    v_mov_b32_e32 v2, v9
-; GFX9-NEXT:    v_mov_b32_e32 v3, v10
-; GFX9-NEXT:    v_mov_b32_e32 v4, v11
-; GFX9-NEXT:    image_load v[0:4], v[5:6], s[0:7] dmask:0xf unorm a16 tfe lwe
+; GFX9-NEXT:    v_mov_b32_e32 v1, v6
+; GFX9-NEXT:    v_mov_b32_e32 v2, v7
+; GFX9-NEXT:    v_mov_b32_e32 v3, v8
+; GFX9-NEXT:    v_mov_b32_e32 v4, v9
+; GFX9-NEXT:    image_load v[0:4], v[10:11], s[0:7] dmask:0xf unorm a16 tfe lwe
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_dword v7, v4, s[10:11]
+; GFX9-NEXT:    global_store_dword v5, v4, s[10:11]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: load_3d_v4f32_xyzw_tfe_lwe:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    v_mov_b32_e32 v6, 0
+; GFX10-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX10-NEXT:    v_mov_b32_e32 v12, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX10-NEXT:    s_mov_b32 s0, s2
 ; GFX10-NEXT:    s_mov_b32 s1, s3
-; GFX10-NEXT:    v_mov_b32_e32 v7, v6
-; GFX10-NEXT:    v_mov_b32_e32 v8, v6
-; GFX10-NEXT:    v_mov_b32_e32 v9, v6
-; GFX10-NEXT:    v_mov_b32_e32 v10, v6
-; GFX10-NEXT:    v_lshl_or_b32 v11, v1, 16, v0
+; GFX10-NEXT:    v_mov_b32_e32 v6, v5
+; GFX10-NEXT:    v_mov_b32_e32 v7, v5
+; GFX10-NEXT:    v_mov_b32_e32 v8, v5
+; GFX10-NEXT:    v_mov_b32_e32 v9, v5
+; GFX10-NEXT:    v_lshl_or_b32 v10, v1, 16, v0
+; GFX10-NEXT:    v_lshl_or_b32 v11, s0, 16, v2
 ; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s3, s5
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
 ; GFX10-NEXT:    s_mov_b32 s7, s9
-; GFX10-NEXT:    v_mov_b32_e32 v0, v6
-; GFX10-NEXT:    v_mov_b32_e32 v1, v7
-; GFX10-NEXT:    v_mov_b32_e32 v2, v8
-; GFX10-NEXT:    v_mov_b32_e32 v3, v9
-; GFX10-NEXT:    v_mov_b32_e32 v4, v10
-; GFX10-NEXT:    image_load v[0:4], v[11:12], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 tfe lwe
+; GFX10-NEXT:    v_mov_b32_e32 v0, v5
+; GFX10-NEXT:    v_mov_b32_e32 v1, v6
+; GFX10-NEXT:    v_mov_b32_e32 v2, v7
+; GFX10-NEXT:    v_mov_b32_e32 v3, v8
+; GFX10-NEXT:    v_mov_b32_e32 v4, v9
+; GFX10-NEXT:    image_load v[0:4], v[10:11], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 tfe lwe
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_store_dword v6, v4, s[10:11]
+; GFX10-NEXT:    global_store_dword v5, v4, s[10:11]
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: load_3d_v4f32_xyzw_tfe_lwe:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_mov_b32_e32 v6, 0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_dual_mov_b32 v5, 0 :: v_dual_and_b32 v0, 0xffff, v0
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX11-NEXT:    s_mov_b32 s0, s2
 ; GFX11-NEXT:    s_mov_b32 s1, s3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_mov_b32_e32 v6, v5
+; GFX11-NEXT:    v_mov_b32_e32 v7, v5
+; GFX11-NEXT:    v_mov_b32_e32 v8, v5
+; GFX11-NEXT:    v_mov_b32_e32 v9, v5
+; GFX11-NEXT:    v_lshl_or_b32 v10, v1, 16, v0
+; GFX11-NEXT:    v_lshl_or_b32 v11, s0, 16, v2
 ; GFX11-NEXT:    s_mov_b32 s2, s4
-; GFX11-NEXT:    v_mov_b32_e32 v7, v6
-; GFX11-NEXT:    v_mov_b32_e32 v8, v6
-; GFX11-NEXT:    v_mov_b32_e32 v9, v6
-; GFX11-NEXT:    v_mov_b32_e32 v10, v6
-; GFX11-NEXT:    v_mov_b32_e32 v12, v2
-; GFX11-NEXT:    v_lshl_or_b32 v11, v1, 16, v0
 ; GFX11-NEXT:    s_mov_b32 s3, s5
 ; GFX11-NEXT:    s_mov_b32 s4, s6
 ; GFX11-NEXT:    s_mov_b32 s5, s7
 ; GFX11-NEXT:    s_mov_b32 s6, s8
 ; GFX11-NEXT:    s_mov_b32 s7, s9
-; GFX11-NEXT:    v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9
-; GFX11-NEXT:    v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8
-; GFX11-NEXT:    v_mov_b32_e32 v4, v10
-; GFX11-NEXT:    image_load v[0:4], v[11:12], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 tfe lwe
+; GFX11-NEXT:    v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
+; GFX11-NEXT:    v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
+; GFX11-NEXT:    v_mov_b32_e32 v4, v9
+; GFX11-NEXT:    image_load v[0:4], v[10:11], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 tfe lwe
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_store_b32 v6, v4, s[10:11]
+; GFX11-NEXT:    global_store_b32 v5, v4, s[10:11]
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: load_3d_v4f32_xyzw_tfe_lwe:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v6, 0
-; GFX12-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX12-NEXT:    v_dual_mov_b32 v5, 0 :: v_dual_and_b32 v0, 0xffff, v0
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX12-NEXT:    s_mov_b32 s0, s2
 ; GFX12-NEXT:    s_mov_b32 s1, s3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_dual_mov_b32 v6, v5 :: v_dual_mov_b32 v9, v5
+; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v8, v5
+; GFX12-NEXT:    v_lshl_or_b32 v10, v1, 16, v0
+; GFX12-NEXT:    v_lshl_or_b32 v11, s0, 16, v2
 ; GFX12-NEXT:    s_mov_b32 s2, s4
-; GFX12-NEXT:    v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6
-; GFX12-NEXT:    v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6
-; GFX12-NEXT:    v_lshl_or_b32 v11, v1, 16, v0
 ; GFX12-NEXT:    s_mov_b32 s3, s5
 ; GFX12-NEXT:    s_mov_b32 s4, s6
 ; GFX12-NEXT:    s_mov_b32 s5, s7
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
-; GFX12-NEXT:    v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7
-; GFX12-NEXT:    v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9
-; GFX12-NEXT:    v_mov_b32_e32 v4, v10
-; GFX12-NEXT:    image_load v[0:4], [v11, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D a16 tfe
+; GFX12-NEXT:    v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
+; GFX12-NEXT:    v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
+; GFX12-NEXT:    v_mov_b32_e32 v4, v9
+; GFX12-NEXT:    image_load v[0:4], [v10, v11], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D a16 tfe
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_store_b32 v6, v4, s[10:11]
+; GFX12-NEXT:    global_store_b32 v5, v4, s[10:11]
 ; GFX12-NEXT:    ; return to shader part epilog
   %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.3d.sl_v4f32i32s.i16(i32 15, i16 %s, i16 %t, i16 %r, <8 x i32> %rsrc, i32 3, i32 0)
   %v.vec = extractvalue { <4 x float>, i32 } %v, 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll
index 7f32d8e6e16b..ba88b30ee2cd 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12 %s
 
 define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %r) {
 ; GFX6-LABEL: load_3d_v4f32_xyzw:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.cd.g16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.cd.g16.ll
index 159d1e3720c0..a9301d741fb3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.cd.g16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.cd.g16.ll
@@ -1,9 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
 
 define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) {
 ; GFX10-LABEL: sample_cd_1d:
 ; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX10-NEXT:    v_lshl_or_b32 v0, s0, 16, v0
+; GFX10-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
 ; GFX10-NEXT:    image_sample_cd_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -30,6 +34,10 @@ main_body:
 define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s) {
 ; GFX10-LABEL: sample_c_cd_1d:
 ; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX10-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
+; GFX10-NEXT:    v_lshl_or_b32 v2, s0, 16, v2
 ; GFX10-NEXT:    image_sample_c_cd_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -56,6 +64,10 @@ main_body:
 define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s, float %clamp) {
 ; GFX10-LABEL: sample_cd_cl_1d:
 ; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX10-NEXT:    v_lshl_or_b32 v0, s0, 16, v0
+; GFX10-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
 ; GFX10-NEXT:    image_sample_cd_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -82,6 +94,10 @@ main_body:
 define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp) {
 ; GFX10-LABEL: sample_c_cd_cl_1d:
 ; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX10-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
+; GFX10-NEXT:    v_lshl_or_b32 v2, s0, 16, v2
 ; GFX10-NEXT:    image_sample_c_cd_cl_g16 v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll
index 86e2d714f78c..8e08a89fe1af 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll
@@ -1,23 +1,35 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GFX11 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GFX12 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GFX12 %s
 
 define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) {
 ; GFX10-LABEL: sample_d_1d:
 ; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX10-NEXT:    v_lshl_or_b32 v0, s0, 16, v0
+; GFX10-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
 ; GFX10-NEXT:    image_sample_d_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: sample_d_1d:
 ; GFX11:       ; %bb.0: ; %main_body
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT:    v_lshl_or_b32 v0, s0, 16, v0
+; GFX11-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
 ; GFX11-NEXT:    image_sample_d_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: sample_d_1d:
 ; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX12-NEXT:    v_lshl_or_b32 v0, s0, 16, v0
+; GFX12-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
 ; GFX12-NEXT:    image_sample_d_g16 v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
@@ -64,12 +76,14 @@ main_body:
 define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, float %s, float %t, float %r) {
 ; GFX10-LABEL: sample_d_3d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v9, v3
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff, v2
+; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff, v3
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX10-NEXT:    v_lshl_or_b32 v2, v1, 16, v0
-; GFX10-NEXT:    v_lshl_or_b32 v4, v4, 16, v9
+; GFX10-NEXT:    v_lshl_or_b32 v3, s0, 16, v9
+; GFX10-NEXT:    v_lshl_or_b32 v4, v4, 16, v10
+; GFX10-NEXT:    v_lshl_or_b32 v5, s0, 16, v5
 ; GFX10-NEXT:    image_sample_d_g16 v[0:3], v[2:8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -77,20 +91,28 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
 ; GFX11-LABEL: sample_d_3d:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX11-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
-; GFX11-NEXT:    image_sample_d_g16 v[0:3], [v0, v2, v1, v5, v[6:8]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D
+; GFX11-NEXT:    v_lshl_or_b32 v1, s0, 16, v2
+; GFX11-NEXT:    v_lshl_or_b32 v2, v4, 16, v3
+; GFX11-NEXT:    v_lshl_or_b32 v3, s0, 16, v5
+; GFX11-NEXT:    image_sample_d_g16 v[0:3], [v0, v1, v2, v3, v[6:8]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: sample_d_3d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX12-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX12-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
-; GFX12-NEXT:    image_sample_d_g16 v[0:3], [v0, v2, v1, v[5:8]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D
+; GFX12-NEXT:    v_lshl_or_b32 v1, s0, 16, v2
+; GFX12-NEXT:    v_lshl_or_b32 v2, v4, 16, v3
+; GFX12-NEXT:    v_lshl_or_b32 v5, s0, 16, v5
+; GFX12-NEXT:    image_sample_d_g16 v[0:3], [v0, v1, v2, v[5:8]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
@@ -101,18 +123,30 @@ main_body:
 define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s) {
 ; GFX10-LABEL: sample_c_d_1d:
 ; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX10-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
+; GFX10-NEXT:    v_lshl_or_b32 v2, s0, 16, v2
 ; GFX10-NEXT:    image_sample_c_d_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: sample_c_d_1d:
 ; GFX11:       ; %bb.0: ; %main_body
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
+; GFX11-NEXT:    v_lshl_or_b32 v2, s0, 16, v2
 ; GFX11-NEXT:    image_sample_c_d_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: sample_c_d_1d:
 ; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX12-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
+; GFX12-NEXT:    v_lshl_or_b32 v2, s0, 16, v2
 ; GFX12-NEXT:    image_sample_c_d_g16 v[0:3], [v0, v1, v2, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
@@ -159,18 +193,30 @@ main_body:
 define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s, float %clamp) {
 ; GFX10-LABEL: sample_d_cl_1d:
 ; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX10-NEXT:    v_lshl_or_b32 v0, s0, 16, v0
+; GFX10-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
 ; GFX10-NEXT:    image_sample_d_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: sample_d_cl_1d:
 ; GFX11:       ; %bb.0: ; %main_body
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT:    v_lshl_or_b32 v0, s0, 16, v0
+; GFX11-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
 ; GFX11-NEXT:    image_sample_d_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: sample_d_cl_1d:
 ; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX12-NEXT:    v_lshl_or_b32 v0, s0, 16, v0
+; GFX12-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
 ; GFX12-NEXT:    image_sample_d_cl_g16 v[0:3], [v0, v1, v2, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
@@ -217,18 +263,30 @@ main_body:
 define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp) {
 ; GFX10-LABEL: sample_c_d_cl_1d:
 ; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX10-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
+; GFX10-NEXT:    v_lshl_or_b32 v2, s0, 16, v2
 ; GFX10-NEXT:    image_sample_c_d_cl_g16 v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: sample_c_d_cl_1d:
 ; GFX11:       ; %bb.0: ; %main_body
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
+; GFX11-NEXT:    v_lshl_or_b32 v2, s0, 16, v2
 ; GFX11-NEXT:    image_sample_c_d_cl_g16 v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: sample_c_d_cl_1d:
 ; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX12-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
+; GFX12-NEXT:    v_lshl_or_b32 v2, s0, 16, v2
 ; GFX12-NEXT:    image_sample_c_d_cl_g16 v[0:3], [v0, v1, v2, v[3:4]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.d16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.d16.ll
index d07eadcd6414..c421ee08336d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.d16.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -o - %s | FileCheck -check-prefix=UNPACKED %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -o - %s | FileCheck -check-prefix=GFX81 %s
-; FIXME: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s
-; FIXME: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - %s | FileCheck -check-prefix=GFX10 %s
-; FIXME: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -o - %s | FileCheck -check-prefix=GFX11 %s
-; FIXME: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -o - %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -o - %s | FileCheck -check-prefix=UNPACKED %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -o - %s | FileCheck -check-prefix=GFX81 %s
+; FIXME: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s
+; FIXME: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - %s | FileCheck -check-prefix=GFX10 %s
+; FIXME: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -o - %s | FileCheck -check-prefix=GFX11 %s
+; FIXME: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -o - %s | FileCheck -check-prefix=GFX12 %s
 
 define amdgpu_ps void @image_store_f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, half %data) {
 ; UNPACKED-LABEL: image_store_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.ll
index c1c383eb583a..9c2b61aef8c6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -o - %s | FileCheck -check-prefix=GFX6 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - %s | FileCheck -check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -o - %s | FileCheck -check-prefix=GFX11 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -o - %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -o - %s | FileCheck -check-prefix=GFX6 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -o - %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -o - %s | FileCheck -check-prefix=GFX12 %s
 
 define amdgpu_ps void @image_store_f32(<8 x i32> inreg %rsrc, i32 %s, i32 %t, float %data) {
 ; GFX6-LABEL: image_store_f32:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.load.1d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.load.1d.ll
index f4b9dbfa257a..c9e1f03bc9c3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.load.1d.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.load.1d.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=regbankselect -regbankselect-fast -o - %s | FileCheck -check-prefix=FAST %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=regbankselect -regbankselect-greedy -o - %s | FileCheck -check-prefix=GREEDY %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=amdgpu-regbanklegalize -regbankselect-fast -o - %s | FileCheck -check-prefix=FAST %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=amdgpu-regbanklegalize -regbankselect-greedy -o - %s | FileCheck -check-prefix=GREEDY %s
 
 ; Natural mapping
 define amdgpu_ps void @load_1d_vgpr_vaddr__sgpr_srsrc(<8 x i32> inreg %rsrc, i32 %s) {
@@ -20,8 +20,7 @@ define amdgpu_ps void @load_1d_vgpr_vaddr__sgpr_srsrc(<8 x i32> inreg %rsrc, i32
   ; FAST-NEXT:   [[COPY8:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; FAST-NEXT:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
   ; FAST-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY8]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8)
-  ; FAST-NEXT:   [[COPY9:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1)
-  ; FAST-NEXT:   G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[COPY9]](p1) :: (store (<4 x s32>) into `ptr addrspace(1) poison`, addrspace 1)
+  ; FAST-NEXT:   G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[DEF]](p1) :: (store (<4 x s32>) into `ptr addrspace(1) poison`, addrspace 1)
   ; FAST-NEXT:   S_ENDPGM 0
   ;
   ; GREEDY-LABEL: name: load_1d_vgpr_vaddr__sgpr_srsrc
@@ -40,8 +39,7 @@ define amdgpu_ps void @load_1d_vgpr_vaddr__sgpr_srsrc(<8 x i32> inreg %rsrc, i32
   ; GREEDY-NEXT:   [[COPY8:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; GREEDY-NEXT:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
   ; GREEDY-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY8]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8)
-  ; GREEDY-NEXT:   [[COPY9:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1)
-  ; GREEDY-NEXT:   G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[COPY9]](p1) :: (store (<4 x s32>) into `ptr addrspace(1) poison`, addrspace 1)
+  ; GREEDY-NEXT:   G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[DEF]](p1) :: (store (<4 x s32>) into `ptr addrspace(1) poison`, addrspace 1)
   ; GREEDY-NEXT:   S_ENDPGM 0
   %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) poison
@@ -67,8 +65,7 @@ define amdgpu_ps void @load_1d_sgpr_vaddr__sgpr_srsrc(<8 x i32> inreg %rsrc, i32
   ; FAST-NEXT:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
   ; FAST-NEXT:   [[COPY9:%[0-9]+]]:vgpr(s32) = COPY [[COPY8]](s32)
   ; FAST-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY9]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8)
-  ; FAST-NEXT:   [[COPY10:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1)
-  ; FAST-NEXT:   G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[COPY10]](p1) :: (store (<4 x s32>) into `ptr addrspace(1) poison`, addrspace 1)
+  ; FAST-NEXT:   G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[DEF]](p1) :: (store (<4 x s32>) into `ptr addrspace(1) poison`, addrspace 1)
   ; FAST-NEXT:   S_ENDPGM 0
   ;
   ; GREEDY-LABEL: name: load_1d_sgpr_vaddr__sgpr_srsrc
@@ -88,8 +85,7 @@ define amdgpu_ps void @load_1d_sgpr_vaddr__sgpr_srsrc(<8 x i32> inreg %rsrc, i32
   ; GREEDY-NEXT:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
   ; GREEDY-NEXT:   [[COPY9:%[0-9]+]]:vgpr(s32) = COPY [[COPY8]](s32)
   ; GREEDY-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY9]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8)
-  ; GREEDY-NEXT:   [[COPY10:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1)
-  ; GREEDY-NEXT:   G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[COPY10]](p1) :: (store (<4 x s32>) into `ptr addrspace(1) poison`, addrspace 1)
+  ; GREEDY-NEXT:   G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[DEF]](p1) :: (store (<4 x s32>) into `ptr addrspace(1) poison`, addrspace 1)
   ; GREEDY-NEXT:   S_ENDPGM 0
   %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) poison
@@ -120,17 +116,16 @@ define amdgpu_ps void @load_1d_vgpr_vaddr__vgpr_srsrc(<8 x i32> %rsrc, i32 %s) {
   ; FAST-NEXT: bb.2:
   ; FAST-NEXT:   successors: %bb.3(0x80000000)
   ; FAST-NEXT: {{  $}}
-  ; FAST-NEXT:   [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.1, %17, %bb.3
-  ; FAST-NEXT:   [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32), [[UV4:%[0-9]+]]:vgpr_32(s32), [[UV5:%[0-9]+]]:vgpr_32(s32), [[UV6:%[0-9]+]]:vgpr_32(s32), [[UV7:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<8 x s32>)
-  ; FAST-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
-  ; FAST-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
-  ; FAST-NEXT:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
-  ; FAST-NEXT:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
-  ; FAST-NEXT:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV4]](s32), implicit $exec
-  ; FAST-NEXT:   [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV5]](s32), implicit $exec
-  ; FAST-NEXT:   [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV6]](s32), implicit $exec
-  ; FAST-NEXT:   [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV7]](s32), implicit $exec
-  ; FAST-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32), [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32), [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32)
+  ; FAST-NEXT:   [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<8 x s32>)
+  ; FAST-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
+  ; FAST-NEXT:   [[INTRINSIC_CONVERGENT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
+  ; FAST-NEXT:   [[INTRINSIC_CONVERGENT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV2]](s32)
+  ; FAST-NEXT:   [[INTRINSIC_CONVERGENT3:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV3]](s32)
+  ; FAST-NEXT:   [[INTRINSIC_CONVERGENT4:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV4]](s32)
+  ; FAST-NEXT:   [[INTRINSIC_CONVERGENT5:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV5]](s32)
+  ; FAST-NEXT:   [[INTRINSIC_CONVERGENT6:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV6]](s32)
+  ; FAST-NEXT:   [[INTRINSIC_CONVERGENT7:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV7]](s32)
+  ; FAST-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[INTRINSIC_CONVERGENT]](s32), [[INTRINSIC_CONVERGENT1]](s32), [[INTRINSIC_CONVERGENT2]](s32), [[INTRINSIC_CONVERGENT3]](s32), [[INTRINSIC_CONVERGENT4]](s32), [[INTRINSIC_CONVERGENT5]](s32), [[INTRINSIC_CONVERGENT6]](s32), [[INTRINSIC_CONVERGENT7]](s32)
   ; FAST-NEXT:   [[UV8:%[0-9]+]]:vgpr(s64), [[UV9:%[0-9]+]]:vgpr(s64), [[UV10:%[0-9]+]]:vgpr(s64), [[UV11:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<8 x s32>)
   ; FAST-NEXT:   [[UV12:%[0-9]+]]:sgpr(s64), [[UV13:%[0-9]+]]:sgpr(s64), [[UV14:%[0-9]+]]:sgpr(s64), [[UV15:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<8 x s32>)
   ; FAST-NEXT:   [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV12]](s64), [[UV8]]
@@ -140,8 +135,8 @@ define amdgpu_ps void @load_1d_vgpr_vaddr__vgpr_srsrc(<8 x i32> %rsrc, i32 %s) {
   ; FAST-NEXT:   [[AND1:%[0-9]+]]:vcc(s1) = G_AND [[AND]], [[ICMP2]]
   ; FAST-NEXT:   [[ICMP3:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV15]](s64), [[UV11]]
   ; FAST-NEXT:   [[AND2:%[0-9]+]]:vcc(s1) = G_AND [[AND1]], [[ICMP3]]
-  ; FAST-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND2]](s1)
-  ; FAST-NEXT:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
+  ; FAST-NEXT:   [[INTRINSIC_CONVERGENT8:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND2]](s1)
+  ; FAST-NEXT:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT8]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
   ; FAST-NEXT: {{  $}}
   ; FAST-NEXT: bb.3:
   ; FAST-NEXT:   successors: %bb.4(0x40000000), %bb.2(0x40000000)
@@ -156,8 +151,7 @@ define amdgpu_ps void @load_1d_vgpr_vaddr__vgpr_srsrc(<8 x i32> %rsrc, i32 %s) {
   ; FAST-NEXT:   $exec = S_MOV_B64_term [[S_MOV_B64_]]
   ; FAST-NEXT: {{  $}}
   ; FAST-NEXT: bb.5:
-  ; FAST-NEXT:   [[COPY9:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1)
-  ; FAST-NEXT:   G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[COPY9]](p1) :: (store (<4 x s32>) into `ptr addrspace(1) poison`, addrspace 1)
+  ; FAST-NEXT:   G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[DEF]](p1) :: (store (<4 x s32>) into `ptr addrspace(1) poison`, addrspace 1)
   ; FAST-NEXT:   S_ENDPGM 0
   ;
   ; GREEDY-LABEL: name: load_1d_vgpr_vaddr__vgpr_srsrc
@@ -182,17 +176,16 @@ define amdgpu_ps void @load_1d_vgpr_vaddr__vgpr_srsrc(<8 x i32> %rsrc, i32 %s) {
   ; GREEDY-NEXT: bb.2:
   ; GREEDY-NEXT:   successors: %bb.3(0x80000000)
   ; GREEDY-NEXT: {{  $}}
-  ; GREEDY-NEXT:   [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.1, %17, %bb.3
-  ; GREEDY-NEXT:   [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32), [[UV4:%[0-9]+]]:vgpr_32(s32), [[UV5:%[0-9]+]]:vgpr_32(s32), [[UV6:%[0-9]+]]:vgpr_32(s32), [[UV7:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<8 x s32>)
-  ; GREEDY-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
-  ; GREEDY-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
-  ; GREEDY-NEXT:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
-  ; GREEDY-NEXT:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
-  ; GREEDY-NEXT:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV4]](s32), implicit $exec
-  ; GREEDY-NEXT:   [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV5]](s32), implicit $exec
-  ; GREEDY-NEXT:   [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV6]](s32), implicit $exec
-  ; GREEDY-NEXT:   [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV7]](s32), implicit $exec
-  ; GREEDY-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32), [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32), [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32)
+  ; GREEDY-NEXT:   [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<8 x s32>)
+  ; GREEDY-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
+  ; GREEDY-NEXT:   [[INTRINSIC_CONVERGENT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
+  ; GREEDY-NEXT:   [[INTRINSIC_CONVERGENT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV2]](s32)
+  ; GREEDY-NEXT:   [[INTRINSIC_CONVERGENT3:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV3]](s32)
+  ; GREEDY-NEXT:   [[INTRINSIC_CONVERGENT4:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV4]](s32)
+  ; GREEDY-NEXT:   [[INTRINSIC_CONVERGENT5:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV5]](s32)
+  ; GREEDY-NEXT:   [[INTRINSIC_CONVERGENT6:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV6]](s32)
+  ; GREEDY-NEXT:   [[INTRINSIC_CONVERGENT7:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV7]](s32)
+  ; GREEDY-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[INTRINSIC_CONVERGENT]](s32), [[INTRINSIC_CONVERGENT1]](s32), [[INTRINSIC_CONVERGENT2]](s32), [[INTRINSIC_CONVERGENT3]](s32), [[INTRINSIC_CONVERGENT4]](s32), [[INTRINSIC_CONVERGENT5]](s32), [[INTRINSIC_CONVERGENT6]](s32), [[INTRINSIC_CONVERGENT7]](s32)
   ; GREEDY-NEXT:   [[UV8:%[0-9]+]]:vgpr(s64), [[UV9:%[0-9]+]]:vgpr(s64), [[UV10:%[0-9]+]]:vgpr(s64), [[UV11:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<8 x s32>)
   ; GREEDY-NEXT:   [[UV12:%[0-9]+]]:sgpr(s64), [[UV13:%[0-9]+]]:sgpr(s64), [[UV14:%[0-9]+]]:sgpr(s64), [[UV15:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<8 x s32>)
   ; GREEDY-NEXT:   [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV12]](s64), [[UV8]]
@@ -202,8 +195,8 @@ define amdgpu_ps void @load_1d_vgpr_vaddr__vgpr_srsrc(<8 x i32> %rsrc, i32 %s) {
   ; GREEDY-NEXT:   [[AND1:%[0-9]+]]:vcc(s1) = G_AND [[AND]], [[ICMP2]]
   ; GREEDY-NEXT:   [[ICMP3:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV15]](s64), [[UV11]]
   ; GREEDY-NEXT:   [[AND2:%[0-9]+]]:vcc(s1) = G_AND [[AND1]], [[ICMP3]]
-  ; GREEDY-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND2]](s1)
-  ; GREEDY-NEXT:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GREEDY-NEXT:   [[INTRINSIC_CONVERGENT8:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND2]](s1)
+  ; GREEDY-NEXT:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT8]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
   ; GREEDY-NEXT: {{  $}}
   ; GREEDY-NEXT: bb.3:
   ; GREEDY-NEXT:   successors: %bb.4(0x40000000), %bb.2(0x40000000)
@@ -218,8 +211,7 @@ define amdgpu_ps void @load_1d_vgpr_vaddr__vgpr_srsrc(<8 x i32> %rsrc, i32 %s) {
   ; GREEDY-NEXT:   $exec = S_MOV_B64_term [[S_MOV_B64_]]
   ; GREEDY-NEXT: {{  $}}
   ; GREEDY-NEXT: bb.5:
-  ; GREEDY-NEXT:   [[COPY9:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1)
-  ; GREEDY-NEXT:   G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[COPY9]](p1) :: (store (<4 x s32>) into `ptr addrspace(1) poison`, addrspace 1)
+  ; GREEDY-NEXT:   G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[DEF]](p1) :: (store (<4 x s32>) into `ptr addrspace(1) poison`, addrspace 1)
   ; GREEDY-NEXT:   S_ENDPGM 0
   %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) poison
@@ -251,17 +243,16 @@ define amdgpu_ps void @load_1d_sgpr_vaddr__vgpr_srsrc(<8 x i32> %rsrc, i32 inreg
   ; FAST-NEXT: bb.2:
   ; FAST-NEXT:   successors: %bb.3(0x80000000)
   ; FAST-NEXT: {{  $}}
-  ; FAST-NEXT:   [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.1, %18, %bb.3
-  ; FAST-NEXT:   [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32), [[UV4:%[0-9]+]]:vgpr_32(s32), [[UV5:%[0-9]+]]:vgpr_32(s32), [[UV6:%[0-9]+]]:vgpr_32(s32), [[UV7:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<8 x s32>)
-  ; FAST-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
-  ; FAST-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
-  ; FAST-NEXT:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
-  ; FAST-NEXT:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
-  ; FAST-NEXT:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV4]](s32), implicit $exec
-  ; FAST-NEXT:   [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV5]](s32), implicit $exec
-  ; FAST-NEXT:   [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV6]](s32), implicit $exec
-  ; FAST-NEXT:   [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV7]](s32), implicit $exec
-  ; FAST-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32), [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32), [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32)
+  ; FAST-NEXT:   [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<8 x s32>)
+  ; FAST-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
+  ; FAST-NEXT:   [[INTRINSIC_CONVERGENT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
+  ; FAST-NEXT:   [[INTRINSIC_CONVERGENT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV2]](s32)
+  ; FAST-NEXT:   [[INTRINSIC_CONVERGENT3:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV3]](s32)
+  ; FAST-NEXT:   [[INTRINSIC_CONVERGENT4:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV4]](s32)
+  ; FAST-NEXT:   [[INTRINSIC_CONVERGENT5:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV5]](s32)
+  ; FAST-NEXT:   [[INTRINSIC_CONVERGENT6:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV6]](s32)
+  ; FAST-NEXT:   [[INTRINSIC_CONVERGENT7:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV7]](s32)
+  ; FAST-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[INTRINSIC_CONVERGENT]](s32), [[INTRINSIC_CONVERGENT1]](s32), [[INTRINSIC_CONVERGENT2]](s32), [[INTRINSIC_CONVERGENT3]](s32), [[INTRINSIC_CONVERGENT4]](s32), [[INTRINSIC_CONVERGENT5]](s32), [[INTRINSIC_CONVERGENT6]](s32), [[INTRINSIC_CONVERGENT7]](s32)
   ; FAST-NEXT:   [[UV8:%[0-9]+]]:vgpr(s64), [[UV9:%[0-9]+]]:vgpr(s64), [[UV10:%[0-9]+]]:vgpr(s64), [[UV11:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<8 x s32>)
   ; FAST-NEXT:   [[UV12:%[0-9]+]]:sgpr(s64), [[UV13:%[0-9]+]]:sgpr(s64), [[UV14:%[0-9]+]]:sgpr(s64), [[UV15:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<8 x s32>)
   ; FAST-NEXT:   [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV12]](s64), [[UV8]]
@@ -271,8 +262,8 @@ define amdgpu_ps void @load_1d_sgpr_vaddr__vgpr_srsrc(<8 x i32> %rsrc, i32 inreg
   ; FAST-NEXT:   [[AND1:%[0-9]+]]:vcc(s1) = G_AND [[AND]], [[ICMP2]]
   ; FAST-NEXT:   [[ICMP3:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV15]](s64), [[UV11]]
   ; FAST-NEXT:   [[AND2:%[0-9]+]]:vcc(s1) = G_AND [[AND1]], [[ICMP3]]
-  ; FAST-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND2]](s1)
-  ; FAST-NEXT:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
+  ; FAST-NEXT:   [[INTRINSIC_CONVERGENT8:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND2]](s1)
+  ; FAST-NEXT:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT8]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
   ; FAST-NEXT: {{  $}}
   ; FAST-NEXT: bb.3:
   ; FAST-NEXT:   successors: %bb.4(0x40000000), %bb.2(0x40000000)
@@ -287,8 +278,7 @@ define amdgpu_ps void @load_1d_sgpr_vaddr__vgpr_srsrc(<8 x i32> %rsrc, i32 inreg
   ; FAST-NEXT:   $exec = S_MOV_B64_term [[S_MOV_B64_]]
   ; FAST-NEXT: {{  $}}
   ; FAST-NEXT: bb.5:
-  ; FAST-NEXT:   [[COPY10:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1)
-  ; FAST-NEXT:   G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[COPY10]](p1) :: (store (<4 x s32>) into `ptr addrspace(1) poison`, addrspace 1)
+  ; FAST-NEXT:   G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[DEF]](p1) :: (store (<4 x s32>) into `ptr addrspace(1) poison`, addrspace 1)
   ; FAST-NEXT:   S_ENDPGM 0
   ;
   ; GREEDY-LABEL: name: load_1d_sgpr_vaddr__vgpr_srsrc
@@ -314,17 +304,16 @@ define amdgpu_ps void @load_1d_sgpr_vaddr__vgpr_srsrc(<8 x i32> %rsrc, i32 inreg
   ; GREEDY-NEXT: bb.2:
   ; GREEDY-NEXT:   successors: %bb.3(0x80000000)
   ; GREEDY-NEXT: {{  $}}
-  ; GREEDY-NEXT:   [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.1, %18, %bb.3
-  ; GREEDY-NEXT:   [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32), [[UV4:%[0-9]+]]:vgpr_32(s32), [[UV5:%[0-9]+]]:vgpr_32(s32), [[UV6:%[0-9]+]]:vgpr_32(s32), [[UV7:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<8 x s32>)
-  ; GREEDY-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
-  ; GREEDY-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
-  ; GREEDY-NEXT:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
-  ; GREEDY-NEXT:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
-  ; GREEDY-NEXT:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV4]](s32), implicit $exec
-  ; GREEDY-NEXT:   [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV5]](s32), implicit $exec
-  ; GREEDY-NEXT:   [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV6]](s32), implicit $exec
-  ; GREEDY-NEXT:   [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV7]](s32), implicit $exec
-  ; GREEDY-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32), [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32), [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32)
+  ; GREEDY-NEXT:   [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<8 x s32>)
+  ; GREEDY-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
+  ; GREEDY-NEXT:   [[INTRINSIC_CONVERGENT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
+  ; GREEDY-NEXT:   [[INTRINSIC_CONVERGENT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV2]](s32)
+  ; GREEDY-NEXT:   [[INTRINSIC_CONVERGENT3:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV3]](s32)
+  ; GREEDY-NEXT:   [[INTRINSIC_CONVERGENT4:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV4]](s32)
+  ; GREEDY-NEXT:   [[INTRINSIC_CONVERGENT5:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV5]](s32)
+  ; GREEDY-NEXT:   [[INTRINSIC_CONVERGENT6:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV6]](s32)
+  ; GREEDY-NEXT:   [[INTRINSIC_CONVERGENT7:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV7]](s32)
+  ; GREEDY-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[INTRINSIC_CONVERGENT]](s32), [[INTRINSIC_CONVERGENT1]](s32), [[INTRINSIC_CONVERGENT2]](s32), [[INTRINSIC_CONVERGENT3]](s32), [[INTRINSIC_CONVERGENT4]](s32), [[INTRINSIC_CONVERGENT5]](s32), [[INTRINSIC_CONVERGENT6]](s32), [[INTRINSIC_CONVERGENT7]](s32)
   ; GREEDY-NEXT:   [[UV8:%[0-9]+]]:vgpr(s64), [[UV9:%[0-9]+]]:vgpr(s64), [[UV10:%[0-9]+]]:vgpr(s64), [[UV11:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<8 x s32>)
   ; GREEDY-NEXT:   [[UV12:%[0-9]+]]:sgpr(s64), [[UV13:%[0-9]+]]:sgpr(s64), [[UV14:%[0-9]+]]:sgpr(s64), [[UV15:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<8 x s32>)
   ; GREEDY-NEXT:   [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV12]](s64), [[UV8]]
@@ -334,8 +323,8 @@ define amdgpu_ps void @load_1d_sgpr_vaddr__vgpr_srsrc(<8 x i32> %rsrc, i32 inreg
   ; GREEDY-NEXT:   [[AND1:%[0-9]+]]:vcc(s1) = G_AND [[AND]], [[ICMP2]]
   ; GREEDY-NEXT:   [[ICMP3:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV15]](s64), [[UV11]]
   ; GREEDY-NEXT:   [[AND2:%[0-9]+]]:vcc(s1) = G_AND [[AND1]], [[ICMP3]]
-  ; GREEDY-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND2]](s1)
-  ; GREEDY-NEXT:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GREEDY-NEXT:   [[INTRINSIC_CONVERGENT8:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND2]](s1)
+  ; GREEDY-NEXT:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT8]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
   ; GREEDY-NEXT: {{  $}}
   ; GREEDY-NEXT: bb.3:
   ; GREEDY-NEXT:   successors: %bb.4(0x40000000), %bb.2(0x40000000)
@@ -350,8 +339,7 @@ define amdgpu_ps void @load_1d_sgpr_vaddr__vgpr_srsrc(<8 x i32> %rsrc, i32 inreg
   ; GREEDY-NEXT:   $exec = S_MOV_B64_term [[S_MOV_B64_]]
   ; GREEDY-NEXT: {{  $}}
   ; GREEDY-NEXT: bb.5:
-  ; GREEDY-NEXT:   [[COPY10:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1)
-  ; GREEDY-NEXT:   G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[COPY10]](p1) :: (store (<4 x s32>) into `ptr addrspace(1) poison`, addrspace 1)
+  ; GREEDY-NEXT:   G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[DEF]](p1) :: (store (<4 x s32>) into `ptr addrspace(1) poison`, addrspace 1)
   ; GREEDY-NEXT:   S_ENDPGM 0
   %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
   store <4 x float> %v, ptr addrspace(1) poison
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll
index 90e34f995711..ddca8b99e4aa 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -stop-after=regbankselect -regbankselect-fast -o - %s | FileCheck -check-prefix=FAST %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -stop-after=regbankselect -regbankselect-greedy -o - %s | FileCheck -check-prefix=GREEDY %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -stop-after=amdgpu-regbanklegalize -regbankselect-fast -o - %s | FileCheck -check-prefix=FAST %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -stop-after=amdgpu-regbanklegalize -regbankselect-greedy -o - %s | FileCheck -check-prefix=GREEDY %s
 
 ; Natural mapping
 define amdgpu_ps void @sample_1d_vgpr_vaddr__sgpr_rsrc__sgpr_samp(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
@@ -141,17 +141,16 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__vgpr_rsrc__sgpr_samp(<8 x i32> %rsr
   ; FAST-NEXT: bb.2:
   ; FAST-NEXT:   successors: %bb.3(0x80000000)
   ; FAST-NEXT: {{  $}}
-  ; FAST-NEXT:   [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.1, %22, %bb.3
-  ; FAST-NEXT:   [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32), [[UV4:%[0-9]+]]:vgpr_32(s32), [[UV5:%[0-9]+]]:vgpr_32(s32), [[UV6:%[0-9]+]]:vgpr_32(s32), [[UV7:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<8 x s32>)
-  ; FAST-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
-  ; FAST-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
-  ; FAST-NEXT:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
-  ; FAST-NEXT:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
-  ; FAST-NEXT:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV4]](s32), implicit $exec
-  ; FAST-NEXT:   [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV5]](s32), implicit $exec
-  ; FAST-NEXT:   [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV6]](s32), implicit $exec
-  ; FAST-NEXT:   [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV7]](s32), implicit $exec
-  ; FAST-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32), [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32), [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32)
+  ; FAST-NEXT:   [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<8 x s32>)
+  ; FAST-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
+  ; FAST-NEXT:   [[INTRINSIC_CONVERGENT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
+  ; FAST-NEXT:   [[INTRINSIC_CONVERGENT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV2]](s32)
+  ; FAST-NEXT:   [[INTRINSIC_CONVERGENT3:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV3]](s32)
+  ; FAST-NEXT:   [[INTRINSIC_CONVERGENT4:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV4]](s32)
+  ; FAST-NEXT:   [[INTRINSIC_CONVERGENT5:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV5]](s32)
+  ; FAST-NEXT:   [[INTRINSIC_CONVERGENT6:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV6]](s32)
+  ; FAST-NEXT:   [[INTRINSIC_CONVERGENT7:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV7]](s32)
+  ; FAST-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[INTRINSIC_CONVERGENT]](s32), [[INTRINSIC_CONVERGENT1]](s32), [[INTRINSIC_CONVERGENT2]](s32), [[INTRINSIC_CONVERGENT3]](s32), [[INTRINSIC_CONVERGENT4]](s32), [[INTRINSIC_CONVERGENT5]](s32), [[INTRINSIC_CONVERGENT6]](s32), [[INTRINSIC_CONVERGENT7]](s32)
   ; FAST-NEXT:   [[UV8:%[0-9]+]]:vgpr(s64), [[UV9:%[0-9]+]]:vgpr(s64), [[UV10:%[0-9]+]]:vgpr(s64), [[UV11:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<8 x s32>)
   ; FAST-NEXT:   [[UV12:%[0-9]+]]:sgpr(s64), [[UV13:%[0-9]+]]:sgpr(s64), [[UV14:%[0-9]+]]:sgpr(s64), [[UV15:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR2]](<8 x s32>)
   ; FAST-NEXT:   [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV12]](s64), [[UV8]]
@@ -161,8 +160,8 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__vgpr_rsrc__sgpr_samp(<8 x i32> %rsr
   ; FAST-NEXT:   [[AND1:%[0-9]+]]:vcc(s1) = G_AND [[AND]], [[ICMP2]]
   ; FAST-NEXT:   [[ICMP3:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV15]](s64), [[UV11]]
   ; FAST-NEXT:   [[AND2:%[0-9]+]]:vcc(s1) = G_AND [[AND1]], [[ICMP3]]
-  ; FAST-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND2]](s1)
-  ; FAST-NEXT:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
+  ; FAST-NEXT:   [[INTRINSIC_CONVERGENT8:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND2]](s1)
+  ; FAST-NEXT:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT8]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
   ; FAST-NEXT: {{  $}}
   ; FAST-NEXT: bb.3:
   ; FAST-NEXT:   successors: %bb.4(0x40000000), %bb.2(0x40000000)
@@ -207,17 +206,16 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__vgpr_rsrc__sgpr_samp(<8 x i32> %rsr
   ; GREEDY-NEXT: bb.2:
   ; GREEDY-NEXT:   successors: %bb.3(0x80000000)
   ; GREEDY-NEXT: {{  $}}
-  ; GREEDY-NEXT:   [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.1, %22, %bb.3
-  ; GREEDY-NEXT:   [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32), [[UV4:%[0-9]+]]:vgpr_32(s32), [[UV5:%[0-9]+]]:vgpr_32(s32), [[UV6:%[0-9]+]]:vgpr_32(s32), [[UV7:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<8 x s32>)
-  ; GREEDY-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
-  ; GREEDY-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
-  ; GREEDY-NEXT:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
-  ; GREEDY-NEXT:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
-  ; GREEDY-NEXT:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV4]](s32), implicit $exec
-  ; GREEDY-NEXT:   [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV5]](s32), implicit $exec
-  ; GREEDY-NEXT:   [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV6]](s32), implicit $exec
-  ; GREEDY-NEXT:   [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV7]](s32), implicit $exec
-  ; GREEDY-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32), [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32), [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32)
+  ; GREEDY-NEXT:   [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<8 x s32>)
+  ; GREEDY-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
+  ; GREEDY-NEXT:   [[INTRINSIC_CONVERGENT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
+  ; GREEDY-NEXT:   [[INTRINSIC_CONVERGENT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV2]](s32)
+  ; GREEDY-NEXT:   [[INTRINSIC_CONVERGENT3:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV3]](s32)
+  ; GREEDY-NEXT:   [[INTRINSIC_CONVERGENT4:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV4]](s32)
+  ; GREEDY-NEXT:   [[INTRINSIC_CONVERGENT5:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV5]](s32)
+  ; GREEDY-NEXT:   [[INTRINSIC_CONVERGENT6:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV6]](s32)
+  ; GREEDY-NEXT:   [[INTRINSIC_CONVERGENT7:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV7]](s32)
+  ; GREEDY-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[INTRINSIC_CONVERGENT]](s32), [[INTRINSIC_CONVERGENT1]](s32), [[INTRINSIC_CONVERGENT2]](s32), [[INTRINSIC_CONVERGENT3]](s32), [[INTRINSIC_CONVERGENT4]](s32), [[INTRINSIC_CONVERGENT5]](s32), [[INTRINSIC_CONVERGENT6]](s32), [[INTRINSIC_CONVERGENT7]](s32)
   ; GREEDY-NEXT:   [[UV8:%[0-9]+]]:vgpr(s64), [[UV9:%[0-9]+]]:vgpr(s64), [[UV10:%[0-9]+]]:vgpr(s64), [[UV11:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<8 x s32>)
   ; GREEDY-NEXT:   [[UV12:%[0-9]+]]:sgpr(s64), [[UV13:%[0-9]+]]:sgpr(s64), [[UV14:%[0-9]+]]:sgpr(s64), [[UV15:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR2]](<8 x s32>)
   ; GREEDY-NEXT:   [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV12]](s64), [[UV8]]
@@ -227,8 +225,8 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__vgpr_rsrc__sgpr_samp(<8 x i32> %rsr
   ; GREEDY-NEXT:   [[AND1:%[0-9]+]]:vcc(s1) = G_AND [[AND]], [[ICMP2]]
   ; GREEDY-NEXT:   [[ICMP3:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV15]](s64), [[UV11]]
   ; GREEDY-NEXT:   [[AND2:%[0-9]+]]:vcc(s1) = G_AND [[AND1]], [[ICMP3]]
-  ; GREEDY-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND2]](s1)
-  ; GREEDY-NEXT:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GREEDY-NEXT:   [[INTRINSIC_CONVERGENT8:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND2]](s1)
+  ; GREEDY-NEXT:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT8]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
   ; GREEDY-NEXT: {{  $}}
   ; GREEDY-NEXT: bb.3:
   ; GREEDY-NEXT:   successors: %bb.4(0x40000000), %bb.2(0x40000000)
@@ -279,20 +277,19 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__sgpr_rsrc__vgpr_samp(<8 x i32> inre
   ; FAST-NEXT: bb.2:
   ; FAST-NEXT:   successors: %bb.3(0x80000000)
   ; FAST-NEXT: {{  $}}
-  ; FAST-NEXT:   [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.1, %22, %bb.3
-  ; FAST-NEXT:   [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>)
-  ; FAST-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
-  ; FAST-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
-  ; FAST-NEXT:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
-  ; FAST-NEXT:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
-  ; FAST-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
+  ; FAST-NEXT:   [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>)
+  ; FAST-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
+  ; FAST-NEXT:   [[INTRINSIC_CONVERGENT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
+  ; FAST-NEXT:   [[INTRINSIC_CONVERGENT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV2]](s32)
+  ; FAST-NEXT:   [[INTRINSIC_CONVERGENT3:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV3]](s32)
+  ; FAST-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[INTRINSIC_CONVERGENT]](s32), [[INTRINSIC_CONVERGENT1]](s32), [[INTRINSIC_CONVERGENT2]](s32), [[INTRINSIC_CONVERGENT3]](s32)
   ; FAST-NEXT:   [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>)
   ; FAST-NEXT:   [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR2]](<4 x s32>)
   ; FAST-NEXT:   [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]]
   ; FAST-NEXT:   [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]]
   ; FAST-NEXT:   [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]]
-  ; FAST-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1)
-  ; FAST-NEXT:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
+  ; FAST-NEXT:   [[INTRINSIC_CONVERGENT4:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1)
+  ; FAST-NEXT:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT4]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
   ; FAST-NEXT: {{  $}}
   ; FAST-NEXT: bb.3:
   ; FAST-NEXT:   successors: %bb.4(0x40000000), %bb.2(0x40000000)
@@ -337,20 +334,19 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__sgpr_rsrc__vgpr_samp(<8 x i32> inre
   ; GREEDY-NEXT: bb.2:
   ; GREEDY-NEXT:   successors: %bb.3(0x80000000)
   ; GREEDY-NEXT: {{  $}}
-  ; GREEDY-NEXT:   [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.1, %22, %bb.3
-  ; GREEDY-NEXT:   [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>)
-  ; GREEDY-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
-  ; GREEDY-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
-  ; GREEDY-NEXT:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
-  ; GREEDY-NEXT:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
-  ; GREEDY-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
+  ; GREEDY-NEXT:   [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>)
+  ; GREEDY-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
+  ; GREEDY-NEXT:   [[INTRINSIC_CONVERGENT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
+  ; GREEDY-NEXT:   [[INTRINSIC_CONVERGENT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV2]](s32)
+  ; GREEDY-NEXT:   [[INTRINSIC_CONVERGENT3:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV3]](s32)
+  ; GREEDY-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[INTRINSIC_CONVERGENT]](s32), [[INTRINSIC_CONVERGENT1]](s32), [[INTRINSIC_CONVERGENT2]](s32), [[INTRINSIC_CONVERGENT3]](s32)
   ; GREEDY-NEXT:   [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>)
   ; GREEDY-NEXT:   [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR2]](<4 x s32>)
   ; GREEDY-NEXT:   [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]]
   ; GREEDY-NEXT:   [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]]
   ; GREEDY-NEXT:   [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]]
-  ; GREEDY-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1)
-  ; GREEDY-NEXT:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GREEDY-NEXT:   [[INTRINSIC_CONVERGENT4:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1)
+  ; GREEDY-NEXT:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT4]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
   ; GREEDY-NEXT: {{  $}}
   ; GREEDY-NEXT: bb.3:
   ; GREEDY-NEXT:   successors: %bb.4(0x40000000), %bb.2(0x40000000)
@@ -401,17 +397,16 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__vgpr_rsrc__vgpr_samp(<8 x i32> %rsr
   ; FAST-NEXT: bb.2:
   ; FAST-NEXT:   successors: %bb.3(0x80000000)
   ; FAST-NEXT: {{  $}}
-  ; FAST-NEXT:   [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.1, %22, %bb.3
-  ; FAST-NEXT:   [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32), [[UV4:%[0-9]+]]:vgpr_32(s32), [[UV5:%[0-9]+]]:vgpr_32(s32), [[UV6:%[0-9]+]]:vgpr_32(s32), [[UV7:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<8 x s32>)
-  ; FAST-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
-  ; FAST-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
-  ; FAST-NEXT:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
-  ; FAST-NEXT:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
-  ; FAST-NEXT:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV4]](s32), implicit $exec
-  ; FAST-NEXT:   [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV5]](s32), implicit $exec
-  ; FAST-NEXT:   [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV6]](s32), implicit $exec
-  ; FAST-NEXT:   [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV7]](s32), implicit $exec
-  ; FAST-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32), [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32), [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32)
+  ; FAST-NEXT:   [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<8 x s32>)
+  ; FAST-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
+  ; FAST-NEXT:   [[INTRINSIC_CONVERGENT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
+  ; FAST-NEXT:   [[INTRINSIC_CONVERGENT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV2]](s32)
+  ; FAST-NEXT:   [[INTRINSIC_CONVERGENT3:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV3]](s32)
+  ; FAST-NEXT:   [[INTRINSIC_CONVERGENT4:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV4]](s32)
+  ; FAST-NEXT:   [[INTRINSIC_CONVERGENT5:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV5]](s32)
+  ; FAST-NEXT:   [[INTRINSIC_CONVERGENT6:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV6]](s32)
+  ; FAST-NEXT:   [[INTRINSIC_CONVERGENT7:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV7]](s32)
+  ; FAST-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[INTRINSIC_CONVERGENT]](s32), [[INTRINSIC_CONVERGENT1]](s32), [[INTRINSIC_CONVERGENT2]](s32), [[INTRINSIC_CONVERGENT3]](s32), [[INTRINSIC_CONVERGENT4]](s32), [[INTRINSIC_CONVERGENT5]](s32), [[INTRINSIC_CONVERGENT6]](s32), [[INTRINSIC_CONVERGENT7]](s32)
   ; FAST-NEXT:   [[UV8:%[0-9]+]]:vgpr(s64), [[UV9:%[0-9]+]]:vgpr(s64), [[UV10:%[0-9]+]]:vgpr(s64), [[UV11:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<8 x s32>)
   ; FAST-NEXT:   [[UV12:%[0-9]+]]:sgpr(s64), [[UV13:%[0-9]+]]:sgpr(s64), [[UV14:%[0-9]+]]:sgpr(s64), [[UV15:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR2]](<8 x s32>)
   ; FAST-NEXT:   [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV12]](s64), [[UV8]]
@@ -421,20 +416,20 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__vgpr_rsrc__vgpr_samp(<8 x i32> %rsr
   ; FAST-NEXT:   [[AND1:%[0-9]+]]:vcc(s1) = G_AND [[AND]], [[ICMP2]]
   ; FAST-NEXT:   [[ICMP3:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV15]](s64), [[UV11]]
   ; FAST-NEXT:   [[AND2:%[0-9]+]]:vcc(s1) = G_AND [[AND1]], [[ICMP3]]
-  ; FAST-NEXT:   [[UV16:%[0-9]+]]:vgpr_32(s32), [[UV17:%[0-9]+]]:vgpr_32(s32), [[UV18:%[0-9]+]]:vgpr_32(s32), [[UV19:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>)
-  ; FAST-NEXT:   [[V_READFIRSTLANE_B32_8:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV16]](s32), implicit $exec
-  ; FAST-NEXT:   [[V_READFIRSTLANE_B32_9:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV17]](s32), implicit $exec
-  ; FAST-NEXT:   [[V_READFIRSTLANE_B32_10:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV18]](s32), implicit $exec
-  ; FAST-NEXT:   [[V_READFIRSTLANE_B32_11:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV19]](s32), implicit $exec
-  ; FAST-NEXT:   [[BUILD_VECTOR3:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_8]](s32), [[V_READFIRSTLANE_B32_9]](s32), [[V_READFIRSTLANE_B32_10]](s32), [[V_READFIRSTLANE_B32_11]](s32)
+  ; FAST-NEXT:   [[UV16:%[0-9]+]]:vgpr(s32), [[UV17:%[0-9]+]]:vgpr(s32), [[UV18:%[0-9]+]]:vgpr(s32), [[UV19:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>)
+  ; FAST-NEXT:   [[INTRINSIC_CONVERGENT8:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV16]](s32)
+  ; FAST-NEXT:   [[INTRINSIC_CONVERGENT9:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV17]](s32)
+  ; FAST-NEXT:   [[INTRINSIC_CONVERGENT10:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV18]](s32)
+  ; FAST-NEXT:   [[INTRINSIC_CONVERGENT11:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV19]](s32)
+  ; FAST-NEXT:   [[BUILD_VECTOR3:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[INTRINSIC_CONVERGENT8]](s32), [[INTRINSIC_CONVERGENT9]](s32), [[INTRINSIC_CONVERGENT10]](s32), [[INTRINSIC_CONVERGENT11]](s32)
   ; FAST-NEXT:   [[UV20:%[0-9]+]]:vgpr(s64), [[UV21:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>)
   ; FAST-NEXT:   [[UV22:%[0-9]+]]:sgpr(s64), [[UV23:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR3]](<4 x s32>)
   ; FAST-NEXT:   [[ICMP4:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV22]](s64), [[UV20]]
   ; FAST-NEXT:   [[AND3:%[0-9]+]]:vcc(s1) = G_AND [[AND2]], [[ICMP4]]
   ; FAST-NEXT:   [[ICMP5:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV23]](s64), [[UV21]]
   ; FAST-NEXT:   [[AND4:%[0-9]+]]:vcc(s1) = G_AND [[AND3]], [[ICMP5]]
-  ; FAST-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND4]](s1)
-  ; FAST-NEXT:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
+  ; FAST-NEXT:   [[INTRINSIC_CONVERGENT12:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND4]](s1)
+  ; FAST-NEXT:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT12]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
   ; FAST-NEXT: {{  $}}
   ; FAST-NEXT: bb.3:
   ; FAST-NEXT:   successors: %bb.4(0x40000000), %bb.2(0x40000000)
@@ -479,17 +474,16 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__vgpr_rsrc__vgpr_samp(<8 x i32> %rsr
   ; GREEDY-NEXT: bb.2:
   ; GREEDY-NEXT:   successors: %bb.3(0x80000000)
   ; GREEDY-NEXT: {{  $}}
-  ; GREEDY-NEXT:   [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.1, %22, %bb.3
-  ; GREEDY-NEXT:   [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32), [[UV4:%[0-9]+]]:vgpr_32(s32), [[UV5:%[0-9]+]]:vgpr_32(s32), [[UV6:%[0-9]+]]:vgpr_32(s32), [[UV7:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<8 x s32>)
-  ; GREEDY-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
-  ; GREEDY-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
-  ; GREEDY-NEXT:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
-  ; GREEDY-NEXT:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
-  ; GREEDY-NEXT:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV4]](s32), implicit $exec
-  ; GREEDY-NEXT:   [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV5]](s32), implicit $exec
-  ; GREEDY-NEXT:   [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV6]](s32), implicit $exec
-  ; GREEDY-NEXT:   [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV7]](s32), implicit $exec
-  ; GREEDY-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32), [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32), [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32)
+  ; GREEDY-NEXT:   [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<8 x s32>)
+  ; GREEDY-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
+  ; GREEDY-NEXT:   [[INTRINSIC_CONVERGENT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
+  ; GREEDY-NEXT:   [[INTRINSIC_CONVERGENT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV2]](s32)
+  ; GREEDY-NEXT:   [[INTRINSIC_CONVERGENT3:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV3]](s32)
+  ; GREEDY-NEXT:   [[INTRINSIC_CONVERGENT4:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV4]](s32)
+  ; GREEDY-NEXT:   [[INTRINSIC_CONVERGENT5:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV5]](s32)
+  ; GREEDY-NEXT:   [[INTRINSIC_CONVERGENT6:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV6]](s32)
+  ; GREEDY-NEXT:   [[INTRINSIC_CONVERGENT7:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV7]](s32)
+  ; GREEDY-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[INTRINSIC_CONVERGENT]](s32), [[INTRINSIC_CONVERGENT1]](s32), [[INTRINSIC_CONVERGENT2]](s32), [[INTRINSIC_CONVERGENT3]](s32), [[INTRINSIC_CONVERGENT4]](s32), [[INTRINSIC_CONVERGENT5]](s32), [[INTRINSIC_CONVERGENT6]](s32), [[INTRINSIC_CONVERGENT7]](s32)
   ; GREEDY-NEXT:   [[UV8:%[0-9]+]]:vgpr(s64), [[UV9:%[0-9]+]]:vgpr(s64), [[UV10:%[0-9]+]]:vgpr(s64), [[UV11:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<8 x s32>)
   ; GREEDY-NEXT:   [[UV12:%[0-9]+]]:sgpr(s64), [[UV13:%[0-9]+]]:sgpr(s64), [[UV14:%[0-9]+]]:sgpr(s64), [[UV15:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR2]](<8 x s32>)
   ; GREEDY-NEXT:   [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV12]](s64), [[UV8]]
@@ -499,20 +493,20 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__vgpr_rsrc__vgpr_samp(<8 x i32> %rsr
   ; GREEDY-NEXT:   [[AND1:%[0-9]+]]:vcc(s1) = G_AND [[AND]], [[ICMP2]]
   ; GREEDY-NEXT:   [[ICMP3:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV15]](s64), [[UV11]]
   ; GREEDY-NEXT:   [[AND2:%[0-9]+]]:vcc(s1) = G_AND [[AND1]], [[ICMP3]]
-  ; GREEDY-NEXT:   [[UV16:%[0-9]+]]:vgpr_32(s32), [[UV17:%[0-9]+]]:vgpr_32(s32), [[UV18:%[0-9]+]]:vgpr_32(s32), [[UV19:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>)
-  ; GREEDY-NEXT:   [[V_READFIRSTLANE_B32_8:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV16]](s32), implicit $exec
-  ; GREEDY-NEXT:   [[V_READFIRSTLANE_B32_9:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV17]](s32), implicit $exec
-  ; GREEDY-NEXT:   [[V_READFIRSTLANE_B32_10:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV18]](s32), implicit $exec
-  ; GREEDY-NEXT:   [[V_READFIRSTLANE_B32_11:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV19]](s32), implicit $exec
-  ; GREEDY-NEXT:   [[BUILD_VECTOR3:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_8]](s32), [[V_READFIRSTLANE_B32_9]](s32), [[V_READFIRSTLANE_B32_10]](s32), [[V_READFIRSTLANE_B32_11]](s32)
+  ; GREEDY-NEXT:   [[UV16:%[0-9]+]]:vgpr(s32), [[UV17:%[0-9]+]]:vgpr(s32), [[UV18:%[0-9]+]]:vgpr(s32), [[UV19:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>)
+  ; GREEDY-NEXT:   [[INTRINSIC_CONVERGENT8:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV16]](s32)
+  ; GREEDY-NEXT:   [[INTRINSIC_CONVERGENT9:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV17]](s32)
+  ; GREEDY-NEXT:   [[INTRINSIC_CONVERGENT10:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV18]](s32)
+  ; GREEDY-NEXT:   [[INTRINSIC_CONVERGENT11:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV19]](s32)
+  ; GREEDY-NEXT:   [[BUILD_VECTOR3:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[INTRINSIC_CONVERGENT8]](s32), [[INTRINSIC_CONVERGENT9]](s32), [[INTRINSIC_CONVERGENT10]](s32), [[INTRINSIC_CONVERGENT11]](s32)
   ; GREEDY-NEXT:   [[UV20:%[0-9]+]]:vgpr(s64), [[UV21:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>)
   ; GREEDY-NEXT:   [[UV22:%[0-9]+]]:sgpr(s64), [[UV23:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR3]](<4 x s32>)
   ; GREEDY-NEXT:   [[ICMP4:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV22]](s64), [[UV20]]
   ; GREEDY-NEXT:   [[AND3:%[0-9]+]]:vcc(s1) = G_AND [[AND2]], [[ICMP4]]
   ; GREEDY-NEXT:   [[ICMP5:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV23]](s64), [[UV21]]
   ; GREEDY-NEXT:   [[AND4:%[0-9]+]]:vcc(s1) = G_AND [[AND3]], [[ICMP5]]
-  ; GREEDY-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND4]](s1)
-  ; GREEDY-NEXT:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GREEDY-NEXT:   [[INTRINSIC_CONVERGENT12:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND4]](s1)
+  ; GREEDY-NEXT:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT12]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
   ; GREEDY-NEXT: {{  $}}
   ; GREEDY-NEXT: bb.3:
   ; GREEDY-NEXT:   successors: %bb.4(0x40000000), %bb.2(0x40000000)
diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-image-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-image-atomics.ll
index 37d0e5411f58..66ea509ef2e7 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-min-max-image-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-image-atomics.ll
@@ -5,12 +5,13 @@
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1030 | FileCheck %s -check-prefix=GFX1030
 ; RUN: not --crash llc < %s -mtriple=amdgcn -mcpu=gfx1100 2>&1 | FileCheck %s -check-prefix=GFX11-ERR
 
-; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=verde | FileCheck %s -check-prefix=G_SI
-; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=hawaii | FileCheck %s  -check-prefix=G_GFX7
-; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s -check-prefix=G_GFX10
-; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1030 | FileCheck %s -check-prefix=G_GFX1030
-; RUN: not llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1100 2>&1 | FileCheck %s -check-prefix=G_GFX11-ERR
+; RUN: llc < %s -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=verde | FileCheck %s -check-prefix=G_SI
+; RUN: llc < %s -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=hawaii | FileCheck %s  -check-prefix=G_GFX7
+; RUN: llc < %s -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s -check-prefix=G_GFX10
+; RUN: llc < %s -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1030 | FileCheck %s -check-prefix=G_GFX1030
+; RUN: not llc < %s -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 2>&1 | FileCheck %s -check-prefix=G_GFX11-ERR
 
+; image_atomic_fmin and image_atomic_fmax was removed on gfx11+
 ; GFX11-ERR: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.image.atomic.f
 ; G_GFX11-ERR: LLVM ERROR: cannot select: {{.*}} = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.f
 
diff --git a/llvm/test/CodeGen/AMDGPU/invariant-image-load.ll b/llvm/test/CodeGen/AMDGPU/invariant-image-load.ll
index 693ebc8ad11c..fc92f8fddec5 100644
--- a/llvm/test/CodeGen/AMDGPU/invariant-image-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/invariant-image-load.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s -check-prefix=GFX9
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s -check-prefix=GFX9
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s -check-prefix=GFX9
 
 ; Marking the image loads as invariant should allow both loads to be hoisted
 ; above both stores.
diff --git a/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll b/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll
index 1caa1442fd2f..9644c941cd06 100644
--- a/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll
+++ b/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefix=SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefix=GISEL %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefix=GISEL %s
 
 ; Check for verifier error due to trying to save and restore SCC
 ; around a waterfall looop when it was never defined. We have to get
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.flt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.flt.ll
index 71acac792232..377207de4f44 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.flt.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.flt.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
 
 define amdgpu_ps float @atomic_min_flt_1d(<8 x i32> inreg %rsrc, float %data, i32 %s) {
 ; GFX12-LABEL: atomic_min_flt_1d:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll
index 44a4e8171ff3..e8f74533eebd 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -check-prefixes=GCN %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -early-live-intervals < %s | FileCheck -check-prefixes=GCN %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -check-prefixes=GCN %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -check-prefixes=GCN %s
 
 define amdgpu_ps <4 x float> @load_1d(<8 x i32> inreg %rsrc, i32 %s) {
 ; GCN-LABEL: load_1d:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.gfx90a.ll
index a6c77ff7046a..c39152ae5283 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.gfx90a.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A,SDAG %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -early-live-intervals < %s | FileCheck -check-prefixes=GFX90A,SDAG %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A,GISEL %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A,GISEL %s
 
 ; GFX90A-LABEL: {{^}}sample_1d:
 ; GFX90A-NOT: s_wqm_b64
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll
index f0ce166ca354..926f0cdb82c1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel < %s | FileCheck -check-prefixes=GFX10GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX10GISEL %s
 ; TODO: global-isel produces more code - there will need to be some more combines in the postregbankselectcombine phase
 ; Depends on some other changes to pass this test - those are in review separately
 
@@ -13,6 +13,8 @@ define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
 ;
 ; GFX10GISEL-LABEL: sample_d_1d:
 ; GFX10GISEL:       ; %bb.0: ; %main_body
+; GFX10GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX10GISEL-NEXT:    v_lshl_or_b32 v2, s0, 16, v2
 ; GFX10GISEL-NEXT:    image_sample_d v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
 ; GFX10GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10GISEL-NEXT:    ; return to shader part epilog
@@ -58,10 +60,10 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
 ;
 ; GFX10GISEL-LABEL: sample_d_3d:
 ; GFX10GISEL:       ; %bb.0: ; %main_body
-; GFX10GISEL-NEXT:    v_mov_b32_e32 v9, v7
 ; GFX10GISEL-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX10GISEL-NEXT:    v_mov_b32_e32 v7, v8
-; GFX10GISEL-NEXT:    v_lshl_or_b32 v6, v9, 16, v6
+; GFX10GISEL-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX10GISEL-NEXT:    v_lshl_or_b32 v6, v7, 16, v6
+; GFX10GISEL-NEXT:    v_lshl_or_b32 v7, s0, 16, v8
 ; GFX10GISEL-NEXT:    image_sample_d v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16
 ; GFX10GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10GISEL-NEXT:    ; return to shader part epilog
@@ -79,6 +81,8 @@ define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inr
 ;
 ; GFX10GISEL-LABEL: sample_c_d_1d:
 ; GFX10GISEL:       ; %bb.0: ; %main_body
+; GFX10GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX10GISEL-NEXT:    v_lshl_or_b32 v3, s0, 16, v3
 ; GFX10GISEL-NEXT:    image_sample_c_d v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
 ; GFX10GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10GISEL-NEXT:    ; return to shader part epilog
@@ -142,10 +146,10 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
 ;
 ; GFX10GISEL-LABEL: sample_d_cl_2d:
 ; GFX10GISEL:       ; %bb.0: ; %main_body
-; GFX10GISEL-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX10GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX10GISEL-NEXT:    v_mov_b32_e32 v5, v6
-; GFX10GISEL-NEXT:    v_lshl_or_b32 v4, v7, 16, v4
+; GFX10GISEL-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX10GISEL-NEXT:    v_lshl_or_b32 v4, v5, 16, v4
+; GFX10GISEL-NEXT:    v_lshl_or_b32 v5, s0, 16, v6
 ; GFX10GISEL-NEXT:    image_sample_d_cl v[0:3], v[0:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
 ; GFX10GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10GISEL-NEXT:    ; return to shader part epilog
@@ -190,10 +194,10 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
 ;
 ; GFX10GISEL-LABEL: sample_c_d_cl_2d:
 ; GFX10GISEL:       ; %bb.0: ; %main_body
-; GFX10GISEL-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX10GISEL-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX10GISEL-NEXT:    v_mov_b32_e32 v6, v7
-; GFX10GISEL-NEXT:    v_lshl_or_b32 v5, v8, 16, v5
+; GFX10GISEL-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX10GISEL-NEXT:    v_lshl_or_b32 v5, v6, 16, v5
+; GFX10GISEL-NEXT:    v_lshl_or_b32 v6, s0, 16, v7
 ; GFX10GISEL-NEXT:    image_sample_c_d_cl v[0:3], v[0:6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
 ; GFX10GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10GISEL-NEXT:    ; return to shader part epilog
@@ -211,6 +215,8 @@ define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inre
 ;
 ; GFX10GISEL-LABEL: sample_cd_1d:
 ; GFX10GISEL:       ; %bb.0: ; %main_body
+; GFX10GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX10GISEL-NEXT:    v_lshl_or_b32 v2, s0, 16, v2
 ; GFX10GISEL-NEXT:    image_sample_cd v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
 ; GFX10GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10GISEL-NEXT:    ; return to shader part epilog
@@ -248,6 +254,8 @@ define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> in
 ;
 ; GFX10GISEL-LABEL: sample_c_cd_1d:
 ; GFX10GISEL:       ; %bb.0: ; %main_body
+; GFX10GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX10GISEL-NEXT:    v_lshl_or_b32 v3, s0, 16, v3
 ; GFX10GISEL-NEXT:    image_sample_c_cd v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
 ; GFX10GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10GISEL-NEXT:    ; return to shader part epilog
@@ -311,10 +319,10 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
 ;
 ; GFX10GISEL-LABEL: sample_cd_cl_2d:
 ; GFX10GISEL:       ; %bb.0: ; %main_body
-; GFX10GISEL-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX10GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX10GISEL-NEXT:    v_mov_b32_e32 v5, v6
-; GFX10GISEL-NEXT:    v_lshl_or_b32 v4, v7, 16, v4
+; GFX10GISEL-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX10GISEL-NEXT:    v_lshl_or_b32 v4, v5, 16, v4
+; GFX10GISEL-NEXT:    v_lshl_or_b32 v5, s0, 16, v6
 ; GFX10GISEL-NEXT:    image_sample_cd_cl v[0:3], v[0:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
 ; GFX10GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10GISEL-NEXT:    ; return to shader part epilog
@@ -359,10 +367,10 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
 ;
 ; GFX10GISEL-LABEL: sample_c_cd_cl_2d:
 ; GFX10GISEL:       ; %bb.0: ; %main_body
-; GFX10GISEL-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX10GISEL-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX10GISEL-NEXT:    v_mov_b32_e32 v6, v7
-; GFX10GISEL-NEXT:    v_lshl_or_b32 v5, v8, 16, v5
+; GFX10GISEL-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX10GISEL-NEXT:    v_lshl_or_b32 v5, v6, 16, v5
+; GFX10GISEL-NEXT:    v_lshl_or_b32 v6, s0, 16, v7
 ; GFX10GISEL-NEXT:    image_sample_c_cd_cl v[0:3], v[0:6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
 ; GFX10GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10GISEL-NEXT:    ; return to shader part epilog
@@ -388,10 +396,10 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32>
 ;
 ; GFX10GISEL-LABEL: sample_c_d_o_2darray_V1:
 ; GFX10GISEL:       ; %bb.0: ; %main_body
-; GFX10GISEL-NEXT:    v_mov_b32_e32 v9, v7
 ; GFX10GISEL-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX10GISEL-NEXT:    v_mov_b32_e32 v7, v8
-; GFX10GISEL-NEXT:    v_lshl_or_b32 v6, v9, 16, v6
+; GFX10GISEL-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX10GISEL-NEXT:    v_lshl_or_b32 v6, v7, 16, v6
+; GFX10GISEL-NEXT:    v_lshl_or_b32 v7, s0, 16, v8
 ; GFX10GISEL-NEXT:    image_sample_c_d_o v0, v[0:7], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16
 ; GFX10GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10GISEL-NEXT:    ; return to shader part epilog
@@ -417,10 +425,10 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
 ;
 ; GFX10GISEL-LABEL: sample_c_d_o_2darray_V2:
 ; GFX10GISEL:       ; %bb.0: ; %main_body
-; GFX10GISEL-NEXT:    v_mov_b32_e32 v9, v7
 ; GFX10GISEL-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX10GISEL-NEXT:    v_mov_b32_e32 v7, v8
-; GFX10GISEL-NEXT:    v_lshl_or_b32 v6, v9, 16, v6
+; GFX10GISEL-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX10GISEL-NEXT:    v_lshl_or_b32 v6, v7, 16, v6
+; GFX10GISEL-NEXT:    v_lshl_or_b32 v7, s0, 16, v8
 ; GFX10GISEL-NEXT:    image_sample_c_d_o v[0:1], v[0:7], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY a16
 ; GFX10GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10GISEL-NEXT:    ; return to shader part epilog
@@ -460,6 +468,10 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_d_1d(<8 x i32> inreg %rsrc, <4 x
 ;
 ; GFX10GISEL-LABEL: sample_g16_noa16_d_1d:
 ; GFX10GISEL:       ; %bb.0: ; %main_body
+; GFX10GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX10GISEL-NEXT:    v_lshl_or_b32 v0, s0, 16, v0
+; GFX10GISEL-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
 ; GFX10GISEL-NEXT:    image_sample_d_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX10GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10GISEL-NEXT:    ; return to shader part epilog
@@ -504,12 +516,14 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_d_3d(<8 x i32> inreg %rsrc, <4 x
 ;
 ; GFX10GISEL-LABEL: sample_g16_noa16_d_3d:
 ; GFX10GISEL:       ; %bb.0: ; %main_body
-; GFX10GISEL-NEXT:    v_mov_b32_e32 v9, v3
 ; GFX10GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX10GISEL-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10GISEL-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX10GISEL-NEXT:    v_and_b32_e32 v9, 0xffff, v2
+; GFX10GISEL-NEXT:    v_and_b32_e32 v10, 0xffff, v3
+; GFX10GISEL-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX10GISEL-NEXT:    v_lshl_or_b32 v2, v1, 16, v0
-; GFX10GISEL-NEXT:    v_lshl_or_b32 v4, v4, 16, v9
+; GFX10GISEL-NEXT:    v_lshl_or_b32 v3, s0, 16, v9
+; GFX10GISEL-NEXT:    v_lshl_or_b32 v4, v4, 16, v10
+; GFX10GISEL-NEXT:    v_lshl_or_b32 v5, s0, 16, v5
 ; GFX10GISEL-NEXT:    image_sample_d_g16 v[0:3], v[2:8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D
 ; GFX10GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10GISEL-NEXT:    ; return to shader part epilog
@@ -527,6 +541,10 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_c_d_1d(<8 x i32> inreg %rsrc, <4
 ;
 ; GFX10GISEL-LABEL: sample_g16_noa16_c_d_1d:
 ; GFX10GISEL:       ; %bb.0: ; %main_body
+; GFX10GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX10GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX10GISEL-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
+; GFX10GISEL-NEXT:    v_lshl_or_b32 v2, s0, 16, v2
 ; GFX10GISEL-NEXT:    image_sample_c_d_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX10GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10GISEL-NEXT:    ; return to shader part epilog
@@ -567,6 +585,10 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_d_cl_1d(<8 x i32> inreg %rsrc, <4
 ;
 ; GFX10GISEL-LABEL: sample_g16_noa16_d_cl_1d:
 ; GFX10GISEL:       ; %bb.0: ; %main_body
+; GFX10GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX10GISEL-NEXT:    v_lshl_or_b32 v0, s0, 16, v0
+; GFX10GISEL-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
 ; GFX10GISEL-NEXT:    image_sample_d_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX10GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10GISEL-NEXT:    ; return to shader part epilog
@@ -607,6 +629,10 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_c_d_cl_1d(<8 x i32> inreg %rsrc,
 ;
 ; GFX10GISEL-LABEL: sample_g16_noa16_c_d_cl_1d:
 ; GFX10GISEL:       ; %bb.0: ; %main_body
+; GFX10GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX10GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX10GISEL-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
+; GFX10GISEL-NEXT:    v_lshl_or_b32 v2, s0, 16, v2
 ; GFX10GISEL-NEXT:    image_sample_c_d_cl_g16 v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX10GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10GISEL-NEXT:    ; return to shader part epilog
@@ -651,6 +677,10 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_cd_1d(<8 x i32> inreg %rsrc, <4 x
 ;
 ; GFX10GISEL-LABEL: sample_g16_noa16_cd_1d:
 ; GFX10GISEL:       ; %bb.0: ; %main_body
+; GFX10GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX10GISEL-NEXT:    v_lshl_or_b32 v0, s0, 16, v0
+; GFX10GISEL-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
 ; GFX10GISEL-NEXT:    image_sample_cd_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX10GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10GISEL-NEXT:    ; return to shader part epilog
@@ -691,6 +721,10 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_c_cd_1d(<8 x i32> inreg %rsrc, <4
 ;
 ; GFX10GISEL-LABEL: sample_g16_noa16_c_cd_1d:
 ; GFX10GISEL:       ; %bb.0: ; %main_body
+; GFX10GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX10GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX10GISEL-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
+; GFX10GISEL-NEXT:    v_lshl_or_b32 v2, s0, 16, v2
 ; GFX10GISEL-NEXT:    image_sample_c_cd_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX10GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10GISEL-NEXT:    ; return to shader part epilog
@@ -731,6 +765,10 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_cd_cl_1d(<8 x i32> inreg %rsrc, <
 ;
 ; GFX10GISEL-LABEL: sample_g16_noa16_cd_cl_1d:
 ; GFX10GISEL:       ; %bb.0: ; %main_body
+; GFX10GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX10GISEL-NEXT:    v_lshl_or_b32 v0, s0, 16, v0
+; GFX10GISEL-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
 ; GFX10GISEL-NEXT:    image_sample_cd_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX10GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10GISEL-NEXT:    ; return to shader part epilog
@@ -771,6 +809,10 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_c_cd_cl_1d(<8 x i32> inreg %rsrc,
 ;
 ; GFX10GISEL-LABEL: sample_g16_noa16_c_cd_cl_1d:
 ; GFX10GISEL:       ; %bb.0: ; %main_body
+; GFX10GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX10GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX10GISEL-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
+; GFX10GISEL-NEXT:    v_lshl_or_b32 v2, s0, 16, v2
 ; GFX10GISEL-NEXT:    image_sample_c_cd_cl_g16 v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX10GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10GISEL-NEXT:    ; return to shader part epilog
@@ -899,6 +941,12 @@ define amdgpu_ps <4 x float> @sample_d_1d_g16_a16(<8 x i32> inreg %rsrc, <4 x i3
 ;
 ; GFX10GISEL-LABEL: sample_d_1d_g16_a16:
 ; GFX10GISEL:       ; %bb.0: ; %main_body
+; GFX10GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX10GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX10GISEL-NEXT:    v_lshl_or_b32 v0, s0, 16, v0
+; GFX10GISEL-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
+; GFX10GISEL-NEXT:    v_lshl_or_b32 v2, s0, 16, v2
 ; GFX10GISEL-NEXT:    image_sample_d_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
 ; GFX10GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10GISEL-NEXT:    ; return to shader part epilog
@@ -948,17 +996,19 @@ define amdgpu_ps <4 x float> @sample_d_3d_g16_a16(<8 x i32> inreg %rsrc, <4 x i3
 ;
 ; GFX10GISEL-LABEL: sample_d_3d_g16_a16:
 ; GFX10GISEL:       ; %bb.0: ; %main_body
-; GFX10GISEL-NEXT:    v_mov_b32_e32 v9, v3
-; GFX10GISEL-NEXT:    v_mov_b32_e32 v10, v7
-; GFX10GISEL-NEXT:    v_mov_b32_e32 v7, v8
 ; GFX10GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX10GISEL-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX10GISEL-NEXT:    v_and_b32_e32 v8, 0xffff, v9
-; GFX10GISEL-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10GISEL-NEXT:    v_lshl_or_b32 v2, v1, 16, v0
-; GFX10GISEL-NEXT:    v_lshl_or_b32 v6, v10, 16, v6
-; GFX10GISEL-NEXT:    v_lshl_or_b32 v4, v4, 16, v8
-; GFX10GISEL-NEXT:    image_sample_d_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16
+; GFX10GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX10GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; GFX10GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v3
+; GFX10GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v5
+; GFX10GISEL-NEXT:    v_and_b32_e32 v5, 0xffff, v6
+; GFX10GISEL-NEXT:    v_and_b32_e32 v6, 0xffff, v8
+; GFX10GISEL-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
+; GFX10GISEL-NEXT:    v_lshl_or_b32 v2, v4, 16, v2
+; GFX10GISEL-NEXT:    v_lshl_or_b32 v3, s0, 16, v3
+; GFX10GISEL-NEXT:    v_lshl_or_b32 v4, v7, 16, v5
+; GFX10GISEL-NEXT:    v_lshl_or_b32 v5, s0, 16, v6
+; GFX10GISEL-NEXT:    image_sample_d_g16 v[0:3], v[0:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16
 ; GFX10GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10GISEL-NEXT:    ; return to shader part epilog
 main_body:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.noret.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.noret.ll
index 4873b42a235e..d8358460962a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.noret.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.noret.ll
@@ -1,15 +1,14 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10PLUS-SDAG,GFX10,GFX10-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10PLUS-GISEL,GFX10,GFX10-GISEL %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10PLUS-SDAG,GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10PLUS-SDAG,GFX11,GFX11-SDAG,GFX11-SDAG-FAKE16 %s
-; FIXME-TRUE16. enable gisel
-; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10PLUS-GISEL,GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10PLUS-GISEL,GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
-; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10PLUS-SDAG %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10PLUS-GISEL,GFX10-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10PLUS-SDAG %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10PLUS-SDAG %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10PLUS-GISEL,GFX11-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10PLUS-GISEL,GFX11-GISEL-FAKE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
 
 define amdgpu_ps void @sample_1d_nortn(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
 ; GFX10PLUS-LABEL: sample_1d_nortn:
@@ -436,15 +435,52 @@ main_body:
 }
 
 define amdgpu_ps void @sample_d_1d_g16_nortn(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) {
-; GFX10PLUS-LABEL: sample_d_1d_g16_nortn:
-; GFX10PLUS:       ; %bb.0: ; %main_body
-; GFX10PLUS-NEXT:    image_sample_d_g16 off, v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
-; GFX10PLUS-NEXT:    s_endpgm
+; GFX10PLUS-SDAG-LABEL: sample_d_1d_g16_nortn:
+; GFX10PLUS-SDAG:       ; %bb.0: ; %main_body
+; GFX10PLUS-SDAG-NEXT:    image_sample_d_g16 off, v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10PLUS-SDAG-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: sample_d_1d_g16_nortn:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    image_sample_d_g16 off, [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
-; GFX12-NEXT:    s_endpgm
+; GFX10-GISEL-LABEL: sample_d_1d_g16_nortn:
+; GFX10-GISEL:       ; %bb.0: ; %main_body
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, s0, 16, v0
+; GFX10-GISEL-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
+; GFX10-GISEL-NEXT:    image_sample_d_g16 off, v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-GISEL-TRUE16-LABEL: sample_d_1d_g16_nortn:
+; GFX11-GISEL-TRUE16:       ; %bb.0: ; %main_body
+; GFX11-GISEL-TRUE16-NEXT:    image_sample_d_g16 off, v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX11-GISEL-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-GISEL-FAKE16-LABEL: sample_d_1d_g16_nortn:
+; GFX11-GISEL-FAKE16:       ; %bb.0: ; %main_body
+; GFX11-GISEL-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_lshl_or_b32 v0, s0, 16, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
+; GFX11-GISEL-FAKE16-NEXT:    image_sample_d_g16 off, v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX11-GISEL-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: sample_d_1d_g16_nortn:
+; GFX12-SDAG:       ; %bb.0: ; %main_body
+; GFX12-SDAG-NEXT:    image_sample_d_g16 off, [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-TRUE16-LABEL: sample_d_1d_g16_nortn:
+; GFX12-GISEL-TRUE16:       ; %bb.0: ; %main_body
+; GFX12-GISEL-TRUE16-NEXT:    image_sample_d_g16 off, [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX12-GISEL-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-GISEL-FAKE16-LABEL: sample_d_1d_g16_nortn:
+; GFX12-GISEL-FAKE16:       ; %bb.0: ; %main_body
+; GFX12-GISEL-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX12-GISEL-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX12-GISEL-FAKE16-NEXT:    v_lshl_or_b32 v0, s0, 16, v0
+; GFX12-GISEL-FAKE16-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
+; GFX12-GISEL-FAKE16-NEXT:    image_sample_d_g16 off, [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX12-GISEL-FAKE16-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.image.sample.d.1d.nortn.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   ret void
@@ -475,16 +511,3 @@ declare void @llvm.amdgcn.image.sample.d.1d.nortn.f16.f32(i32, half, half, float
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readonly }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFX10: {{.*}}
-; GFX10-GISEL: {{.*}}
-; GFX10-SDAG: {{.*}}
-; GFX11: {{.*}}
-; GFX11-GISEL: {{.*}}
-; GFX11-GISEL-FAKE16: {{.*}}
-; GFX11-SDAG: {{.*}}
-; GFX11-SDAG-FAKE16: {{.*}}
-; GFX11-SDAG-TRUE16: {{.*}}
-; GFX12-GISEL-FAKE16: {{.*}}
-; GFX12-SDAG-FAKE16: {{.*}}
-; GFX12-SDAG-TRUE16: {{.*}}