Reland "[AMDGPU] Add new llvm.amdgcn.wave.shuffle intrinsic (#167372)" (#174614)

This change adds a new intrinsic for AMDGPU that implements a wave shuffle, allowing arbitrary swizzling between lanes using an index. In the initial version of this commit, there was an issue in one of the tests added that returned a signal, causing testing to fail when combined with another recent change to 'not'. For context on the initial commit see #167372 --------- Signed-off-by: Domenic Nutile <domenic.nutile@gmail.com> Co-authored-by: Jay Foad <jay.foad@gmail.com>
2026-01-06 15:02:08 -05:00 · 2026-01-06 15:02:08 -05:00 · c262893f4b
commit c262893f4b
parent 1f5126dae1
7 changed files with 459 additions and 1 deletions
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@ -2684,6 +2684,16 @@ def int_amdgcn_call_whole_wave:
             llvm_vararg_ty], // The arguments to the callee.
            [IntrConvergent]>;

+// <result> llvm.amdgcn.wave.shuffle <value> <id>
+// value and result can be a 32bit floating-point or
+// integer type, and must be the same type. Any index
+// value that's outside the valid range will wrap around,
+// and reading from an inactive lane will return poison.
+def int_amdgcn_wave_shuffle :
+  DefaultAttrsIntrinsic<[llvm_any_ty],                    // return type
+                        [LLVMMatchType<0>, llvm_i32_ty],  // arg types
+                        [IntrConvergent, IntrNoMem]>;     // flags
+
 //===----------------------------------------------------------------------===//
 // CI+ Intrinsics
 //===----------------------------------------------------------------------===//
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@ -1216,6 +1216,8 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
  case Intrinsic::amdgcn_permlane16_swap:
  case Intrinsic::amdgcn_permlane32_swap:
    return selectPermlaneSwapIntrin(I, IntrinsicID);
+  case Intrinsic::amdgcn_wave_shuffle:
+    return selectWaveShuffleIntrin(I);
  default:
    return selectImpl(I, *CoverageInfo);
  }
@ -3894,6 +3896,130 @@ bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const {
  return true;
 }

+bool AMDGPUInstructionSelector::selectWaveShuffleIntrin(
+    MachineInstr &MI) const {
+  assert(MI.getNumOperands() == 4);
+  MachineBasicBlock *MBB = MI.getParent();
+  const DebugLoc &DL = MI.getDebugLoc();
+
+  Register DstReg = MI.getOperand(0).getReg();
+  Register ValReg = MI.getOperand(2).getReg();
+  Register IdxReg = MI.getOperand(3).getReg();
+
+  const LLT DstTy = MRI->getType(DstReg);
+  unsigned DstSize = DstTy.getSizeInBits();
+  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
+  const TargetRegisterClass *DstRC =
+      TRI.getRegClassForSizeOnBank(DstSize, *DstRB);
+
+  if (DstTy != LLT::scalar(32))
+    return false;
+
+  // If we can bpermute across the whole wave, then just do that
+  if (Subtarget->supportsWaveWideBPermute()) {
+    Register ShiftIdxReg = MRI->createVirtualRegister(DstRC);
+    BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), ShiftIdxReg)
+        .addImm(2)
+        .addReg(IdxReg);
+
+    BuildMI(*MBB, MI, DL, TII.get(AMDGPU::DS_BPERMUTE_B32), DstReg)
+        .addReg(ShiftIdxReg)
+        .addReg(ValReg)
+        .addImm(0);
+  } else {
+    // Otherwise, we need to make use of whole wave mode
+    assert(Subtarget->isWave64());
+
+    // Set inactive lanes to poison
+    Register UndefValReg =
+        MRI->createVirtualRegister(TRI.getRegClass(AMDGPU::SReg_32RegClassID));
+    BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefValReg);
+
+    Register UndefExecReg = MRI->createVirtualRegister(
+        TRI.getRegClass(AMDGPU::SReg_64_XEXECRegClassID));
+    BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefExecReg);
+
+    Register PoisonValReg = MRI->createVirtualRegister(DstRC);
+    BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SET_INACTIVE_B32), PoisonValReg)
+        .addImm(0)
+        .addReg(ValReg)
+        .addImm(0)
+        .addReg(UndefValReg)
+        .addReg(UndefExecReg);
+
+    // ds_bpermute requires index to be multiplied by 4
+    Register ShiftIdxReg = MRI->createVirtualRegister(DstRC);
+    BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), ShiftIdxReg)
+        .addImm(2)
+        .addReg(IdxReg);
+
+    Register PoisonIdxReg = MRI->createVirtualRegister(DstRC);
+    BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SET_INACTIVE_B32), PoisonIdxReg)
+        .addImm(0)
+        .addReg(ShiftIdxReg)
+        .addImm(0)
+        .addReg(UndefValReg)
+        .addReg(UndefExecReg);
+
+    // Get permutation of each half, then we'll select which one to use
+    Register SameSidePermReg = MRI->createVirtualRegister(DstRC);
+    BuildMI(*MBB, MI, DL, TII.get(AMDGPU::DS_BPERMUTE_B32), SameSidePermReg)
+        .addReg(PoisonIdxReg)
+        .addReg(PoisonValReg)
+        .addImm(0);
+
+    Register SwappedValReg = MRI->createVirtualRegister(DstRC);
+    BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_PERMLANE64_B32), SwappedValReg)
+        .addReg(PoisonValReg);
+
+    Register OppSidePermReg = MRI->createVirtualRegister(DstRC);
+    BuildMI(*MBB, MI, DL, TII.get(AMDGPU::DS_BPERMUTE_B32), OppSidePermReg)
+        .addReg(PoisonIdxReg)
+        .addReg(SwappedValReg)
+        .addImm(0);
+
+    Register WWMSwapPermReg = MRI->createVirtualRegister(DstRC);
+    BuildMI(*MBB, MI, DL, TII.get(AMDGPU::STRICT_WWM), WWMSwapPermReg)
+        .addReg(OppSidePermReg);
+
+    // Select which side to take the permute from
+    // We can get away with only using mbcnt_lo here since we're only
+    // trying to detect which side of 32 each lane is on, and mbcnt_lo
+    // returns 32 for lanes 32-63.
+    Register ThreadIDReg = MRI->createVirtualRegister(DstRC);
+    BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MBCNT_LO_U32_B32_e64), ThreadIDReg)
+        .addImm(-1)
+        .addImm(0);
+
+    Register XORReg = MRI->createVirtualRegister(DstRC);
+    BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_XOR_B32_e64), XORReg)
+        .addReg(ThreadIDReg)
+        .addReg(PoisonIdxReg);
+
+    Register ANDReg = MRI->createVirtualRegister(DstRC);
+    BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_AND_B32_e64), ANDReg)
+        .addReg(XORReg)
+        .addImm(32);
+
+    Register CompareReg = MRI->createVirtualRegister(
+        TRI.getRegClass(AMDGPU::SReg_64_XEXECRegClassID));
+    BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), CompareReg)
+        .addReg(ANDReg)
+        .addImm(0);
+
+    // Finally do the selection
+    BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+        .addImm(0)
+        .addReg(WWMSwapPermReg)
+        .addImm(0)
+        .addReg(SameSidePermReg)
+        .addReg(CompareReg);
+  }
+
+  MI.eraseFromParent();
+  return true;
+}
+
 // Match BITOP3 operation and return a number of matched instructions plus
 // truth table.
 static std::pair<unsigned, uint8_t> BitOp3_Op(Register R,
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@ -156,6 +156,7 @@ private:
  bool selectSBarrierSignalIsfirst(MachineInstr &I, Intrinsic::ID IID) const;
  bool selectSGetBarrierState(MachineInstr &I, Intrinsic::ID IID) const;
  bool selectSBarrierLeave(MachineInstr &I) const;
+  bool selectWaveShuffleIntrin(MachineInstr &I) const;

  std::pair<Register, unsigned> selectVOP3ModsImpl(Register Src,
                                                   bool IsCanonicalizing = true,
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@ -5239,11 +5239,20 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
      OpdsMapping[2] = AMDGPU::getValueMapping(regBankID, OpSize);
      break;
    }
-    case Intrinsic::amdgcn_s_bitreplicate:
+    case Intrinsic::amdgcn_s_bitreplicate: {
      Register MaskReg = MI.getOperand(2).getReg();
      unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID);
      OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64);
      OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, 32);
+      break;
+    }
+    case Intrinsic::amdgcn_wave_shuffle: {
+      unsigned OpSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+      OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
+      OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
+      OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
+      break;
+    }
    }
    break;
  }
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@ -1883,6 +1883,12 @@ public:
  bool requiresWaitsBeforeSystemScopeStores() const {
    return RequiresWaitsBeforeSystemScopeStores;
  }
+
+  bool supportsWaveWideBPermute() const {
+    return (getGeneration() <= AMDGPUSubtarget::GFX9 ||
+            getGeneration() == AMDGPUSubtarget::GFX12) ||
+           isWave32();
+  }
 };

 class GCNUserSGPRUsageInfo {
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@ -7368,6 +7368,84 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
  return DAG.getBitcast(VT, UnrolledLaneOp);
 }

+static SDValue lowerWaveShuffle(const SITargetLowering &TLI, SDNode *N,
+                                SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+
+  if (VT.getSizeInBits() != 32)
+    return SDValue();
+
+  SDLoc SL(N);
+
+  SDValue Value = N->getOperand(1);
+  SDValue Index = N->getOperand(2);
+
+  // ds_bpermute requires index to be multiplied by 4
+  SDValue ShiftAmount = DAG.getShiftAmountConstant(2, MVT::i32, SL);
+  SDValue ShiftedIndex =
+      DAG.getNode(ISD::SHL, SL, Index.getValueType(), Index, ShiftAmount);
+
+  // Intrinsics will require i32 to operate on
+  SDValue ValueI32 = DAG.getBitcast(MVT::i32, Value);
+
+  auto MakeIntrinsic = [&DAG, &SL](unsigned IID, MVT RetVT,
+                                   SmallVector<SDValue> IntrinArgs) -> SDValue {
+    SmallVector<SDValue> Operands(1);
+    Operands[0] = DAG.getTargetConstant(IID, SL, MVT::i32);
+    Operands.append(IntrinArgs);
+    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, RetVT, Operands);
+  };
+
+  // If we can bpermute across the whole wave, then just do that
+  if (TLI.getSubtarget()->supportsWaveWideBPermute()) {
+    SDValue BPermute = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32,
+                                     {ShiftedIndex, ValueI32});
+    return DAG.getBitcast(VT, BPermute);
+  }
+
+  assert(TLI.getSubtarget()->isWave64());
+
+  // Otherwise, we need to make use of whole wave mode
+  SDValue PoisonVal = DAG.getPOISON(ValueI32->getValueType(0));
+
+  // Set inactive lanes to poison
+  SDValue WWMValue = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
+                                   {ValueI32, PoisonVal});
+  SDValue WWMIndex = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
+                                   {ShiftedIndex, PoisonVal});
+
+  SDValue Swapped =
+      MakeIntrinsic(Intrinsic::amdgcn_permlane64, MVT::i32, {WWMValue});
+
+  // Get permutation of each half, then we'll select which one to use
+  SDValue BPermSameHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32,
+                                        {WWMIndex, WWMValue});
+  SDValue BPermOtherHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute,
+                                         MVT::i32, {WWMIndex, Swapped});
+  SDValue BPermOtherHalfWWM =
+      MakeIntrinsic(Intrinsic::amdgcn_wwm, MVT::i32, {BPermOtherHalf});
+
+  // Select which side to take the permute from
+  SDValue ThreadIDMask = DAG.getAllOnesConstant(SL, MVT::i32);
+  // We can get away with only using mbcnt_lo here since we're only
+  // trying to detect which side of 32 each lane is on, and mbcnt_lo
+  // returns 32 for lanes 32-63.
+  SDValue ThreadID =
+      MakeIntrinsic(Intrinsic::amdgcn_mbcnt_lo, MVT::i32,
+                    {ThreadIDMask, DAG.getTargetConstant(0, SL, MVT::i32)});
+
+  SDValue SameOrOtherHalf =
+      DAG.getNode(ISD::AND, SL, MVT::i32,
+                  DAG.getNode(ISD::XOR, SL, MVT::i32, ThreadID, Index),
+                  DAG.getTargetConstant(32, SL, MVT::i32));
+  SDValue UseSameHalf =
+      DAG.getSetCC(SL, MVT::i1, SameOrOtherHalf,
+                   DAG.getConstant(0, SL, MVT::i32), ISD::SETEQ);
+  SDValue Result = DAG.getSelect(SL, MVT::i32, UseSameHalf, BPermSameHalf,
+                                 BPermOtherHalfWWM);
+  return DAG.getBitcast(VT, Result);
+}
+
 void SITargetLowering::ReplaceNodeResults(SDNode *N,
                                          SmallVectorImpl<SDValue> &Results,
                                          SelectionDAG &DAG) const {
@ -10286,6 +10364,8 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
      Poisons.push_back(DAG.getPOISON(ValTy));
    return DAG.getMergeValues(Poisons, SDLoc(Op));
  }
+  case Intrinsic::amdgcn_wave_shuffle:
+    return lowerWaveShuffle(*this, Op.getNode(), DAG);
  default:
    if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
            AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.shuffle.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.shuffle.ll
@ -0,0 +1,226 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx801 < %s | FileCheck -check-prefixes=GFX8-W32 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9-W32 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11-W32 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-W32 %s
+
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx801 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX8-W64 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX9-W64 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11-W64 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12-W64 %s
+
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx801 < %s | FileCheck -check-prefixes=GFX8-W32-GISEL %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9-W32-GISEL %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11-W32-GISEL %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-W32-GISEL %s
+
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx801 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX8-W64-GISEL %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX9-W64-GISEL %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11-W64-GISEL %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12-W64-GISEL %s
+
+; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx600 -filetype=null %s 2>&1 | FileCheck -check-prefixes=GFX6-SDAG-ERR %s
+; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx600 -filetype=null %s 2>&1 | FileCheck -check-prefixes=GFX6-GISEL-ERR %s
+
+; GFX6-SDAG-ERR: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.ds.bpermute
+; GFX6-GISEL-ERR: "Invalid opcode!"
+
+; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=null %s 2>&1 | FileCheck -check-prefixes=GFX7-SDAG-ERR %s
+; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=null %s 2>&1 | FileCheck -check-prefixes=GFX7-GISEL-ERR %s
+
+; GFX7-SDAG-ERR: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.ds.bpermute
+; GFX7-GISEL-ERR: "Invalid opcode!"
+
+
+define float @test_wave_shuffle_float(float %val, i32 %idx) {
+; GFX8-W32-LABEL: test_wave_shuffle_float:
+; GFX8-W32:       ; %bb.0: ; %entry
+; GFX8-W32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-W32-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX8-W32-NEXT:    ds_bpermute_b32 v0, v1, v0
+; GFX8-W32-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-W32-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-W32-LABEL: test_wave_shuffle_float:
+; GFX9-W32:       ; %bb.0: ; %entry
+; GFX9-W32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-W32-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX9-W32-NEXT:    ds_bpermute_b32 v0, v1, v0
+; GFX9-W32-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-W32-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-W32-LABEL: test_wave_shuffle_float:
+; GFX11-W32:       ; %bb.0: ; %entry
+; GFX11-W32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-W32-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX11-W32-NEXT:    ds_bpermute_b32 v0, v1, v0
+; GFX11-W32-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-W32-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-W32-LABEL: test_wave_shuffle_float:
+; GFX12-W32:       ; %bb.0: ; %entry
+; GFX12-W32-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-W32-NEXT:    s_wait_expcnt 0x0
+; GFX12-W32-NEXT:    s_wait_samplecnt 0x0
+; GFX12-W32-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-W32-NEXT:    s_wait_kmcnt 0x0
+; GFX12-W32-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX12-W32-NEXT:    ds_bpermute_b32 v0, v1, v0
+; GFX12-W32-NEXT:    s_wait_dscnt 0x0
+; GFX12-W32-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-W64-LABEL: test_wave_shuffle_float:
+; GFX8-W64:       ; %bb.0: ; %entry
+; GFX8-W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-W64-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX8-W64-NEXT:    ds_bpermute_b32 v0, v1, v0
+; GFX8-W64-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-W64-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-W64-LABEL: test_wave_shuffle_float:
+; GFX9-W64:       ; %bb.0: ; %entry
+; GFX9-W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-W64-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX9-W64-NEXT:    ds_bpermute_b32 v0, v1, v0
+; GFX9-W64-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-W64-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-W64-LABEL: test_wave_shuffle_float:
+; GFX11-W64:       ; %bb.0: ; %entry
+; GFX11-W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-W64-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX11-W64-NEXT:    scratch_store_b32 off, v2, s32 ; 4-byte Folded Spill
+; GFX11-W64-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX11-W64-NEXT:    v_lshlrev_b32_e32 v3, 2, v1
+; GFX11-W64-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
+; GFX11-W64-NEXT:    ; kill: def $vgpr3 killed $vgpr3 killed $exec
+; GFX11-W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX11-W64-NEXT:    v_permlane64_b32 v2, v0
+; GFX11-W64-NEXT:    ds_bpermute_b32 v2, v3, v2
+; GFX11-W64-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX11-W64-NEXT:    ds_bpermute_b32 v0, v3, v0
+; GFX11-W64-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX11-W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-W64-NEXT:    v_xor_b32_e32 v1, v3, v1
+; GFX11-W64-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX11-W64-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-W64-NEXT:    v_and_b32_e32 v1, 32, v1
+; GFX11-W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX11-W64-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-W64-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX11-W64-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX11-W64-NEXT:    scratch_load_b32 v2, off, s32 ; 4-byte Folded Reload
+; GFX11-W64-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX11-W64-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-W64-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-W64-LABEL: test_wave_shuffle_float:
+; GFX12-W64:       ; %bb.0: ; %entry
+; GFX12-W64-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-W64-NEXT:    s_wait_expcnt 0x0
+; GFX12-W64-NEXT:    s_wait_samplecnt 0x0
+; GFX12-W64-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-W64-NEXT:    s_wait_kmcnt 0x0
+; GFX12-W64-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX12-W64-NEXT:    ds_bpermute_b32 v0, v1, v0
+; GFX12-W64-NEXT:    s_wait_dscnt 0x0
+; GFX12-W64-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-W32-GISEL-LABEL: test_wave_shuffle_float:
+; GFX8-W32-GISEL:       ; %bb.0: ; %entry
+; GFX8-W32-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-W32-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX8-W32-GISEL-NEXT:    ds_bpermute_b32 v0, v1, v0
+; GFX8-W32-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-W32-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-W32-GISEL-LABEL: test_wave_shuffle_float:
+; GFX9-W32-GISEL:       ; %bb.0: ; %entry
+; GFX9-W32-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-W32-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX9-W32-GISEL-NEXT:    ds_bpermute_b32 v0, v1, v0
+; GFX9-W32-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-W32-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-W32-GISEL-LABEL: test_wave_shuffle_float:
+; GFX11-W32-GISEL:       ; %bb.0: ; %entry
+; GFX11-W32-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-W32-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX11-W32-GISEL-NEXT:    ds_bpermute_b32 v0, v1, v0
+; GFX11-W32-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-W32-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-W32-GISEL-LABEL: test_wave_shuffle_float:
+; GFX12-W32-GISEL:       ; %bb.0: ; %entry
+; GFX12-W32-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-W32-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-W32-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-W32-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-W32-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-W32-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX12-W32-GISEL-NEXT:    ds_bpermute_b32 v0, v1, v0
+; GFX12-W32-GISEL-NEXT:    s_wait_dscnt 0x0
+; GFX12-W32-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-W64-GISEL-LABEL: test_wave_shuffle_float:
+; GFX8-W64-GISEL:       ; %bb.0: ; %entry
+; GFX8-W64-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-W64-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX8-W64-GISEL-NEXT:    ds_bpermute_b32 v0, v1, v0
+; GFX8-W64-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-W64-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-W64-GISEL-LABEL: test_wave_shuffle_float:
+; GFX9-W64-GISEL:       ; %bb.0: ; %entry
+; GFX9-W64-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-W64-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX9-W64-GISEL-NEXT:    ds_bpermute_b32 v0, v1, v0
+; GFX9-W64-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-W64-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-W64-GISEL-LABEL: test_wave_shuffle_float:
+; GFX11-W64-GISEL:       ; %bb.0: ; %entry
+; GFX11-W64-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-W64-GISEL-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX11-W64-GISEL-NEXT:    scratch_store_b32 off, v2, s32 ; 4-byte Folded Spill
+; GFX11-W64-GISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX11-W64-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX11-W64-GISEL-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
+; GFX11-W64-GISEL-NEXT:    ; kill: def $vgpr1 killed $vgpr1 killed $exec
+; GFX11-W64-GISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX11-W64-GISEL-NEXT:    v_permlane64_b32 v2, v0
+; GFX11-W64-GISEL-NEXT:    ds_bpermute_b32 v2, v1, v2
+; GFX11-W64-GISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX11-W64-GISEL-NEXT:    ds_bpermute_b32 v0, v1, v0
+; GFX11-W64-GISEL-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
+; GFX11-W64-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-W64-GISEL-NEXT:    v_xor_b32_e32 v1, v3, v1
+; GFX11-W64-GISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX11-W64-GISEL-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-W64-GISEL-NEXT:    v_and_b32_e32 v1, 32, v1
+; GFX11-W64-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-W64-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX11-W64-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-W64-GISEL-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX11-W64-GISEL-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX11-W64-GISEL-NEXT:    scratch_load_b32 v2, off, s32 ; 4-byte Folded Reload
+; GFX11-W64-GISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX11-W64-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-W64-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-W64-GISEL-LABEL: test_wave_shuffle_float:
+; GFX12-W64-GISEL:       ; %bb.0: ; %entry
+; GFX12-W64-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-W64-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-W64-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-W64-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-W64-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-W64-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX12-W64-GISEL-NEXT:    ds_bpermute_b32 v0, v1, v0
+; GFX12-W64-GISEL-NEXT:    s_wait_dscnt 0x0
+; GFX12-W64-GISEL-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %0 = tail call float @llvm.amdgcn.wave.shuffle(float %val, i32 %idx)
+  ret float %0
+}