diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index f2650f678dea..f7d4b51c7516 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2684,16 +2684,6 @@ def int_amdgcn_call_whole_wave: llvm_vararg_ty], // The arguments to the callee. [IntrConvergent]>; -// llvm.amdgcn.wave.shuffle -// value and result can be a 32bit floating-point or -// integer type, and must be the same type. Any index -// value that's outside the valid range will wrap around, -// and reading from an inactive lane will return poison. -def int_amdgcn_wave_shuffle : - DefaultAttrsIntrinsic<[llvm_any_ty], // return type - [LLVMMatchType<0>, llvm_i32_ty], // arg types - [IntrConvergent, IntrNoMem]>; // flags - //===----------------------------------------------------------------------===// // CI+ Intrinsics //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 3093b9aaf174..5dc7c8327102 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1216,8 +1216,6 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const { case Intrinsic::amdgcn_permlane16_swap: case Intrinsic::amdgcn_permlane32_swap: return selectPermlaneSwapIntrin(I, IntrinsicID); - case Intrinsic::amdgcn_wave_shuffle: - return selectWaveShuffleIntrin(I); default: return selectImpl(I, *CoverageInfo); } @@ -3896,130 +3894,6 @@ bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const { return true; } -bool AMDGPUInstructionSelector::selectWaveShuffleIntrin( - MachineInstr &MI) const { - assert(MI.getNumOperands() == 4); - MachineBasicBlock *MBB = MI.getParent(); - const DebugLoc &DL = MI.getDebugLoc(); - - Register DstReg = MI.getOperand(0).getReg(); - Register ValReg = MI.getOperand(2).getReg(); - Register IdxReg = MI.getOperand(3).getReg(); - - const LLT DstTy = MRI->getType(DstReg); - unsigned DstSize = DstTy.getSizeInBits(); - const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); - const TargetRegisterClass *DstRC = - TRI.getRegClassForSizeOnBank(DstSize, *DstRB); - - if (DstTy != LLT::scalar(32)) - return false; - - // If we can bpermute across the whole wave, then just do that - if (Subtarget->supportsWaveWideBPermute()) { - Register ShiftIdxReg = MRI->createVirtualRegister(DstRC); - BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), ShiftIdxReg) - .addImm(2) - .addReg(IdxReg); - - BuildMI(*MBB, MI, DL, TII.get(AMDGPU::DS_BPERMUTE_B32), DstReg) - .addReg(ShiftIdxReg) - .addReg(ValReg) - .addImm(0); - } else { - // Otherwise, we need to make use of whole wave mode - assert(Subtarget->isWave64()); - - // Set inactive lanes to poison - Register UndefValReg = - MRI->createVirtualRegister(TRI.getRegClass(AMDGPU::SReg_32RegClassID)); - BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefValReg); - - Register UndefExecReg = MRI->createVirtualRegister( - TRI.getRegClass(AMDGPU::SReg_64_XEXECRegClassID)); - BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefExecReg); - - Register PoisonValReg = MRI->createVirtualRegister(DstRC); - BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SET_INACTIVE_B32), PoisonValReg) - .addImm(0) - .addReg(ValReg) - .addImm(0) - .addReg(UndefValReg) - .addReg(UndefExecReg); - - // ds_bpermute requires index to be multiplied by 4 - Register ShiftIdxReg = MRI->createVirtualRegister(DstRC); - BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), ShiftIdxReg) - .addImm(2) - .addReg(IdxReg); - - Register PoisonIdxReg = MRI->createVirtualRegister(DstRC); - BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SET_INACTIVE_B32), PoisonIdxReg) - .addImm(0) - .addReg(ShiftIdxReg) - .addImm(0) - .addReg(UndefValReg) - .addReg(UndefExecReg); - - // Get permutation of each half, then we'll select which one to use - Register SameSidePermReg = MRI->createVirtualRegister(DstRC); - BuildMI(*MBB, MI, DL, TII.get(AMDGPU::DS_BPERMUTE_B32), SameSidePermReg) - .addReg(PoisonIdxReg) - .addReg(PoisonValReg) - .addImm(0); - - Register SwappedValReg = MRI->createVirtualRegister(DstRC); - BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_PERMLANE64_B32), SwappedValReg) - .addReg(PoisonValReg); - - Register OppSidePermReg = MRI->createVirtualRegister(DstRC); - BuildMI(*MBB, MI, DL, TII.get(AMDGPU::DS_BPERMUTE_B32), OppSidePermReg) - .addReg(PoisonIdxReg) - .addReg(SwappedValReg) - .addImm(0); - - Register WWMSwapPermReg = MRI->createVirtualRegister(DstRC); - BuildMI(*MBB, MI, DL, TII.get(AMDGPU::STRICT_WWM), WWMSwapPermReg) - .addReg(OppSidePermReg); - - // Select which side to take the permute from - // We can get away with only using mbcnt_lo here since we're only - // trying to detect which side of 32 each lane is on, and mbcnt_lo - // returns 32 for lanes 32-63. - Register ThreadIDReg = MRI->createVirtualRegister(DstRC); - BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MBCNT_LO_U32_B32_e64), ThreadIDReg) - .addImm(-1) - .addImm(0); - - Register XORReg = MRI->createVirtualRegister(DstRC); - BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_XOR_B32_e64), XORReg) - .addReg(ThreadIDReg) - .addReg(PoisonIdxReg); - - Register ANDReg = MRI->createVirtualRegister(DstRC); - BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_AND_B32_e64), ANDReg) - .addReg(XORReg) - .addImm(32); - - Register CompareReg = MRI->createVirtualRegister( - TRI.getRegClass(AMDGPU::SReg_64_XEXECRegClassID)); - BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), CompareReg) - .addReg(ANDReg) - .addImm(0); - - // Finally do the selection - BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg) - .addImm(0) - .addReg(WWMSwapPermReg) - .addImm(0) - .addReg(SameSidePermReg) - .addReg(CompareReg); - } - - MI.eraseFromParent(); - return true; -} - // Match BITOP3 operation and return a number of matched instructions plus // truth table. static std::pair BitOp3_Op(Register R, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index 627cce277ae3..c760fe7ef99d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -156,7 +156,6 @@ private: bool selectSBarrierSignalIsfirst(MachineInstr &I, Intrinsic::ID IID) const; bool selectSGetBarrierState(MachineInstr &I, Intrinsic::ID IID) const; bool selectSBarrierLeave(MachineInstr &I) const; - bool selectWaveShuffleIntrin(MachineInstr &I) const; std::pair selectVOP3ModsImpl(Register Src, bool IsCanonicalizing = true, diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 7470fecd3c03..266c708f4873 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -5239,20 +5239,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[2] = AMDGPU::getValueMapping(regBankID, OpSize); break; } - case Intrinsic::amdgcn_s_bitreplicate: { + case Intrinsic::amdgcn_s_bitreplicate: Register MaskReg = MI.getOperand(2).getReg(); unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID); OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64); OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, 32); - break; - } - case Intrinsic::amdgcn_wave_shuffle: { - unsigned OpSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); - OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize); - OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize); - OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize); - break; - } } break; } diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 2d1e54bf5883..1cd434a9948a 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -1883,12 +1883,6 @@ public: bool requiresWaitsBeforeSystemScopeStores() const { return RequiresWaitsBeforeSystemScopeStores; } - - bool supportsWaveWideBPermute() const { - return (getGeneration() <= AMDGPUSubtarget::GFX9 || - getGeneration() == AMDGPUSubtarget::GFX12) || - isWave32(); - } }; class GCNUserSGPRUsageInfo { diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index e63835fd8d14..3dd8dd2b90b4 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -7368,84 +7368,6 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, return DAG.getBitcast(VT, UnrolledLaneOp); } -static SDValue lowerWaveShuffle(const SITargetLowering &TLI, SDNode *N, - SelectionDAG &DAG) { - EVT VT = N->getValueType(0); - - if (VT.getSizeInBits() != 32) - return SDValue(); - - SDLoc SL(N); - - SDValue Value = N->getOperand(1); - SDValue Index = N->getOperand(2); - - // ds_bpermute requires index to be multiplied by 4 - SDValue ShiftAmount = DAG.getShiftAmountConstant(2, MVT::i32, SL); - SDValue ShiftedIndex = - DAG.getNode(ISD::SHL, SL, Index.getValueType(), Index, ShiftAmount); - - // Intrinsics will require i32 to operate on - SDValue ValueI32 = DAG.getBitcast(MVT::i32, Value); - - auto MakeIntrinsic = [&DAG, &SL](unsigned IID, MVT RetVT, - SmallVector IntrinArgs) -> SDValue { - SmallVector Operands(1); - Operands[0] = DAG.getTargetConstant(IID, SL, MVT::i32); - Operands.append(IntrinArgs); - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, RetVT, Operands); - }; - - // If we can bpermute across the whole wave, then just do that - if (TLI.getSubtarget()->supportsWaveWideBPermute()) { - SDValue BPermute = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32, - {ShiftedIndex, ValueI32}); - return DAG.getBitcast(VT, BPermute); - } - - assert(TLI.getSubtarget()->isWave64()); - - // Otherwise, we need to make use of whole wave mode - SDValue PoisonVal = DAG.getPOISON(ValueI32->getValueType(0)); - - // Set inactive lanes to poison - SDValue WWMValue = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32, - {ValueI32, PoisonVal}); - SDValue WWMIndex = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32, - {ShiftedIndex, PoisonVal}); - - SDValue Swapped = - MakeIntrinsic(Intrinsic::amdgcn_permlane64, MVT::i32, {WWMValue}); - - // Get permutation of each half, then we'll select which one to use - SDValue BPermSameHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32, - {WWMIndex, WWMValue}); - SDValue BPermOtherHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, - MVT::i32, {WWMIndex, Swapped}); - SDValue BPermOtherHalfWWM = - MakeIntrinsic(Intrinsic::amdgcn_wwm, MVT::i32, {BPermOtherHalf}); - - // Select which side to take the permute from - SDValue ThreadIDMask = DAG.getAllOnesConstant(SL, MVT::i32); - // We can get away with only using mbcnt_lo here since we're only - // trying to detect which side of 32 each lane is on, and mbcnt_lo - // returns 32 for lanes 32-63. - SDValue ThreadID = - MakeIntrinsic(Intrinsic::amdgcn_mbcnt_lo, MVT::i32, - {ThreadIDMask, DAG.getTargetConstant(0, SL, MVT::i32)}); - - SDValue SameOrOtherHalf = - DAG.getNode(ISD::AND, SL, MVT::i32, - DAG.getNode(ISD::XOR, SL, MVT::i32, ThreadID, Index), - DAG.getTargetConstant(32, SL, MVT::i32)); - SDValue UseSameHalf = - DAG.getSetCC(SL, MVT::i1, SameOrOtherHalf, - DAG.getConstant(0, SL, MVT::i32), ISD::SETEQ); - SDValue Result = DAG.getSelect(SL, MVT::i32, UseSameHalf, BPermSameHalf, - BPermOtherHalfWWM); - return DAG.getBitcast(VT, Result); -} - void SITargetLowering::ReplaceNodeResults(SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) const { @@ -10364,8 +10286,6 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, Poisons.push_back(DAG.getPOISON(ValTy)); return DAG.getMergeValues(Poisons, SDLoc(Op)); } - case Intrinsic::amdgcn_wave_shuffle: - return lowerWaveShuffle(*this, Op.getNode(), DAG); default: if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrinsicID)) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.shuffle.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.shuffle.ll deleted file mode 100644 index 664a203a8f38..000000000000 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.shuffle.ll +++ /dev/null @@ -1,226 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 -; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx801 < %s | FileCheck -check-prefixes=GFX8-W32 %s -; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9-W32 %s -; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11-W32 %s -; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-W32 %s - -; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx801 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX8-W64 %s -; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX9-W64 %s -; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11-W64 %s -; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12-W64 %s - -; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx801 < %s | FileCheck -check-prefixes=GFX8-W32-GISEL %s -; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9-W32-GISEL %s -; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11-W32-GISEL %s -; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-W32-GISEL %s - -; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx801 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX8-W64-GISEL %s -; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX9-W64-GISEL %s -; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11-W64-GISEL %s -; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12-W64-GISEL %s - -; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx600 -filetype=null %s 2>&1 | FileCheck -check-prefixes=GFX6-SDAG-ERR %s -; RUN: not llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx600 -filetype=null %s 2>&1 | FileCheck -check-prefixes=GFX6-GISEL-ERR %s - -; GFX6-SDAG-ERR: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.ds.bpermute -; GFX6-GISEL-ERR: "Invalid opcode!" - -; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=null %s 2>&1 | FileCheck -check-prefixes=GFX7-SDAG-ERR %s -; RUN: not llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=null %s 2>&1 | FileCheck -check-prefixes=GFX7-GISEL-ERR %s - -; GFX7-SDAG-ERR: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.ds.bpermute -; GFX7-GISEL-ERR: "Invalid opcode!" - - -define float @test_wave_shuffle_float(float %val, i32 %idx) { -; GFX8-W32-LABEL: test_wave_shuffle_float: -; GFX8-W32: ; %bb.0: ; %entry -; GFX8-W32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-W32-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GFX8-W32-NEXT: ds_bpermute_b32 v0, v1, v0 -; GFX8-W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-W32-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-W32-LABEL: test_wave_shuffle_float: -; GFX9-W32: ; %bb.0: ; %entry -; GFX9-W32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-W32-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GFX9-W32-NEXT: ds_bpermute_b32 v0, v1, v0 -; GFX9-W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-W32-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-W32-LABEL: test_wave_shuffle_float: -; GFX11-W32: ; %bb.0: ; %entry -; GFX11-W32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-W32-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GFX11-W32-NEXT: ds_bpermute_b32 v0, v1, v0 -; GFX11-W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-W32-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-W32-LABEL: test_wave_shuffle_float: -; GFX12-W32: ; %bb.0: ; %entry -; GFX12-W32-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-W32-NEXT: s_wait_expcnt 0x0 -; GFX12-W32-NEXT: s_wait_samplecnt 0x0 -; GFX12-W32-NEXT: s_wait_bvhcnt 0x0 -; GFX12-W32-NEXT: s_wait_kmcnt 0x0 -; GFX12-W32-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GFX12-W32-NEXT: ds_bpermute_b32 v0, v1, v0 -; GFX12-W32-NEXT: s_wait_dscnt 0x0 -; GFX12-W32-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-W64-LABEL: test_wave_shuffle_float: -; GFX8-W64: ; %bb.0: ; %entry -; GFX8-W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-W64-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GFX8-W64-NEXT: ds_bpermute_b32 v0, v1, v0 -; GFX8-W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-W64-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-W64-LABEL: test_wave_shuffle_float: -; GFX9-W64: ; %bb.0: ; %entry -; GFX9-W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-W64-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GFX9-W64-NEXT: ds_bpermute_b32 v0, v1, v0 -; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-W64-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-W64-LABEL: test_wave_shuffle_float: -; GFX11-W64: ; %bb.0: ; %entry -; GFX11-W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-W64-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX11-W64-NEXT: scratch_store_b32 off, v2, s32 ; 4-byte Folded Spill -; GFX11-W64-NEXT: s_mov_b64 exec, s[0:1] -; GFX11-W64-NEXT: v_lshlrev_b32_e32 v3, 2, v1 -; GFX11-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec -; GFX11-W64-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $exec -; GFX11-W64-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX11-W64-NEXT: v_permlane64_b32 v2, v0 -; GFX11-W64-NEXT: ds_bpermute_b32 v2, v3, v2 -; GFX11-W64-NEXT: s_mov_b64 exec, s[0:1] -; GFX11-W64-NEXT: ds_bpermute_b32 v0, v3, v0 -; GFX11-W64-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 -; GFX11-W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-W64-NEXT: v_xor_b32_e32 v1, v3, v1 -; GFX11-W64-NEXT: s_waitcnt lgkmcnt(1) -; GFX11-W64-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-W64-NEXT: v_and_b32_e32 v1, 32, v1 -; GFX11-W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX11-W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-W64-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GFX11-W64-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX11-W64-NEXT: scratch_load_b32 v2, off, s32 ; 4-byte Folded Reload -; GFX11-W64-NEXT: s_mov_b64 exec, s[0:1] -; GFX11-W64-NEXT: s_waitcnt vmcnt(0) -; GFX11-W64-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-W64-LABEL: test_wave_shuffle_float: -; GFX12-W64: ; %bb.0: ; %entry -; GFX12-W64-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-W64-NEXT: s_wait_expcnt 0x0 -; GFX12-W64-NEXT: s_wait_samplecnt 0x0 -; GFX12-W64-NEXT: s_wait_bvhcnt 0x0 -; GFX12-W64-NEXT: s_wait_kmcnt 0x0 -; GFX12-W64-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GFX12-W64-NEXT: ds_bpermute_b32 v0, v1, v0 -; GFX12-W64-NEXT: s_wait_dscnt 0x0 -; GFX12-W64-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-W32-GISEL-LABEL: test_wave_shuffle_float: -; GFX8-W32-GISEL: ; %bb.0: ; %entry -; GFX8-W32-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-W32-GISEL-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GFX8-W32-GISEL-NEXT: ds_bpermute_b32 v0, v1, v0 -; GFX8-W32-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-W32-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-W32-GISEL-LABEL: test_wave_shuffle_float: -; GFX9-W32-GISEL: ; %bb.0: ; %entry -; GFX9-W32-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-W32-GISEL-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GFX9-W32-GISEL-NEXT: ds_bpermute_b32 v0, v1, v0 -; GFX9-W32-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-W32-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-W32-GISEL-LABEL: test_wave_shuffle_float: -; GFX11-W32-GISEL: ; %bb.0: ; %entry -; GFX11-W32-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-W32-GISEL-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GFX11-W32-GISEL-NEXT: ds_bpermute_b32 v0, v1, v0 -; GFX11-W32-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-W32-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-W32-GISEL-LABEL: test_wave_shuffle_float: -; GFX12-W32-GISEL: ; %bb.0: ; %entry -; GFX12-W32-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-W32-GISEL-NEXT: s_wait_expcnt 0x0 -; GFX12-W32-GISEL-NEXT: s_wait_samplecnt 0x0 -; GFX12-W32-GISEL-NEXT: s_wait_bvhcnt 0x0 -; GFX12-W32-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-W32-GISEL-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GFX12-W32-GISEL-NEXT: ds_bpermute_b32 v0, v1, v0 -; GFX12-W32-GISEL-NEXT: s_wait_dscnt 0x0 -; GFX12-W32-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-W64-GISEL-LABEL: test_wave_shuffle_float: -; GFX8-W64-GISEL: ; %bb.0: ; %entry -; GFX8-W64-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-W64-GISEL-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GFX8-W64-GISEL-NEXT: ds_bpermute_b32 v0, v1, v0 -; GFX8-W64-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-W64-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-W64-GISEL-LABEL: test_wave_shuffle_float: -; GFX9-W64-GISEL: ; %bb.0: ; %entry -; GFX9-W64-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-W64-GISEL-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GFX9-W64-GISEL-NEXT: ds_bpermute_b32 v0, v1, v0 -; GFX9-W64-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-W64-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-W64-GISEL-LABEL: test_wave_shuffle_float: -; GFX11-W64-GISEL: ; %bb.0: ; %entry -; GFX11-W64-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-W64-GISEL-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX11-W64-GISEL-NEXT: scratch_store_b32 off, v2, s32 ; 4-byte Folded Spill -; GFX11-W64-GISEL-NEXT: s_mov_b64 exec, s[0:1] -; GFX11-W64-GISEL-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GFX11-W64-GISEL-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec -; GFX11-W64-GISEL-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $exec -; GFX11-W64-GISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX11-W64-GISEL-NEXT: v_permlane64_b32 v2, v0 -; GFX11-W64-GISEL-NEXT: ds_bpermute_b32 v2, v1, v2 -; GFX11-W64-GISEL-NEXT: s_mov_b64 exec, s[0:1] -; GFX11-W64-GISEL-NEXT: ds_bpermute_b32 v0, v1, v0 -; GFX11-W64-GISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 -; GFX11-W64-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-W64-GISEL-NEXT: v_xor_b32_e32 v1, v3, v1 -; GFX11-W64-GISEL-NEXT: s_waitcnt lgkmcnt(1) -; GFX11-W64-GISEL-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-W64-GISEL-NEXT: v_and_b32_e32 v1, 32, v1 -; GFX11-W64-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-W64-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX11-W64-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-W64-GISEL-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GFX11-W64-GISEL-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX11-W64-GISEL-NEXT: scratch_load_b32 v2, off, s32 ; 4-byte Folded Reload -; GFX11-W64-GISEL-NEXT: s_mov_b64 exec, s[0:1] -; GFX11-W64-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX11-W64-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-W64-GISEL-LABEL: test_wave_shuffle_float: -; GFX12-W64-GISEL: ; %bb.0: ; %entry -; GFX12-W64-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-W64-GISEL-NEXT: s_wait_expcnt 0x0 -; GFX12-W64-GISEL-NEXT: s_wait_samplecnt 0x0 -; GFX12-W64-GISEL-NEXT: s_wait_bvhcnt 0x0 -; GFX12-W64-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-W64-GISEL-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GFX12-W64-GISEL-NEXT: ds_bpermute_b32 v0, v1, v0 -; GFX12-W64-GISEL-NEXT: s_wait_dscnt 0x0 -; GFX12-W64-GISEL-NEXT: s_setpc_b64 s[30:31] -entry: - %0 = tail call float @llvm.amdgcn.wave.shuffle(float %val, i32 %idx) - ret float %0 -}