Revert "[AMDGPU] Add new llvm.amdgcn.wave.shuffle intrinsic" (#174501)
Reverts llvm/llvm-project#167372
This commit is contained in:
parent
2f06623953
commit
4bca00d56b
@ -2684,16 +2684,6 @@ def int_amdgcn_call_whole_wave:
|
||||
llvm_vararg_ty], // The arguments to the callee.
|
||||
[IntrConvergent]>;
|
||||
|
||||
// <result> llvm.amdgcn.wave.shuffle <value> <id>
|
||||
// value and result can be a 32bit floating-point or
|
||||
// integer type, and must be the same type. Any index
|
||||
// value that's outside the valid range will wrap around,
|
||||
// and reading from an inactive lane will return poison.
|
||||
def int_amdgcn_wave_shuffle :
|
||||
DefaultAttrsIntrinsic<[llvm_any_ty], // return type
|
||||
[LLVMMatchType<0>, llvm_i32_ty], // arg types
|
||||
[IntrConvergent, IntrNoMem]>; // flags
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// CI+ Intrinsics
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
@ -1216,8 +1216,6 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
|
||||
case Intrinsic::amdgcn_permlane16_swap:
|
||||
case Intrinsic::amdgcn_permlane32_swap:
|
||||
return selectPermlaneSwapIntrin(I, IntrinsicID);
|
||||
case Intrinsic::amdgcn_wave_shuffle:
|
||||
return selectWaveShuffleIntrin(I);
|
||||
default:
|
||||
return selectImpl(I, *CoverageInfo);
|
||||
}
|
||||
@ -3896,130 +3894,6 @@ bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool AMDGPUInstructionSelector::selectWaveShuffleIntrin(
|
||||
MachineInstr &MI) const {
|
||||
assert(MI.getNumOperands() == 4);
|
||||
MachineBasicBlock *MBB = MI.getParent();
|
||||
const DebugLoc &DL = MI.getDebugLoc();
|
||||
|
||||
Register DstReg = MI.getOperand(0).getReg();
|
||||
Register ValReg = MI.getOperand(2).getReg();
|
||||
Register IdxReg = MI.getOperand(3).getReg();
|
||||
|
||||
const LLT DstTy = MRI->getType(DstReg);
|
||||
unsigned DstSize = DstTy.getSizeInBits();
|
||||
const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
|
||||
const TargetRegisterClass *DstRC =
|
||||
TRI.getRegClassForSizeOnBank(DstSize, *DstRB);
|
||||
|
||||
if (DstTy != LLT::scalar(32))
|
||||
return false;
|
||||
|
||||
// If we can bpermute across the whole wave, then just do that
|
||||
if (Subtarget->supportsWaveWideBPermute()) {
|
||||
Register ShiftIdxReg = MRI->createVirtualRegister(DstRC);
|
||||
BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), ShiftIdxReg)
|
||||
.addImm(2)
|
||||
.addReg(IdxReg);
|
||||
|
||||
BuildMI(*MBB, MI, DL, TII.get(AMDGPU::DS_BPERMUTE_B32), DstReg)
|
||||
.addReg(ShiftIdxReg)
|
||||
.addReg(ValReg)
|
||||
.addImm(0);
|
||||
} else {
|
||||
// Otherwise, we need to make use of whole wave mode
|
||||
assert(Subtarget->isWave64());
|
||||
|
||||
// Set inactive lanes to poison
|
||||
Register UndefValReg =
|
||||
MRI->createVirtualRegister(TRI.getRegClass(AMDGPU::SReg_32RegClassID));
|
||||
BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefValReg);
|
||||
|
||||
Register UndefExecReg = MRI->createVirtualRegister(
|
||||
TRI.getRegClass(AMDGPU::SReg_64_XEXECRegClassID));
|
||||
BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefExecReg);
|
||||
|
||||
Register PoisonValReg = MRI->createVirtualRegister(DstRC);
|
||||
BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SET_INACTIVE_B32), PoisonValReg)
|
||||
.addImm(0)
|
||||
.addReg(ValReg)
|
||||
.addImm(0)
|
||||
.addReg(UndefValReg)
|
||||
.addReg(UndefExecReg);
|
||||
|
||||
// ds_bpermute requires index to be multiplied by 4
|
||||
Register ShiftIdxReg = MRI->createVirtualRegister(DstRC);
|
||||
BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), ShiftIdxReg)
|
||||
.addImm(2)
|
||||
.addReg(IdxReg);
|
||||
|
||||
Register PoisonIdxReg = MRI->createVirtualRegister(DstRC);
|
||||
BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SET_INACTIVE_B32), PoisonIdxReg)
|
||||
.addImm(0)
|
||||
.addReg(ShiftIdxReg)
|
||||
.addImm(0)
|
||||
.addReg(UndefValReg)
|
||||
.addReg(UndefExecReg);
|
||||
|
||||
// Get permutation of each half, then we'll select which one to use
|
||||
Register SameSidePermReg = MRI->createVirtualRegister(DstRC);
|
||||
BuildMI(*MBB, MI, DL, TII.get(AMDGPU::DS_BPERMUTE_B32), SameSidePermReg)
|
||||
.addReg(PoisonIdxReg)
|
||||
.addReg(PoisonValReg)
|
||||
.addImm(0);
|
||||
|
||||
Register SwappedValReg = MRI->createVirtualRegister(DstRC);
|
||||
BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_PERMLANE64_B32), SwappedValReg)
|
||||
.addReg(PoisonValReg);
|
||||
|
||||
Register OppSidePermReg = MRI->createVirtualRegister(DstRC);
|
||||
BuildMI(*MBB, MI, DL, TII.get(AMDGPU::DS_BPERMUTE_B32), OppSidePermReg)
|
||||
.addReg(PoisonIdxReg)
|
||||
.addReg(SwappedValReg)
|
||||
.addImm(0);
|
||||
|
||||
Register WWMSwapPermReg = MRI->createVirtualRegister(DstRC);
|
||||
BuildMI(*MBB, MI, DL, TII.get(AMDGPU::STRICT_WWM), WWMSwapPermReg)
|
||||
.addReg(OppSidePermReg);
|
||||
|
||||
// Select which side to take the permute from
|
||||
// We can get away with only using mbcnt_lo here since we're only
|
||||
// trying to detect which side of 32 each lane is on, and mbcnt_lo
|
||||
// returns 32 for lanes 32-63.
|
||||
Register ThreadIDReg = MRI->createVirtualRegister(DstRC);
|
||||
BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MBCNT_LO_U32_B32_e64), ThreadIDReg)
|
||||
.addImm(-1)
|
||||
.addImm(0);
|
||||
|
||||
Register XORReg = MRI->createVirtualRegister(DstRC);
|
||||
BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_XOR_B32_e64), XORReg)
|
||||
.addReg(ThreadIDReg)
|
||||
.addReg(PoisonIdxReg);
|
||||
|
||||
Register ANDReg = MRI->createVirtualRegister(DstRC);
|
||||
BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_AND_B32_e64), ANDReg)
|
||||
.addReg(XORReg)
|
||||
.addImm(32);
|
||||
|
||||
Register CompareReg = MRI->createVirtualRegister(
|
||||
TRI.getRegClass(AMDGPU::SReg_64_XEXECRegClassID));
|
||||
BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), CompareReg)
|
||||
.addReg(ANDReg)
|
||||
.addImm(0);
|
||||
|
||||
// Finally do the selection
|
||||
BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
|
||||
.addImm(0)
|
||||
.addReg(WWMSwapPermReg)
|
||||
.addImm(0)
|
||||
.addReg(SameSidePermReg)
|
||||
.addReg(CompareReg);
|
||||
}
|
||||
|
||||
MI.eraseFromParent();
|
||||
return true;
|
||||
}
|
||||
|
||||
// Match BITOP3 operation and return a number of matched instructions plus
|
||||
// truth table.
|
||||
static std::pair<unsigned, uint8_t> BitOp3_Op(Register R,
|
||||
|
||||
@ -156,7 +156,6 @@ private:
|
||||
bool selectSBarrierSignalIsfirst(MachineInstr &I, Intrinsic::ID IID) const;
|
||||
bool selectSGetBarrierState(MachineInstr &I, Intrinsic::ID IID) const;
|
||||
bool selectSBarrierLeave(MachineInstr &I) const;
|
||||
bool selectWaveShuffleIntrin(MachineInstr &I) const;
|
||||
|
||||
std::pair<Register, unsigned> selectVOP3ModsImpl(Register Src,
|
||||
bool IsCanonicalizing = true,
|
||||
|
||||
@ -5239,20 +5239,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
|
||||
OpdsMapping[2] = AMDGPU::getValueMapping(regBankID, OpSize);
|
||||
break;
|
||||
}
|
||||
case Intrinsic::amdgcn_s_bitreplicate: {
|
||||
case Intrinsic::amdgcn_s_bitreplicate:
|
||||
Register MaskReg = MI.getOperand(2).getReg();
|
||||
unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID);
|
||||
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64);
|
||||
OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, 32);
|
||||
break;
|
||||
}
|
||||
case Intrinsic::amdgcn_wave_shuffle: {
|
||||
unsigned OpSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
|
||||
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
|
||||
OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
|
||||
OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
|
||||
break;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
@ -1883,12 +1883,6 @@ public:
|
||||
bool requiresWaitsBeforeSystemScopeStores() const {
|
||||
return RequiresWaitsBeforeSystemScopeStores;
|
||||
}
|
||||
|
||||
bool supportsWaveWideBPermute() const {
|
||||
return (getGeneration() <= AMDGPUSubtarget::GFX9 ||
|
||||
getGeneration() == AMDGPUSubtarget::GFX12) ||
|
||||
isWave32();
|
||||
}
|
||||
};
|
||||
|
||||
class GCNUserSGPRUsageInfo {
|
||||
|
||||
@ -7368,84 +7368,6 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
|
||||
return DAG.getBitcast(VT, UnrolledLaneOp);
|
||||
}
|
||||
|
||||
static SDValue lowerWaveShuffle(const SITargetLowering &TLI, SDNode *N,
|
||||
SelectionDAG &DAG) {
|
||||
EVT VT = N->getValueType(0);
|
||||
|
||||
if (VT.getSizeInBits() != 32)
|
||||
return SDValue();
|
||||
|
||||
SDLoc SL(N);
|
||||
|
||||
SDValue Value = N->getOperand(1);
|
||||
SDValue Index = N->getOperand(2);
|
||||
|
||||
// ds_bpermute requires index to be multiplied by 4
|
||||
SDValue ShiftAmount = DAG.getShiftAmountConstant(2, MVT::i32, SL);
|
||||
SDValue ShiftedIndex =
|
||||
DAG.getNode(ISD::SHL, SL, Index.getValueType(), Index, ShiftAmount);
|
||||
|
||||
// Intrinsics will require i32 to operate on
|
||||
SDValue ValueI32 = DAG.getBitcast(MVT::i32, Value);
|
||||
|
||||
auto MakeIntrinsic = [&DAG, &SL](unsigned IID, MVT RetVT,
|
||||
SmallVector<SDValue> IntrinArgs) -> SDValue {
|
||||
SmallVector<SDValue> Operands(1);
|
||||
Operands[0] = DAG.getTargetConstant(IID, SL, MVT::i32);
|
||||
Operands.append(IntrinArgs);
|
||||
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, RetVT, Operands);
|
||||
};
|
||||
|
||||
// If we can bpermute across the whole wave, then just do that
|
||||
if (TLI.getSubtarget()->supportsWaveWideBPermute()) {
|
||||
SDValue BPermute = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32,
|
||||
{ShiftedIndex, ValueI32});
|
||||
return DAG.getBitcast(VT, BPermute);
|
||||
}
|
||||
|
||||
assert(TLI.getSubtarget()->isWave64());
|
||||
|
||||
// Otherwise, we need to make use of whole wave mode
|
||||
SDValue PoisonVal = DAG.getPOISON(ValueI32->getValueType(0));
|
||||
|
||||
// Set inactive lanes to poison
|
||||
SDValue WWMValue = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
|
||||
{ValueI32, PoisonVal});
|
||||
SDValue WWMIndex = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
|
||||
{ShiftedIndex, PoisonVal});
|
||||
|
||||
SDValue Swapped =
|
||||
MakeIntrinsic(Intrinsic::amdgcn_permlane64, MVT::i32, {WWMValue});
|
||||
|
||||
// Get permutation of each half, then we'll select which one to use
|
||||
SDValue BPermSameHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32,
|
||||
{WWMIndex, WWMValue});
|
||||
SDValue BPermOtherHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute,
|
||||
MVT::i32, {WWMIndex, Swapped});
|
||||
SDValue BPermOtherHalfWWM =
|
||||
MakeIntrinsic(Intrinsic::amdgcn_wwm, MVT::i32, {BPermOtherHalf});
|
||||
|
||||
// Select which side to take the permute from
|
||||
SDValue ThreadIDMask = DAG.getAllOnesConstant(SL, MVT::i32);
|
||||
// We can get away with only using mbcnt_lo here since we're only
|
||||
// trying to detect which side of 32 each lane is on, and mbcnt_lo
|
||||
// returns 32 for lanes 32-63.
|
||||
SDValue ThreadID =
|
||||
MakeIntrinsic(Intrinsic::amdgcn_mbcnt_lo, MVT::i32,
|
||||
{ThreadIDMask, DAG.getTargetConstant(0, SL, MVT::i32)});
|
||||
|
||||
SDValue SameOrOtherHalf =
|
||||
DAG.getNode(ISD::AND, SL, MVT::i32,
|
||||
DAG.getNode(ISD::XOR, SL, MVT::i32, ThreadID, Index),
|
||||
DAG.getTargetConstant(32, SL, MVT::i32));
|
||||
SDValue UseSameHalf =
|
||||
DAG.getSetCC(SL, MVT::i1, SameOrOtherHalf,
|
||||
DAG.getConstant(0, SL, MVT::i32), ISD::SETEQ);
|
||||
SDValue Result = DAG.getSelect(SL, MVT::i32, UseSameHalf, BPermSameHalf,
|
||||
BPermOtherHalfWWM);
|
||||
return DAG.getBitcast(VT, Result);
|
||||
}
|
||||
|
||||
void SITargetLowering::ReplaceNodeResults(SDNode *N,
|
||||
SmallVectorImpl<SDValue> &Results,
|
||||
SelectionDAG &DAG) const {
|
||||
@ -10364,8 +10286,6 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
|
||||
Poisons.push_back(DAG.getPOISON(ValTy));
|
||||
return DAG.getMergeValues(Poisons, SDLoc(Op));
|
||||
}
|
||||
case Intrinsic::amdgcn_wave_shuffle:
|
||||
return lowerWaveShuffle(*this, Op.getNode(), DAG);
|
||||
default:
|
||||
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
|
||||
AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
|
||||
|
||||
@ -1,226 +0,0 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
|
||||
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx801 < %s | FileCheck -check-prefixes=GFX8-W32 %s
|
||||
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9-W32 %s
|
||||
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11-W32 %s
|
||||
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-W32 %s
|
||||
|
||||
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx801 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX8-W64 %s
|
||||
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX9-W64 %s
|
||||
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11-W64 %s
|
||||
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12-W64 %s
|
||||
|
||||
; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx801 < %s | FileCheck -check-prefixes=GFX8-W32-GISEL %s
|
||||
; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9-W32-GISEL %s
|
||||
; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11-W32-GISEL %s
|
||||
; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-W32-GISEL %s
|
||||
|
||||
; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx801 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX8-W64-GISEL %s
|
||||
; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX9-W64-GISEL %s
|
||||
; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11-W64-GISEL %s
|
||||
; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12-W64-GISEL %s
|
||||
|
||||
; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx600 -filetype=null %s 2>&1 | FileCheck -check-prefixes=GFX6-SDAG-ERR %s
|
||||
; RUN: not llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx600 -filetype=null %s 2>&1 | FileCheck -check-prefixes=GFX6-GISEL-ERR %s
|
||||
|
||||
; GFX6-SDAG-ERR: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.ds.bpermute
|
||||
; GFX6-GISEL-ERR: "Invalid opcode!"
|
||||
|
||||
; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=null %s 2>&1 | FileCheck -check-prefixes=GFX7-SDAG-ERR %s
|
||||
; RUN: not llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=null %s 2>&1 | FileCheck -check-prefixes=GFX7-GISEL-ERR %s
|
||||
|
||||
; GFX7-SDAG-ERR: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.ds.bpermute
|
||||
; GFX7-GISEL-ERR: "Invalid opcode!"
|
||||
|
||||
|
||||
define float @test_wave_shuffle_float(float %val, i32 %idx) {
|
||||
; GFX8-W32-LABEL: test_wave_shuffle_float:
|
||||
; GFX8-W32: ; %bb.0: ; %entry
|
||||
; GFX8-W32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX8-W32-NEXT: v_lshlrev_b32_e32 v1, 2, v1
|
||||
; GFX8-W32-NEXT: ds_bpermute_b32 v0, v1, v0
|
||||
; GFX8-W32-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-W32-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX9-W32-LABEL: test_wave_shuffle_float:
|
||||
; GFX9-W32: ; %bb.0: ; %entry
|
||||
; GFX9-W32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-W32-NEXT: v_lshlrev_b32_e32 v1, 2, v1
|
||||
; GFX9-W32-NEXT: ds_bpermute_b32 v0, v1, v0
|
||||
; GFX9-W32-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-W32-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-W32-LABEL: test_wave_shuffle_float:
|
||||
; GFX11-W32: ; %bb.0: ; %entry
|
||||
; GFX11-W32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-W32-NEXT: v_lshlrev_b32_e32 v1, 2, v1
|
||||
; GFX11-W32-NEXT: ds_bpermute_b32 v0, v1, v0
|
||||
; GFX11-W32-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-W32-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX12-W32-LABEL: test_wave_shuffle_float:
|
||||
; GFX12-W32: ; %bb.0: ; %entry
|
||||
; GFX12-W32-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX12-W32-NEXT: s_wait_expcnt 0x0
|
||||
; GFX12-W32-NEXT: s_wait_samplecnt 0x0
|
||||
; GFX12-W32-NEXT: s_wait_bvhcnt 0x0
|
||||
; GFX12-W32-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-W32-NEXT: v_lshlrev_b32_e32 v1, 2, v1
|
||||
; GFX12-W32-NEXT: ds_bpermute_b32 v0, v1, v0
|
||||
; GFX12-W32-NEXT: s_wait_dscnt 0x0
|
||||
; GFX12-W32-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX8-W64-LABEL: test_wave_shuffle_float:
|
||||
; GFX8-W64: ; %bb.0: ; %entry
|
||||
; GFX8-W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX8-W64-NEXT: v_lshlrev_b32_e32 v1, 2, v1
|
||||
; GFX8-W64-NEXT: ds_bpermute_b32 v0, v1, v0
|
||||
; GFX8-W64-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-W64-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX9-W64-LABEL: test_wave_shuffle_float:
|
||||
; GFX9-W64: ; %bb.0: ; %entry
|
||||
; GFX9-W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-W64-NEXT: v_lshlrev_b32_e32 v1, 2, v1
|
||||
; GFX9-W64-NEXT: ds_bpermute_b32 v0, v1, v0
|
||||
; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-W64-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-W64-LABEL: test_wave_shuffle_float:
|
||||
; GFX11-W64: ; %bb.0: ; %entry
|
||||
; GFX11-W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-W64-NEXT: s_xor_saveexec_b64 s[0:1], -1
|
||||
; GFX11-W64-NEXT: scratch_store_b32 off, v2, s32 ; 4-byte Folded Spill
|
||||
; GFX11-W64-NEXT: s_mov_b64 exec, s[0:1]
|
||||
; GFX11-W64-NEXT: v_lshlrev_b32_e32 v3, 2, v1
|
||||
; GFX11-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
|
||||
; GFX11-W64-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $exec
|
||||
; GFX11-W64-NEXT: s_or_saveexec_b64 s[0:1], -1
|
||||
; GFX11-W64-NEXT: v_permlane64_b32 v2, v0
|
||||
; GFX11-W64-NEXT: ds_bpermute_b32 v2, v3, v2
|
||||
; GFX11-W64-NEXT: s_mov_b64 exec, s[0:1]
|
||||
; GFX11-W64-NEXT: ds_bpermute_b32 v0, v3, v0
|
||||
; GFX11-W64-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0
|
||||
; GFX11-W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
|
||||
; GFX11-W64-NEXT: v_xor_b32_e32 v1, v3, v1
|
||||
; GFX11-W64-NEXT: s_waitcnt lgkmcnt(1)
|
||||
; GFX11-W64-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX11-W64-NEXT: v_and_b32_e32 v1, 32, v1
|
||||
; GFX11-W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
|
||||
; GFX11-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
|
||||
; GFX11-W64-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-W64-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
|
||||
; GFX11-W64-NEXT: s_xor_saveexec_b64 s[0:1], -1
|
||||
; GFX11-W64-NEXT: scratch_load_b32 v2, off, s32 ; 4-byte Folded Reload
|
||||
; GFX11-W64-NEXT: s_mov_b64 exec, s[0:1]
|
||||
; GFX11-W64-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-W64-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX12-W64-LABEL: test_wave_shuffle_float:
|
||||
; GFX12-W64: ; %bb.0: ; %entry
|
||||
; GFX12-W64-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX12-W64-NEXT: s_wait_expcnt 0x0
|
||||
; GFX12-W64-NEXT: s_wait_samplecnt 0x0
|
||||
; GFX12-W64-NEXT: s_wait_bvhcnt 0x0
|
||||
; GFX12-W64-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-W64-NEXT: v_lshlrev_b32_e32 v1, 2, v1
|
||||
; GFX12-W64-NEXT: ds_bpermute_b32 v0, v1, v0
|
||||
; GFX12-W64-NEXT: s_wait_dscnt 0x0
|
||||
; GFX12-W64-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX8-W32-GISEL-LABEL: test_wave_shuffle_float:
|
||||
; GFX8-W32-GISEL: ; %bb.0: ; %entry
|
||||
; GFX8-W32-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX8-W32-GISEL-NEXT: v_lshlrev_b32_e32 v1, 2, v1
|
||||
; GFX8-W32-GISEL-NEXT: ds_bpermute_b32 v0, v1, v0
|
||||
; GFX8-W32-GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-W32-GISEL-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX9-W32-GISEL-LABEL: test_wave_shuffle_float:
|
||||
; GFX9-W32-GISEL: ; %bb.0: ; %entry
|
||||
; GFX9-W32-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-W32-GISEL-NEXT: v_lshlrev_b32_e32 v1, 2, v1
|
||||
; GFX9-W32-GISEL-NEXT: ds_bpermute_b32 v0, v1, v0
|
||||
; GFX9-W32-GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-W32-GISEL-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-W32-GISEL-LABEL: test_wave_shuffle_float:
|
||||
; GFX11-W32-GISEL: ; %bb.0: ; %entry
|
||||
; GFX11-W32-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-W32-GISEL-NEXT: v_lshlrev_b32_e32 v1, 2, v1
|
||||
; GFX11-W32-GISEL-NEXT: ds_bpermute_b32 v0, v1, v0
|
||||
; GFX11-W32-GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-W32-GISEL-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX12-W32-GISEL-LABEL: test_wave_shuffle_float:
|
||||
; GFX12-W32-GISEL: ; %bb.0: ; %entry
|
||||
; GFX12-W32-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX12-W32-GISEL-NEXT: s_wait_expcnt 0x0
|
||||
; GFX12-W32-GISEL-NEXT: s_wait_samplecnt 0x0
|
||||
; GFX12-W32-GISEL-NEXT: s_wait_bvhcnt 0x0
|
||||
; GFX12-W32-GISEL-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-W32-GISEL-NEXT: v_lshlrev_b32_e32 v1, 2, v1
|
||||
; GFX12-W32-GISEL-NEXT: ds_bpermute_b32 v0, v1, v0
|
||||
; GFX12-W32-GISEL-NEXT: s_wait_dscnt 0x0
|
||||
; GFX12-W32-GISEL-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX8-W64-GISEL-LABEL: test_wave_shuffle_float:
|
||||
; GFX8-W64-GISEL: ; %bb.0: ; %entry
|
||||
; GFX8-W64-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX8-W64-GISEL-NEXT: v_lshlrev_b32_e32 v1, 2, v1
|
||||
; GFX8-W64-GISEL-NEXT: ds_bpermute_b32 v0, v1, v0
|
||||
; GFX8-W64-GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-W64-GISEL-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX9-W64-GISEL-LABEL: test_wave_shuffle_float:
|
||||
; GFX9-W64-GISEL: ; %bb.0: ; %entry
|
||||
; GFX9-W64-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-W64-GISEL-NEXT: v_lshlrev_b32_e32 v1, 2, v1
|
||||
; GFX9-W64-GISEL-NEXT: ds_bpermute_b32 v0, v1, v0
|
||||
; GFX9-W64-GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-W64-GISEL-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-W64-GISEL-LABEL: test_wave_shuffle_float:
|
||||
; GFX11-W64-GISEL: ; %bb.0: ; %entry
|
||||
; GFX11-W64-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-W64-GISEL-NEXT: s_xor_saveexec_b64 s[0:1], -1
|
||||
; GFX11-W64-GISEL-NEXT: scratch_store_b32 off, v2, s32 ; 4-byte Folded Spill
|
||||
; GFX11-W64-GISEL-NEXT: s_mov_b64 exec, s[0:1]
|
||||
; GFX11-W64-GISEL-NEXT: v_lshlrev_b32_e32 v1, 2, v1
|
||||
; GFX11-W64-GISEL-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
|
||||
; GFX11-W64-GISEL-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $exec
|
||||
; GFX11-W64-GISEL-NEXT: s_or_saveexec_b64 s[0:1], -1
|
||||
; GFX11-W64-GISEL-NEXT: v_permlane64_b32 v2, v0
|
||||
; GFX11-W64-GISEL-NEXT: ds_bpermute_b32 v2, v1, v2
|
||||
; GFX11-W64-GISEL-NEXT: s_mov_b64 exec, s[0:1]
|
||||
; GFX11-W64-GISEL-NEXT: ds_bpermute_b32 v0, v1, v0
|
||||
; GFX11-W64-GISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0
|
||||
; GFX11-W64-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
|
||||
; GFX11-W64-GISEL-NEXT: v_xor_b32_e32 v1, v3, v1
|
||||
; GFX11-W64-GISEL-NEXT: s_waitcnt lgkmcnt(1)
|
||||
; GFX11-W64-GISEL-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX11-W64-GISEL-NEXT: v_and_b32_e32 v1, 32, v1
|
||||
; GFX11-W64-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
|
||||
; GFX11-W64-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
|
||||
; GFX11-W64-GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-W64-GISEL-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
|
||||
; GFX11-W64-GISEL-NEXT: s_xor_saveexec_b64 s[0:1], -1
|
||||
; GFX11-W64-GISEL-NEXT: scratch_load_b32 v2, off, s32 ; 4-byte Folded Reload
|
||||
; GFX11-W64-GISEL-NEXT: s_mov_b64 exec, s[0:1]
|
||||
; GFX11-W64-GISEL-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-W64-GISEL-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX12-W64-GISEL-LABEL: test_wave_shuffle_float:
|
||||
; GFX12-W64-GISEL: ; %bb.0: ; %entry
|
||||
; GFX12-W64-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX12-W64-GISEL-NEXT: s_wait_expcnt 0x0
|
||||
; GFX12-W64-GISEL-NEXT: s_wait_samplecnt 0x0
|
||||
; GFX12-W64-GISEL-NEXT: s_wait_bvhcnt 0x0
|
||||
; GFX12-W64-GISEL-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-W64-GISEL-NEXT: v_lshlrev_b32_e32 v1, 2, v1
|
||||
; GFX12-W64-GISEL-NEXT: ds_bpermute_b32 v0, v1, v0
|
||||
; GFX12-W64-GISEL-NEXT: s_wait_dscnt 0x0
|
||||
; GFX12-W64-GISEL-NEXT: s_setpc_b64 s[30:31]
|
||||
entry:
|
||||
%0 = tail call float @llvm.amdgcn.wave.shuffle(float %val, i32 %idx)
|
||||
ret float %0
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user