[AMDGPU] Remove AMDGPUISD::FFBH_I32 and add ISD::CTLS lowering (#187694)
It's the a continuation of previously reverted https://github.com/llvm/llvm-project/pull/178420 The patch removes custom AMDGPUISD::FFBH_I32 SelectionDAG node. Call sites that need raw hardware semantics (LowerINT_TO_FP32, legalizeITOFP) now use amdgcn_sffbh intrinsic directly. ISD::CTLS is added as a Custom operation for i32. Previous attempt had an issue: The hardware v_ffbh_i32 instruction (v_cls_i32 on newer targets) has different semantics than ISD::CTLS: -sffbh returns [1, BitWidth-1] for normal values, -1 for all-same-bits -CTLS returns [0, BitWidth-2] for normal values, BitWidth-1 for all-same-bits Now LowerCTLS handles this by: sffbh -> umin(sffbh, BitWidth) -> sub 1. Current patch also adds DAG combine to recognize the common CTLS idiom: sub(ctlz(xor(x, sra(x, BitWidth-1))), 1) -> ctls(x) and an optimization in performMinMaxCombine to fold away umin when the input is not all-same-bits. Partially addresses #177635
This commit is contained in:
parent
249a3d19dd
commit
76f88063b6
@ -5620,6 +5620,7 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
|
||||
case G_CTTZ:
|
||||
case G_CTTZ_ZERO_UNDEF:
|
||||
case G_CTPOP:
|
||||
case G_CTLS:
|
||||
case G_FCOPYSIGN:
|
||||
case G_ZEXT:
|
||||
case G_SEXT:
|
||||
|
||||
@ -1473,6 +1473,8 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
|
||||
case ISD::CTLZ:
|
||||
case ISD::CTLZ_ZERO_UNDEF:
|
||||
return LowerCTLZ_CTTZ(Op, DAG);
|
||||
case ISD::CTLS:
|
||||
return LowerCTLS(Op, DAG);
|
||||
case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
|
||||
}
|
||||
return Op;
|
||||
@ -3418,6 +3420,19 @@ SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) cons
|
||||
return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
|
||||
}
|
||||
|
||||
SDValue AMDGPUTargetLowering::LowerCTLS(SDValue Op, SelectionDAG &DAG) const {
|
||||
SDLoc SL(Op);
|
||||
SDValue Src = Op.getOperand(0);
|
||||
assert(Src.getValueType() == MVT::i32 && "LowerCTLS only supports i32");
|
||||
SDValue Ffbh = DAG.getNode(
|
||||
ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
|
||||
DAG.getTargetConstant(Intrinsic::amdgcn_sffbh, SL, MVT::i32), Src);
|
||||
SDValue Clamped = DAG.getNode(ISD::UMIN, SL, MVT::i32, Ffbh,
|
||||
DAG.getConstant(32, SL, MVT::i32));
|
||||
return DAG.getNode(ISD::ADD, SL, MVT::i32, Clamped,
|
||||
DAG.getAllOnesConstant(SL, MVT::i32));
|
||||
}
|
||||
|
||||
SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
|
||||
bool Signed) const {
|
||||
// The regular method converting a 64-bit integer to float roughly consists of
|
||||
@ -3482,7 +3497,9 @@ SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
|
||||
DAG.getNode(ISD::ADD, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
|
||||
OppositeSign);
|
||||
// Count the leading sign bits.
|
||||
ShAmt = DAG.getNode(AMDGPUISD::FFBH_I32, SL, MVT::i32, Hi);
|
||||
ShAmt = DAG.getNode(
|
||||
ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
|
||||
DAG.getTargetConstant(Intrinsic::amdgcn_sffbh, SL, MVT::i32), Hi);
|
||||
// Different from unsigned conversion, the shift should be one bit less to
|
||||
// preserve the sign bit.
|
||||
ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, ShAmt,
|
||||
|
||||
@ -51,6 +51,7 @@ protected:
|
||||
/// Split a vector store into multiple scalar stores.
|
||||
/// \returns The resulting chain.
|
||||
|
||||
SDValue LowerCTLS(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerFCEIL(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const;
|
||||
|
||||
@ -320,7 +320,6 @@ def AMDGPUbfm : SDNode<"AMDGPUISD::BFM", SDTIntBinOp>;
|
||||
|
||||
// ctlz with -1 if input is zero.
|
||||
def AMDGPUffbh_u32_impl : SDNode<"AMDGPUISD::FFBH_U32", SDTIntBitCountUnaryOp>;
|
||||
def AMDGPUffbh_i32_impl : SDNode<"AMDGPUISD::FFBH_I32", SDTIntBitCountUnaryOp>;
|
||||
|
||||
// cttz with -1 if input is zero.
|
||||
def AMDGPUffbl_b32_impl : SDNode<"AMDGPUISD::FFBL_B32", SDTIntBitCountUnaryOp>;
|
||||
@ -494,9 +493,6 @@ def AMDGPUdiv_fixup : PatFrags<(ops node:$src0, node:$src1, node:$src2),
|
||||
[(int_amdgcn_div_fixup node:$src0, node:$src1, node:$src2),
|
||||
(AMDGPUdiv_fixup_impl node:$src0, node:$src1, node:$src2)]>;
|
||||
|
||||
def AMDGPUffbh_i32 : PatFrags<(ops node:$src),
|
||||
[(int_amdgcn_sffbh node:$src),
|
||||
(AMDGPUffbh_i32_impl node:$src)]>;
|
||||
|
||||
def AMDGPUffbh_u32 : PatFrags<(ops node:$src),
|
||||
[(ctlz_zero_undef node:$src),
|
||||
|
||||
@ -1387,6 +1387,12 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
|
||||
.widenScalarToNextPow2(0, 32)
|
||||
.widenScalarToNextPow2(1, 32);
|
||||
|
||||
getActionDefinitionsBuilder(G_CTLS)
|
||||
.customFor({{S32, S32}})
|
||||
.scalarize(0)
|
||||
.clampScalar(0, S32, S32)
|
||||
.clampScalar(1, S32, S32);
|
||||
|
||||
// S64 is only legal on SALU, and needs to be broken into 32-bit elements in
|
||||
// RegBankSelect.
|
||||
getActionDefinitionsBuilder(G_BITREVERSE)
|
||||
@ -2310,6 +2316,8 @@ bool AMDGPULegalizerInfo::legalizeCustom(
|
||||
case TargetOpcode::G_CTLZ:
|
||||
case TargetOpcode::G_CTTZ:
|
||||
return legalizeCTLZ_CTTZ(MI, MRI, B);
|
||||
case TargetOpcode::G_CTLS:
|
||||
return legalizeCTLS(MI, MRI, B);
|
||||
case TargetOpcode::G_CTLZ_ZERO_UNDEF:
|
||||
return legalizeCTLZ_ZERO_UNDEF(MI, MRI, B);
|
||||
case TargetOpcode::G_STACKSAVE:
|
||||
@ -4682,6 +4690,23 @@ bool AMDGPULegalizerInfo::legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI,
|
||||
return true;
|
||||
}
|
||||
|
||||
bool AMDGPULegalizerInfo::legalizeCTLS(MachineInstr &MI,
|
||||
MachineRegisterInfo &MRI,
|
||||
MachineIRBuilder &B) const {
|
||||
Register Dst = MI.getOperand(0).getReg();
|
||||
Register Src = MI.getOperand(1).getReg();
|
||||
LLT SrcTy = MRI.getType(Src);
|
||||
const LLT S32 = LLT::scalar(32);
|
||||
assert(SrcTy == S32 && "legalizeCTLS only supports s32");
|
||||
unsigned BitWidth = SrcTy.getSizeInBits();
|
||||
|
||||
auto Sffbh = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32}).addUse(Src);
|
||||
auto Clamped = B.buildUMin(S32, Sffbh, B.buildConstant(S32, BitWidth));
|
||||
B.buildSub(Dst, Clamped, B.buildConstant(S32, 1));
|
||||
MI.eraseFromParent();
|
||||
return true;
|
||||
}
|
||||
|
||||
// Check that this is a G_XOR x, -1
|
||||
static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
|
||||
if (MI.getOpcode() != TargetOpcode::G_XOR)
|
||||
|
||||
@ -119,6 +119,8 @@ public:
|
||||
MachineIRBuilder &B) const;
|
||||
bool legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI, MachineRegisterInfo &MRI,
|
||||
MachineIRBuilder &B) const;
|
||||
bool legalizeCTLS(MachineInstr &MI, MachineRegisterInfo &MRI,
|
||||
MachineIRBuilder &B) const;
|
||||
|
||||
void buildLoadInputValue(Register DstReg, MachineIRBuilder &B,
|
||||
const ArgDescriptor *Arg,
|
||||
|
||||
@ -518,6 +518,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
|
||||
|
||||
setOperationAction({ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, MVT::i32, Custom);
|
||||
setOperationAction({ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF}, MVT::i32, Custom);
|
||||
setOperationAction(ISD::CTLS, MVT::i32, Custom);
|
||||
|
||||
// We only really have 32-bit BFE instructions (and 16-bit on VI).
|
||||
//
|
||||
@ -10648,8 +10649,6 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
|
||||
case Intrinsic::amdgcn_fmul_legacy:
|
||||
return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT, Op.getOperand(1),
|
||||
Op.getOperand(2));
|
||||
case Intrinsic::amdgcn_sffbh:
|
||||
return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
|
||||
case Intrinsic::amdgcn_sbfe:
|
||||
return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, Op.getOperand(1),
|
||||
Op.getOperand(2), Op.getOperand(3));
|
||||
@ -15760,6 +15759,21 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
|
||||
}
|
||||
}
|
||||
|
||||
// umin(sffbh(x), bitwidth) -> sffbh(x) if x is known to be not 0 or -1.
|
||||
SDValue FfbhSrc;
|
||||
uint64_t Clamp = 0;
|
||||
if (Opc == ISD::UMIN &&
|
||||
sd_match(Op0,
|
||||
m_IntrinsicWOChain<Intrinsic::amdgcn_sffbh>(m_Value(FfbhSrc))) &&
|
||||
sd_match(Op1, m_ConstInt(Clamp))) {
|
||||
unsigned BitWidth = FfbhSrc.getValueType().getScalarSizeInBits();
|
||||
if (Clamp >= BitWidth) {
|
||||
KnownBits Known = DAG.computeKnownBits(FfbhSrc);
|
||||
if (Known.isNonZero() && !Known.isAllOnes())
|
||||
return Op0;
|
||||
}
|
||||
}
|
||||
|
||||
// min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
|
||||
// max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
|
||||
if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
|
||||
@ -17008,6 +17022,10 @@ SDValue SITargetLowering::performPtrAddCombine(SDNode *N,
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
static bool isCtlzOpc(unsigned Opc) {
|
||||
return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
|
||||
}
|
||||
|
||||
SDValue SITargetLowering::performSubCombine(SDNode *N,
|
||||
DAGCombinerInfo &DCI) const {
|
||||
SelectionDAG &DAG = DCI.DAG;
|
||||
@ -17053,6 +17071,27 @@ SDValue SITargetLowering::performSubCombine(SDNode *N,
|
||||
SDValue Args[] = {LHS.getOperand(0), RHS, LHS.getOperand(2)};
|
||||
return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args);
|
||||
}
|
||||
|
||||
// sub (ctlz (xor x, (sra x, 31))), 1 -> ctls x.
|
||||
if (isOneConstant(RHS) && isCtlzOpc(LHS.getOpcode())) {
|
||||
SDValue CtlzSrc = LHS.getOperand(0);
|
||||
// Check for xor x, (sra x, 31) pattern.
|
||||
if (CtlzSrc.getOpcode() == ISD::XOR) {
|
||||
SDValue X = CtlzSrc.getOperand(0);
|
||||
SDValue SignExt = CtlzSrc.getOperand(1);
|
||||
// Try both ordering of XOR operands.
|
||||
if (SignExt.getOpcode() != ISD::SRA)
|
||||
std::swap(X, SignExt);
|
||||
if (SignExt.getOpcode() == ISD::SRA && SignExt.getOperand(0) == X) {
|
||||
ConstantSDNode *ShiftAmt =
|
||||
dyn_cast<ConstantSDNode>(SignExt.getOperand(1));
|
||||
unsigned BitWidth = X.getValueType().getScalarSizeInBits();
|
||||
if (ShiftAmt && ShiftAmt->getZExtValue() == BitWidth - 1)
|
||||
return DAG.getNode(ISD::CTLS, SL, VT, X);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
|
||||
@ -293,7 +293,7 @@ def S_FLBIT_I32_B64 : SOP1_32_64 <"s_flbit_i32_b64",
|
||||
[(set i32:$sdst, (UniformUnaryFrag<AMDGPUffbh_u32> i64:$src0))]
|
||||
>;
|
||||
def S_FLBIT_I32 : SOP1_32 <"s_flbit_i32",
|
||||
[(set i32:$sdst, (UniformUnaryFrag<AMDGPUffbh_i32> i32:$src0))]
|
||||
[(set i32:$sdst, (UniformUnaryFrag<int_amdgcn_sffbh> i32:$src0))]
|
||||
>;
|
||||
def S_FLBIT_I32_I64 : SOP1_32_64 <"s_flbit_i32_i64">;
|
||||
def S_SEXT_I32_I8 : SOP1_32 <"s_sext_i32_i8",
|
||||
|
||||
@ -370,7 +370,7 @@ defm V_NOT_B32 : VOP1Inst <"v_not_b32", VOP_I32_I32>;
|
||||
defm V_BFREV_B32 : VOP1Inst <"v_bfrev_b32", VOP_I32_I32, DivergentUnaryFrag<bitreverse>>;
|
||||
defm V_FFBH_U32 : VOP1Inst <"v_ffbh_u32", VOP_I32_I32, AMDGPUffbh_u32>;
|
||||
defm V_FFBL_B32 : VOP1Inst <"v_ffbl_b32", VOP_I32_I32, AMDGPUffbl_b32>;
|
||||
defm V_FFBH_I32 : VOP1Inst <"v_ffbh_i32", VOP_I32_I32, AMDGPUffbh_i32>;
|
||||
defm V_FFBH_I32 : VOP1Inst <"v_ffbh_i32", VOP_I32_I32, int_amdgcn_sffbh>;
|
||||
|
||||
let SchedRW = [WriteDoubleAdd], IsDPMACCInstruction = 1 in {
|
||||
defm V_FREXP_EXP_I32_F64 : VOP1Inst <"v_frexp_exp_i32_f64", VOP_I32_F64_SPECIAL_OMOD, int_amdgcn_frexp_exp>;
|
||||
|
||||
159
llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctls.mir
Normal file
159
llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctls.mir
Normal file
@ -0,0 +1,159 @@
|
||||
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
|
||||
# RUN: llc -mtriple=amdgcn -mcpu=tahiti -O0 -run-pass=legalizer %s -o - | FileCheck %s
|
||||
|
||||
---
|
||||
name: ctls_s32
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0
|
||||
|
||||
; CHECK-LABEL: name: ctls_s32
|
||||
; CHECK: liveins: $vgpr0
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
|
||||
; CHECK-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.sffbh), [[COPY]](s32)
|
||||
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
|
||||
; CHECK-NEXT: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[INT]], [[C]]
|
||||
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
|
||||
; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UMIN]], [[C1]]
|
||||
; CHECK-NEXT: $vgpr0 = COPY [[SUB]](s32)
|
||||
%0:_(s32) = COPY $vgpr0
|
||||
%1:_(s32) = G_CTLS %0
|
||||
$vgpr0 = COPY %1
|
||||
...
|
||||
|
||||
---
|
||||
name: ctls_s16
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0
|
||||
|
||||
; CHECK-LABEL: name: ctls_s16
|
||||
; CHECK: liveins: $vgpr0
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
|
||||
; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 16
|
||||
; CHECK-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.sffbh), [[SEXT_INREG]](s32)
|
||||
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
|
||||
; CHECK-NEXT: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[INT]], [[C]]
|
||||
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
|
||||
; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UMIN]], [[C1]]
|
||||
; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
|
||||
; CHECK-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[SUB]], [[C2]]
|
||||
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[SUB1]](s32)
|
||||
; CHECK-NEXT: $vgpr0 = COPY [[COPY1]](s32)
|
||||
%0:_(s32) = COPY $vgpr0
|
||||
%1:_(s16) = G_TRUNC %0
|
||||
%2:_(s32) = G_CTLS %1
|
||||
$vgpr0 = COPY %2
|
||||
...
|
||||
|
||||
---
|
||||
name: ctls_v2s32
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1
|
||||
|
||||
; CHECK-LABEL: name: ctls_v2s32
|
||||
; CHECK: liveins: $vgpr0, $vgpr1
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
|
||||
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
|
||||
; CHECK-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.sffbh), [[COPY]](s32)
|
||||
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
|
||||
; CHECK-NEXT: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[INT]], [[C]]
|
||||
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
|
||||
; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UMIN]], [[C1]]
|
||||
; CHECK-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.sffbh), [[COPY1]](s32)
|
||||
; CHECK-NEXT: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[INT1]], [[C]]
|
||||
; CHECK-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[UMIN1]], [[C1]]
|
||||
; CHECK-NEXT: $vgpr0 = COPY [[SUB]](s32)
|
||||
; CHECK-NEXT: $vgpr1 = COPY [[SUB1]](s32)
|
||||
%0:_(s32) = COPY $vgpr0
|
||||
%1:_(s32) = COPY $vgpr1
|
||||
%2:_(<2 x s32>) = G_BUILD_VECTOR %0, %1
|
||||
%3:_(<2 x s32>) = G_CTLS %2
|
||||
%4:_(s32), %5:_(s32) = G_UNMERGE_VALUES %3
|
||||
$vgpr0 = COPY %4
|
||||
$vgpr1 = COPY %5
|
||||
...
|
||||
|
||||
---
|
||||
name: ctls_v2s16
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0
|
||||
|
||||
; CHECK-LABEL: name: ctls_v2s16
|
||||
; CHECK: liveins: $vgpr0
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
|
||||
; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
|
||||
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
|
||||
; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
|
||||
; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[BITCAST]], 16
|
||||
; CHECK-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.sffbh), [[SEXT_INREG]](s32)
|
||||
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
|
||||
; CHECK-NEXT: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[INT]], [[C1]]
|
||||
; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
|
||||
; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UMIN]], [[C2]]
|
||||
; CHECK-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[SUB]], [[C]]
|
||||
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[SUB1]](s32)
|
||||
; CHECK-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[LSHR]], 16
|
||||
; CHECK-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.sffbh), [[SEXT_INREG1]](s32)
|
||||
; CHECK-NEXT: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[INT1]], [[C1]]
|
||||
; CHECK-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[UMIN1]], [[C2]]
|
||||
; CHECK-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SUB2]], [[C]]
|
||||
; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[SUB3]](s32)
|
||||
; CHECK-NEXT: $vgpr0 = COPY [[COPY1]](s32)
|
||||
; CHECK-NEXT: $vgpr1 = COPY [[COPY2]](s32)
|
||||
%0:_(<2 x s16>) = COPY $vgpr0
|
||||
%1:_(<2 x s32>) = G_CTLS %0
|
||||
%2:_(s32), %3:_(s32) = G_UNMERGE_VALUES %1
|
||||
$vgpr0 = COPY %2
|
||||
$vgpr1 = COPY %3
|
||||
...
|
||||
|
||||
---
|
||||
name: ctls_v4s32
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
|
||||
|
||||
; CHECK-LABEL: name: ctls_v4s32
|
||||
; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
|
||||
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
|
||||
; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
|
||||
; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
|
||||
; CHECK-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.sffbh), [[COPY]](s32)
|
||||
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
|
||||
; CHECK-NEXT: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[INT]], [[C]]
|
||||
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
|
||||
; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UMIN]], [[C1]]
|
||||
; CHECK-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.sffbh), [[COPY1]](s32)
|
||||
; CHECK-NEXT: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[INT1]], [[C]]
|
||||
; CHECK-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[UMIN1]], [[C1]]
|
||||
; CHECK-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.sffbh), [[COPY2]](s32)
|
||||
; CHECK-NEXT: [[UMIN2:%[0-9]+]]:_(s32) = G_UMIN [[INT2]], [[C]]
|
||||
; CHECK-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[UMIN2]], [[C1]]
|
||||
; CHECK-NEXT: [[INT3:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.sffbh), [[COPY3]](s32)
|
||||
; CHECK-NEXT: [[UMIN3:%[0-9]+]]:_(s32) = G_UMIN [[INT3]], [[C]]
|
||||
; CHECK-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[UMIN3]], [[C1]]
|
||||
; CHECK-NEXT: $vgpr0 = COPY [[SUB]](s32)
|
||||
; CHECK-NEXT: $vgpr1 = COPY [[SUB1]](s32)
|
||||
; CHECK-NEXT: $vgpr2 = COPY [[SUB2]](s32)
|
||||
; CHECK-NEXT: $vgpr3 = COPY [[SUB3]](s32)
|
||||
%0:_(s32) = COPY $vgpr0
|
||||
%1:_(s32) = COPY $vgpr1
|
||||
%2:_(s32) = COPY $vgpr2
|
||||
%3:_(s32) = COPY $vgpr3
|
||||
%4:_(<4 x s32>) = G_BUILD_VECTOR %0, %1, %2, %3
|
||||
%5:_(<4 x s32>) = G_CTLS %4
|
||||
%6:_(s32), %7:_(s32), %8:_(s32), %9:_(s32) = G_UNMERGE_VALUES %5
|
||||
$vgpr0 = COPY %6
|
||||
$vgpr1 = COPY %7
|
||||
$vgpr2 = COPY %8
|
||||
$vgpr3 = COPY %9
|
||||
...
|
||||
624
llvm/test/CodeGen/AMDGPU/ctls.ll
Normal file
624
llvm/test/CodeGen/AMDGPU/ctls.ll
Normal file
@ -0,0 +1,624 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
|
||||
; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s
|
||||
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
|
||||
|
||||
declare i32 @llvm.ctlz.i32(i32, i1)
|
||||
declare i64 @llvm.ctlz.i64(i64, i1)
|
||||
declare i32 @llvm.amdgcn.sffbh.i32(i32)
|
||||
|
||||
; Test that ctls(x) is lowered to umin(ffbh_i32(x), bitwidth) - 1
|
||||
; ctls is formed by the DAG combiner from: ctlz(x ^ ashr(x, 31)) - 1
|
||||
define i32 @ctls_i32(i32 %x) {
|
||||
; GFX6-LABEL: ctls_i32:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX6-NEXT: v_ffbh_i32_e32 v0, v0
|
||||
; GFX6-NEXT: v_min_u32_e32 v0, 32, v0
|
||||
; GFX6-NEXT: v_add_i32_e32 v0, vcc, -1, v0
|
||||
; GFX6-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-LABEL: ctls_i32:
|
||||
; GFX11: ; %bb.0:
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-NEXT: v_cls_i32_e32 v0, v0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_min_u32_e32 v0, 32, v0
|
||||
; GFX11-NEXT: v_add_nc_u32_e32 v0, -1, v0
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
||||
%a = ashr i32 %x, 31
|
||||
%b = xor i32 %x, %a
|
||||
%c = call i32 @llvm.ctlz.i32(i32 %b, i1 false)
|
||||
%d = sub i32 %c, 1
|
||||
ret i32 %d
|
||||
}
|
||||
|
||||
define i32 @ctls_i32_known_positive(i32 %x) {
|
||||
; GFX6-LABEL: ctls_i32_known_positive:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX6-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
|
||||
; GFX6-NEXT: v_ffbh_i32_e32 v0, v0
|
||||
; GFX6-NEXT: v_min_u32_e32 v0, 32, v0
|
||||
; GFX6-NEXT: v_add_i32_e32 v0, vcc, -1, v0
|
||||
; GFX6-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-LABEL: ctls_i32_known_positive:
|
||||
; GFX11: ; %bb.0:
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_cls_i32_e32 v0, v0
|
||||
; GFX11-NEXT: v_min_u32_e32 v0, 32, v0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_add_nc_u32_e32 v0, -1, v0
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
||||
%pos = and i32 %x, 2147483647
|
||||
%a = ashr i32 %pos, 31
|
||||
%b = xor i32 %pos, %a
|
||||
%c = call i32 @llvm.ctlz.i32(i32 %b, i1 false)
|
||||
%d = sub i32 %c, 1
|
||||
ret i32 %d
|
||||
}
|
||||
|
||||
; sub(ctlz(xor(x, sra(x, 31))), 1) -> ctls(x)
|
||||
define i32 @ctls_i32_xor_commuted(i32 %x) {
|
||||
; GFX6-LABEL: ctls_i32_xor_commuted:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX6-NEXT: v_ffbh_i32_e32 v0, v0
|
||||
; GFX6-NEXT: v_min_u32_e32 v0, 32, v0
|
||||
; GFX6-NEXT: v_add_i32_e32 v0, vcc, -1, v0
|
||||
; GFX6-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-LABEL: ctls_i32_xor_commuted:
|
||||
; GFX11: ; %bb.0:
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-NEXT: v_cls_i32_e32 v0, v0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_min_u32_e32 v0, 32, v0
|
||||
; GFX11-NEXT: v_add_nc_u32_e32 v0, -1, v0
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
||||
%a = ashr i32 %x, 31
|
||||
%b = xor i32 %a, %x ; note: reversed order compared to ctls_i32
|
||||
%c = call i32 @llvm.ctlz.i32(i32 %b, i1 false)
|
||||
%d = sub i32 %c, 1
|
||||
ret i32 %d
|
||||
}
|
||||
|
||||
define i32 @ctls_i32_zero_undef(i32 %x) {
|
||||
; GFX6-LABEL: ctls_i32_zero_undef:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX6-NEXT: v_ffbh_i32_e32 v0, v0
|
||||
; GFX6-NEXT: v_min_u32_e32 v0, 32, v0
|
||||
; GFX6-NEXT: v_add_i32_e32 v0, vcc, -1, v0
|
||||
; GFX6-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-LABEL: ctls_i32_zero_undef:
|
||||
; GFX11: ; %bb.0:
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-NEXT: v_cls_i32_e32 v0, v0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_min_u32_e32 v0, 32, v0
|
||||
; GFX11-NEXT: v_add_nc_u32_e32 v0, -1, v0
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
||||
%a = ashr i32 %x, 31
|
||||
%b = xor i32 %x, %a
|
||||
%c = call i32 @llvm.ctlz.i32(i32 %b, i1 true) ; zero_undef = true
|
||||
%d = sub i32 %c, 1
|
||||
ret i32 %d
|
||||
}
|
||||
|
||||
; umin(ffbh_i32(x), 32) -> ffbh_i32(x).
|
||||
define i32 @ctls_i32_known_mixed_bits(i32 %x) {
|
||||
; GFX6-LABEL: ctls_i32_known_mixed_bits:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX6-NEXT: v_or_b32_e32 v0, 1, v0
|
||||
; GFX6-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
|
||||
; GFX6-NEXT: v_ffbh_i32_e32 v0, v0
|
||||
; GFX6-NEXT: v_add_i32_e32 v0, vcc, -1, v0
|
||||
; GFX6-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-LABEL: ctls_i32_known_mixed_bits:
|
||||
; GFX11: ; %bb.0:
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-NEXT: v_or_b32_e32 v0, 1, v0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
|
||||
; GFX11-NEXT: v_cls_i32_e32 v0, v0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_add_nc_u32_e32 v0, -1, v0
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
||||
; Force bit 31 = 0 and bit 0 = 1, so value is neither all-0s nor all-1s
|
||||
%cleared = and i32 %x, 2147483647 ; clear bit 31
|
||||
%mixed = or i32 %cleared, 1 ; set bit 0
|
||||
%a = ashr i32 %mixed, 31
|
||||
%b = xor i32 %mixed, %a
|
||||
%c = call i32 @llvm.ctlz.i32(i32 %b, i1 false)
|
||||
%d = sub i32 %c, 1
|
||||
ret i32 %d
|
||||
}
|
||||
|
||||
; test for i64 CTLS.
|
||||
define i32 @ctls_i64(i64 %x) {
|
||||
; GFX6-LABEL: ctls_i64:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v1
|
||||
; GFX6-NEXT: v_xor_b32_e32 v0, v0, v2
|
||||
; GFX6-NEXT: v_ffbh_u32_e32 v0, v0
|
||||
; GFX6-NEXT: v_xor_b32_e32 v1, v1, v2
|
||||
; GFX6-NEXT: v_min_u32_e32 v0, 0xffffffdf, v0
|
||||
; GFX6-NEXT: v_add_i32_e32 v0, vcc, 32, v0
|
||||
; GFX6-NEXT: v_ffbh_u32_e32 v1, v1
|
||||
; GFX6-NEXT: v_min3_u32 v0, v0, v1, 64
|
||||
; GFX6-NEXT: v_add_i32_e32 v0, vcc, -1, v0
|
||||
; GFX6-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-LABEL: ctls_i64:
|
||||
; GFX11: ; %bb.0:
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v1
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_xor_b32_e32 v0, v0, v2
|
||||
; GFX11-NEXT: v_xor_b32_e32 v1, v1, v2
|
||||
; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_clz_i32_u32_e32 v1, v1
|
||||
; GFX11-NEXT: v_add_nc_u32_e64 v0, v0, 32 clamp
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_min3_u32 v0, v0, v1, 64
|
||||
; GFX11-NEXT: v_add_nc_u32_e32 v0, -1, v0
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
||||
%a = ashr i64 %x, 63
|
||||
%b = xor i64 %x, %a
|
||||
%c = call i64 @llvm.ctlz.i64(i64 %b, i1 false)
|
||||
%d = sub i64 %c, 1
|
||||
%e = trunc i64 %d to i32
|
||||
ret i32 %e
|
||||
}
|
||||
|
||||
; i16 CTLS via the sub(ctlz(xor(x, sra(x, 15))), 1) pattern.
|
||||
declare i16 @llvm.ctlz.i16(i16, i1)
|
||||
define i16 @ctls_i16(i16 %x) {
|
||||
; GFX6-LABEL: ctls_i16:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
|
||||
; GFX6-NEXT: v_ffbh_i32_e32 v0, v0
|
||||
; GFX6-NEXT: v_min_u32_e32 v0, 32, v0
|
||||
; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, 17, v0
|
||||
; GFX6-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-LABEL: ctls_i16:
|
||||
; GFX11: ; %bb.0:
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-NEXT: v_ashrrev_i16 v0.h, 15, v0.l
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_xor_b16 v0.l, v0.l, v0.h
|
||||
; GFX11-NEXT: v_mov_b16_e32 v0.h, 0
|
||||
; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_min_u32_e32 v0, 32, v0
|
||||
; GFX11-NEXT: v_add_nc_u32_e32 v0, -16, v0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_add_nc_u16 v0.l, v0.l, -1
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
||||
%a = ashr i16 %x, 15
|
||||
%b = xor i16 %x, %a
|
||||
%c = call i16 @llvm.ctlz.i16(i16 %b, i1 false)
|
||||
%d = sub i16 %c, 1
|
||||
ret i16 %d
|
||||
}
|
||||
|
||||
; uniform input should use scalar sffbh.
|
||||
define amdgpu_ps i32 @ctls_i32_salu(i32 inreg %x) {
|
||||
; GFX6-LABEL: ctls_i32_salu:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_flbit_i32 s0, s0
|
||||
; GFX6-NEXT: s_min_u32 s0, s0, 32
|
||||
; GFX6-NEXT: s_add_i32 s0, s0, -1
|
||||
; GFX6-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX11-LABEL: ctls_i32_salu:
|
||||
; GFX11: ; %bb.0:
|
||||
; GFX11-NEXT: s_cls_i32 s0, s0
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: s_min_u32 s0, s0, 32
|
||||
; GFX11-NEXT: s_add_i32 s0, s0, -1
|
||||
; GFX11-NEXT: ; return to shader part epilog
|
||||
%a = ashr i32 %x, 31
|
||||
%b = xor i32 %x, %a
|
||||
%c = call i32 @llvm.ctlz.i32(i32 %b, i1 false)
|
||||
%d = sub i32 %c, 1
|
||||
ret i32 %d
|
||||
}
|
||||
|
||||
define <2 x i32> @ctls_v2i32(<2 x i32> %x) {
|
||||
; GFX6-LABEL: ctls_v2i32:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v1
|
||||
; GFX6-NEXT: v_ashrrev_i32_e32 v3, 31, v0
|
||||
; GFX6-NEXT: v_xor_b32_e32 v1, v1, v2
|
||||
; GFX6-NEXT: v_xor_b32_e32 v0, v0, v3
|
||||
; GFX6-NEXT: v_ffbh_u32_e32 v1, v1
|
||||
; GFX6-NEXT: v_ffbh_u32_e32 v0, v0
|
||||
; GFX6-NEXT: v_min_u32_e32 v1, 32, v1
|
||||
; GFX6-NEXT: v_min_u32_e32 v0, 32, v0
|
||||
; GFX6-NEXT: v_add_i32_e32 v0, vcc, -1, v0
|
||||
; GFX6-NEXT: v_add_i32_e32 v1, vcc, -1, v1
|
||||
; GFX6-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-LABEL: ctls_v2i32:
|
||||
; GFX11: ; %bb.0:
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v0
|
||||
; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v1
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_xor_b32_e32 v0, v0, v2
|
||||
; GFX11-NEXT: v_xor_b32_e32 v1, v1, v3
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0
|
||||
; GFX11-NEXT: v_clz_i32_u32_e32 v1, v1
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_min_u32_e32 v0, 32, v0
|
||||
; GFX11-NEXT: v_min_u32_e32 v1, 32, v1
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_add_nc_u32_e32 v0, -1, v0
|
||||
; GFX11-NEXT: v_add_nc_u32_e32 v1, -1, v1
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
||||
%a = ashr <2 x i32> %x, <i32 31, i32 31>
|
||||
%b = xor <2 x i32> %x, %a
|
||||
%c = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %b, i1 false)
|
||||
%d = sub <2 x i32> %c, <i32 1, i32 1>
|
||||
ret <2 x i32> %d
|
||||
}
|
||||
declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1)
|
||||
declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1)
|
||||
|
||||
define <4 x i32> @ctls_v4i32(<4 x i32> %x) {
|
||||
; GFX6-LABEL: ctls_v4i32:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX6-NEXT: v_ffbh_i32_e32 v0, v0
|
||||
; GFX6-NEXT: v_ffbh_i32_e32 v1, v1
|
||||
; GFX6-NEXT: v_ffbh_i32_e32 v2, v2
|
||||
; GFX6-NEXT: v_ffbh_i32_e32 v3, v3
|
||||
; GFX6-NEXT: v_min_u32_e32 v0, 32, v0
|
||||
; GFX6-NEXT: v_min_u32_e32 v1, 32, v1
|
||||
; GFX6-NEXT: v_min_u32_e32 v2, 32, v2
|
||||
; GFX6-NEXT: v_min_u32_e32 v3, 32, v3
|
||||
; GFX6-NEXT: v_add_i32_e32 v0, vcc, -1, v0
|
||||
; GFX6-NEXT: v_add_i32_e32 v1, vcc, -1, v1
|
||||
; GFX6-NEXT: v_add_i32_e32 v2, vcc, -1, v2
|
||||
; GFX6-NEXT: v_add_i32_e32 v3, vcc, -1, v3
|
||||
; GFX6-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-LABEL: ctls_v4i32:
|
||||
; GFX11: ; %bb.0:
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-NEXT: v_cls_i32_e32 v0, v0
|
||||
; GFX11-NEXT: v_cls_i32_e32 v1, v1
|
||||
; GFX11-NEXT: v_cls_i32_e32 v2, v2
|
||||
; GFX11-NEXT: v_cls_i32_e32 v3, v3
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
|
||||
; GFX11-NEXT: v_min_u32_e32 v0, 32, v0
|
||||
; GFX11-NEXT: v_min_u32_e32 v1, 32, v1
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
|
||||
; GFX11-NEXT: v_min_u32_e32 v2, 32, v2
|
||||
; GFX11-NEXT: v_min_u32_e32 v3, 32, v3
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
|
||||
; GFX11-NEXT: v_add_nc_u32_e32 v0, -1, v0
|
||||
; GFX11-NEXT: v_add_nc_u32_e32 v1, -1, v1
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
|
||||
; GFX11-NEXT: v_add_nc_u32_e32 v2, -1, v2
|
||||
; GFX11-NEXT: v_add_nc_u32_e32 v3, -1, v3
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
||||
%a = ashr <4 x i32> %x, <i32 31, i32 31, i32 31, i32 31>
|
||||
%b = xor <4 x i32> %x, %a
|
||||
%c = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %b, i1 false)
|
||||
%d = sub <4 x i32> %c, <i32 1, i32 1, i32 1, i32 1>
|
||||
ret <4 x i32> %d
|
||||
}
|
||||
|
||||
; umin should be folded away per element per element.
|
||||
define <2 x i32> @ctls_v2i32_known_mixed_bits(<2 x i32> %x) {
|
||||
; GFX6-LABEL: ctls_v2i32_known_mixed_bits:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX6-NEXT: v_or_b32_e32 v1, 1, v1
|
||||
; GFX6-NEXT: v_or_b32_e32 v0, 1, v0
|
||||
; GFX6-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1
|
||||
; GFX6-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
|
||||
; GFX6-NEXT: v_ffbh_u32_e32 v1, v1
|
||||
; GFX6-NEXT: v_ffbh_u32_e32 v0, v0
|
||||
; GFX6-NEXT: v_add_i32_e32 v0, vcc, -1, v0
|
||||
; GFX6-NEXT: v_add_i32_e32 v1, vcc, -1, v1
|
||||
; GFX6-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-LABEL: ctls_v2i32_known_mixed_bits:
|
||||
; GFX11: ; %bb.0:
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-NEXT: v_or_b32_e32 v0, 1, v0
|
||||
; GFX11-NEXT: v_or_b32_e32 v1, 1, v1
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
|
||||
; GFX11-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0
|
||||
; GFX11-NEXT: v_clz_i32_u32_e32 v1, v1
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_add_nc_u32_e32 v0, -1, v0
|
||||
; GFX11-NEXT: v_add_nc_u32_e32 v1, -1, v1
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
||||
%cleared = and <2 x i32> %x, <i32 2147483647, i32 2147483647>
|
||||
%mixed = or <2 x i32> %cleared, <i32 1, i32 1>
|
||||
%a = ashr <2 x i32> %mixed, <i32 31, i32 31>
|
||||
%b = xor <2 x i32> %mixed, %a
|
||||
%c = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %b, i1 false)
|
||||
%d = sub <2 x i32> %c, <i32 1, i32 1>
|
||||
ret <2 x i32> %d
|
||||
}
|
||||
|
||||
; Vector with ctlz_zero_undef.
|
||||
define <2 x i32> @ctls_v2i32_zero_undef(<2 x i32> %x) {
|
||||
; GFX6-LABEL: ctls_v2i32_zero_undef:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v1
|
||||
; GFX6-NEXT: v_ashrrev_i32_e32 v3, 31, v0
|
||||
; GFX6-NEXT: v_xor_b32_e32 v1, v1, v2
|
||||
; GFX6-NEXT: v_xor_b32_e32 v0, v0, v3
|
||||
; GFX6-NEXT: v_ffbh_u32_e32 v1, v1
|
||||
; GFX6-NEXT: v_ffbh_u32_e32 v0, v0
|
||||
; GFX6-NEXT: v_add_i32_e32 v0, vcc, -1, v0
|
||||
; GFX6-NEXT: v_add_i32_e32 v1, vcc, -1, v1
|
||||
; GFX6-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-LABEL: ctls_v2i32_zero_undef:
|
||||
; GFX11: ; %bb.0:
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v0
|
||||
; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v1
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_xor_b32_e32 v0, v0, v2
|
||||
; GFX11-NEXT: v_xor_b32_e32 v1, v1, v3
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0
|
||||
; GFX11-NEXT: v_clz_i32_u32_e32 v1, v1
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_add_nc_u32_e32 v0, -1, v0
|
||||
; GFX11-NEXT: v_add_nc_u32_e32 v1, -1, v1
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
||||
%a = ashr <2 x i32> %x, <i32 31, i32 31>
|
||||
%b = xor <2 x i32> %x, %a
|
||||
%c = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %b, i1 true)
|
||||
%d = sub <2 x i32> %c, <i32 1, i32 1>
|
||||
ret <2 x i32> %d
|
||||
}
|
||||
|
||||
; Vector commuted XOR operands.
|
||||
define <2 x i32> @ctls_v2i32_xor_commuted(<2 x i32> %x) {
|
||||
; GFX6-LABEL: ctls_v2i32_xor_commuted:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v1
|
||||
; GFX6-NEXT: v_ashrrev_i32_e32 v3, 31, v0
|
||||
; GFX6-NEXT: v_xor_b32_e32 v1, v2, v1
|
||||
; GFX6-NEXT: v_xor_b32_e32 v0, v3, v0
|
||||
; GFX6-NEXT: v_ffbh_u32_e32 v1, v1
|
||||
; GFX6-NEXT: v_ffbh_u32_e32 v0, v0
|
||||
; GFX6-NEXT: v_min_u32_e32 v1, 32, v1
|
||||
; GFX6-NEXT: v_min_u32_e32 v0, 32, v0
|
||||
; GFX6-NEXT: v_add_i32_e32 v0, vcc, -1, v0
|
||||
; GFX6-NEXT: v_add_i32_e32 v1, vcc, -1, v1
|
||||
; GFX6-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-LABEL: ctls_v2i32_xor_commuted:
|
||||
; GFX11: ; %bb.0:
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v0
|
||||
; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v1
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_xor_b32_e32 v0, v2, v0
|
||||
; GFX11-NEXT: v_xor_b32_e32 v1, v3, v1
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0
|
||||
; GFX11-NEXT: v_clz_i32_u32_e32 v1, v1
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_min_u32_e32 v0, 32, v0
|
||||
; GFX11-NEXT: v_min_u32_e32 v1, 32, v1
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_add_nc_u32_e32 v0, -1, v0
|
||||
; GFX11-NEXT: v_add_nc_u32_e32 v1, -1, v1
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
||||
%a = ashr <2 x i32> %x, <i32 31, i32 31>
|
||||
%b = xor <2 x i32> %a, %x
|
||||
%c = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %b, i1 false)
|
||||
%d = sub <2 x i32> %c, <i32 1, i32 1>
|
||||
ret <2 x i32> %d
|
||||
}
|
||||
|
||||
; Vector known positive: umin should NOT be folded.
|
||||
define <2 x i32> @ctls_v2i32_known_positive(<2 x i32> %x) {
|
||||
; GFX6-LABEL: ctls_v2i32_known_positive:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX6-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1
|
||||
; GFX6-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
|
||||
; GFX6-NEXT: v_ffbh_u32_e32 v1, v1
|
||||
; GFX6-NEXT: v_ffbh_u32_e32 v0, v0
|
||||
; GFX6-NEXT: v_min_u32_e32 v1, 32, v1
|
||||
; GFX6-NEXT: v_min_u32_e32 v0, 32, v0
|
||||
; GFX6-NEXT: v_add_i32_e32 v0, vcc, -1, v0
|
||||
; GFX6-NEXT: v_add_i32_e32 v1, vcc, -1, v1
|
||||
; GFX6-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-LABEL: ctls_v2i32_known_positive:
|
||||
; GFX11: ; %bb.0:
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
|
||||
; GFX11-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0
|
||||
; GFX11-NEXT: v_clz_i32_u32_e32 v1, v1
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_min_u32_e32 v0, 32, v0
|
||||
; GFX11-NEXT: v_min_u32_e32 v1, 32, v1
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_add_nc_u32_e32 v0, -1, v0
|
||||
; GFX11-NEXT: v_add_nc_u32_e32 v1, -1, v1
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
||||
%pos = and <2 x i32> %x, <i32 2147483647, i32 2147483647>
|
||||
%a = ashr <2 x i32> %pos, <i32 31, i32 31>
|
||||
%b = xor <2 x i32> %pos, %a
|
||||
%c = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %b, i1 false)
|
||||
%d = sub <2 x i32> %c, <i32 1, i32 1>
|
||||
ret <2 x i32> %d
|
||||
}
|
||||
|
||||
; @llvm.amdgcn.sffbh must still produce raw hardware result.
|
||||
define i32 @sffbh_intrinsic(i32 %x) {
|
||||
; GFX6-LABEL: sffbh_intrinsic:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX6-NEXT: v_ffbh_i32_e32 v0, v0
|
||||
; GFX6-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-LABEL: sffbh_intrinsic:
|
||||
; GFX11: ; %bb.0:
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-NEXT: v_cls_i32_e32 v0, v0
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
||||
%r = call i32 @llvm.amdgcn.sffbh.i32(i32 %x)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
; sitofp i64 to f32 uses sffbh(Hi)-1, not CTLS.
|
||||
define float @sitofp_i64_to_f32(i64 %x) {
|
||||
; GFX6-LABEL: sitofp_i64_to_f32:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX6-NEXT: v_xor_b32_e32 v2, v0, v1
|
||||
; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v2
|
||||
; GFX6-NEXT: v_ffbh_i32_e32 v3, v1
|
||||
; GFX6-NEXT: v_add_i32_e32 v2, vcc, 32, v2
|
||||
; GFX6-NEXT: v_add_i32_e32 v3, vcc, -1, v3
|
||||
; GFX6-NEXT: v_min_u32_e32 v2, v3, v2
|
||||
; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v2
|
||||
; GFX6-NEXT: v_min_u32_e32 v0, 1, v0
|
||||
; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
|
||||
; GFX6-NEXT: v_cvt_f32_i32_e32 v0, v0
|
||||
; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 32, v2
|
||||
; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v1
|
||||
; GFX6-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-LABEL: sitofp_i64_to_f32:
|
||||
; GFX11: ; %bb.0:
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-NEXT: v_xor_b32_e32 v2, v0, v1
|
||||
; GFX11-NEXT: v_cls_i32_e32 v3, v1
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v2
|
||||
; GFX11-NEXT: v_add_nc_u32_e32 v3, -1, v3
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_add_nc_u32_e32 v2, 32, v2
|
||||
; GFX11-NEXT: v_min_u32_e32 v2, v3, v2
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
|
||||
; GFX11-NEXT: v_min_u32_e32 v0, 1, v0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_or_b32_e32 v0, v1, v0
|
||||
; GFX11-NEXT: v_sub_nc_u32_e32 v1, 32, v2
|
||||
; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_ldexp_f32 v0, v0, v1
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
||||
%r = sitofp i64 %x to float
|
||||
ret float %r
|
||||
}
|
||||
|
||||
; Negative tests:
|
||||
define i32 @no_ctls_wrong_shift(i32 %x) {
|
||||
; GFX6-LABEL: no_ctls_wrong_shift:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX6-NEXT: v_ashrrev_i32_e32 v1, 30, v0
|
||||
; GFX6-NEXT: v_xor_b32_e32 v0, v0, v1
|
||||
; GFX6-NEXT: v_ffbh_u32_e32 v0, v0
|
||||
; GFX6-NEXT: v_min_u32_e32 v0, 32, v0
|
||||
; GFX6-NEXT: v_add_i32_e32 v0, vcc, -1, v0
|
||||
; GFX6-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-LABEL: no_ctls_wrong_shift:
|
||||
; GFX11: ; %bb.0:
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-NEXT: v_ashrrev_i32_e32 v1, 30, v0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_xor_b32_e32 v0, v0, v1
|
||||
; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_min_u32_e32 v0, 32, v0
|
||||
; GFX11-NEXT: v_add_nc_u32_e32 v0, -1, v0
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
||||
%a = ashr i32 %x, 30
|
||||
%b = xor i32 %x, %a
|
||||
%c = call i32 @llvm.ctlz.i32(i32 %b, i1 false)
|
||||
%d = sub i32 %c, 1
|
||||
ret i32 %d
|
||||
}
|
||||
|
||||
define i32 @no_ctls_xor_different_value(i32 %x, i32 %y) {
|
||||
; GFX6-LABEL: no_ctls_xor_different_value:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v1
|
||||
; GFX6-NEXT: v_xor_b32_e32 v0, v0, v1
|
||||
; GFX6-NEXT: v_ffbh_u32_e32 v0, v0
|
||||
; GFX6-NEXT: v_min_u32_e32 v0, 32, v0
|
||||
; GFX6-NEXT: v_add_i32_e32 v0, vcc, -1, v0
|
||||
; GFX6-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-LABEL: no_ctls_xor_different_value:
|
||||
; GFX11: ; %bb.0:
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-NEXT: v_ashrrev_i32_e32 v1, 31, v1
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_xor_b32_e32 v0, v0, v1
|
||||
; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_min_u32_e32 v0, 32, v0
|
||||
; GFX11-NEXT: v_add_nc_u32_e32 v0, -1, v0
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
||||
%a = ashr i32 %y, 31
|
||||
%b = xor i32 %x, %a
|
||||
%c = call i32 @llvm.ctlz.i32(i32 %b, i1 false)
|
||||
%d = sub i32 %c, 1
|
||||
ret i32 %d
|
||||
}
|
||||
|
||||
define i32 @no_ctls_sub_2(i32 %x) {
|
||||
; GFX6-LABEL: no_ctls_sub_2:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX6-NEXT: v_ffbh_i32_e32 v0, v0
|
||||
; GFX6-NEXT: v_min_u32_e32 v0, 32, v0
|
||||
; GFX6-NEXT: v_add_i32_e32 v0, vcc, -2, v0
|
||||
; GFX6-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-LABEL: no_ctls_sub_2:
|
||||
; GFX11: ; %bb.0:
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-NEXT: v_cls_i32_e32 v0, v0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_min_u32_e32 v0, 32, v0
|
||||
; GFX11-NEXT: v_add_nc_u32_e32 v0, -2, v0
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
||||
%a = ashr i32 %x, 31
|
||||
%b = xor i32 %x, %a
|
||||
%c = call i32 @llvm.ctlz.i32(i32 %b, i1 false)
|
||||
%d = sub i32 %c, 2
|
||||
ret i32 %d
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user