From 76f88063b651589474d52b90b06ad10bf55bb480 Mon Sep 17 00:00:00 2001 From: Dmitry Sidorov Date: Thu, 26 Mar 2026 16:14:34 +0100 Subject: [PATCH] [AMDGPU] Remove AMDGPUISD::FFBH_I32 and add ISD::CTLS lowering (#187694) It's the a continuation of previously reverted https://github.com/llvm/llvm-project/pull/178420 The patch removes custom AMDGPUISD::FFBH_I32 SelectionDAG node. Call sites that need raw hardware semantics (LowerINT_TO_FP32, legalizeITOFP) now use amdgcn_sffbh intrinsic directly. ISD::CTLS is added as a Custom operation for i32. Previous attempt had an issue: The hardware v_ffbh_i32 instruction (v_cls_i32 on newer targets) has different semantics than ISD::CTLS: -sffbh returns [1, BitWidth-1] for normal values, -1 for all-same-bits -CTLS returns [0, BitWidth-2] for normal values, BitWidth-1 for all-same-bits Now LowerCTLS handles this by: sffbh -> umin(sffbh, BitWidth) -> sub 1. Current patch also adds DAG combine to recognize the common CTLS idiom: sub(ctlz(xor(x, sra(x, BitWidth-1))), 1) -> ctls(x) and an optimization in performMinMaxCombine to fold away umin when the input is not all-same-bits. Partially addresses #177635 --- .../CodeGen/GlobalISel/LegalizerHelper.cpp | 1 + llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 19 +- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 1 + llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td | 4 - .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 25 + llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h | 2 + llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 43 +- llvm/lib/Target/AMDGPU/SOPInstructions.td | 2 +- llvm/lib/Target/AMDGPU/VOP1Instructions.td | 2 +- .../AMDGPU/GlobalISel/legalize-ctls.mir | 159 +++++ llvm/test/CodeGen/AMDGPU/ctls.ll | 624 ++++++++++++++++++ 11 files changed, 873 insertions(+), 9 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctls.mir create mode 100644 llvm/test/CodeGen/AMDGPU/ctls.ll diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index b813cda34890..72ca4380a630 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -5620,6 +5620,7 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx, case G_CTTZ: case G_CTTZ_ZERO_UNDEF: case G_CTPOP: + case G_CTLS: case G_FCOPYSIGN: case G_ZEXT: case G_SEXT: diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 6bd3556801e3..e7c5b2cb5cc4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -1473,6 +1473,8 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, case ISD::CTLZ: case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ_CTTZ(Op, DAG); + case ISD::CTLS: + return LowerCTLS(Op, DAG); case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); } return Op; @@ -3418,6 +3420,19 @@ SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) cons return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr); } +SDValue AMDGPUTargetLowering::LowerCTLS(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); + SDValue Src = Op.getOperand(0); + assert(Src.getValueType() == MVT::i32 && "LowerCTLS only supports i32"); + SDValue Ffbh = DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, + DAG.getTargetConstant(Intrinsic::amdgcn_sffbh, SL, MVT::i32), Src); + SDValue Clamped = DAG.getNode(ISD::UMIN, SL, MVT::i32, Ffbh, + DAG.getConstant(32, SL, MVT::i32)); + return DAG.getNode(ISD::ADD, SL, MVT::i32, Clamped, + DAG.getAllOnesConstant(SL, MVT::i32)); +} + SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, bool Signed) const { // The regular method converting a 64-bit integer to float roughly consists of @@ -3482,7 +3497,9 @@ SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, DAG.getNode(ISD::ADD, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32), OppositeSign); // Count the leading sign bits. - ShAmt = DAG.getNode(AMDGPUISD::FFBH_I32, SL, MVT::i32, Hi); + ShAmt = DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, + DAG.getTargetConstant(Intrinsic::amdgcn_sffbh, SL, MVT::i32), Hi); // Different from unsigned conversion, the shift should be one bit less to // preserve the sign bit. ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, ShAmt, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index 433c29319264..18fed2ebe6e6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -51,6 +51,7 @@ protected: /// Split a vector store into multiple scalar stores. /// \returns The resulting chain. + SDValue LowerCTLS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFCEIL(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td index 8dc5d45aa73b..1b9a8869d18b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -320,7 +320,6 @@ def AMDGPUbfm : SDNode<"AMDGPUISD::BFM", SDTIntBinOp>; // ctlz with -1 if input is zero. def AMDGPUffbh_u32_impl : SDNode<"AMDGPUISD::FFBH_U32", SDTIntBitCountUnaryOp>; -def AMDGPUffbh_i32_impl : SDNode<"AMDGPUISD::FFBH_I32", SDTIntBitCountUnaryOp>; // cttz with -1 if input is zero. def AMDGPUffbl_b32_impl : SDNode<"AMDGPUISD::FFBL_B32", SDTIntBitCountUnaryOp>; @@ -494,9 +493,6 @@ def AMDGPUdiv_fixup : PatFrags<(ops node:$src0, node:$src1, node:$src2), [(int_amdgcn_div_fixup node:$src0, node:$src1, node:$src2), (AMDGPUdiv_fixup_impl node:$src0, node:$src1, node:$src2)]>; -def AMDGPUffbh_i32 : PatFrags<(ops node:$src), - [(int_amdgcn_sffbh node:$src), - (AMDGPUffbh_i32_impl node:$src)]>; def AMDGPUffbh_u32 : PatFrags<(ops node:$src), [(ctlz_zero_undef node:$src), diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 9f0815d39a06..8b1114f59cd6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -1387,6 +1387,12 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .widenScalarToNextPow2(0, 32) .widenScalarToNextPow2(1, 32); + getActionDefinitionsBuilder(G_CTLS) + .customFor({{S32, S32}}) + .scalarize(0) + .clampScalar(0, S32, S32) + .clampScalar(1, S32, S32); + // S64 is only legal on SALU, and needs to be broken into 32-bit elements in // RegBankSelect. getActionDefinitionsBuilder(G_BITREVERSE) @@ -2310,6 +2316,8 @@ bool AMDGPULegalizerInfo::legalizeCustom( case TargetOpcode::G_CTLZ: case TargetOpcode::G_CTTZ: return legalizeCTLZ_CTTZ(MI, MRI, B); + case TargetOpcode::G_CTLS: + return legalizeCTLS(MI, MRI, B); case TargetOpcode::G_CTLZ_ZERO_UNDEF: return legalizeCTLZ_ZERO_UNDEF(MI, MRI, B); case TargetOpcode::G_STACKSAVE: @@ -4682,6 +4690,23 @@ bool AMDGPULegalizerInfo::legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI, return true; } +bool AMDGPULegalizerInfo::legalizeCTLS(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + Register Dst = MI.getOperand(0).getReg(); + Register Src = MI.getOperand(1).getReg(); + LLT SrcTy = MRI.getType(Src); + const LLT S32 = LLT::scalar(32); + assert(SrcTy == S32 && "legalizeCTLS only supports s32"); + unsigned BitWidth = SrcTy.getSizeInBits(); + + auto Sffbh = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32}).addUse(Src); + auto Clamped = B.buildUMin(S32, Sffbh, B.buildConstant(S32, BitWidth)); + B.buildSub(Dst, Clamped, B.buildConstant(S32, 1)); + MI.eraseFromParent(); + return true; +} + // Check that this is a G_XOR x, -1 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { if (MI.getOpcode() != TargetOpcode::G_XOR) diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h index d3ec307b0cde..36ce5d974076 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h @@ -119,6 +119,8 @@ public: MachineIRBuilder &B) const; bool legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const; + bool legalizeCTLS(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const; void buildLoadInputValue(Register DstReg, MachineIRBuilder &B, const ArgDescriptor *Arg, diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 79b6c239a6d6..39d1e762ac08 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -518,6 +518,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction({ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, MVT::i32, Custom); setOperationAction({ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF}, MVT::i32, Custom); + setOperationAction(ISD::CTLS, MVT::i32, Custom); // We only really have 32-bit BFE instructions (and 16-bit on VI). // @@ -10648,8 +10649,6 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::amdgcn_fmul_legacy: return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT, Op.getOperand(1), Op.getOperand(2)); - case Intrinsic::amdgcn_sffbh: - return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1)); case Intrinsic::amdgcn_sbfe: return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); @@ -15760,6 +15759,21 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N, } } + // umin(sffbh(x), bitwidth) -> sffbh(x) if x is known to be not 0 or -1. + SDValue FfbhSrc; + uint64_t Clamp = 0; + if (Opc == ISD::UMIN && + sd_match(Op0, + m_IntrinsicWOChain(m_Value(FfbhSrc))) && + sd_match(Op1, m_ConstInt(Clamp))) { + unsigned BitWidth = FfbhSrc.getValueType().getScalarSizeInBits(); + if (Clamp >= BitWidth) { + KnownBits Known = DAG.computeKnownBits(FfbhSrc); + if (Known.isNonZero() && !Known.isAllOnes()) + return Op0; + } + } + // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1) // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0) if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) { @@ -17008,6 +17022,10 @@ SDValue SITargetLowering::performPtrAddCombine(SDNode *N, return SDValue(); } +static bool isCtlzOpc(unsigned Opc) { + return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF; +} + SDValue SITargetLowering::performSubCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -17053,6 +17071,27 @@ SDValue SITargetLowering::performSubCombine(SDNode *N, SDValue Args[] = {LHS.getOperand(0), RHS, LHS.getOperand(2)}; return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args); } + + // sub (ctlz (xor x, (sra x, 31))), 1 -> ctls x. + if (isOneConstant(RHS) && isCtlzOpc(LHS.getOpcode())) { + SDValue CtlzSrc = LHS.getOperand(0); + // Check for xor x, (sra x, 31) pattern. + if (CtlzSrc.getOpcode() == ISD::XOR) { + SDValue X = CtlzSrc.getOperand(0); + SDValue SignExt = CtlzSrc.getOperand(1); + // Try both ordering of XOR operands. + if (SignExt.getOpcode() != ISD::SRA) + std::swap(X, SignExt); + if (SignExt.getOpcode() == ISD::SRA && SignExt.getOperand(0) == X) { + ConstantSDNode *ShiftAmt = + dyn_cast(SignExt.getOperand(1)); + unsigned BitWidth = X.getValueType().getScalarSizeInBits(); + if (ShiftAmt && ShiftAmt->getZExtValue() == BitWidth - 1) + return DAG.getNode(ISD::CTLS, SL, VT, X); + } + } + } + return SDValue(); } diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index f5f6355ff666..b5f484645728 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -293,7 +293,7 @@ def S_FLBIT_I32_B64 : SOP1_32_64 <"s_flbit_i32_b64", [(set i32:$sdst, (UniformUnaryFrag i64:$src0))] >; def S_FLBIT_I32 : SOP1_32 <"s_flbit_i32", - [(set i32:$sdst, (UniformUnaryFrag i32:$src0))] + [(set i32:$sdst, (UniformUnaryFrag i32:$src0))] >; def S_FLBIT_I32_I64 : SOP1_32_64 <"s_flbit_i32_i64">; def S_SEXT_I32_I8 : SOP1_32 <"s_sext_i32_i8", diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index d40378a5ac4b..86e7675626ba 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -370,7 +370,7 @@ defm V_NOT_B32 : VOP1Inst <"v_not_b32", VOP_I32_I32>; defm V_BFREV_B32 : VOP1Inst <"v_bfrev_b32", VOP_I32_I32, DivergentUnaryFrag>; defm V_FFBH_U32 : VOP1Inst <"v_ffbh_u32", VOP_I32_I32, AMDGPUffbh_u32>; defm V_FFBL_B32 : VOP1Inst <"v_ffbl_b32", VOP_I32_I32, AMDGPUffbl_b32>; -defm V_FFBH_I32 : VOP1Inst <"v_ffbh_i32", VOP_I32_I32, AMDGPUffbh_i32>; +defm V_FFBH_I32 : VOP1Inst <"v_ffbh_i32", VOP_I32_I32, int_amdgcn_sffbh>; let SchedRW = [WriteDoubleAdd], IsDPMACCInstruction = 1 in { defm V_FREXP_EXP_I32_F64 : VOP1Inst <"v_frexp_exp_i32_f64", VOP_I32_F64_SPECIAL_OMOD, int_amdgcn_frexp_exp>; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctls.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctls.mir new file mode 100644 index 000000000000..aa72fb58c144 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctls.mir @@ -0,0 +1,159 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn -mcpu=tahiti -O0 -run-pass=legalizer %s -o - | FileCheck %s + +--- +name: ctls_s32 +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: ctls_s32 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.sffbh), [[COPY]](s32) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; CHECK-NEXT: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[INT]], [[C]] + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UMIN]], [[C1]] + ; CHECK-NEXT: $vgpr0 = COPY [[SUB]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = G_CTLS %0 + $vgpr0 = COPY %1 +... + +--- +name: ctls_s16 +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: ctls_s16 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 16 + ; CHECK-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.sffbh), [[SEXT_INREG]](s32) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; CHECK-NEXT: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[INT]], [[C]] + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UMIN]], [[C1]] + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[SUB]], [[C2]] + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[SUB1]](s32) + ; CHECK-NEXT: $vgpr0 = COPY [[COPY1]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s16) = G_TRUNC %0 + %2:_(s32) = G_CTLS %1 + $vgpr0 = COPY %2 +... + +--- +name: ctls_v2s32 +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; CHECK-LABEL: name: ctls_v2s32 + ; CHECK: liveins: $vgpr0, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.sffbh), [[COPY]](s32) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; CHECK-NEXT: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[INT]], [[C]] + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UMIN]], [[C1]] + ; CHECK-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.sffbh), [[COPY1]](s32) + ; CHECK-NEXT: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[INT1]], [[C]] + ; CHECK-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[UMIN1]], [[C1]] + ; CHECK-NEXT: $vgpr0 = COPY [[SUB]](s32) + ; CHECK-NEXT: $vgpr1 = COPY [[SUB1]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(<2 x s32>) = G_BUILD_VECTOR %0, %1 + %3:_(<2 x s32>) = G_CTLS %2 + %4:_(s32), %5:_(s32) = G_UNMERGE_VALUES %3 + $vgpr0 = COPY %4 + $vgpr1 = COPY %5 +... + +--- +name: ctls_v2s16 +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: ctls_v2s16 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[BITCAST]], 16 + ; CHECK-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.sffbh), [[SEXT_INREG]](s32) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; CHECK-NEXT: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[INT]], [[C1]] + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UMIN]], [[C2]] + ; CHECK-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[SUB]], [[C]] + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[SUB1]](s32) + ; CHECK-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[LSHR]], 16 + ; CHECK-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.sffbh), [[SEXT_INREG1]](s32) + ; CHECK-NEXT: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[INT1]], [[C1]] + ; CHECK-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[UMIN1]], [[C2]] + ; CHECK-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SUB2]], [[C]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[SUB3]](s32) + ; CHECK-NEXT: $vgpr0 = COPY [[COPY1]](s32) + ; CHECK-NEXT: $vgpr1 = COPY [[COPY2]](s32) + %0:_(<2 x s16>) = COPY $vgpr0 + %1:_(<2 x s32>) = G_CTLS %0 + %2:_(s32), %3:_(s32) = G_UNMERGE_VALUES %1 + $vgpr0 = COPY %2 + $vgpr1 = COPY %3 +... + +--- +name: ctls_v4s32 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + + ; CHECK-LABEL: name: ctls_v4s32 + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; CHECK-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.sffbh), [[COPY]](s32) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; CHECK-NEXT: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[INT]], [[C]] + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UMIN]], [[C1]] + ; CHECK-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.sffbh), [[COPY1]](s32) + ; CHECK-NEXT: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[INT1]], [[C]] + ; CHECK-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[UMIN1]], [[C1]] + ; CHECK-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.sffbh), [[COPY2]](s32) + ; CHECK-NEXT: [[UMIN2:%[0-9]+]]:_(s32) = G_UMIN [[INT2]], [[C]] + ; CHECK-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[UMIN2]], [[C1]] + ; CHECK-NEXT: [[INT3:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.sffbh), [[COPY3]](s32) + ; CHECK-NEXT: [[UMIN3:%[0-9]+]]:_(s32) = G_UMIN [[INT3]], [[C]] + ; CHECK-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[UMIN3]], [[C1]] + ; CHECK-NEXT: $vgpr0 = COPY [[SUB]](s32) + ; CHECK-NEXT: $vgpr1 = COPY [[SUB1]](s32) + ; CHECK-NEXT: $vgpr2 = COPY [[SUB2]](s32) + ; CHECK-NEXT: $vgpr3 = COPY [[SUB3]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = COPY $vgpr2 + %3:_(s32) = COPY $vgpr3 + %4:_(<4 x s32>) = G_BUILD_VECTOR %0, %1, %2, %3 + %5:_(<4 x s32>) = G_CTLS %4 + %6:_(s32), %7:_(s32), %8:_(s32), %9:_(s32) = G_UNMERGE_VALUES %5 + $vgpr0 = COPY %6 + $vgpr1 = COPY %7 + $vgpr2 = COPY %8 + $vgpr3 = COPY %9 +... diff --git a/llvm/test/CodeGen/AMDGPU/ctls.ll b/llvm/test/CodeGen/AMDGPU/ctls.ll new file mode 100644 index 000000000000..3181b0032875 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/ctls.ll @@ -0,0 +1,624 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s + +declare i32 @llvm.ctlz.i32(i32, i1) +declare i64 @llvm.ctlz.i64(i64, i1) +declare i32 @llvm.amdgcn.sffbh.i32(i32) + +; Test that ctls(x) is lowered to umin(ffbh_i32(x), bitwidth) - 1 +; ctls is formed by the DAG combiner from: ctlz(x ^ ashr(x, 31)) - 1 +define i32 @ctls_i32(i32 %x) { +; GFX6-LABEL: ctls_i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_ffbh_i32_e32 v0, v0 +; GFX6-NEXT: v_min_u32_e32 v0, 32, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, -1, v0 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: ctls_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cls_i32_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_min_u32_e32 v0, 32, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v0, -1, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %a = ashr i32 %x, 31 + %b = xor i32 %x, %a + %c = call i32 @llvm.ctlz.i32(i32 %b, i1 false) + %d = sub i32 %c, 1 + ret i32 %d +} + +define i32 @ctls_i32_known_positive(i32 %x) { +; GFX6-LABEL: ctls_i32_known_positive: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX6-NEXT: v_ffbh_i32_e32 v0, v0 +; GFX6-NEXT: v_min_u32_e32 v0, 32, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, -1, v0 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: ctls_i32_known_positive: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cls_i32_e32 v0, v0 +; GFX11-NEXT: v_min_u32_e32 v0, 32, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v0, -1, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %pos = and i32 %x, 2147483647 + %a = ashr i32 %pos, 31 + %b = xor i32 %pos, %a + %c = call i32 @llvm.ctlz.i32(i32 %b, i1 false) + %d = sub i32 %c, 1 + ret i32 %d +} + +; sub(ctlz(xor(x, sra(x, 31))), 1) -> ctls(x) +define i32 @ctls_i32_xor_commuted(i32 %x) { +; GFX6-LABEL: ctls_i32_xor_commuted: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_ffbh_i32_e32 v0, v0 +; GFX6-NEXT: v_min_u32_e32 v0, 32, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, -1, v0 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: ctls_i32_xor_commuted: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cls_i32_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_min_u32_e32 v0, 32, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v0, -1, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %a = ashr i32 %x, 31 + %b = xor i32 %a, %x ; note: reversed order compared to ctls_i32 + %c = call i32 @llvm.ctlz.i32(i32 %b, i1 false) + %d = sub i32 %c, 1 + ret i32 %d +} + +define i32 @ctls_i32_zero_undef(i32 %x) { +; GFX6-LABEL: ctls_i32_zero_undef: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_ffbh_i32_e32 v0, v0 +; GFX6-NEXT: v_min_u32_e32 v0, 32, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, -1, v0 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: ctls_i32_zero_undef: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cls_i32_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_min_u32_e32 v0, 32, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v0, -1, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %a = ashr i32 %x, 31 + %b = xor i32 %x, %a + %c = call i32 @llvm.ctlz.i32(i32 %b, i1 true) ; zero_undef = true + %d = sub i32 %c, 1 + ret i32 %d +} + +; umin(ffbh_i32(x), 32) -> ffbh_i32(x). +define i32 @ctls_i32_known_mixed_bits(i32 %x) { +; GFX6-LABEL: ctls_i32_known_mixed_bits: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_or_b32_e32 v0, 1, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX6-NEXT: v_ffbh_i32_e32 v0, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, -1, v0 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: ctls_i32_known_mixed_bits: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_or_b32_e32 v0, 1, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX11-NEXT: v_cls_i32_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v0, -1, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + ; Force bit 31 = 0 and bit 0 = 1, so value is neither all-0s nor all-1s + %cleared = and i32 %x, 2147483647 ; clear bit 31 + %mixed = or i32 %cleared, 1 ; set bit 0 + %a = ashr i32 %mixed, 31 + %b = xor i32 %mixed, %a + %c = call i32 @llvm.ctlz.i32(i32 %b, i1 false) + %d = sub i32 %c, 1 + ret i32 %d +} + +; test for i64 CTLS. +define i32 @ctls_i64(i64 %x) { +; GFX6-LABEL: ctls_i64: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; GFX6-NEXT: v_xor_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_ffbh_u32_e32 v0, v0 +; GFX6-NEXT: v_xor_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_min_u32_e32 v0, 0xffffffdf, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX6-NEXT: v_ffbh_u32_e32 v1, v1 +; GFX6-NEXT: v_min3_u32 v0, v0, v1, 64 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, -1, v0 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: ctls_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_xor_b32_e32 v0, v0, v2 +; GFX11-NEXT: v_xor_b32_e32 v1, v1, v2 +; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_clz_i32_u32_e32 v1, v1 +; GFX11-NEXT: v_add_nc_u32_e64 v0, v0, 32 clamp +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_min3_u32 v0, v0, v1, 64 +; GFX11-NEXT: v_add_nc_u32_e32 v0, -1, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %a = ashr i64 %x, 63 + %b = xor i64 %x, %a + %c = call i64 @llvm.ctlz.i64(i64 %b, i1 false) + %d = sub i64 %c, 1 + %e = trunc i64 %d to i32 + ret i32 %e +} + +; i16 CTLS via the sub(ctlz(xor(x, sra(x, 15))), 1) pattern. +declare i16 @llvm.ctlz.i16(i16, i1) +define i16 @ctls_i16(i16 %x) { +; GFX6-LABEL: ctls_i16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX6-NEXT: v_ffbh_i32_e32 v0, v0 +; GFX6-NEXT: v_min_u32_e32 v0, 32, v0 +; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, 17, v0 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: ctls_i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_ashrrev_i16 v0.h, 15, v0.l +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_xor_b16 v0.l, v0.l, v0.h +; GFX11-NEXT: v_mov_b16_e32 v0.h, 0 +; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_min_u32_e32 v0, 32, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v0, -16, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u16 v0.l, v0.l, -1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %a = ashr i16 %x, 15 + %b = xor i16 %x, %a + %c = call i16 @llvm.ctlz.i16(i16 %b, i1 false) + %d = sub i16 %c, 1 + ret i16 %d +} + +; uniform input should use scalar sffbh. +define amdgpu_ps i32 @ctls_i32_salu(i32 inreg %x) { +; GFX6-LABEL: ctls_i32_salu: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_flbit_i32 s0, s0 +; GFX6-NEXT: s_min_u32 s0, s0, 32 +; GFX6-NEXT: s_add_i32 s0, s0, -1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: ctls_i32_salu: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_cls_i32 s0, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_min_u32 s0, s0, 32 +; GFX11-NEXT: s_add_i32 s0, s0, -1 +; GFX11-NEXT: ; return to shader part epilog + %a = ashr i32 %x, 31 + %b = xor i32 %x, %a + %c = call i32 @llvm.ctlz.i32(i32 %b, i1 false) + %d = sub i32 %c, 1 + ret i32 %d +} + +define <2 x i32> @ctls_v2i32(<2 x i32> %x) { +; GFX6-LABEL: ctls_v2i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; GFX6-NEXT: v_ashrrev_i32_e32 v3, 31, v0 +; GFX6-NEXT: v_xor_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_xor_b32_e32 v0, v0, v3 +; GFX6-NEXT: v_ffbh_u32_e32 v1, v1 +; GFX6-NEXT: v_ffbh_u32_e32 v0, v0 +; GFX6-NEXT: v_min_u32_e32 v1, 32, v1 +; GFX6-NEXT: v_min_u32_e32 v0, 32, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, -1, v0 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, -1, v1 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: ctls_v2i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v0 +; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_xor_b32_e32 v0, v0, v2 +; GFX11-NEXT: v_xor_b32_e32 v1, v1, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 +; GFX11-NEXT: v_clz_i32_u32_e32 v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_min_u32_e32 v0, 32, v0 +; GFX11-NEXT: v_min_u32_e32 v1, 32, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v0, -1, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v1, -1, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %a = ashr <2 x i32> %x, + %b = xor <2 x i32> %x, %a + %c = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %b, i1 false) + %d = sub <2 x i32> %c, + ret <2 x i32> %d +} +declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) +declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) + +define <4 x i32> @ctls_v4i32(<4 x i32> %x) { +; GFX6-LABEL: ctls_v4i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_ffbh_i32_e32 v0, v0 +; GFX6-NEXT: v_ffbh_i32_e32 v1, v1 +; GFX6-NEXT: v_ffbh_i32_e32 v2, v2 +; GFX6-NEXT: v_ffbh_i32_e32 v3, v3 +; GFX6-NEXT: v_min_u32_e32 v0, 32, v0 +; GFX6-NEXT: v_min_u32_e32 v1, 32, v1 +; GFX6-NEXT: v_min_u32_e32 v2, 32, v2 +; GFX6-NEXT: v_min_u32_e32 v3, 32, v3 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, -1, v0 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, -1, v1 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, -1, v2 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, -1, v3 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: ctls_v4i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cls_i32_e32 v0, v0 +; GFX11-NEXT: v_cls_i32_e32 v1, v1 +; GFX11-NEXT: v_cls_i32_e32 v2, v2 +; GFX11-NEXT: v_cls_i32_e32 v3, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_min_u32_e32 v0, 32, v0 +; GFX11-NEXT: v_min_u32_e32 v1, 32, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_min_u32_e32 v2, 32, v2 +; GFX11-NEXT: v_min_u32_e32 v3, 32, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v0, -1, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v1, -1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v2, -1, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v3, -1, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %a = ashr <4 x i32> %x, + %b = xor <4 x i32> %x, %a + %c = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %b, i1 false) + %d = sub <4 x i32> %c, + ret <4 x i32> %d +} + +; umin should be folded away per element per element. +define <2 x i32> @ctls_v2i32_known_mixed_bits(<2 x i32> %x) { +; GFX6-LABEL: ctls_v2i32_known_mixed_bits: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_or_b32_e32 v1, 1, v1 +; GFX6-NEXT: v_or_b32_e32 v0, 1, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX6-NEXT: v_ffbh_u32_e32 v1, v1 +; GFX6-NEXT: v_ffbh_u32_e32 v0, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, -1, v0 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, -1, v1 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: ctls_v2i32_known_mixed_bits: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_or_b32_e32 v0, 1, v0 +; GFX11-NEXT: v_or_b32_e32 v1, 1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 +; GFX11-NEXT: v_clz_i32_u32_e32 v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v0, -1, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v1, -1, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cleared = and <2 x i32> %x, + %mixed = or <2 x i32> %cleared, + %a = ashr <2 x i32> %mixed, + %b = xor <2 x i32> %mixed, %a + %c = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %b, i1 false) + %d = sub <2 x i32> %c, + ret <2 x i32> %d +} + +; Vector with ctlz_zero_undef. +define <2 x i32> @ctls_v2i32_zero_undef(<2 x i32> %x) { +; GFX6-LABEL: ctls_v2i32_zero_undef: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; GFX6-NEXT: v_ashrrev_i32_e32 v3, 31, v0 +; GFX6-NEXT: v_xor_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_xor_b32_e32 v0, v0, v3 +; GFX6-NEXT: v_ffbh_u32_e32 v1, v1 +; GFX6-NEXT: v_ffbh_u32_e32 v0, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, -1, v0 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, -1, v1 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: ctls_v2i32_zero_undef: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v0 +; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_xor_b32_e32 v0, v0, v2 +; GFX11-NEXT: v_xor_b32_e32 v1, v1, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 +; GFX11-NEXT: v_clz_i32_u32_e32 v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v0, -1, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v1, -1, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %a = ashr <2 x i32> %x, + %b = xor <2 x i32> %x, %a + %c = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %b, i1 true) + %d = sub <2 x i32> %c, + ret <2 x i32> %d +} + +; Vector commuted XOR operands. +define <2 x i32> @ctls_v2i32_xor_commuted(<2 x i32> %x) { +; GFX6-LABEL: ctls_v2i32_xor_commuted: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; GFX6-NEXT: v_ashrrev_i32_e32 v3, 31, v0 +; GFX6-NEXT: v_xor_b32_e32 v1, v2, v1 +; GFX6-NEXT: v_xor_b32_e32 v0, v3, v0 +; GFX6-NEXT: v_ffbh_u32_e32 v1, v1 +; GFX6-NEXT: v_ffbh_u32_e32 v0, v0 +; GFX6-NEXT: v_min_u32_e32 v1, 32, v1 +; GFX6-NEXT: v_min_u32_e32 v0, 32, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, -1, v0 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, -1, v1 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: ctls_v2i32_xor_commuted: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v0 +; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_xor_b32_e32 v0, v2, v0 +; GFX11-NEXT: v_xor_b32_e32 v1, v3, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 +; GFX11-NEXT: v_clz_i32_u32_e32 v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_min_u32_e32 v0, 32, v0 +; GFX11-NEXT: v_min_u32_e32 v1, 32, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v0, -1, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v1, -1, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %a = ashr <2 x i32> %x, + %b = xor <2 x i32> %a, %x + %c = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %b, i1 false) + %d = sub <2 x i32> %c, + ret <2 x i32> %d +} + +; Vector known positive: umin should NOT be folded. +define <2 x i32> @ctls_v2i32_known_positive(<2 x i32> %x) { +; GFX6-LABEL: ctls_v2i32_known_positive: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX6-NEXT: v_ffbh_u32_e32 v1, v1 +; GFX6-NEXT: v_ffbh_u32_e32 v0, v0 +; GFX6-NEXT: v_min_u32_e32 v1, 32, v1 +; GFX6-NEXT: v_min_u32_e32 v0, 32, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, -1, v0 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, -1, v1 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: ctls_v2i32_known_positive: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 +; GFX11-NEXT: v_clz_i32_u32_e32 v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_min_u32_e32 v0, 32, v0 +; GFX11-NEXT: v_min_u32_e32 v1, 32, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v0, -1, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v1, -1, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %pos = and <2 x i32> %x, + %a = ashr <2 x i32> %pos, + %b = xor <2 x i32> %pos, %a + %c = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %b, i1 false) + %d = sub <2 x i32> %c, + ret <2 x i32> %d +} + +; @llvm.amdgcn.sffbh must still produce raw hardware result. +define i32 @sffbh_intrinsic(i32 %x) { +; GFX6-LABEL: sffbh_intrinsic: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_ffbh_i32_e32 v0, v0 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: sffbh_intrinsic: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cls_i32_e32 v0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %r = call i32 @llvm.amdgcn.sffbh.i32(i32 %x) + ret i32 %r +} + +; sitofp i64 to f32 uses sffbh(Hi)-1, not CTLS. +define float @sitofp_i64_to_f32(i64 %x) { +; GFX6-LABEL: sitofp_i64_to_f32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_xor_b32_e32 v2, v0, v1 +; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v2 +; GFX6-NEXT: v_ffbh_i32_e32 v3, v1 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 32, v2 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, -1, v3 +; GFX6-NEXT: v_min_u32_e32 v2, v3, v2 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v2 +; GFX6-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 32, v2 +; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: sitofp_i64_to_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_xor_b32_e32 v2, v0, v1 +; GFX11-NEXT: v_cls_i32_e32 v3, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v3, -1, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v2, 32, v2 +; GFX11-NEXT: v_min_u32_e32 v2, v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] +; GFX11-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX11-NEXT: v_sub_nc_u32_e32 v1, 32, v2 +; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %r = sitofp i64 %x to float + ret float %r +} + +; Negative tests: +define i32 @no_ctls_wrong_shift(i32 %x) { +; GFX6-LABEL: no_ctls_wrong_shift: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_ashrrev_i32_e32 v1, 30, v0 +; GFX6-NEXT: v_xor_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_ffbh_u32_e32 v0, v0 +; GFX6-NEXT: v_min_u32_e32 v0, 32, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, -1, v0 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: no_ctls_wrong_shift: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_ashrrev_i32_e32 v1, 30, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_xor_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_min_u32_e32 v0, 32, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v0, -1, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %a = ashr i32 %x, 30 + %b = xor i32 %x, %a + %c = call i32 @llvm.ctlz.i32(i32 %b, i1 false) + %d = sub i32 %c, 1 + ret i32 %d +} + +define i32 @no_ctls_xor_different_value(i32 %x, i32 %y) { +; GFX6-LABEL: no_ctls_xor_different_value: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v1 +; GFX6-NEXT: v_xor_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_ffbh_u32_e32 v0, v0 +; GFX6-NEXT: v_min_u32_e32 v0, 32, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, -1, v0 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: no_ctls_xor_different_value: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_ashrrev_i32_e32 v1, 31, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_xor_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_min_u32_e32 v0, 32, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v0, -1, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %a = ashr i32 %y, 31 + %b = xor i32 %x, %a + %c = call i32 @llvm.ctlz.i32(i32 %b, i1 false) + %d = sub i32 %c, 1 + ret i32 %d +} + +define i32 @no_ctls_sub_2(i32 %x) { +; GFX6-LABEL: no_ctls_sub_2: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_ffbh_i32_e32 v0, v0 +; GFX6-NEXT: v_min_u32_e32 v0, 32, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, -2, v0 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: no_ctls_sub_2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cls_i32_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_min_u32_e32 v0, 32, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v0, -2, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %a = ashr i32 %x, 31 + %b = xor i32 %x, %a + %c = call i32 @llvm.ctlz.i32(i32 %b, i1 false) + %d = sub i32 %c, 2 + ret i32 %d +}