[X86][DAG] remove LowerFCanonicalize (#188127)

Remove LowerFCanonicalize. Added fallback for cases when the scalar type also has its Custom lowering to avoid regressions on AMDGPU and SystemZ.

Fixes #143862
This commit is contained in:
Gergo Stomfai 2026-04-01 14:34:05 +01:00 committed by GitHub
parent b46f8fa622
commit 15d48c5bbe
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 59 additions and 93 deletions

View File

@ -5632,6 +5632,11 @@ public:
SDNodeFlags Flags, const SDLoc &DL,
SelectionDAG &DAG) const;
/// Expand FCANONICALIZE to FMUL with 1.
/// \param NodeNode to expand
/// \returns The expansion result
SDValue expandFCANONICALIZE(SDNode *Node, SelectionDAG &DAG) const;
/// Expand CTPOP nodes. Expands vector/scalar CTPOP nodes,
/// vector nodes can only succeed if all operations are legal/custom.
/// \param N Node to expand

View File

@ -3783,26 +3783,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
break;
}
case ISD::FCANONICALIZE: {
// This implements llvm.canonicalize.f* by multiplication with 1.0, as
// suggested in
// https://llvm.org/docs/LangRef.html#llvm-canonicalize-intrinsic.
// It uses strict_fp operations even outside a strict_fp context in order
// to guarantee that the canonicalization is not optimized away by later
// passes. The result chain introduced by that is intentionally ignored
// since no ordering requirement is intended here.
// Create strict multiplication by 1.0.
SDValue Operand = Node->getOperand(0);
EVT VT = Operand.getValueType();
SDValue One = DAG.getConstantFP(1.0, dl, VT);
SDValue Chain = DAG.getEntryNode();
// Propagate existing flags on canonicalize, and additionally set
// NoFPExcept.
SDNodeFlags CanonicalizeFlags = Node->getFlags();
CanonicalizeFlags.setNoFPExcept(true);
SDValue Mul = DAG.getNode(ISD::STRICT_FMUL, dl, {VT, MVT::Other},
{Chain, Operand, One}, CanonicalizeFlags);
SDValue Mul = TLI.expandFCANONICALIZE(Node, DAG);
Results.push_back(Mul);
break;
}

View File

@ -1076,6 +1076,20 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
return;
}
break;
case ISD::FCANONICALIZE: {
// If the scalar element type has a
// Legal/Custom FCANONICALIZE, don't
// mess with the vector, fall back.
EVT VT = Node->getValueType(0);
EVT EltVT = VT.getVectorElementType();
if (TLI.getOperationAction(ISD::FCANONICALIZE, EltVT.getSimpleVT()) !=
TargetLowering::Expand)
break;
// Otherwise canonicalize the whole vector.
SDValue Mul = TLI.expandFCANONICALIZE(Node, DAG);
Results.push_back(Mul);
return;
}
case ISD::FSUB:
ExpandFSUB(Node, Results);
return;

View File

@ -8866,6 +8866,26 @@ void TargetLowering::expandShiftParts(SDNode *Node, SDValue &Lo, SDValue &Hi,
}
}
SDValue TargetLowering::expandFCANONICALIZE(SDNode *Node,
SelectionDAG &DAG) const {
// This implements llvm.canonicalize.f* by multiplication with 1.0, as
// suggested in
// https://llvm.org/docs/LangRef.html#llvm-canonicalize-intrinsic.
// It uses strict_fp operations even outside a strict_fp context in order
// to guarantee that the canonicalization is not optimized away by later
// passes. The result chain introduced by that is intentionally ignored
// since no ordering requirement is intended here.
EVT VT = Node->getValueType(0);
SDLoc DL(Node);
SDNodeFlags Flags = Node->getFlags();
Flags.setNoFPExcept(true);
SDValue One = DAG.getConstantFP(1.0, DL, VT);
SDValue Mul =
DAG.getNode(ISD::STRICT_FMUL, DL, {VT, MVT::Other},
{DAG.getEntryNode(), Node->getOperand(0), One}, Flags);
return Mul;
}
bool TargetLowering::expandFP_TO_SINT(SDNode *Node, SDValue &Result,
SelectionDAG &DAG) const {
unsigned OpNo = Node->isStrictFPOpcode() ? 1 : 0;

View File

@ -315,8 +315,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FP_TO_UINT_SAT, VT, Custom);
setOperationAction(ISD::FP_TO_SINT_SAT, VT, Custom);
}
setOperationAction(ISD::FCANONICALIZE, MVT::f32, Custom);
setOperationAction(ISD::FCANONICALIZE, MVT::f64, Custom);
if (Subtarget.is64Bit()) {
setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom);
setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom);
@ -346,8 +344,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
if (!Subtarget.hasSSE2()) {
setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
setOperationAction(ISD::FCANONICALIZE, MVT::f32, Custom);
setOperationAction(ISD::FCANONICALIZE, MVT::f64, Custom);
if (Subtarget.is64Bit()) {
setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
// Without SSE, i64->f64 goes through memory.
@ -716,7 +712,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::STRICT_FROUNDEVEN, MVT::f16, Promote);
setOperationAction(ISD::STRICT_FTRUNC, MVT::f16, Promote);
setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);
setOperationAction(ISD::FCANONICALIZE, MVT::f16, Custom);
setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Custom);
setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom);
@ -879,7 +874,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::STRICT_FMUL , MVT::f80, Legal);
setOperationAction(ISD::STRICT_FDIV , MVT::f80, Legal);
setOperationAction(ISD::STRICT_FSQRT , MVT::f80, Legal);
setOperationAction(ISD::FCANONICALIZE , MVT::f80, Custom);
setOperationAction(ISD::FCANONICALIZE, MVT::f80, Expand);
if (isTypeLegal(MVT::f16)) {
setOperationAction(ISD::FP_EXTEND, MVT::f80, Custom);
setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Custom);
@ -942,7 +937,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
if (isTypeLegal(MVT::f80)) {
setOperationAction(ISD::FP_ROUND, MVT::f80, Custom);
setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Custom);
setOperationAction(ISD::FCANONICALIZE, MVT::f80, Custom);
setOperationAction(ISD::FCANONICALIZE, MVT::f80, Expand);
}
setOperationAction(ISD::SETCC, MVT::f128, Custom);
@ -1078,11 +1073,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
setOperationAction(ISD::FCANONICALIZE, MVT::v4f32, Custom);
setOperationAction(ISD::FCANONICALIZE, MVT::v4f32, Expand);
setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
setOperationAction(ISD::STORE, MVT::v2f32, Custom);
setOperationAction(ISD::FCANONICALIZE, MVT::v2f32, Custom);
setOperationAction(ISD::FCANONICALIZE, MVT::v2f32, Expand);
setOperationAction(ISD::STRICT_FADD, MVT::v4f32, Legal);
setOperationAction(ISD::STRICT_FSUB, MVT::v4f32, Legal);
@ -1137,7 +1132,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::UMULO, MVT::v2i32, Custom);
setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
setOperationAction(ISD::FCANONICALIZE, MVT::v2f64, Custom);
setOperationAction(ISD::FCANONICALIZE, MVT::v2f64, Expand);
setOperationAction(ISD::FABS, MVT::v2f64, Custom);
setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);
@ -1496,7 +1491,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FMINIMUM, VT, Custom);
setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
setOperationAction(ISD::FCANONICALIZE, VT, Custom);
setOperationAction(ISD::FCANONICALIZE, VT, Expand);
}
setOperationAction(ISD::LRINT, MVT::v8f32, Custom);
@ -1783,9 +1778,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom);
setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i1, Custom);
setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i1, Custom);
setOperationAction(ISD::FCANONICALIZE, MVT::v8f16, Custom);
setOperationAction(ISD::FCANONICALIZE, MVT::v16f16, Custom);
setOperationAction(ISD::FCANONICALIZE, MVT::v32f16, Custom);
setOperationAction(ISD::FCANONICALIZE, MVT::v8f16, Expand);
setOperationAction(ISD::FCANONICALIZE, MVT::v16f16, Expand);
setOperationAction(ISD::FCANONICALIZE, MVT::v32f16, Expand);
// There is no byte sized k-register load or store without AVX512DQ.
if (!Subtarget.hasDQI()) {
@ -1867,7 +1862,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FMA, VT, Legal);
setOperationAction(ISD::STRICT_FMA, VT, Legal);
setOperationAction(ISD::FCOPYSIGN, VT, Custom);
setOperationAction(ISD::FCANONICALIZE, VT, Custom);
setOperationAction(ISD::FCANONICALIZE, VT, Expand);
}
setOperationAction(ISD::LRINT, MVT::v16f32,
Subtarget.hasDQI() ? Legal : Custom);
@ -34066,24 +34061,6 @@ static SDValue LowerPREFETCH(SDValue Op, const X86Subtarget &Subtarget,
return Op;
}
static SDValue LowerFCanonicalize(SDValue Op, SelectionDAG &DAG) {
SDNode *N = Op.getNode();
SDValue Operand = N->getOperand(0);
EVT VT = Operand.getValueType();
SDLoc dl(N);
SDValue One = DAG.getConstantFP(1.0, dl, VT);
// TODO: Fix Crash for bf16 when generating strict_fmul as it
// leads to a error : SoftPromoteHalfResult #0: t11: bf16,ch = strict_fmul t0,
// ConstantFP:bf16<APFloat(16256)>, t5 LLVM ERROR: Do not know how to soft
// promote this operator's result!
SDValue Chain = DAG.getEntryNode();
SDValue StrictFmul = DAG.getNode(ISD::STRICT_FMUL, dl, {VT, MVT::Other},
{Chain, Operand, One});
return StrictFmul;
}
static StringRef getInstrStrFromOpNo(const SmallVectorImpl<StringRef> &AsmStrs,
unsigned OpNo) {
const APInt Operand(32, OpNo);
@ -34225,7 +34202,6 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
case ISD::FSHL:
case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG);
case ISD::FCANONICALIZE: return LowerFCanonicalize(Op, DAG);
case ISD::STRICT_SINT_TO_FP:
case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
case ISD::STRICT_UINT_TO_FP:

View File

@ -205,17 +205,8 @@ define <8 x half> @canonicalize_v8f16(<8 x half> %a) nounwind {
define <4 x float> @canonicalize_v4f32(<4 x float> %a) {
; Z16-LABEL: canonicalize_v4f32:
; Z16: # %bb.0:
; Z16-NEXT: vrepf %v0, %v24, 3
; Z16-NEXT: vgmf %v1, 2, 8
; Z16-NEXT: vrepf %v2, %v24, 2
; Z16-NEXT: meebr %f0, %f1
; Z16-NEXT: meebr %f2, %f1
; Z16-NEXT: vrepf %v3, %v24, 1
; Z16-NEXT: vmrhf %v0, %v2, %v0
; Z16-NEXT: wfmsb %f2, %v24, %f1
; Z16-NEXT: wfmsb %f1, %f3, %f1
; Z16-NEXT: vmrhf %v1, %v2, %v1
; Z16-NEXT: vmrhg %v24, %v1, %v0
; Z16-NEXT: vgmf %v0, 2, 8
; Z16-NEXT: vfmsb %v24, %v24, %v0
; Z16-NEXT: br %r14
%canonicalized = call <4 x float> @llvm.canonicalize.v4f32(<4 x float> %a)
ret <4 x float> %canonicalized
@ -225,14 +216,8 @@ define <4 x double> @canonicalize_v4f64(<4 x double> %a) {
; Z16-LABEL: canonicalize_v4f64:
; Z16: # %bb.0:
; Z16-NEXT: vgmg %v0, 2, 11
; Z16-NEXT: vrepg %v2, %v24, 1
; Z16-NEXT: wfmdb %f1, %v24, %f0
; Z16-NEXT: mdbr %f2, %f0
; Z16-NEXT: vmrhg %v24, %v1, %v2
; Z16-NEXT: vrepg %v2, %v26, 1
; Z16-NEXT: wfmdb %f1, %v26, %f0
; Z16-NEXT: wfmdb %f0, %f2, %f0
; Z16-NEXT: vmrhg %v26, %v1, %v0
; Z16-NEXT: vfmdb %v24, %v24, %v0
; Z16-NEXT: vfmdb %v26, %v26, %v0
; Z16-NEXT: br %r14
%canonicalized = call <4 x double> @llvm.canonicalize.v4f64(<4 x double> %a)
ret <4 x double> %canonicalized
@ -358,17 +343,8 @@ define void @canonicalize_ptr_v4f32(ptr %out) {
; Z16-LABEL: canonicalize_ptr_v4f32:
; Z16: # %bb.0:
; Z16-NEXT: vl %v0, 0(%r2), 3
; Z16-NEXT: vrepf %v1, %v0, 3
; Z16-NEXT: vgmf %v2, 2, 8
; Z16-NEXT: vrepf %v3, %v0, 2
; Z16-NEXT: meebr %f1, %f2
; Z16-NEXT: meebr %f3, %f2
; Z16-NEXT: vmrhf %v1, %v3, %v1
; Z16-NEXT: wfmsb %f3, %f0, %f2
; Z16-NEXT: vrepf %v0, %v0, 1
; Z16-NEXT: meebr %f0, %f2
; Z16-NEXT: vmrhf %v0, %v3, %v0
; Z16-NEXT: vmrhg %v0, %v0, %v1
; Z16-NEXT: vgmf %v1, 2, 8
; Z16-NEXT: vfmsb %v0, %v0, %v1
; Z16-NEXT: vst %v0, 0(%r2), 3
; Z16-NEXT: br %r14
%val = load <4 x float>, ptr %out
@ -380,17 +356,11 @@ define void @canonicalize_ptr_v4f32(ptr %out) {
define void @canonicalize_ptr_v4f64(ptr %out) {
; Z16-LABEL: canonicalize_ptr_v4f64:
; Z16: # %bb.0:
; Z16-NEXT: vl %v0, 0(%r2), 4
; Z16-NEXT: vl %v1, 16(%r2), 4
; Z16-NEXT: vgmg %v2, 2, 11
; Z16-NEXT: wfmdb %f3, %f1, %f2
; Z16-NEXT: vrepg %v1, %v1, 1
; Z16-NEXT: mdbr %f1, %f2
; Z16-NEXT: vl %v0, 0(%r2), 4
; Z16-NEXT: vmrhg %v1, %v3, %v1
; Z16-NEXT: wfmdb %f3, %f0, %f2
; Z16-NEXT: vrepg %v0, %v0, 1
; Z16-NEXT: mdbr %f0, %f2
; Z16-NEXT: vmrhg %v0, %v3, %v0
; Z16-NEXT: vfmdb %v1, %v1, %v2
; Z16-NEXT: vfmdb %v0, %v0, %v2
; Z16-NEXT: vst %v0, 0(%r2), 4
; Z16-NEXT: vst %v1, 16(%r2), 4
; Z16-NEXT: br %r14