From 15d48c5bbe2189db1b79aca0b4f355d0c7d664b6 Mon Sep 17 00:00:00 2001 From: Gergo Stomfai Date: Wed, 1 Apr 2026 14:34:05 +0100 Subject: [PATCH] [X86][DAG] remove LowerFCanonicalize (#188127) Remove LowerFCanonicalize. Added fallback for cases when the scalar type also has its Custom lowering to avoid regressions on AMDGPU and SystemZ. Fixes #143862 --- llvm/include/llvm/CodeGen/TargetLowering.h | 5 ++ llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 21 +------- .../SelectionDAG/LegalizeVectorOps.cpp | 14 ++++++ .../CodeGen/SelectionDAG/TargetLowering.cpp | 20 ++++++++ llvm/lib/Target/X86/X86ISelLowering.cpp | 44 ++++------------- .../test/CodeGen/SystemZ/canonicalize-vars.ll | 48 ++++--------------- 6 files changed, 59 insertions(+), 93 deletions(-) diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 4d7b6ea0755e..fbed0d5378db 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -5632,6 +5632,11 @@ public: SDNodeFlags Flags, const SDLoc &DL, SelectionDAG &DAG) const; + /// Expand FCANONICALIZE to FMUL with 1. + /// \param NodeNode to expand + /// \returns The expansion result + SDValue expandFCANONICALIZE(SDNode *Node, SelectionDAG &DAG) const; + /// Expand CTPOP nodes. Expands vector/scalar CTPOP nodes, /// vector nodes can only succeed if all operations are legal/custom. /// \param N Node to expand diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index c9de7faf4f2e..54d86dfbfa30 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -3783,26 +3783,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { break; } case ISD::FCANONICALIZE: { - // This implements llvm.canonicalize.f* by multiplication with 1.0, as - // suggested in - // https://llvm.org/docs/LangRef.html#llvm-canonicalize-intrinsic. - // It uses strict_fp operations even outside a strict_fp context in order - // to guarantee that the canonicalization is not optimized away by later - // passes. The result chain introduced by that is intentionally ignored - // since no ordering requirement is intended here. - - // Create strict multiplication by 1.0. - SDValue Operand = Node->getOperand(0); - EVT VT = Operand.getValueType(); - SDValue One = DAG.getConstantFP(1.0, dl, VT); - SDValue Chain = DAG.getEntryNode(); - // Propagate existing flags on canonicalize, and additionally set - // NoFPExcept. - SDNodeFlags CanonicalizeFlags = Node->getFlags(); - CanonicalizeFlags.setNoFPExcept(true); - SDValue Mul = DAG.getNode(ISD::STRICT_FMUL, dl, {VT, MVT::Other}, - {Chain, Operand, One}, CanonicalizeFlags); - + SDValue Mul = TLI.expandFCANONICALIZE(Node, DAG); Results.push_back(Mul); break; } diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index b6e7c275bb3a..46e9a783324f 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -1076,6 +1076,20 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl &Results) { return; } break; + case ISD::FCANONICALIZE: { + // If the scalar element type has a + // Legal/Custom FCANONICALIZE, don't + // mess with the vector, fall back. + EVT VT = Node->getValueType(0); + EVT EltVT = VT.getVectorElementType(); + if (TLI.getOperationAction(ISD::FCANONICALIZE, EltVT.getSimpleVT()) != + TargetLowering::Expand) + break; + // Otherwise canonicalize the whole vector. + SDValue Mul = TLI.expandFCANONICALIZE(Node, DAG); + Results.push_back(Mul); + return; + } case ISD::FSUB: ExpandFSUB(Node, Results); return; diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 2b1b6c0ad636..90ac63b14741 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -8866,6 +8866,26 @@ void TargetLowering::expandShiftParts(SDNode *Node, SDValue &Lo, SDValue &Hi, } } +SDValue TargetLowering::expandFCANONICALIZE(SDNode *Node, + SelectionDAG &DAG) const { + // This implements llvm.canonicalize.f* by multiplication with 1.0, as + // suggested in + // https://llvm.org/docs/LangRef.html#llvm-canonicalize-intrinsic. + // It uses strict_fp operations even outside a strict_fp context in order + // to guarantee that the canonicalization is not optimized away by later + // passes. The result chain introduced by that is intentionally ignored + // since no ordering requirement is intended here. + EVT VT = Node->getValueType(0); + SDLoc DL(Node); + SDNodeFlags Flags = Node->getFlags(); + Flags.setNoFPExcept(true); + SDValue One = DAG.getConstantFP(1.0, DL, VT); + SDValue Mul = + DAG.getNode(ISD::STRICT_FMUL, DL, {VT, MVT::Other}, + {DAG.getEntryNode(), Node->getOperand(0), One}, Flags); + return Mul; +} + bool TargetLowering::expandFP_TO_SINT(SDNode *Node, SDValue &Result, SelectionDAG &DAG) const { unsigned OpNo = Node->isStrictFPOpcode() ? 1 : 0; diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 32796c3e5678..e1a7876e30de 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -315,8 +315,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FP_TO_UINT_SAT, VT, Custom); setOperationAction(ISD::FP_TO_SINT_SAT, VT, Custom); } - setOperationAction(ISD::FCANONICALIZE, MVT::f32, Custom); - setOperationAction(ISD::FCANONICALIZE, MVT::f64, Custom); if (Subtarget.is64Bit()) { setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom); setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom); @@ -346,8 +344,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, if (!Subtarget.hasSSE2()) { setOperationAction(ISD::BITCAST , MVT::f32 , Expand); setOperationAction(ISD::BITCAST , MVT::i32 , Expand); - setOperationAction(ISD::FCANONICALIZE, MVT::f32, Custom); - setOperationAction(ISD::FCANONICALIZE, MVT::f64, Custom); if (Subtarget.is64Bit()) { setOperationAction(ISD::BITCAST , MVT::f64 , Expand); // Without SSE, i64->f64 goes through memory. @@ -716,7 +712,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::STRICT_FROUNDEVEN, MVT::f16, Promote); setOperationAction(ISD::STRICT_FTRUNC, MVT::f16, Promote); setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom); - setOperationAction(ISD::FCANONICALIZE, MVT::f16, Custom); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Custom); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom); @@ -879,7 +874,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::STRICT_FMUL , MVT::f80, Legal); setOperationAction(ISD::STRICT_FDIV , MVT::f80, Legal); setOperationAction(ISD::STRICT_FSQRT , MVT::f80, Legal); - setOperationAction(ISD::FCANONICALIZE , MVT::f80, Custom); + setOperationAction(ISD::FCANONICALIZE, MVT::f80, Expand); if (isTypeLegal(MVT::f16)) { setOperationAction(ISD::FP_EXTEND, MVT::f80, Custom); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Custom); @@ -942,7 +937,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, if (isTypeLegal(MVT::f80)) { setOperationAction(ISD::FP_ROUND, MVT::f80, Custom); setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Custom); - setOperationAction(ISD::FCANONICALIZE, MVT::f80, Custom); + setOperationAction(ISD::FCANONICALIZE, MVT::f80, Expand); } setOperationAction(ISD::SETCC, MVT::f128, Custom); @@ -1078,11 +1073,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::VSELECT, MVT::v4f32, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); setOperationAction(ISD::SELECT, MVT::v4f32, Custom); - setOperationAction(ISD::FCANONICALIZE, MVT::v4f32, Custom); + setOperationAction(ISD::FCANONICALIZE, MVT::v4f32, Expand); setOperationAction(ISD::LOAD, MVT::v2f32, Custom); setOperationAction(ISD::STORE, MVT::v2f32, Custom); - setOperationAction(ISD::FCANONICALIZE, MVT::v2f32, Custom); + setOperationAction(ISD::FCANONICALIZE, MVT::v2f32, Expand); setOperationAction(ISD::STRICT_FADD, MVT::v4f32, Legal); setOperationAction(ISD::STRICT_FSUB, MVT::v4f32, Legal); @@ -1137,7 +1132,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::UMULO, MVT::v2i32, Custom); setOperationAction(ISD::FNEG, MVT::v2f64, Custom); - setOperationAction(ISD::FCANONICALIZE, MVT::v2f64, Custom); + setOperationAction(ISD::FCANONICALIZE, MVT::v2f64, Expand); setOperationAction(ISD::FABS, MVT::v2f64, Custom); setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom); @@ -1496,7 +1491,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FMINIMUM, VT, Custom); setOperationAction(ISD::FMAXIMUMNUM, VT, Custom); setOperationAction(ISD::FMINIMUMNUM, VT, Custom); - setOperationAction(ISD::FCANONICALIZE, VT, Custom); + setOperationAction(ISD::FCANONICALIZE, VT, Expand); } setOperationAction(ISD::LRINT, MVT::v8f32, Custom); @@ -1783,9 +1778,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom); setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i1, Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i1, Custom); - setOperationAction(ISD::FCANONICALIZE, MVT::v8f16, Custom); - setOperationAction(ISD::FCANONICALIZE, MVT::v16f16, Custom); - setOperationAction(ISD::FCANONICALIZE, MVT::v32f16, Custom); + setOperationAction(ISD::FCANONICALIZE, MVT::v8f16, Expand); + setOperationAction(ISD::FCANONICALIZE, MVT::v16f16, Expand); + setOperationAction(ISD::FCANONICALIZE, MVT::v32f16, Expand); // There is no byte sized k-register load or store without AVX512DQ. if (!Subtarget.hasDQI()) { @@ -1867,7 +1862,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FMA, VT, Legal); setOperationAction(ISD::STRICT_FMA, VT, Legal); setOperationAction(ISD::FCOPYSIGN, VT, Custom); - setOperationAction(ISD::FCANONICALIZE, VT, Custom); + setOperationAction(ISD::FCANONICALIZE, VT, Expand); } setOperationAction(ISD::LRINT, MVT::v16f32, Subtarget.hasDQI() ? Legal : Custom); @@ -34066,24 +34061,6 @@ static SDValue LowerPREFETCH(SDValue Op, const X86Subtarget &Subtarget, return Op; } -static SDValue LowerFCanonicalize(SDValue Op, SelectionDAG &DAG) { - SDNode *N = Op.getNode(); - SDValue Operand = N->getOperand(0); - EVT VT = Operand.getValueType(); - SDLoc dl(N); - - SDValue One = DAG.getConstantFP(1.0, dl, VT); - - // TODO: Fix Crash for bf16 when generating strict_fmul as it - // leads to a error : SoftPromoteHalfResult #0: t11: bf16,ch = strict_fmul t0, - // ConstantFP:bf16, t5 LLVM ERROR: Do not know how to soft - // promote this operator's result! - SDValue Chain = DAG.getEntryNode(); - SDValue StrictFmul = DAG.getNode(ISD::STRICT_FMUL, dl, {VT, MVT::Other}, - {Chain, Operand, One}); - return StrictFmul; -} - static StringRef getInstrStrFromOpNo(const SmallVectorImpl &AsmStrs, unsigned OpNo) { const APInt Operand(32, OpNo); @@ -34225,7 +34202,6 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG); case ISD::FSHL: case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG); - case ISD::FCANONICALIZE: return LowerFCanonicalize(Op, DAG); case ISD::STRICT_SINT_TO_FP: case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); case ISD::STRICT_UINT_TO_FP: diff --git a/llvm/test/CodeGen/SystemZ/canonicalize-vars.ll b/llvm/test/CodeGen/SystemZ/canonicalize-vars.ll index d0f3414e8949..e6659d385ae5 100644 --- a/llvm/test/CodeGen/SystemZ/canonicalize-vars.ll +++ b/llvm/test/CodeGen/SystemZ/canonicalize-vars.ll @@ -205,17 +205,8 @@ define <8 x half> @canonicalize_v8f16(<8 x half> %a) nounwind { define <4 x float> @canonicalize_v4f32(<4 x float> %a) { ; Z16-LABEL: canonicalize_v4f32: ; Z16: # %bb.0: -; Z16-NEXT: vrepf %v0, %v24, 3 -; Z16-NEXT: vgmf %v1, 2, 8 -; Z16-NEXT: vrepf %v2, %v24, 2 -; Z16-NEXT: meebr %f0, %f1 -; Z16-NEXT: meebr %f2, %f1 -; Z16-NEXT: vrepf %v3, %v24, 1 -; Z16-NEXT: vmrhf %v0, %v2, %v0 -; Z16-NEXT: wfmsb %f2, %v24, %f1 -; Z16-NEXT: wfmsb %f1, %f3, %f1 -; Z16-NEXT: vmrhf %v1, %v2, %v1 -; Z16-NEXT: vmrhg %v24, %v1, %v0 +; Z16-NEXT: vgmf %v0, 2, 8 +; Z16-NEXT: vfmsb %v24, %v24, %v0 ; Z16-NEXT: br %r14 %canonicalized = call <4 x float> @llvm.canonicalize.v4f32(<4 x float> %a) ret <4 x float> %canonicalized @@ -225,14 +216,8 @@ define <4 x double> @canonicalize_v4f64(<4 x double> %a) { ; Z16-LABEL: canonicalize_v4f64: ; Z16: # %bb.0: ; Z16-NEXT: vgmg %v0, 2, 11 -; Z16-NEXT: vrepg %v2, %v24, 1 -; Z16-NEXT: wfmdb %f1, %v24, %f0 -; Z16-NEXT: mdbr %f2, %f0 -; Z16-NEXT: vmrhg %v24, %v1, %v2 -; Z16-NEXT: vrepg %v2, %v26, 1 -; Z16-NEXT: wfmdb %f1, %v26, %f0 -; Z16-NEXT: wfmdb %f0, %f2, %f0 -; Z16-NEXT: vmrhg %v26, %v1, %v0 +; Z16-NEXT: vfmdb %v24, %v24, %v0 +; Z16-NEXT: vfmdb %v26, %v26, %v0 ; Z16-NEXT: br %r14 %canonicalized = call <4 x double> @llvm.canonicalize.v4f64(<4 x double> %a) ret <4 x double> %canonicalized @@ -358,17 +343,8 @@ define void @canonicalize_ptr_v4f32(ptr %out) { ; Z16-LABEL: canonicalize_ptr_v4f32: ; Z16: # %bb.0: ; Z16-NEXT: vl %v0, 0(%r2), 3 -; Z16-NEXT: vrepf %v1, %v0, 3 -; Z16-NEXT: vgmf %v2, 2, 8 -; Z16-NEXT: vrepf %v3, %v0, 2 -; Z16-NEXT: meebr %f1, %f2 -; Z16-NEXT: meebr %f3, %f2 -; Z16-NEXT: vmrhf %v1, %v3, %v1 -; Z16-NEXT: wfmsb %f3, %f0, %f2 -; Z16-NEXT: vrepf %v0, %v0, 1 -; Z16-NEXT: meebr %f0, %f2 -; Z16-NEXT: vmrhf %v0, %v3, %v0 -; Z16-NEXT: vmrhg %v0, %v0, %v1 +; Z16-NEXT: vgmf %v1, 2, 8 +; Z16-NEXT: vfmsb %v0, %v0, %v1 ; Z16-NEXT: vst %v0, 0(%r2), 3 ; Z16-NEXT: br %r14 %val = load <4 x float>, ptr %out @@ -380,17 +356,11 @@ define void @canonicalize_ptr_v4f32(ptr %out) { define void @canonicalize_ptr_v4f64(ptr %out) { ; Z16-LABEL: canonicalize_ptr_v4f64: ; Z16: # %bb.0: +; Z16-NEXT: vl %v0, 0(%r2), 4 ; Z16-NEXT: vl %v1, 16(%r2), 4 ; Z16-NEXT: vgmg %v2, 2, 11 -; Z16-NEXT: wfmdb %f3, %f1, %f2 -; Z16-NEXT: vrepg %v1, %v1, 1 -; Z16-NEXT: mdbr %f1, %f2 -; Z16-NEXT: vl %v0, 0(%r2), 4 -; Z16-NEXT: vmrhg %v1, %v3, %v1 -; Z16-NEXT: wfmdb %f3, %f0, %f2 -; Z16-NEXT: vrepg %v0, %v0, 1 -; Z16-NEXT: mdbr %f0, %f2 -; Z16-NEXT: vmrhg %v0, %v3, %v0 +; Z16-NEXT: vfmdb %v1, %v1, %v2 +; Z16-NEXT: vfmdb %v0, %v0, %v2 ; Z16-NEXT: vst %v0, 0(%r2), 4 ; Z16-NEXT: vst %v1, 16(%r2), 4 ; Z16-NEXT: br %r14