fix llvm.fma.f16 double rounding issue when there is no native support (#171904)

fixes https://github.com/llvm/llvm-project/issues/98389

As the issue describes, promoting `llvm.fma.f16` to `llvm.fma.f32` does
not work, because there is not enough precision to handle the repeated
rounding. `f64` does have sufficient space. So this PR explicitly
promotes the 16-bit fma to a 64-bit fma.

I could not find examples of a libcall being used for fma, but that's
something that could be looked in separately to work around code size
issues.
This commit is contained in:
Folkert de Vries 2025-12-17 22:03:01 +01:00 committed by GitHub
parent 558760009c
commit a587ccd87d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
19 changed files with 916 additions and 656 deletions

View File

@ -3510,6 +3510,7 @@ SDValue DAGTypeLegalizer::SoftPromoteHalfRes_FMAD(SDNode *N) {
SDValue Op0 = GetSoftPromotedHalf(N->getOperand(0));
SDValue Op1 = GetSoftPromotedHalf(N->getOperand(1));
SDValue Op2 = GetSoftPromotedHalf(N->getOperand(2));
SDNodeFlags Flags = N->getFlags();
SDLoc dl(N);
// Promote to the larger FP type.
@ -3518,9 +3519,28 @@ SDValue DAGTypeLegalizer::SoftPromoteHalfRes_FMAD(SDNode *N) {
Op1 = DAG.getNode(PromotionOpcode, dl, NVT, Op1);
Op2 = DAG.getNode(PromotionOpcode, dl, NVT, Op2);
SDValue Res = DAG.getNode(N->getOpcode(), dl, NVT, Op0, Op1, Op2);
SDValue Res;
if (OVT == MVT::f16) {
// If f16 fma is not natively supported, the value must be promoted to an
// f64 (and not to f32!) to prevent double rounding issues.
SDValue A64 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Op0, Flags);
SDValue B64 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Op1, Flags);
SDValue C64 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Op2, Flags);
// Convert back to FP16 as an integer.
// Prefer a wide FMA node if available; otherwise expand to mul+add.
SDValue WideRes;
if (TLI.isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), MVT::f64)) {
WideRes = DAG.getNode(ISD::FMA, dl, MVT::f64, A64, B64, C64, Flags);
} else {
SDValue Mul = DAG.getNode(ISD::FMUL, dl, MVT::f64, A64, B64, Flags);
WideRes = DAG.getNode(ISD::FADD, dl, MVT::f64, Mul, C64, Flags);
}
return DAG.getNode(GetPromotionOpcode(MVT::f64, OVT), dl, MVT::i16,
WideRes);
}
Res = DAG.getNode(N->getOpcode(), dl, NVT, Op0, Op1, Op2, Flags);
return DAG.getNode(GetPromotionOpcode(NVT, OVT), dl, MVT::i16, Res);
}

View File

@ -1041,6 +1041,11 @@ void TargetLoweringBase::initActions() {
}
}
// If f16 fma is not natively supported, the value must be promoted to an f64
// (and not to f32!) to prevent double rounding issues.
AddPromotedToType(ISD::FMA, MVT::f16, MVT::f64);
AddPromotedToType(ISD::STRICT_FMA, MVT::f16, MVT::f64);
// Set default actions for various operations.
for (MVT VT : MVT::all_valuetypes()) {
// Default all indexed load / store to expand.

View File

@ -570,6 +570,15 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, XLenVT, Custom);
}
if (!Subtarget.hasStdExtD()) {
// FIXME: handle f16 fma when f64 is not legal. Using an f32 fma
// instruction runs into double rounding issues, so this is wrong.
// Normally we'd use an f64 fma, but without the D extension the f64 type
// is not legal. This should probably be a libcall.
AddPromotedToType(ISD::FMA, MVT::f16, MVT::f32);
AddPromotedToType(ISD::STRICT_FMA, MVT::f16, MVT::f32);
}
setOperationAction(ISD::BITCAST, MVT::i16, Custom);
setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Legal);

View File

@ -1378,11 +1378,11 @@ define half @test_log2(half %a) #0 {
define half @test_fma(half %a, half %b, half %c) #0 {
; CHECK-CVT-SD-LABEL: test_fma:
; CHECK-CVT-SD: // %bb.0:
; CHECK-CVT-SD-NEXT: fcvt s2, h2
; CHECK-CVT-SD-NEXT: fcvt s1, h1
; CHECK-CVT-SD-NEXT: fcvt s0, h0
; CHECK-CVT-SD-NEXT: fmadd s0, s0, s1, s2
; CHECK-CVT-SD-NEXT: fcvt h0, s0
; CHECK-CVT-SD-NEXT: fcvt d2, h2
; CHECK-CVT-SD-NEXT: fcvt d1, h1
; CHECK-CVT-SD-NEXT: fcvt d0, h0
; CHECK-CVT-SD-NEXT: fmadd d0, d0, d1, d2
; CHECK-CVT-SD-NEXT: fcvt h0, d0
; CHECK-CVT-SD-NEXT: ret
;
; CHECK-FP16-LABEL: test_fma:

View File

@ -27,11 +27,11 @@ entry:
define half @fma_f16(half %a, half %b, half %c) {
; CHECK-SD-NOFP16-LABEL: fma_f16:
; CHECK-SD-NOFP16: // %bb.0: // %entry
; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
; CHECK-SD-NOFP16-NEXT: fmadd s0, s0, s1, s2
; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
; CHECK-SD-NOFP16-NEXT: fcvt d2, h2
; CHECK-SD-NOFP16-NEXT: fcvt d1, h1
; CHECK-SD-NOFP16-NEXT: fcvt d0, h0
; CHECK-SD-NOFP16-NEXT: fmadd d0, d0, d1, d2
; CHECK-SD-NOFP16-NEXT: fcvt h0, d0
; CHECK-SD-NOFP16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fma_f16:
@ -178,69 +178,69 @@ define <7 x half> @fma_v7f16(<7 x half> %a, <7 x half> %b, <7 x half> %c) {
; CHECK-SD-NOFP16-NEXT: mov h3, v2.h[1]
; CHECK-SD-NOFP16-NEXT: mov h4, v1.h[1]
; CHECK-SD-NOFP16-NEXT: mov h5, v0.h[1]
; CHECK-SD-NOFP16-NEXT: fcvt s6, h2
; CHECK-SD-NOFP16-NEXT: fcvt s7, h1
; CHECK-SD-NOFP16-NEXT: fcvt s16, h0
; CHECK-SD-NOFP16-NEXT: fcvt d6, h2
; CHECK-SD-NOFP16-NEXT: fcvt d7, h1
; CHECK-SD-NOFP16-NEXT: fcvt d16, h0
; CHECK-SD-NOFP16-NEXT: mov h17, v2.h[2]
; CHECK-SD-NOFP16-NEXT: mov h18, v1.h[2]
; CHECK-SD-NOFP16-NEXT: mov h19, v0.h[2]
; CHECK-SD-NOFP16-NEXT: fcvt s3, h3
; CHECK-SD-NOFP16-NEXT: fcvt s4, h4
; CHECK-SD-NOFP16-NEXT: fcvt s5, h5
; CHECK-SD-NOFP16-NEXT: fmadd s6, s16, s7, s6
; CHECK-SD-NOFP16-NEXT: fcvt s7, h17
; CHECK-SD-NOFP16-NEXT: fcvt s16, h18
; CHECK-SD-NOFP16-NEXT: fcvt s17, h19
; CHECK-SD-NOFP16-NEXT: fcvt d3, h3
; CHECK-SD-NOFP16-NEXT: fcvt d4, h4
; CHECK-SD-NOFP16-NEXT: fcvt d5, h5
; CHECK-SD-NOFP16-NEXT: fmadd d6, d16, d7, d6
; CHECK-SD-NOFP16-NEXT: fcvt d7, h17
; CHECK-SD-NOFP16-NEXT: fcvt d16, h18
; CHECK-SD-NOFP16-NEXT: fcvt d17, h19
; CHECK-SD-NOFP16-NEXT: mov h18, v1.h[3]
; CHECK-SD-NOFP16-NEXT: mov h19, v0.h[3]
; CHECK-SD-NOFP16-NEXT: fmadd s4, s5, s4, s3
; CHECK-SD-NOFP16-NEXT: fmadd d4, d5, d4, d3
; CHECK-SD-NOFP16-NEXT: mov h5, v2.h[3]
; CHECK-SD-NOFP16-NEXT: fcvt h3, s6
; CHECK-SD-NOFP16-NEXT: fmadd s6, s17, s16, s7
; CHECK-SD-NOFP16-NEXT: fcvt h3, d6
; CHECK-SD-NOFP16-NEXT: fmadd d6, d17, d16, d7
; CHECK-SD-NOFP16-NEXT: mov h17, v2.h[4]
; CHECK-SD-NOFP16-NEXT: fcvt s7, h18
; CHECK-SD-NOFP16-NEXT: fcvt s16, h19
; CHECK-SD-NOFP16-NEXT: fcvt d7, h18
; CHECK-SD-NOFP16-NEXT: fcvt d16, h19
; CHECK-SD-NOFP16-NEXT: mov h18, v1.h[4]
; CHECK-SD-NOFP16-NEXT: fcvt h4, s4
; CHECK-SD-NOFP16-NEXT: fcvt s5, h5
; CHECK-SD-NOFP16-NEXT: fcvt h4, d4
; CHECK-SD-NOFP16-NEXT: fcvt d5, h5
; CHECK-SD-NOFP16-NEXT: mov h19, v0.h[4]
; CHECK-SD-NOFP16-NEXT: fcvt h6, s6
; CHECK-SD-NOFP16-NEXT: fcvt s17, h17
; CHECK-SD-NOFP16-NEXT: fcvt s18, h18
; CHECK-SD-NOFP16-NEXT: fcvt h6, d6
; CHECK-SD-NOFP16-NEXT: fcvt d17, h17
; CHECK-SD-NOFP16-NEXT: fcvt d18, h18
; CHECK-SD-NOFP16-NEXT: mov v3.h[1], v4.h[0]
; CHECK-SD-NOFP16-NEXT: mov h4, v2.h[5]
; CHECK-SD-NOFP16-NEXT: fmadd s5, s16, s7, s5
; CHECK-SD-NOFP16-NEXT: fmadd d5, d16, d7, d5
; CHECK-SD-NOFP16-NEXT: mov h7, v1.h[5]
; CHECK-SD-NOFP16-NEXT: mov h16, v0.h[5]
; CHECK-SD-NOFP16-NEXT: fcvt s19, h19
; CHECK-SD-NOFP16-NEXT: fcvt d19, h19
; CHECK-SD-NOFP16-NEXT: mov v3.h[2], v6.h[0]
; CHECK-SD-NOFP16-NEXT: mov h6, v2.h[6]
; CHECK-SD-NOFP16-NEXT: fcvt s4, h4
; CHECK-SD-NOFP16-NEXT: fcvt s7, h7
; CHECK-SD-NOFP16-NEXT: fcvt s16, h16
; CHECK-SD-NOFP16-NEXT: fcvt h5, s5
; CHECK-SD-NOFP16-NEXT: fmadd s17, s19, s18, s17
; CHECK-SD-NOFP16-NEXT: fcvt d4, h4
; CHECK-SD-NOFP16-NEXT: fcvt d7, h7
; CHECK-SD-NOFP16-NEXT: fcvt d16, h16
; CHECK-SD-NOFP16-NEXT: fcvt h5, d5
; CHECK-SD-NOFP16-NEXT: fmadd d17, d19, d18, d17
; CHECK-SD-NOFP16-NEXT: mov h18, v1.h[6]
; CHECK-SD-NOFP16-NEXT: mov h19, v0.h[6]
; CHECK-SD-NOFP16-NEXT: mov h2, v2.h[7]
; CHECK-SD-NOFP16-NEXT: mov h1, v1.h[7]
; CHECK-SD-NOFP16-NEXT: mov h0, v0.h[7]
; CHECK-SD-NOFP16-NEXT: fmadd s4, s16, s7, s4
; CHECK-SD-NOFP16-NEXT: fmadd d4, d16, d7, d4
; CHECK-SD-NOFP16-NEXT: mov v3.h[3], v5.h[0]
; CHECK-SD-NOFP16-NEXT: fcvt s5, h6
; CHECK-SD-NOFP16-NEXT: fcvt s6, h18
; CHECK-SD-NOFP16-NEXT: fcvt s7, h19
; CHECK-SD-NOFP16-NEXT: fcvt h16, s17
; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
; CHECK-SD-NOFP16-NEXT: fcvt h4, s4
; CHECK-SD-NOFP16-NEXT: fmadd s5, s7, s6, s5
; CHECK-SD-NOFP16-NEXT: fcvt d5, h6
; CHECK-SD-NOFP16-NEXT: fcvt d6, h18
; CHECK-SD-NOFP16-NEXT: fcvt d7, h19
; CHECK-SD-NOFP16-NEXT: fcvt h16, d17
; CHECK-SD-NOFP16-NEXT: fcvt d2, h2
; CHECK-SD-NOFP16-NEXT: fcvt d1, h1
; CHECK-SD-NOFP16-NEXT: fcvt d0, h0
; CHECK-SD-NOFP16-NEXT: fcvt h4, d4
; CHECK-SD-NOFP16-NEXT: fmadd d5, d7, d6, d5
; CHECK-SD-NOFP16-NEXT: mov v3.h[4], v16.h[0]
; CHECK-SD-NOFP16-NEXT: fmadd s0, s0, s1, s2
; CHECK-SD-NOFP16-NEXT: fmadd d0, d0, d1, d2
; CHECK-SD-NOFP16-NEXT: mov v3.h[5], v4.h[0]
; CHECK-SD-NOFP16-NEXT: fcvt h4, s5
; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
; CHECK-SD-NOFP16-NEXT: fcvt h4, d5
; CHECK-SD-NOFP16-NEXT: fcvt h0, d0
; CHECK-SD-NOFP16-NEXT: mov v3.h[6], v4.h[0]
; CHECK-SD-NOFP16-NEXT: mov v3.h[7], v0.h[0]
; CHECK-SD-NOFP16-NEXT: mov v0.16b, v3.16b
@ -301,34 +301,34 @@ define <4 x half> @fma_v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c) {
; CHECK-SD-NOFP16-NEXT: mov h3, v2.h[1]
; CHECK-SD-NOFP16-NEXT: mov h4, v1.h[1]
; CHECK-SD-NOFP16-NEXT: mov h5, v0.h[1]
; CHECK-SD-NOFP16-NEXT: fcvt s6, h2
; CHECK-SD-NOFP16-NEXT: fcvt s7, h1
; CHECK-SD-NOFP16-NEXT: fcvt s16, h0
; CHECK-SD-NOFP16-NEXT: fcvt d6, h2
; CHECK-SD-NOFP16-NEXT: fcvt d7, h1
; CHECK-SD-NOFP16-NEXT: fcvt d16, h0
; CHECK-SD-NOFP16-NEXT: mov h17, v2.h[2]
; CHECK-SD-NOFP16-NEXT: mov h18, v1.h[2]
; CHECK-SD-NOFP16-NEXT: mov h19, v0.h[2]
; CHECK-SD-NOFP16-NEXT: mov h2, v2.h[3]
; CHECK-SD-NOFP16-NEXT: mov h1, v1.h[3]
; CHECK-SD-NOFP16-NEXT: fcvt s3, h3
; CHECK-SD-NOFP16-NEXT: fcvt s4, h4
; CHECK-SD-NOFP16-NEXT: fcvt s5, h5
; CHECK-SD-NOFP16-NEXT: fmadd s6, s16, s7, s6
; CHECK-SD-NOFP16-NEXT: fcvt d3, h3
; CHECK-SD-NOFP16-NEXT: fcvt d4, h4
; CHECK-SD-NOFP16-NEXT: fcvt d5, h5
; CHECK-SD-NOFP16-NEXT: fmadd d6, d16, d7, d6
; CHECK-SD-NOFP16-NEXT: mov h16, v0.h[3]
; CHECK-SD-NOFP16-NEXT: fcvt s7, h19
; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
; CHECK-SD-NOFP16-NEXT: fmadd s3, s5, s4, s3
; CHECK-SD-NOFP16-NEXT: fcvt s4, h17
; CHECK-SD-NOFP16-NEXT: fcvt s5, h18
; CHECK-SD-NOFP16-NEXT: fcvt h0, s6
; CHECK-SD-NOFP16-NEXT: fmadd s4, s7, s5, s4
; CHECK-SD-NOFP16-NEXT: fcvt h3, s3
; CHECK-SD-NOFP16-NEXT: fcvt s5, h16
; CHECK-SD-NOFP16-NEXT: fcvt d7, h19
; CHECK-SD-NOFP16-NEXT: fcvt d2, h2
; CHECK-SD-NOFP16-NEXT: fcvt d1, h1
; CHECK-SD-NOFP16-NEXT: fmadd d3, d5, d4, d3
; CHECK-SD-NOFP16-NEXT: fcvt d4, h17
; CHECK-SD-NOFP16-NEXT: fcvt d5, h18
; CHECK-SD-NOFP16-NEXT: fcvt h0, d6
; CHECK-SD-NOFP16-NEXT: fmadd d4, d7, d5, d4
; CHECK-SD-NOFP16-NEXT: fcvt h3, d3
; CHECK-SD-NOFP16-NEXT: fcvt d5, h16
; CHECK-SD-NOFP16-NEXT: mov v0.h[1], v3.h[0]
; CHECK-SD-NOFP16-NEXT: fcvt h3, s4
; CHECK-SD-NOFP16-NEXT: fmadd s1, s5, s1, s2
; CHECK-SD-NOFP16-NEXT: fcvt h3, d4
; CHECK-SD-NOFP16-NEXT: fmadd d1, d5, d1, d2
; CHECK-SD-NOFP16-NEXT: mov v0.h[2], v3.h[0]
; CHECK-SD-NOFP16-NEXT: fcvt h1, s1
; CHECK-SD-NOFP16-NEXT: fcvt h1, d1
; CHECK-SD-NOFP16-NEXT: mov v0.h[3], v1.h[0]
; CHECK-SD-NOFP16-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-SD-NOFP16-NEXT: ret
@ -364,69 +364,69 @@ define <8 x half> @fma_v8f16(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
; CHECK-SD-NOFP16-NEXT: mov h3, v2.h[1]
; CHECK-SD-NOFP16-NEXT: mov h4, v1.h[1]
; CHECK-SD-NOFP16-NEXT: mov h5, v0.h[1]
; CHECK-SD-NOFP16-NEXT: fcvt s6, h2
; CHECK-SD-NOFP16-NEXT: fcvt s7, h1
; CHECK-SD-NOFP16-NEXT: fcvt s16, h0
; CHECK-SD-NOFP16-NEXT: fcvt d6, h2
; CHECK-SD-NOFP16-NEXT: fcvt d7, h1
; CHECK-SD-NOFP16-NEXT: fcvt d16, h0
; CHECK-SD-NOFP16-NEXT: mov h17, v2.h[2]
; CHECK-SD-NOFP16-NEXT: mov h18, v1.h[2]
; CHECK-SD-NOFP16-NEXT: mov h19, v0.h[2]
; CHECK-SD-NOFP16-NEXT: fcvt s3, h3
; CHECK-SD-NOFP16-NEXT: fcvt s4, h4
; CHECK-SD-NOFP16-NEXT: fcvt s5, h5
; CHECK-SD-NOFP16-NEXT: fmadd s6, s16, s7, s6
; CHECK-SD-NOFP16-NEXT: fcvt s7, h17
; CHECK-SD-NOFP16-NEXT: fcvt s16, h18
; CHECK-SD-NOFP16-NEXT: fcvt s17, h19
; CHECK-SD-NOFP16-NEXT: fcvt d3, h3
; CHECK-SD-NOFP16-NEXT: fcvt d4, h4
; CHECK-SD-NOFP16-NEXT: fcvt d5, h5
; CHECK-SD-NOFP16-NEXT: fmadd d6, d16, d7, d6
; CHECK-SD-NOFP16-NEXT: fcvt d7, h17
; CHECK-SD-NOFP16-NEXT: fcvt d16, h18
; CHECK-SD-NOFP16-NEXT: fcvt d17, h19
; CHECK-SD-NOFP16-NEXT: mov h18, v1.h[3]
; CHECK-SD-NOFP16-NEXT: mov h19, v0.h[3]
; CHECK-SD-NOFP16-NEXT: fmadd s4, s5, s4, s3
; CHECK-SD-NOFP16-NEXT: fmadd d4, d5, d4, d3
; CHECK-SD-NOFP16-NEXT: mov h5, v2.h[3]
; CHECK-SD-NOFP16-NEXT: fcvt h3, s6
; CHECK-SD-NOFP16-NEXT: fmadd s6, s17, s16, s7
; CHECK-SD-NOFP16-NEXT: fcvt h3, d6
; CHECK-SD-NOFP16-NEXT: fmadd d6, d17, d16, d7
; CHECK-SD-NOFP16-NEXT: mov h17, v2.h[4]
; CHECK-SD-NOFP16-NEXT: fcvt s7, h18
; CHECK-SD-NOFP16-NEXT: fcvt s16, h19
; CHECK-SD-NOFP16-NEXT: fcvt d7, h18
; CHECK-SD-NOFP16-NEXT: fcvt d16, h19
; CHECK-SD-NOFP16-NEXT: mov h18, v1.h[4]
; CHECK-SD-NOFP16-NEXT: fcvt h4, s4
; CHECK-SD-NOFP16-NEXT: fcvt s5, h5
; CHECK-SD-NOFP16-NEXT: fcvt h4, d4
; CHECK-SD-NOFP16-NEXT: fcvt d5, h5
; CHECK-SD-NOFP16-NEXT: mov h19, v0.h[4]
; CHECK-SD-NOFP16-NEXT: fcvt h6, s6
; CHECK-SD-NOFP16-NEXT: fcvt s17, h17
; CHECK-SD-NOFP16-NEXT: fcvt s18, h18
; CHECK-SD-NOFP16-NEXT: fcvt h6, d6
; CHECK-SD-NOFP16-NEXT: fcvt d17, h17
; CHECK-SD-NOFP16-NEXT: fcvt d18, h18
; CHECK-SD-NOFP16-NEXT: mov v3.h[1], v4.h[0]
; CHECK-SD-NOFP16-NEXT: mov h4, v2.h[5]
; CHECK-SD-NOFP16-NEXT: fmadd s5, s16, s7, s5
; CHECK-SD-NOFP16-NEXT: fmadd d5, d16, d7, d5
; CHECK-SD-NOFP16-NEXT: mov h7, v1.h[5]
; CHECK-SD-NOFP16-NEXT: mov h16, v0.h[5]
; CHECK-SD-NOFP16-NEXT: fcvt s19, h19
; CHECK-SD-NOFP16-NEXT: fcvt d19, h19
; CHECK-SD-NOFP16-NEXT: mov v3.h[2], v6.h[0]
; CHECK-SD-NOFP16-NEXT: mov h6, v2.h[6]
; CHECK-SD-NOFP16-NEXT: fcvt s4, h4
; CHECK-SD-NOFP16-NEXT: fcvt s7, h7
; CHECK-SD-NOFP16-NEXT: fcvt s16, h16
; CHECK-SD-NOFP16-NEXT: fcvt h5, s5
; CHECK-SD-NOFP16-NEXT: fmadd s17, s19, s18, s17
; CHECK-SD-NOFP16-NEXT: fcvt d4, h4
; CHECK-SD-NOFP16-NEXT: fcvt d7, h7
; CHECK-SD-NOFP16-NEXT: fcvt d16, h16
; CHECK-SD-NOFP16-NEXT: fcvt h5, d5
; CHECK-SD-NOFP16-NEXT: fmadd d17, d19, d18, d17
; CHECK-SD-NOFP16-NEXT: mov h18, v1.h[6]
; CHECK-SD-NOFP16-NEXT: mov h19, v0.h[6]
; CHECK-SD-NOFP16-NEXT: mov h2, v2.h[7]
; CHECK-SD-NOFP16-NEXT: mov h1, v1.h[7]
; CHECK-SD-NOFP16-NEXT: mov h0, v0.h[7]
; CHECK-SD-NOFP16-NEXT: fmadd s4, s16, s7, s4
; CHECK-SD-NOFP16-NEXT: fmadd d4, d16, d7, d4
; CHECK-SD-NOFP16-NEXT: mov v3.h[3], v5.h[0]
; CHECK-SD-NOFP16-NEXT: fcvt s5, h6
; CHECK-SD-NOFP16-NEXT: fcvt s6, h18
; CHECK-SD-NOFP16-NEXT: fcvt s7, h19
; CHECK-SD-NOFP16-NEXT: fcvt h16, s17
; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
; CHECK-SD-NOFP16-NEXT: fcvt h4, s4
; CHECK-SD-NOFP16-NEXT: fmadd s5, s7, s6, s5
; CHECK-SD-NOFP16-NEXT: fcvt d5, h6
; CHECK-SD-NOFP16-NEXT: fcvt d6, h18
; CHECK-SD-NOFP16-NEXT: fcvt d7, h19
; CHECK-SD-NOFP16-NEXT: fcvt h16, d17
; CHECK-SD-NOFP16-NEXT: fcvt d2, h2
; CHECK-SD-NOFP16-NEXT: fcvt d1, h1
; CHECK-SD-NOFP16-NEXT: fcvt d0, h0
; CHECK-SD-NOFP16-NEXT: fcvt h4, d4
; CHECK-SD-NOFP16-NEXT: fmadd d5, d7, d6, d5
; CHECK-SD-NOFP16-NEXT: mov v3.h[4], v16.h[0]
; CHECK-SD-NOFP16-NEXT: fmadd s0, s0, s1, s2
; CHECK-SD-NOFP16-NEXT: fmadd d0, d0, d1, d2
; CHECK-SD-NOFP16-NEXT: mov v3.h[5], v4.h[0]
; CHECK-SD-NOFP16-NEXT: fcvt h4, s5
; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
; CHECK-SD-NOFP16-NEXT: fcvt h4, d5
; CHECK-SD-NOFP16-NEXT: fcvt h0, d0
; CHECK-SD-NOFP16-NEXT: mov v3.h[6], v4.h[0]
; CHECK-SD-NOFP16-NEXT: mov v3.h[7], v0.h[0]
; CHECK-SD-NOFP16-NEXT: mov v0.16b, v3.16b
@ -468,136 +468,136 @@ define <16 x half> @fma_v16f16(<16 x half> %a, <16 x half> %b, <16 x half> %c) {
; CHECK-SD-NOFP16-NEXT: mov h6, v4.h[1]
; CHECK-SD-NOFP16-NEXT: mov h7, v2.h[1]
; CHECK-SD-NOFP16-NEXT: mov h16, v0.h[1]
; CHECK-SD-NOFP16-NEXT: fcvt s17, h4
; CHECK-SD-NOFP16-NEXT: fcvt s18, h2
; CHECK-SD-NOFP16-NEXT: fcvt s19, h0
; CHECK-SD-NOFP16-NEXT: fcvt d17, h4
; CHECK-SD-NOFP16-NEXT: fcvt d18, h2
; CHECK-SD-NOFP16-NEXT: fcvt d19, h0
; CHECK-SD-NOFP16-NEXT: mov h20, v4.h[2]
; CHECK-SD-NOFP16-NEXT: mov h21, v2.h[2]
; CHECK-SD-NOFP16-NEXT: mov h22, v0.h[2]
; CHECK-SD-NOFP16-NEXT: mov h23, v4.h[3]
; CHECK-SD-NOFP16-NEXT: mov h24, v2.h[3]
; CHECK-SD-NOFP16-NEXT: mov h25, v0.h[3]
; CHECK-SD-NOFP16-NEXT: fcvt s6, h6
; CHECK-SD-NOFP16-NEXT: fcvt s7, h7
; CHECK-SD-NOFP16-NEXT: fcvt s16, h16
; CHECK-SD-NOFP16-NEXT: fmadd s17, s19, s18, s17
; CHECK-SD-NOFP16-NEXT: fcvt d6, h6
; CHECK-SD-NOFP16-NEXT: fcvt d7, h7
; CHECK-SD-NOFP16-NEXT: fcvt d16, h16
; CHECK-SD-NOFP16-NEXT: fmadd d17, d19, d18, d17
; CHECK-SD-NOFP16-NEXT: mov h26, v1.h[1]
; CHECK-SD-NOFP16-NEXT: fcvt s27, h5
; CHECK-SD-NOFP16-NEXT: fcvt s18, h20
; CHECK-SD-NOFP16-NEXT: fcvt s19, h21
; CHECK-SD-NOFP16-NEXT: fcvt s20, h22
; CHECK-SD-NOFP16-NEXT: fcvt s21, h23
; CHECK-SD-NOFP16-NEXT: fcvt s22, h24
; CHECK-SD-NOFP16-NEXT: fcvt s23, h25
; CHECK-SD-NOFP16-NEXT: fmadd s7, s16, s7, s6
; CHECK-SD-NOFP16-NEXT: fcvt d27, h5
; CHECK-SD-NOFP16-NEXT: fcvt d18, h20
; CHECK-SD-NOFP16-NEXT: fcvt d19, h21
; CHECK-SD-NOFP16-NEXT: fcvt d20, h22
; CHECK-SD-NOFP16-NEXT: fcvt d21, h23
; CHECK-SD-NOFP16-NEXT: fcvt d22, h24
; CHECK-SD-NOFP16-NEXT: fcvt d23, h25
; CHECK-SD-NOFP16-NEXT: fmadd d7, d16, d7, d6
; CHECK-SD-NOFP16-NEXT: mov h24, v5.h[1]
; CHECK-SD-NOFP16-NEXT: mov h25, v3.h[1]
; CHECK-SD-NOFP16-NEXT: fcvt h6, s17
; CHECK-SD-NOFP16-NEXT: fcvt s28, h3
; CHECK-SD-NOFP16-NEXT: fcvt s29, h1
; CHECK-SD-NOFP16-NEXT: fmadd s19, s20, s19, s18
; CHECK-SD-NOFP16-NEXT: fcvt s26, h26
; CHECK-SD-NOFP16-NEXT: fcvt h6, d17
; CHECK-SD-NOFP16-NEXT: fcvt d28, h3
; CHECK-SD-NOFP16-NEXT: fcvt d29, h1
; CHECK-SD-NOFP16-NEXT: fmadd d19, d20, d19, d18
; CHECK-SD-NOFP16-NEXT: fcvt d26, h26
; CHECK-SD-NOFP16-NEXT: mov h16, v4.h[4]
; CHECK-SD-NOFP16-NEXT: fmadd s21, s23, s22, s21
; CHECK-SD-NOFP16-NEXT: fmadd d21, d23, d22, d21
; CHECK-SD-NOFP16-NEXT: mov h22, v3.h[2]
; CHECK-SD-NOFP16-NEXT: mov h23, v1.h[2]
; CHECK-SD-NOFP16-NEXT: fcvt h20, s7
; CHECK-SD-NOFP16-NEXT: fcvt s24, h24
; CHECK-SD-NOFP16-NEXT: fcvt s25, h25
; CHECK-SD-NOFP16-NEXT: fcvt h20, d7
; CHECK-SD-NOFP16-NEXT: fcvt d24, h24
; CHECK-SD-NOFP16-NEXT: fcvt d25, h25
; CHECK-SD-NOFP16-NEXT: mov h17, v2.h[4]
; CHECK-SD-NOFP16-NEXT: mov h18, v0.h[4]
; CHECK-SD-NOFP16-NEXT: mov h7, v4.h[5]
; CHECK-SD-NOFP16-NEXT: fcvt h19, s19
; CHECK-SD-NOFP16-NEXT: fcvt h19, d19
; CHECK-SD-NOFP16-NEXT: mov h30, v2.h[5]
; CHECK-SD-NOFP16-NEXT: fcvt s16, h16
; CHECK-SD-NOFP16-NEXT: fcvt h21, s21
; CHECK-SD-NOFP16-NEXT: fcvt d16, h16
; CHECK-SD-NOFP16-NEXT: fcvt h21, d21
; CHECK-SD-NOFP16-NEXT: mov h31, v1.h[4]
; CHECK-SD-NOFP16-NEXT: fmadd s24, s26, s25, s24
; CHECK-SD-NOFP16-NEXT: fmadd s25, s29, s28, s27
; CHECK-SD-NOFP16-NEXT: fmadd d24, d26, d25, d24
; CHECK-SD-NOFP16-NEXT: fmadd d25, d29, d28, d27
; CHECK-SD-NOFP16-NEXT: mov v6.h[1], v20.h[0]
; CHECK-SD-NOFP16-NEXT: mov h20, v5.h[2]
; CHECK-SD-NOFP16-NEXT: mov h26, v5.h[3]
; CHECK-SD-NOFP16-NEXT: mov h27, v3.h[3]
; CHECK-SD-NOFP16-NEXT: mov h28, v1.h[3]
; CHECK-SD-NOFP16-NEXT: fcvt s17, h17
; CHECK-SD-NOFP16-NEXT: fcvt s18, h18
; CHECK-SD-NOFP16-NEXT: fcvt s29, h7
; CHECK-SD-NOFP16-NEXT: fcvt s30, h30
; CHECK-SD-NOFP16-NEXT: fcvt d17, h17
; CHECK-SD-NOFP16-NEXT: fcvt d18, h18
; CHECK-SD-NOFP16-NEXT: fcvt d29, h7
; CHECK-SD-NOFP16-NEXT: fcvt d30, h30
; CHECK-SD-NOFP16-NEXT: mov v6.h[2], v19.h[0]
; CHECK-SD-NOFP16-NEXT: fcvt h24, s24
; CHECK-SD-NOFP16-NEXT: fcvt h7, s25
; CHECK-SD-NOFP16-NEXT: fcvt s19, h20
; CHECK-SD-NOFP16-NEXT: fcvt s20, h22
; CHECK-SD-NOFP16-NEXT: fcvt s22, h23
; CHECK-SD-NOFP16-NEXT: fmadd s16, s18, s17, s16
; CHECK-SD-NOFP16-NEXT: fcvt h24, d24
; CHECK-SD-NOFP16-NEXT: fcvt h7, d25
; CHECK-SD-NOFP16-NEXT: fcvt d19, h20
; CHECK-SD-NOFP16-NEXT: fcvt d20, h22
; CHECK-SD-NOFP16-NEXT: fcvt d22, h23
; CHECK-SD-NOFP16-NEXT: fmadd d16, d18, d17, d16
; CHECK-SD-NOFP16-NEXT: mov h23, v0.h[5]
; CHECK-SD-NOFP16-NEXT: fcvt s25, h26
; CHECK-SD-NOFP16-NEXT: fcvt s26, h27
; CHECK-SD-NOFP16-NEXT: fcvt s27, h28
; CHECK-SD-NOFP16-NEXT: fcvt d25, h26
; CHECK-SD-NOFP16-NEXT: fcvt d26, h27
; CHECK-SD-NOFP16-NEXT: fcvt d27, h28
; CHECK-SD-NOFP16-NEXT: mov h18, v4.h[6]
; CHECK-SD-NOFP16-NEXT: mov v6.h[3], v21.h[0]
; CHECK-SD-NOFP16-NEXT: mov v7.h[1], v24.h[0]
; CHECK-SD-NOFP16-NEXT: mov h24, v5.h[5]
; CHECK-SD-NOFP16-NEXT: fmadd s19, s22, s20, s19
; CHECK-SD-NOFP16-NEXT: fmadd d19, d22, d20, d19
; CHECK-SD-NOFP16-NEXT: mov h20, v5.h[4]
; CHECK-SD-NOFP16-NEXT: mov h22, v3.h[4]
; CHECK-SD-NOFP16-NEXT: fcvt s23, h23
; CHECK-SD-NOFP16-NEXT: fcvt d23, h23
; CHECK-SD-NOFP16-NEXT: mov h28, v0.h[6]
; CHECK-SD-NOFP16-NEXT: fcvt h16, s16
; CHECK-SD-NOFP16-NEXT: fcvt s18, h18
; CHECK-SD-NOFP16-NEXT: fcvt h16, d16
; CHECK-SD-NOFP16-NEXT: fcvt d18, h18
; CHECK-SD-NOFP16-NEXT: mov h4, v4.h[7]
; CHECK-SD-NOFP16-NEXT: mov h0, v0.h[7]
; CHECK-SD-NOFP16-NEXT: fcvt s20, h20
; CHECK-SD-NOFP16-NEXT: fcvt s21, h22
; CHECK-SD-NOFP16-NEXT: fcvt s22, h31
; CHECK-SD-NOFP16-NEXT: fmadd s17, s23, s30, s29
; CHECK-SD-NOFP16-NEXT: fmadd s23, s27, s26, s25
; CHECK-SD-NOFP16-NEXT: fcvt h19, s19
; CHECK-SD-NOFP16-NEXT: fcvt d20, h20
; CHECK-SD-NOFP16-NEXT: fcvt d21, h22
; CHECK-SD-NOFP16-NEXT: fcvt d22, h31
; CHECK-SD-NOFP16-NEXT: fmadd d17, d23, d30, d29
; CHECK-SD-NOFP16-NEXT: fmadd d23, d27, d26, d25
; CHECK-SD-NOFP16-NEXT: fcvt h19, d19
; CHECK-SD-NOFP16-NEXT: mov h25, v3.h[5]
; CHECK-SD-NOFP16-NEXT: mov h26, v1.h[5]
; CHECK-SD-NOFP16-NEXT: mov h27, v2.h[6]
; CHECK-SD-NOFP16-NEXT: mov h29, v1.h[6]
; CHECK-SD-NOFP16-NEXT: mov h2, v2.h[7]
; CHECK-SD-NOFP16-NEXT: mov h1, v1.h[7]
; CHECK-SD-NOFP16-NEXT: fmadd s20, s22, s21, s20
; CHECK-SD-NOFP16-NEXT: fmadd d20, d22, d21, d20
; CHECK-SD-NOFP16-NEXT: mov h21, v5.h[6]
; CHECK-SD-NOFP16-NEXT: mov h22, v3.h[6]
; CHECK-SD-NOFP16-NEXT: mov v7.h[2], v19.h[0]
; CHECK-SD-NOFP16-NEXT: fcvt h19, s23
; CHECK-SD-NOFP16-NEXT: fcvt s23, h24
; CHECK-SD-NOFP16-NEXT: fcvt s24, h25
; CHECK-SD-NOFP16-NEXT: fcvt s25, h26
; CHECK-SD-NOFP16-NEXT: fcvt s26, h27
; CHECK-SD-NOFP16-NEXT: fcvt s27, h28
; CHECK-SD-NOFP16-NEXT: fcvt s28, h29
; CHECK-SD-NOFP16-NEXT: fcvt h19, d23
; CHECK-SD-NOFP16-NEXT: fcvt d23, h24
; CHECK-SD-NOFP16-NEXT: fcvt d24, h25
; CHECK-SD-NOFP16-NEXT: fcvt d25, h26
; CHECK-SD-NOFP16-NEXT: fcvt d26, h27
; CHECK-SD-NOFP16-NEXT: fcvt d27, h28
; CHECK-SD-NOFP16-NEXT: fcvt d28, h29
; CHECK-SD-NOFP16-NEXT: mov h5, v5.h[7]
; CHECK-SD-NOFP16-NEXT: fcvt s21, h21
; CHECK-SD-NOFP16-NEXT: fcvt s22, h22
; CHECK-SD-NOFP16-NEXT: fcvt d21, h21
; CHECK-SD-NOFP16-NEXT: fcvt d22, h22
; CHECK-SD-NOFP16-NEXT: mov h3, v3.h[7]
; CHECK-SD-NOFP16-NEXT: mov v7.h[3], v19.h[0]
; CHECK-SD-NOFP16-NEXT: fcvt h19, s20
; CHECK-SD-NOFP16-NEXT: fcvt h19, d20
; CHECK-SD-NOFP16-NEXT: mov v6.h[4], v16.h[0]
; CHECK-SD-NOFP16-NEXT: fmadd s20, s25, s24, s23
; CHECK-SD-NOFP16-NEXT: fcvt h16, s17
; CHECK-SD-NOFP16-NEXT: fcvt s4, h4
; CHECK-SD-NOFP16-NEXT: fmadd s18, s27, s26, s18
; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
; CHECK-SD-NOFP16-NEXT: fmadd s21, s28, s22, s21
; CHECK-SD-NOFP16-NEXT: fcvt s5, h5
; CHECK-SD-NOFP16-NEXT: fcvt s3, h3
; CHECK-SD-NOFP16-NEXT: fmadd d20, d25, d24, d23
; CHECK-SD-NOFP16-NEXT: fcvt h16, d17
; CHECK-SD-NOFP16-NEXT: fcvt d4, h4
; CHECK-SD-NOFP16-NEXT: fmadd d18, d27, d26, d18
; CHECK-SD-NOFP16-NEXT: fcvt d2, h2
; CHECK-SD-NOFP16-NEXT: fcvt d0, h0
; CHECK-SD-NOFP16-NEXT: fmadd d21, d28, d22, d21
; CHECK-SD-NOFP16-NEXT: fcvt d5, h5
; CHECK-SD-NOFP16-NEXT: fcvt d3, h3
; CHECK-SD-NOFP16-NEXT: mov v7.h[4], v19.h[0]
; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
; CHECK-SD-NOFP16-NEXT: fcvt h17, s20
; CHECK-SD-NOFP16-NEXT: fcvt d1, h1
; CHECK-SD-NOFP16-NEXT: fcvt h17, d20
; CHECK-SD-NOFP16-NEXT: mov v6.h[5], v16.h[0]
; CHECK-SD-NOFP16-NEXT: fmadd s0, s0, s2, s4
; CHECK-SD-NOFP16-NEXT: fcvt h2, s18
; CHECK-SD-NOFP16-NEXT: fcvt h4, s21
; CHECK-SD-NOFP16-NEXT: fmadd s1, s1, s3, s5
; CHECK-SD-NOFP16-NEXT: fmadd d0, d0, d2, d4
; CHECK-SD-NOFP16-NEXT: fcvt h2, d18
; CHECK-SD-NOFP16-NEXT: fcvt h4, d21
; CHECK-SD-NOFP16-NEXT: fmadd d1, d1, d3, d5
; CHECK-SD-NOFP16-NEXT: mov v7.h[5], v17.h[0]
; CHECK-SD-NOFP16-NEXT: mov v6.h[6], v2.h[0]
; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
; CHECK-SD-NOFP16-NEXT: fcvt h1, s1
; CHECK-SD-NOFP16-NEXT: fcvt h0, d0
; CHECK-SD-NOFP16-NEXT: fcvt h1, d1
; CHECK-SD-NOFP16-NEXT: mov v7.h[6], v4.h[0]
; CHECK-SD-NOFP16-NEXT: mov v6.h[7], v0.h[0]
; CHECK-SD-NOFP16-NEXT: mov v7.h[7], v1.h[0]

View File

@ -170,11 +170,11 @@ define half @frem_f16(half %x, half %y) #0 {
define half @fma_f16(half %x, half %y, half %z) #0 {
; CHECK-NOFP16-LABEL: fma_f16:
; CHECK-NOFP16: // %bb.0:
; CHECK-NOFP16-NEXT: fcvt s2, h2
; CHECK-NOFP16-NEXT: fcvt s1, h1
; CHECK-NOFP16-NEXT: fcvt s0, h0
; CHECK-NOFP16-NEXT: fmadd s0, s0, s1, s2
; CHECK-NOFP16-NEXT: fcvt h0, s0
; CHECK-NOFP16-NEXT: fcvt d2, h2
; CHECK-NOFP16-NEXT: fcvt d1, h1
; CHECK-NOFP16-NEXT: fcvt d0, h0
; CHECK-NOFP16-NEXT: fmadd d0, d0, d1, d2
; CHECK-NOFP16-NEXT: fcvt h0, d0
; CHECK-NOFP16-NEXT: ret
;
; CHECK-FP16-LABEL: fma_f16:
@ -1382,3 +1382,5 @@ declare i1 @llvm.experimental.constrained.fcmp.f16(half, half, metadata, metadat
declare half @llvm.experimental.constrained.fptrunc.f16.f32(float, metadata, metadata)
declare float @llvm.experimental.constrained.fpext.f32.f16(half, metadata)
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; CHECK-GI: {{.*}}

View File

@ -1043,38 +1043,38 @@ define <2 x half> @fma_v2f16(<2 x half> %op1, <2 x half> %op2, <2 x half> %op3)
; NONEON-NOSVE-NEXT: ldr h0, [sp, #22]
; NONEON-NOSVE-NEXT: ldr h1, [sp, #14]
; NONEON-NOSVE-NEXT: ldr h2, [sp, #6]
; NONEON-NOSVE-NEXT: fcvt s0, h0
; NONEON-NOSVE-NEXT: fcvt s1, h1
; NONEON-NOSVE-NEXT: fcvt s2, h2
; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0
; NONEON-NOSVE-NEXT: fcvt d0, h0
; NONEON-NOSVE-NEXT: fcvt d1, h1
; NONEON-NOSVE-NEXT: fcvt d2, h2
; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0
; NONEON-NOSVE-NEXT: ldr h1, [sp, #12]
; NONEON-NOSVE-NEXT: ldr h2, [sp, #4]
; NONEON-NOSVE-NEXT: fcvt s1, h1
; NONEON-NOSVE-NEXT: fcvt s2, h2
; NONEON-NOSVE-NEXT: fcvt h0, s0
; NONEON-NOSVE-NEXT: fcvt d1, h1
; NONEON-NOSVE-NEXT: fcvt d2, h2
; NONEON-NOSVE-NEXT: fcvt h0, d0
; NONEON-NOSVE-NEXT: str h0, [sp, #30]
; NONEON-NOSVE-NEXT: ldr h0, [sp, #20]
; NONEON-NOSVE-NEXT: fcvt s0, h0
; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0
; NONEON-NOSVE-NEXT: fcvt d0, h0
; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0
; NONEON-NOSVE-NEXT: ldr h1, [sp, #10]
; NONEON-NOSVE-NEXT: ldr h2, [sp, #2]
; NONEON-NOSVE-NEXT: fcvt s1, h1
; NONEON-NOSVE-NEXT: fcvt s2, h2
; NONEON-NOSVE-NEXT: fcvt h0, s0
; NONEON-NOSVE-NEXT: fcvt d1, h1
; NONEON-NOSVE-NEXT: fcvt d2, h2
; NONEON-NOSVE-NEXT: fcvt h0, d0
; NONEON-NOSVE-NEXT: str h0, [sp, #28]
; NONEON-NOSVE-NEXT: ldr h0, [sp, #18]
; NONEON-NOSVE-NEXT: fcvt s0, h0
; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0
; NONEON-NOSVE-NEXT: fcvt d0, h0
; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0
; NONEON-NOSVE-NEXT: ldr h1, [sp, #8]
; NONEON-NOSVE-NEXT: ldr h2, [sp]
; NONEON-NOSVE-NEXT: fcvt s1, h1
; NONEON-NOSVE-NEXT: fcvt s2, h2
; NONEON-NOSVE-NEXT: fcvt h0, s0
; NONEON-NOSVE-NEXT: fcvt d1, h1
; NONEON-NOSVE-NEXT: fcvt d2, h2
; NONEON-NOSVE-NEXT: fcvt h0, d0
; NONEON-NOSVE-NEXT: str h0, [sp, #26]
; NONEON-NOSVE-NEXT: ldr h0, [sp, #16]
; NONEON-NOSVE-NEXT: fcvt s0, h0
; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0
; NONEON-NOSVE-NEXT: fcvt h0, s0
; NONEON-NOSVE-NEXT: fcvt d0, h0
; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0
; NONEON-NOSVE-NEXT: fcvt h0, d0
; NONEON-NOSVE-NEXT: str h0, [sp, #24]
; NONEON-NOSVE-NEXT: ldr d0, [sp, #24]
; NONEON-NOSVE-NEXT: add sp, sp, #32
@ -1103,38 +1103,38 @@ define <4 x half> @fma_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x half> %op3)
; NONEON-NOSVE-NEXT: ldr h0, [sp, #22]
; NONEON-NOSVE-NEXT: ldr h1, [sp, #14]
; NONEON-NOSVE-NEXT: ldr h2, [sp, #6]
; NONEON-NOSVE-NEXT: fcvt s0, h0
; NONEON-NOSVE-NEXT: fcvt s1, h1
; NONEON-NOSVE-NEXT: fcvt s2, h2
; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0
; NONEON-NOSVE-NEXT: fcvt d0, h0
; NONEON-NOSVE-NEXT: fcvt d1, h1
; NONEON-NOSVE-NEXT: fcvt d2, h2
; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0
; NONEON-NOSVE-NEXT: ldr h1, [sp, #12]
; NONEON-NOSVE-NEXT: ldr h2, [sp, #4]
; NONEON-NOSVE-NEXT: fcvt s1, h1
; NONEON-NOSVE-NEXT: fcvt s2, h2
; NONEON-NOSVE-NEXT: fcvt h0, s0
; NONEON-NOSVE-NEXT: fcvt d1, h1
; NONEON-NOSVE-NEXT: fcvt d2, h2
; NONEON-NOSVE-NEXT: fcvt h0, d0
; NONEON-NOSVE-NEXT: str h0, [sp, #30]
; NONEON-NOSVE-NEXT: ldr h0, [sp, #20]
; NONEON-NOSVE-NEXT: fcvt s0, h0
; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0
; NONEON-NOSVE-NEXT: fcvt d0, h0
; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0
; NONEON-NOSVE-NEXT: ldr h1, [sp, #10]
; NONEON-NOSVE-NEXT: ldr h2, [sp, #2]
; NONEON-NOSVE-NEXT: fcvt s1, h1
; NONEON-NOSVE-NEXT: fcvt s2, h2
; NONEON-NOSVE-NEXT: fcvt h0, s0
; NONEON-NOSVE-NEXT: fcvt d1, h1
; NONEON-NOSVE-NEXT: fcvt d2, h2
; NONEON-NOSVE-NEXT: fcvt h0, d0
; NONEON-NOSVE-NEXT: str h0, [sp, #28]
; NONEON-NOSVE-NEXT: ldr h0, [sp, #18]
; NONEON-NOSVE-NEXT: fcvt s0, h0
; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0
; NONEON-NOSVE-NEXT: fcvt d0, h0
; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0
; NONEON-NOSVE-NEXT: ldr h1, [sp, #8]
; NONEON-NOSVE-NEXT: ldr h2, [sp]
; NONEON-NOSVE-NEXT: fcvt s1, h1
; NONEON-NOSVE-NEXT: fcvt s2, h2
; NONEON-NOSVE-NEXT: fcvt h0, s0
; NONEON-NOSVE-NEXT: fcvt d1, h1
; NONEON-NOSVE-NEXT: fcvt d2, h2
; NONEON-NOSVE-NEXT: fcvt h0, d0
; NONEON-NOSVE-NEXT: str h0, [sp, #26]
; NONEON-NOSVE-NEXT: ldr h0, [sp, #16]
; NONEON-NOSVE-NEXT: fcvt s0, h0
; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0
; NONEON-NOSVE-NEXT: fcvt h0, s0
; NONEON-NOSVE-NEXT: fcvt d0, h0
; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0
; NONEON-NOSVE-NEXT: fcvt h0, d0
; NONEON-NOSVE-NEXT: str h0, [sp, #24]
; NONEON-NOSVE-NEXT: ldr d0, [sp, #24]
; NONEON-NOSVE-NEXT: add sp, sp, #32
@ -1163,74 +1163,74 @@ define <8 x half> @fma_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x half> %op3)
; NONEON-NOSVE-NEXT: ldr h0, [sp, #46]
; NONEON-NOSVE-NEXT: ldr h1, [sp, #30]
; NONEON-NOSVE-NEXT: ldr h2, [sp, #14]
; NONEON-NOSVE-NEXT: fcvt s0, h0
; NONEON-NOSVE-NEXT: fcvt s1, h1
; NONEON-NOSVE-NEXT: fcvt s2, h2
; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0
; NONEON-NOSVE-NEXT: fcvt d0, h0
; NONEON-NOSVE-NEXT: fcvt d1, h1
; NONEON-NOSVE-NEXT: fcvt d2, h2
; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0
; NONEON-NOSVE-NEXT: ldr h1, [sp, #28]
; NONEON-NOSVE-NEXT: ldr h2, [sp, #12]
; NONEON-NOSVE-NEXT: fcvt s1, h1
; NONEON-NOSVE-NEXT: fcvt s2, h2
; NONEON-NOSVE-NEXT: fcvt h0, s0
; NONEON-NOSVE-NEXT: fcvt d1, h1
; NONEON-NOSVE-NEXT: fcvt d2, h2
; NONEON-NOSVE-NEXT: fcvt h0, d0
; NONEON-NOSVE-NEXT: str h0, [sp, #62]
; NONEON-NOSVE-NEXT: ldr h0, [sp, #44]
; NONEON-NOSVE-NEXT: fcvt s0, h0
; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0
; NONEON-NOSVE-NEXT: fcvt d0, h0
; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0
; NONEON-NOSVE-NEXT: ldr h1, [sp, #26]
; NONEON-NOSVE-NEXT: ldr h2, [sp, #10]
; NONEON-NOSVE-NEXT: fcvt s1, h1
; NONEON-NOSVE-NEXT: fcvt s2, h2
; NONEON-NOSVE-NEXT: fcvt h0, s0
; NONEON-NOSVE-NEXT: fcvt d1, h1
; NONEON-NOSVE-NEXT: fcvt d2, h2
; NONEON-NOSVE-NEXT: fcvt h0, d0
; NONEON-NOSVE-NEXT: str h0, [sp, #60]
; NONEON-NOSVE-NEXT: ldr h0, [sp, #42]
; NONEON-NOSVE-NEXT: fcvt s0, h0
; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0
; NONEON-NOSVE-NEXT: fcvt d0, h0
; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0
; NONEON-NOSVE-NEXT: ldr h1, [sp, #24]
; NONEON-NOSVE-NEXT: ldr h2, [sp, #8]
; NONEON-NOSVE-NEXT: fcvt s1, h1
; NONEON-NOSVE-NEXT: fcvt s2, h2
; NONEON-NOSVE-NEXT: fcvt h0, s0
; NONEON-NOSVE-NEXT: fcvt d1, h1
; NONEON-NOSVE-NEXT: fcvt d2, h2
; NONEON-NOSVE-NEXT: fcvt h0, d0
; NONEON-NOSVE-NEXT: str h0, [sp, #58]
; NONEON-NOSVE-NEXT: ldr h0, [sp, #40]
; NONEON-NOSVE-NEXT: fcvt s0, h0
; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0
; NONEON-NOSVE-NEXT: fcvt d0, h0
; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0
; NONEON-NOSVE-NEXT: ldr h1, [sp, #22]
; NONEON-NOSVE-NEXT: ldr h2, [sp, #6]
; NONEON-NOSVE-NEXT: fcvt s1, h1
; NONEON-NOSVE-NEXT: fcvt s2, h2
; NONEON-NOSVE-NEXT: fcvt h0, s0
; NONEON-NOSVE-NEXT: fcvt d1, h1
; NONEON-NOSVE-NEXT: fcvt d2, h2
; NONEON-NOSVE-NEXT: fcvt h0, d0
; NONEON-NOSVE-NEXT: str h0, [sp, #56]
; NONEON-NOSVE-NEXT: ldr h0, [sp, #38]
; NONEON-NOSVE-NEXT: fcvt s0, h0
; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0
; NONEON-NOSVE-NEXT: fcvt d0, h0
; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0
; NONEON-NOSVE-NEXT: ldr h1, [sp, #20]
; NONEON-NOSVE-NEXT: ldr h2, [sp, #4]
; NONEON-NOSVE-NEXT: fcvt s1, h1
; NONEON-NOSVE-NEXT: fcvt s2, h2
; NONEON-NOSVE-NEXT: fcvt h0, s0
; NONEON-NOSVE-NEXT: fcvt d1, h1
; NONEON-NOSVE-NEXT: fcvt d2, h2
; NONEON-NOSVE-NEXT: fcvt h0, d0
; NONEON-NOSVE-NEXT: str h0, [sp, #54]
; NONEON-NOSVE-NEXT: ldr h0, [sp, #36]
; NONEON-NOSVE-NEXT: fcvt s0, h0
; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0
; NONEON-NOSVE-NEXT: fcvt d0, h0
; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0
; NONEON-NOSVE-NEXT: ldr h1, [sp, #18]
; NONEON-NOSVE-NEXT: ldr h2, [sp, #2]
; NONEON-NOSVE-NEXT: fcvt s1, h1
; NONEON-NOSVE-NEXT: fcvt s2, h2
; NONEON-NOSVE-NEXT: fcvt h0, s0
; NONEON-NOSVE-NEXT: fcvt d1, h1
; NONEON-NOSVE-NEXT: fcvt d2, h2
; NONEON-NOSVE-NEXT: fcvt h0, d0
; NONEON-NOSVE-NEXT: str h0, [sp, #52]
; NONEON-NOSVE-NEXT: ldr h0, [sp, #34]
; NONEON-NOSVE-NEXT: fcvt s0, h0
; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0
; NONEON-NOSVE-NEXT: fcvt d0, h0
; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0
; NONEON-NOSVE-NEXT: ldr h1, [sp, #16]
; NONEON-NOSVE-NEXT: ldr h2, [sp]
; NONEON-NOSVE-NEXT: fcvt s1, h1
; NONEON-NOSVE-NEXT: fcvt s2, h2
; NONEON-NOSVE-NEXT: fcvt h0, s0
; NONEON-NOSVE-NEXT: fcvt d1, h1
; NONEON-NOSVE-NEXT: fcvt d2, h2
; NONEON-NOSVE-NEXT: fcvt h0, d0
; NONEON-NOSVE-NEXT: str h0, [sp, #50]
; NONEON-NOSVE-NEXT: ldr h0, [sp, #32]
; NONEON-NOSVE-NEXT: fcvt s0, h0
; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0
; NONEON-NOSVE-NEXT: fcvt h0, s0
; NONEON-NOSVE-NEXT: fcvt d0, h0
; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0
; NONEON-NOSVE-NEXT: fcvt h0, d0
; NONEON-NOSVE-NEXT: str h0, [sp, #48]
; NONEON-NOSVE-NEXT: ldr q0, [sp, #48]
; NONEON-NOSVE-NEXT: add sp, sp, #64
@ -1264,146 +1264,146 @@ define void @fma_v16f16(ptr %a, ptr %b, ptr %c) {
; NONEON-NOSVE-NEXT: stp q1, q5, [sp, #32]
; NONEON-NOSVE-NEXT: ldr h1, [sp, #78]
; NONEON-NOSVE-NEXT: ldr h2, [sp, #62]
; NONEON-NOSVE-NEXT: fcvt s0, h0
; NONEON-NOSVE-NEXT: fcvt s1, h1
; NONEON-NOSVE-NEXT: fcvt s2, h2
; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0
; NONEON-NOSVE-NEXT: fcvt d0, h0
; NONEON-NOSVE-NEXT: fcvt d1, h1
; NONEON-NOSVE-NEXT: fcvt d2, h2
; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0
; NONEON-NOSVE-NEXT: ldr h1, [sp, #76]
; NONEON-NOSVE-NEXT: ldr h2, [sp, #60]
; NONEON-NOSVE-NEXT: fcvt s1, h1
; NONEON-NOSVE-NEXT: fcvt s2, h2
; NONEON-NOSVE-NEXT: fcvt h0, s0
; NONEON-NOSVE-NEXT: fcvt d1, h1
; NONEON-NOSVE-NEXT: fcvt d2, h2
; NONEON-NOSVE-NEXT: fcvt h0, d0
; NONEON-NOSVE-NEXT: str h0, [sp, #126]
; NONEON-NOSVE-NEXT: ldr h0, [sp, #92]
; NONEON-NOSVE-NEXT: fcvt s0, h0
; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0
; NONEON-NOSVE-NEXT: fcvt d0, h0
; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0
; NONEON-NOSVE-NEXT: ldr h1, [sp, #74]
; NONEON-NOSVE-NEXT: ldr h2, [sp, #58]
; NONEON-NOSVE-NEXT: fcvt s1, h1
; NONEON-NOSVE-NEXT: fcvt s2, h2
; NONEON-NOSVE-NEXT: fcvt h0, s0
; NONEON-NOSVE-NEXT: fcvt d1, h1
; NONEON-NOSVE-NEXT: fcvt d2, h2
; NONEON-NOSVE-NEXT: fcvt h0, d0
; NONEON-NOSVE-NEXT: str h0, [sp, #124]
; NONEON-NOSVE-NEXT: ldr h0, [sp, #90]
; NONEON-NOSVE-NEXT: fcvt s0, h0
; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0
; NONEON-NOSVE-NEXT: fcvt d0, h0
; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0
; NONEON-NOSVE-NEXT: ldr h1, [sp, #72]
; NONEON-NOSVE-NEXT: ldr h2, [sp, #56]
; NONEON-NOSVE-NEXT: fcvt s1, h1
; NONEON-NOSVE-NEXT: fcvt s2, h2
; NONEON-NOSVE-NEXT: fcvt h0, s0
; NONEON-NOSVE-NEXT: fcvt d1, h1
; NONEON-NOSVE-NEXT: fcvt d2, h2
; NONEON-NOSVE-NEXT: fcvt h0, d0
; NONEON-NOSVE-NEXT: str h0, [sp, #122]
; NONEON-NOSVE-NEXT: ldr h0, [sp, #88]
; NONEON-NOSVE-NEXT: fcvt s0, h0
; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0
; NONEON-NOSVE-NEXT: fcvt d0, h0
; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0
; NONEON-NOSVE-NEXT: ldr h1, [sp, #70]
; NONEON-NOSVE-NEXT: ldr h2, [sp, #54]
; NONEON-NOSVE-NEXT: fcvt s1, h1
; NONEON-NOSVE-NEXT: fcvt s2, h2
; NONEON-NOSVE-NEXT: fcvt h0, s0
; NONEON-NOSVE-NEXT: fcvt d1, h1
; NONEON-NOSVE-NEXT: fcvt d2, h2
; NONEON-NOSVE-NEXT: fcvt h0, d0
; NONEON-NOSVE-NEXT: str h0, [sp, #120]
; NONEON-NOSVE-NEXT: ldr h0, [sp, #86]
; NONEON-NOSVE-NEXT: fcvt s0, h0
; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0
; NONEON-NOSVE-NEXT: fcvt d0, h0
; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0
; NONEON-NOSVE-NEXT: ldr h1, [sp, #68]
; NONEON-NOSVE-NEXT: ldr h2, [sp, #52]
; NONEON-NOSVE-NEXT: fcvt s1, h1
; NONEON-NOSVE-NEXT: fcvt s2, h2
; NONEON-NOSVE-NEXT: fcvt h0, s0
; NONEON-NOSVE-NEXT: fcvt d1, h1
; NONEON-NOSVE-NEXT: fcvt d2, h2
; NONEON-NOSVE-NEXT: fcvt h0, d0
; NONEON-NOSVE-NEXT: str h0, [sp, #118]
; NONEON-NOSVE-NEXT: ldr h0, [sp, #84]
; NONEON-NOSVE-NEXT: fcvt s0, h0
; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0
; NONEON-NOSVE-NEXT: fcvt d0, h0
; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0
; NONEON-NOSVE-NEXT: ldr h1, [sp, #66]
; NONEON-NOSVE-NEXT: ldr h2, [sp, #50]
; NONEON-NOSVE-NEXT: fcvt s1, h1
; NONEON-NOSVE-NEXT: fcvt s2, h2
; NONEON-NOSVE-NEXT: fcvt h0, s0
; NONEON-NOSVE-NEXT: fcvt d1, h1
; NONEON-NOSVE-NEXT: fcvt d2, h2
; NONEON-NOSVE-NEXT: fcvt h0, d0
; NONEON-NOSVE-NEXT: str h0, [sp, #116]
; NONEON-NOSVE-NEXT: ldr h0, [sp, #82]
; NONEON-NOSVE-NEXT: fcvt s0, h0
; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0
; NONEON-NOSVE-NEXT: fcvt d0, h0
; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0
; NONEON-NOSVE-NEXT: ldr h1, [sp, #64]
; NONEON-NOSVE-NEXT: ldr h2, [sp, #48]
; NONEON-NOSVE-NEXT: fcvt s1, h1
; NONEON-NOSVE-NEXT: fcvt s2, h2
; NONEON-NOSVE-NEXT: fcvt h0, s0
; NONEON-NOSVE-NEXT: fcvt d1, h1
; NONEON-NOSVE-NEXT: fcvt d2, h2
; NONEON-NOSVE-NEXT: fcvt h0, d0
; NONEON-NOSVE-NEXT: str h0, [sp, #114]
; NONEON-NOSVE-NEXT: ldr h0, [sp, #80]
; NONEON-NOSVE-NEXT: fcvt s0, h0
; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0
; NONEON-NOSVE-NEXT: fcvt d0, h0
; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0
; NONEON-NOSVE-NEXT: ldr h1, [sp, #30]
; NONEON-NOSVE-NEXT: ldr h2, [sp, #14]
; NONEON-NOSVE-NEXT: fcvt s1, h1
; NONEON-NOSVE-NEXT: fcvt s2, h2
; NONEON-NOSVE-NEXT: fcvt h0, s0
; NONEON-NOSVE-NEXT: fcvt d1, h1
; NONEON-NOSVE-NEXT: fcvt d2, h2
; NONEON-NOSVE-NEXT: fcvt h0, d0
; NONEON-NOSVE-NEXT: str h0, [sp, #112]
; NONEON-NOSVE-NEXT: ldr h0, [sp, #46]
; NONEON-NOSVE-NEXT: fcvt s0, h0
; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0
; NONEON-NOSVE-NEXT: fcvt d0, h0
; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0
; NONEON-NOSVE-NEXT: ldr h1, [sp, #28]
; NONEON-NOSVE-NEXT: ldr h2, [sp, #12]
; NONEON-NOSVE-NEXT: fcvt s1, h1
; NONEON-NOSVE-NEXT: fcvt s2, h2
; NONEON-NOSVE-NEXT: fcvt h0, s0
; NONEON-NOSVE-NEXT: fcvt d1, h1
; NONEON-NOSVE-NEXT: fcvt d2, h2
; NONEON-NOSVE-NEXT: fcvt h0, d0
; NONEON-NOSVE-NEXT: str h0, [sp, #110]
; NONEON-NOSVE-NEXT: ldr h0, [sp, #44]
; NONEON-NOSVE-NEXT: fcvt s0, h0
; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0
; NONEON-NOSVE-NEXT: fcvt d0, h0
; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0
; NONEON-NOSVE-NEXT: ldr h1, [sp, #26]
; NONEON-NOSVE-NEXT: ldr h2, [sp, #10]
; NONEON-NOSVE-NEXT: fcvt s1, h1
; NONEON-NOSVE-NEXT: fcvt s2, h2
; NONEON-NOSVE-NEXT: fcvt h0, s0
; NONEON-NOSVE-NEXT: fcvt d1, h1
; NONEON-NOSVE-NEXT: fcvt d2, h2
; NONEON-NOSVE-NEXT: fcvt h0, d0
; NONEON-NOSVE-NEXT: str h0, [sp, #108]
; NONEON-NOSVE-NEXT: ldr h0, [sp, #42]
; NONEON-NOSVE-NEXT: fcvt s0, h0
; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0
; NONEON-NOSVE-NEXT: fcvt d0, h0
; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0
; NONEON-NOSVE-NEXT: ldr h1, [sp, #24]
; NONEON-NOSVE-NEXT: ldr h2, [sp, #8]
; NONEON-NOSVE-NEXT: fcvt s1, h1
; NONEON-NOSVE-NEXT: fcvt s2, h2
; NONEON-NOSVE-NEXT: fcvt h0, s0
; NONEON-NOSVE-NEXT: fcvt d1, h1
; NONEON-NOSVE-NEXT: fcvt d2, h2
; NONEON-NOSVE-NEXT: fcvt h0, d0
; NONEON-NOSVE-NEXT: str h0, [sp, #106]
; NONEON-NOSVE-NEXT: ldr h0, [sp, #40]
; NONEON-NOSVE-NEXT: fcvt s0, h0
; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0
; NONEON-NOSVE-NEXT: fcvt d0, h0
; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0
; NONEON-NOSVE-NEXT: ldr h1, [sp, #22]
; NONEON-NOSVE-NEXT: ldr h2, [sp, #6]
; NONEON-NOSVE-NEXT: fcvt s1, h1
; NONEON-NOSVE-NEXT: fcvt s2, h2
; NONEON-NOSVE-NEXT: fcvt h0, s0
; NONEON-NOSVE-NEXT: fcvt d1, h1
; NONEON-NOSVE-NEXT: fcvt d2, h2
; NONEON-NOSVE-NEXT: fcvt h0, d0
; NONEON-NOSVE-NEXT: str h0, [sp, #104]
; NONEON-NOSVE-NEXT: ldr h0, [sp, #38]
; NONEON-NOSVE-NEXT: fcvt s0, h0
; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0
; NONEON-NOSVE-NEXT: fcvt d0, h0
; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0
; NONEON-NOSVE-NEXT: ldr h1, [sp, #20]
; NONEON-NOSVE-NEXT: ldr h2, [sp, #4]
; NONEON-NOSVE-NEXT: fcvt s1, h1
; NONEON-NOSVE-NEXT: fcvt s2, h2
; NONEON-NOSVE-NEXT: fcvt h0, s0
; NONEON-NOSVE-NEXT: fcvt d1, h1
; NONEON-NOSVE-NEXT: fcvt d2, h2
; NONEON-NOSVE-NEXT: fcvt h0, d0
; NONEON-NOSVE-NEXT: str h0, [sp, #102]
; NONEON-NOSVE-NEXT: ldr h0, [sp, #36]
; NONEON-NOSVE-NEXT: fcvt s0, h0
; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0
; NONEON-NOSVE-NEXT: fcvt d0, h0
; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0
; NONEON-NOSVE-NEXT: ldr h1, [sp, #18]
; NONEON-NOSVE-NEXT: ldr h2, [sp, #2]
; NONEON-NOSVE-NEXT: fcvt s1, h1
; NONEON-NOSVE-NEXT: fcvt s2, h2
; NONEON-NOSVE-NEXT: fcvt h0, s0
; NONEON-NOSVE-NEXT: fcvt d1, h1
; NONEON-NOSVE-NEXT: fcvt d2, h2
; NONEON-NOSVE-NEXT: fcvt h0, d0
; NONEON-NOSVE-NEXT: str h0, [sp, #100]
; NONEON-NOSVE-NEXT: ldr h0, [sp, #34]
; NONEON-NOSVE-NEXT: fcvt s0, h0
; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0
; NONEON-NOSVE-NEXT: fcvt d0, h0
; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0
; NONEON-NOSVE-NEXT: ldr h1, [sp, #16]
; NONEON-NOSVE-NEXT: ldr h2, [sp]
; NONEON-NOSVE-NEXT: fcvt s1, h1
; NONEON-NOSVE-NEXT: fcvt s2, h2
; NONEON-NOSVE-NEXT: fcvt h0, s0
; NONEON-NOSVE-NEXT: fcvt d1, h1
; NONEON-NOSVE-NEXT: fcvt d2, h2
; NONEON-NOSVE-NEXT: fcvt h0, d0
; NONEON-NOSVE-NEXT: str h0, [sp, #98]
; NONEON-NOSVE-NEXT: ldr h0, [sp, #32]
; NONEON-NOSVE-NEXT: fcvt s0, h0
; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0
; NONEON-NOSVE-NEXT: fcvt h0, s0
; NONEON-NOSVE-NEXT: fcvt d0, h0
; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0
; NONEON-NOSVE-NEXT: fcvt h0, d0
; NONEON-NOSVE-NEXT: str h0, [sp, #96]
; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #96]
; NONEON-NOSVE-NEXT: stp q0, q1, [x0]

View File

@ -1508,61 +1508,81 @@ define void @test_fma(ptr %p, ptr %q, ptr %r) #0 {
; CHECK-FP16-NEXT: push {r4, lr}
; CHECK-FP16-NEXT: mov r4, r0
; CHECK-FP16-NEXT: ldrh r0, [r1]
; CHECK-FP16-NEXT: ldrh r1, [r4]
; CHECK-FP16-NEXT: ldrh r2, [r2]
; CHECK-FP16-NEXT: vmov s2, r0
; CHECK-FP16-NEXT: vmov s0, r1
; CHECK-FP16-NEXT: vcvtb.f32.f16 s1, s2
; CHECK-FP16-NEXT: vmov s2, r2
; CHECK-FP16-NEXT: ldrh r1, [r2]
; CHECK-FP16-NEXT: vmov s0, r0
; CHECK-FP16-NEXT: ldrh r0, [r4]
; CHECK-FP16-NEXT: vcvtb.f32.f16 s0, s0
; CHECK-FP16-NEXT: vcvtb.f32.f16 s2, s2
; CHECK-FP16-NEXT: bl fmaf
; CHECK-FP16-NEXT: vcvtb.f16.f32 s0, s0
; CHECK-FP16-NEXT: vmov r0, s0
; CHECK-FP16-NEXT: vcvt.f64.f32 d16, s0
; CHECK-FP16-NEXT: vmov s0, r0
; CHECK-FP16-NEXT: vcvtb.f32.f16 s0, s0
; CHECK-FP16-NEXT: vcvt.f64.f32 d17, s0
; CHECK-FP16-NEXT: vmov s0, r1
; CHECK-FP16-NEXT: vcvtb.f32.f16 s0, s0
; CHECK-FP16-NEXT: vcvt.f64.f32 d18, s0
; CHECK-FP16-NEXT: vmla.f64 d18, d17, d16
; CHECK-FP16-NEXT: vmov r0, r1, d18
; CHECK-FP16-NEXT: bl __aeabi_d2h
; CHECK-FP16-NEXT: strh r0, [r4]
; CHECK-FP16-NEXT: pop {r4, pc}
;
; CHECK-LIBCALL-VFP-LABEL: test_fma:
; CHECK-LIBCALL-VFP: .save {r4, r5, r6, lr}
; CHECK-LIBCALL-VFP-NEXT: push {r4, r5, r6, lr}
; CHECK-LIBCALL-VFP-NEXT: .vsave {d8, d9}
; CHECK-LIBCALL-VFP-NEXT: vpush {d8, d9}
; CHECK-LIBCALL-VFP-NEXT: mov r4, r0
; CHECK-LIBCALL-VFP-NEXT: ldrh r0, [r2]
; CHECK-LIBCALL-VFP-NEXT: mov r5, r1
; CHECK-LIBCALL-VFP-NEXT: ldrh r0, [r0]
; CHECK-LIBCALL-VFP-NEXT: mov r5, r2
; CHECK-LIBCALL-VFP-NEXT: mov r6, r1
; CHECK-LIBCALL-VFP-NEXT: bl __aeabi_h2f
; CHECK-LIBCALL-VFP-NEXT: mov r6, r0
; CHECK-LIBCALL-VFP-NEXT: ldrh r0, [r5]
; CHECK-LIBCALL-VFP-NEXT: ldrh r1, [r6]
; CHECK-LIBCALL-VFP-NEXT: vmov s16, r0
; CHECK-LIBCALL-VFP-NEXT: ldrh r5, [r5]
; CHECK-LIBCALL-VFP-NEXT: mov r0, r1
; CHECK-LIBCALL-VFP-NEXT: bl __aeabi_h2f
; CHECK-LIBCALL-VFP-NEXT: mov r5, r0
; CHECK-LIBCALL-VFP-NEXT: ldrh r0, [r4]
; CHECK-LIBCALL-VFP-NEXT: vmov s18, r0
; CHECK-LIBCALL-VFP-NEXT: mov r0, r5
; CHECK-LIBCALL-VFP-NEXT: bl __aeabi_h2f
; CHECK-LIBCALL-VFP-NEXT: vmov s0, r0
; CHECK-LIBCALL-VFP-NEXT: vmov s1, r5
; CHECK-LIBCALL-VFP-NEXT: vmov s2, r6
; CHECK-LIBCALL-VFP-NEXT: bl fmaf
; CHECK-LIBCALL-VFP-NEXT: vmov r0, s0
; CHECK-LIBCALL-VFP-NEXT: bl __aeabi_f2h
; CHECK-LIBCALL-VFP-NEXT: vcvt.f64.f32 d16, s18
; CHECK-LIBCALL-VFP-NEXT: vcvt.f64.f32 d17, s16
; CHECK-LIBCALL-VFP-NEXT: vcvt.f64.f32 d18, s0
; CHECK-LIBCALL-VFP-NEXT: vmla.f64 d18, d17, d16
; CHECK-LIBCALL-VFP-NEXT: vmov r0, r1, d18
; CHECK-LIBCALL-VFP-NEXT: bl __aeabi_d2h
; CHECK-LIBCALL-VFP-NEXT: strh r0, [r4]
; CHECK-LIBCALL-VFP-NEXT: vpop {d8, d9}
; CHECK-LIBCALL-VFP-NEXT: pop {r4, r5, r6, pc}
;
; CHECK-NOVFP-LABEL: test_fma:
; CHECK-NOVFP: .save {r4, r5, r6, lr}
; CHECK-NOVFP-NEXT: push {r4, r5, r6, lr}
; CHECK-NOVFP: .save {r4, r5, r6, r7, r11, lr}
; CHECK-NOVFP-NEXT: push {r4, r5, r6, r7, r11, lr}
; CHECK-NOVFP-NEXT: mov r4, r0
; CHECK-NOVFP-NEXT: ldrh r0, [r1]
; CHECK-NOVFP-NEXT: mov r5, r2
; CHECK-NOVFP-NEXT: bl __aeabi_h2f
; CHECK-NOVFP-NEXT: bl __aeabi_f2d
; CHECK-NOVFP-NEXT: mov r6, r0
; CHECK-NOVFP-NEXT: ldrh r0, [r4]
; CHECK-NOVFP-NEXT: mov r7, r1
; CHECK-NOVFP-NEXT: bl __aeabi_h2f
; CHECK-NOVFP-NEXT: bl __aeabi_f2d
; CHECK-NOVFP-NEXT: mov r2, r6
; CHECK-NOVFP-NEXT: mov r3, r7
; CHECK-NOVFP-NEXT: bl __aeabi_dmul
; CHECK-NOVFP-NEXT: mov r6, r0
; CHECK-NOVFP-NEXT: ldrh r0, [r5]
; CHECK-NOVFP-NEXT: mov r7, r1
; CHECK-NOVFP-NEXT: bl __aeabi_h2f
; CHECK-NOVFP-NEXT: mov r5, r0
; CHECK-NOVFP-NEXT: ldrh r0, [r4]
; CHECK-NOVFP-NEXT: bl __aeabi_h2f
; CHECK-NOVFP-NEXT: mov r1, r6
; CHECK-NOVFP-NEXT: mov r2, r5
; CHECK-NOVFP-NEXT: bl fmaf
; CHECK-NOVFP-NEXT: bl __aeabi_f2h
; CHECK-NOVFP-NEXT: bl __aeabi_f2d
; CHECK-NOVFP-NEXT: mov r2, r0
; CHECK-NOVFP-NEXT: mov r3, r1
; CHECK-NOVFP-NEXT: mov r0, r6
; CHECK-NOVFP-NEXT: mov r1, r7
; CHECK-NOVFP-NEXT: bl __aeabi_dadd
; CHECK-NOVFP-NEXT: bl __aeabi_d2h
; CHECK-NOVFP-NEXT: strh r0, [r4]
; CHECK-NOVFP-NEXT: pop {r4, r5, r6, pc}
; CHECK-NOVFP-NEXT: pop {r4, r5, r6, r7, r11, pc}
%a = load half, ptr %p, align 2
%b = load half, ptr %q, align 2
%c = load half, ptr %r, align 2

View File

@ -8,39 +8,39 @@
; RUN: %if aarch64-registered-target %{ llc %s -o - -mtriple=arm64ec-pc-windows-msvc | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
; RUN: %if amdgpu-registered-target %{ llc %s -o - -mtriple=amdgcn-amd-amdhsa | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
; RUN: %if arc-registered-target %{ llc %s -o - -mtriple=arc-elf | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %}
; RUN: %if arm-registered-target %{ llc %s -o - -mtriple=arm-unknown-linux-gnueabi | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
; RUN: %if arm-registered-target %{ llc %s -o - -mtriple=thumbv7em-none-eabi | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
; RUN: %if avr-registered-target %{ llc %s -o - -mtriple=avr-none | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
; RUN: %if arm-registered-target %{ llc %s -o - -mtriple=arm-unknown-linux-gnueabi | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN %}
; RUN: %if arm-registered-target %{ llc %s -o - -mtriple=thumbv7em-none-eabi | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
; RUN: %if avr-registered-target %{ llc %s -o - -mtriple=avr-none | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
; FIXME: BPF has a compiler error
; RUN: %if csky-registered-target %{ llc %s -o - -mtriple=csky-unknown-linux-gnuabiv2 | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %}
; RUN: %if csky-registered-target %{ llc %s -o - -mtriple=csky-unknown-linux-gnuabiv2 -mcpu=ck860fv -mattr=+hard-float | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %}
; FIXME: directx has a compiler error
; RUN: %if hexagon-registered-target %{ llc %s -o - -mtriple=hexagon-unknown-linux-musl | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
; RUN: %if hexagon-registered-target %{ llc %s -o - -mtriple=hexagon-unknown-linux-musl | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
; RUN: %if lanai-registered-target %{ llc %s -o - -mtriple=lanai-unknown-unknown | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %}
; RUN: %if loongarch-registered-target %{ llc %s -o - -mtriple=loongarch32-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
; RUN: %if loongarch-registered-target %{ llc %s -o - -mtriple=loongarch64-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
; RUN: %if loongarch-registered-target %{ llc %s -o - -mtriple=loongarch64-unknown-linux-gnu -mattr=+f | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
; RUN: %if loongarch-registered-target %{ llc %s -o - -mtriple=loongarch32-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
; RUN: %if loongarch-registered-target %{ llc %s -o - -mtriple=loongarch64-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
; RUN: %if loongarch-registered-target %{ llc %s -o - -mtriple=loongarch64-unknown-linux-gnu -mattr=+f | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
; RUN: %if m68k-registered-target %{ llc %s -o - -mtriple=m68k-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %}
; RUN: %if mips-registered-target %{ llc %s -o - -mtriple=mips-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
; RUN: %if mips-registered-target %{ llc %s -o - -mtriple=mips64-unknown-linux-gnuabi64 | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
; RUN: %if mips-registered-target %{ llc %s -o - -mtriple=mips64el-unknown-linux-gnuabi64 | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
; RUN: %if mips-registered-target %{ llc %s -o - -mtriple=mipsel-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
; RUN: %if mips-registered-target %{ llc %s -o - -mtriple=mips-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
; RUN: %if mips-registered-target %{ llc %s -o - -mtriple=mips64-unknown-linux-gnuabi64 | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
; RUN: %if mips-registered-target %{ llc %s -o - -mtriple=mips64el-unknown-linux-gnuabi64 | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
; RUN: %if mips-registered-target %{ llc %s -o - -mtriple=mipsel-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
; RUN: %if msp430-registered-target %{ llc %s -o - -mtriple=msp430-none-elf | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %}
; RUN: %if nvptx-registered-target %{ llc %s -o - -mtriple=nvptx64-nvidia-cuda | FileCheck %s --check-prefixes=NOCRASH %}
; RUN: %if powerpc-registered-target %{ llc %s -o - -mtriple=powerpc-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %}
; RUN: %if powerpc-registered-target %{ llc %s -o - -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %}
; RUN: %if powerpc-registered-target %{ llc %s -o - -mtriple=powerpc64le-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %}
; RUN: %if riscv-registered-target %{ llc %s -o - -mtriple=riscv32-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
; RUN: %if riscv-registered-target %{ llc %s -o - -mtriple=riscv64-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
; RUN: %if sparc-registered-target %{ llc %s -o - -mtriple=sparc-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
; RUN: %if sparc-registered-target %{ llc %s -o - -mtriple=sparc64-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
; RUN: %if riscv-registered-target %{ llc %s -o - -mtriple=riscv32-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
; RUN: %if riscv-registered-target %{ llc %s -o - -mtriple=riscv64-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
; RUN: %if sparc-registered-target %{ llc %s -o - -mtriple=sparc-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
; RUN: %if sparc-registered-target %{ llc %s -o - -mtriple=sparc64-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
; RUN: %if spirv-registered-target %{ llc %s -o - -mtriple=spirv-unknown-unknown | FileCheck %s --check-prefixes=NOCRASH %}
; RUN: %if systemz-registered-target %{ llc %s -o - -mtriple=s390x-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
; RUN: %if systemz-registered-target %{ llc %s -o - -mtriple=s390x-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
; RUN: %if ve-registered-target %{ llc %s -o - -mtriple=ve-unknown-unknown | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %}
; RUN: %if webassembly-registered-target %{ llc %s -o - -mtriple=wasm32-unknown-unknown | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %}
; RUN: %if x86-registered-target %{ llc %s -o - -mtriple=i686-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
; RUN: %if x86-registered-target %{ llc %s -o - -mtriple=x86_64-pc-windows-msvc | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
; RUN: %if x86-registered-target %{ llc %s -o - -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
; RUN: %if x86-registered-target %{ llc %s -o - -mtriple=i686-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
; RUN: %if x86-registered-target %{ llc %s -o - -mtriple=x86_64-pc-windows-msvc | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
; RUN: %if x86-registered-target %{ llc %s -o - -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
; RUN: %if xcore-registered-target %{ llc %s -o - -mtriple=xcore-unknown-unknown | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %}
; RUN: %if xtensa-registered-target %{ llc %s -o - -mtriple=xtensa-none-elf | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,CHECK-FMA %}

View File

@ -1,3 +1,4 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; ## Full FP16 support enabled by default.
; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \
; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
@ -953,11 +954,11 @@ define half @test_cos(half %a) #0 {
; CHECK-DAG: ld.param.b16 [[C:%rs[0-9]+]], [test_fma_param_2];
; CHECK-F16-NOFTZ: fma.rn.f16 [[R:%rs[0-9]+]], [[A]], [[B]], [[C]];
; CHECK-F16-FTZ: fma.rn.ftz.f16 [[R:%rs[0-9]+]], [[A]], [[B]], [[C]];
; CHECK-NOF16-DAG: cvt.f32.f16 [[A32:%r[0-9]+]], [[A]]
; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%r[0-9]+]], [[B]]
; CHECK-NOF16-DAG: cvt.f32.f16 [[C32:%r[0-9]+]], [[C]]
; CHECK-NOF16-NEXT: fma.rn.f32 [[R32:%r[0-9]+]], [[A32]], [[B32]], [[C32]];
; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%rs[0-9]+]], [[R32]]
; CHECK-NOF16-DAG: cvt.f64.f16 [[A64:%rd[0-9]+]], [[A]]
; CHECK-NOF16-DAG: cvt.f64.f16 [[B64:%rd[0-9]+]], [[B]]
; CHECK-NOF16-DAG: cvt.f64.f16 [[C64:%rd[0-9]+]], [[C]]
; CHECK-NOF16-NEXT: fma.rn.f64 [[R64:%rd[0-9]+]], [[A64]], [[B64]], [[C64]];
; CHECK-NOF16-NEXT: cvt.rn.f16.f64 [[R:%rs[0-9]+]], [[R64]]
; CHECK: st.param.b16 [func_retval0], [[R]];
; CHECK: ret
define half @test_fma(half %a, half %b, half %c) #0 {
@ -1151,11 +1152,11 @@ define half @test_round(half %a) #0 {
; CHECK-DAG: ld.param.b16 [[C:%rs[0-9]+]], [test_fmuladd_param_2];
; CHECK-F16-NOFTZ: fma.rn.f16 [[R:%rs[0-9]+]], [[A]], [[B]], [[C]];
; CHECK-F16-FTZ: fma.rn.ftz.f16 [[R:%rs[0-9]+]], [[A]], [[B]], [[C]];
; CHECK-NOF16-DAG: cvt.f32.f16 [[A32:%r[0-9]+]], [[A]]
; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%r[0-9]+]], [[B]]
; CHECK-NOF16-DAG: cvt.f32.f16 [[C32:%r[0-9]+]], [[C]]
; CHECK-NOF16-NEXT: fma.rn.f32 [[R32:%r[0-9]+]], [[A32]], [[B32]], [[C32]];
; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%rs[0-9]+]], [[R32]]
; CHECK-NOF16-DAG: cvt.f64.f16 [[A64:%rd[0-9]+]], [[A]]
; CHECK-NOF16-DAG: cvt.f64.f16 [[B64:%rd[0-9]+]], [[B]]
; CHECK-NOF16-DAG: cvt.f64.f16 [[C64:%rd[0-9]+]], [[C]]
; CHECK-NOF16-NEXT: fma.rn.f64 [[R64:%rd[0-9]+]], [[A64]], [[B64]], [[C64]];
; CHECK-NOF16-NEXT: cvt.rn.f16.f64 [[R:%rs[0-9]+]], [[R64]]
; CHECK: st.param.b16 [func_retval0], [[R]];
; CHECK: ret;
define half @test_fmuladd(half %a, half %b, half %c) #0 {
@ -1183,3 +1184,9 @@ define <2 x half> @test_neg_f16x2(<2 x half> noundef %arg) #0 {
}
attributes #0 = { nounwind }
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; CHECK: {{.*}}
; CHECK-F16-FTZ: {{.*}}
; CHECK-F16-NOFTZ: {{.*}}
; CHECK-NOF16: {{.*}}
; CHECK-NOFTZ: {{.*}}

View File

@ -1766,27 +1766,28 @@ define <2 x half> @test_fma(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 {
; CHECK-NOF16-LABEL: test_fma(
; CHECK-NOF16: {
; CHECK-NOF16-NEXT: .reg .b16 %rs<9>;
; CHECK-NOF16-NEXT: .reg .b32 %r<13>;
; CHECK-NOF16-NEXT: .reg .b32 %r<5>;
; CHECK-NOF16-NEXT: .reg .b64 %rd<9>;
; CHECK-NOF16-EMPTY:
; CHECK-NOF16-NEXT: // %bb.0:
; CHECK-NOF16-NEXT: ld.param.b32 %r3, [test_fma_param_2];
; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fma_param_1];
; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fma_param_0];
; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r3;
; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2;
; CHECK-NOF16-NEXT: cvt.f64.f16 %rd1, %rs2;
; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r2;
; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs4;
; CHECK-NOF16-NEXT: cvt.f64.f16 %rd2, %rs4;
; CHECK-NOF16-NEXT: mov.b32 {%rs5, %rs6}, %r1;
; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs6;
; CHECK-NOF16-NEXT: fma.rn.f32 %r7, %r6, %r5, %r4;
; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs7, %r7;
; CHECK-NOF16-NEXT: cvt.f32.f16 %r8, %rs1;
; CHECK-NOF16-NEXT: cvt.f32.f16 %r9, %rs3;
; CHECK-NOF16-NEXT: cvt.f32.f16 %r10, %rs5;
; CHECK-NOF16-NEXT: fma.rn.f32 %r11, %r10, %r9, %r8;
; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs8, %r11;
; CHECK-NOF16-NEXT: mov.b32 %r12, {%rs8, %rs7};
; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r12;
; CHECK-NOF16-NEXT: cvt.f64.f16 %rd3, %rs6;
; CHECK-NOF16-NEXT: fma.rn.f64 %rd4, %rd3, %rd2, %rd1;
; CHECK-NOF16-NEXT: cvt.rn.f16.f64 %rs7, %rd4;
; CHECK-NOF16-NEXT: cvt.f64.f16 %rd5, %rs1;
; CHECK-NOF16-NEXT: cvt.f64.f16 %rd6, %rs3;
; CHECK-NOF16-NEXT: cvt.f64.f16 %rd7, %rs5;
; CHECK-NOF16-NEXT: fma.rn.f64 %rd8, %rd7, %rd6, %rd5;
; CHECK-NOF16-NEXT: cvt.rn.f16.f64 %rs8, %rd8;
; CHECK-NOF16-NEXT: mov.b32 %r4, {%rs8, %rs7};
; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r4;
; CHECK-NOF16-NEXT: ret;
%r = call <2 x half> @llvm.fma.f16(<2 x half> %a, <2 x half> %b, <2 x half> %c)
ret <2 x half> %r
@ -2203,27 +2204,28 @@ define <2 x half> @test_fmuladd(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0
; CHECK-NOF16-LABEL: test_fmuladd(
; CHECK-NOF16: {
; CHECK-NOF16-NEXT: .reg .b16 %rs<9>;
; CHECK-NOF16-NEXT: .reg .b32 %r<13>;
; CHECK-NOF16-NEXT: .reg .b32 %r<5>;
; CHECK-NOF16-NEXT: .reg .b64 %rd<9>;
; CHECK-NOF16-EMPTY:
; CHECK-NOF16-NEXT: // %bb.0:
; CHECK-NOF16-NEXT: ld.param.b32 %r3, [test_fmuladd_param_2];
; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fmuladd_param_1];
; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fmuladd_param_0];
; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r3;
; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2;
; CHECK-NOF16-NEXT: cvt.f64.f16 %rd1, %rs2;
; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r2;
; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs4;
; CHECK-NOF16-NEXT: cvt.f64.f16 %rd2, %rs4;
; CHECK-NOF16-NEXT: mov.b32 {%rs5, %rs6}, %r1;
; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs6;
; CHECK-NOF16-NEXT: fma.rn.f32 %r7, %r6, %r5, %r4;
; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs7, %r7;
; CHECK-NOF16-NEXT: cvt.f32.f16 %r8, %rs1;
; CHECK-NOF16-NEXT: cvt.f32.f16 %r9, %rs3;
; CHECK-NOF16-NEXT: cvt.f32.f16 %r10, %rs5;
; CHECK-NOF16-NEXT: fma.rn.f32 %r11, %r10, %r9, %r8;
; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs8, %r11;
; CHECK-NOF16-NEXT: mov.b32 %r12, {%rs8, %rs7};
; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r12;
; CHECK-NOF16-NEXT: cvt.f64.f16 %rd3, %rs6;
; CHECK-NOF16-NEXT: fma.rn.f64 %rd4, %rd3, %rd2, %rd1;
; CHECK-NOF16-NEXT: cvt.rn.f16.f64 %rs7, %rd4;
; CHECK-NOF16-NEXT: cvt.f64.f16 %rd5, %rs1;
; CHECK-NOF16-NEXT: cvt.f64.f16 %rd6, %rs3;
; CHECK-NOF16-NEXT: cvt.f64.f16 %rd7, %rs5;
; CHECK-NOF16-NEXT: fma.rn.f64 %rd8, %rd7, %rd6, %rd5;
; CHECK-NOF16-NEXT: cvt.rn.f16.f64 %rs8, %rd8;
; CHECK-NOF16-NEXT: mov.b32 %r4, {%rs8, %rs7};
; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r4;
; CHECK-NOF16-NEXT: ret;
%r = call <2 x half> @llvm.fmuladd.f16(<2 x half> %a, <2 x half> %b, <2 x half> %c)
ret <2 x half> %r

View File

@ -1093,28 +1093,41 @@ define half @fmadd_h(half %a, half %b, half %c) nounwind {
; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill
; RV32I-NEXT: mv s0, a2
; RV32I-NEXT: mv s1, a1
; RV32I-NEXT: lui a1, 16
; RV32I-NEXT: addi s3, a1, -1
; RV32I-NEXT: and a0, a0, s3
; RV32I-NEXT: addi s4, a1, -1
; RV32I-NEXT: and a0, a0, s4
; RV32I-NEXT: call __extendhfsf2
; RV32I-NEXT: call __extendsfdf2
; RV32I-NEXT: mv s2, a0
; RV32I-NEXT: and a0, s1, s3
; RV32I-NEXT: call __extendhfsf2
; RV32I-NEXT: mv s1, a0
; RV32I-NEXT: and a0, s0, s3
; RV32I-NEXT: mv s3, a1
; RV32I-NEXT: and a0, s1, s4
; RV32I-NEXT: call __extendhfsf2
; RV32I-NEXT: call __extendsfdf2
; RV32I-NEXT: mv a2, a0
; RV32I-NEXT: mv a3, a1
; RV32I-NEXT: mv a0, s2
; RV32I-NEXT: mv a1, s1
; RV32I-NEXT: call fmaf
; RV32I-NEXT: call __truncsfhf2
; RV32I-NEXT: mv a1, s3
; RV32I-NEXT: call __muldf3
; RV32I-NEXT: mv s1, a0
; RV32I-NEXT: mv s2, a1
; RV32I-NEXT: and a0, s0, s4
; RV32I-NEXT: call __extendhfsf2
; RV32I-NEXT: call __extendsfdf2
; RV32I-NEXT: mv a2, a0
; RV32I-NEXT: mv a3, a1
; RV32I-NEXT: mv a0, s1
; RV32I-NEXT: mv a1, s2
; RV32I-NEXT: call __adddf3
; RV32I-NEXT: call __truncdfhf2
; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s2, 16(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s3, 12(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s4, 8(sp) # 4-byte Folded Reload
; RV32I-NEXT: addi sp, sp, 32
; RV32I-NEXT: ret
;
@ -1132,17 +1145,22 @@ define half @fmadd_h(half %a, half %b, half %c) nounwind {
; RV64I-NEXT: addi s3, a1, -1
; RV64I-NEXT: and a0, a0, s3
; RV64I-NEXT: call __extendhfsf2
; RV64I-NEXT: call __extendsfdf2
; RV64I-NEXT: mv s2, a0
; RV64I-NEXT: and a0, s1, s3
; RV64I-NEXT: call __extendhfsf2
; RV64I-NEXT: call __extendsfdf2
; RV64I-NEXT: mv a1, a0
; RV64I-NEXT: mv a0, s2
; RV64I-NEXT: call __muldf3
; RV64I-NEXT: mv s1, a0
; RV64I-NEXT: and a0, s0, s3
; RV64I-NEXT: call __extendhfsf2
; RV64I-NEXT: mv a2, a0
; RV64I-NEXT: mv a0, s2
; RV64I-NEXT: mv a1, s1
; RV64I-NEXT: call fmaf
; RV64I-NEXT: call __truncsfhf2
; RV64I-NEXT: call __extendsfdf2
; RV64I-NEXT: mv a1, a0
; RV64I-NEXT: mv a0, s1
; RV64I-NEXT: call __adddf3
; RV64I-NEXT: call __truncdfhf2
; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload
; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload
; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload
@ -1194,35 +1212,48 @@ define half @fmsub_h(half %a, half %b, half %c) nounwind {
; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill
; RV32I-NEXT: mv s0, a1
; RV32I-NEXT: mv s1, a0
; RV32I-NEXT: lui a0, 16
; RV32I-NEXT: addi s2, a0, -1
; RV32I-NEXT: and a0, a2, s2
; RV32I-NEXT: addi s3, a0, -1
; RV32I-NEXT: and a0, a2, s3
; RV32I-NEXT: call __extendhfsf2
; RV32I-NEXT: li a1, 0
; RV32I-NEXT: call __addsf3
; RV32I-NEXT: call __truncsfhf2
; RV32I-NEXT: lui a1, 8
; RV32I-NEXT: xor s3, a0, a1
; RV32I-NEXT: and a0, s1, s2
; RV32I-NEXT: xor s4, a0, a1
; RV32I-NEXT: and a0, s1, s3
; RV32I-NEXT: call __extendhfsf2
; RV32I-NEXT: call __extendsfdf2
; RV32I-NEXT: mv s1, a0
; RV32I-NEXT: and a0, s0, s2
; RV32I-NEXT: call __extendhfsf2
; RV32I-NEXT: mv s0, a0
; RV32I-NEXT: and a0, s3, s2
; RV32I-NEXT: mv s2, a1
; RV32I-NEXT: and a0, s0, s3
; RV32I-NEXT: call __extendhfsf2
; RV32I-NEXT: call __extendsfdf2
; RV32I-NEXT: mv a2, a0
; RV32I-NEXT: mv a3, a1
; RV32I-NEXT: mv a0, s1
; RV32I-NEXT: mv a1, s0
; RV32I-NEXT: call fmaf
; RV32I-NEXT: call __truncsfhf2
; RV32I-NEXT: mv a1, s2
; RV32I-NEXT: call __muldf3
; RV32I-NEXT: mv s0, a0
; RV32I-NEXT: mv s1, a1
; RV32I-NEXT: and a0, s4, s3
; RV32I-NEXT: call __extendhfsf2
; RV32I-NEXT: call __extendsfdf2
; RV32I-NEXT: mv a2, a0
; RV32I-NEXT: mv a3, a1
; RV32I-NEXT: mv a0, s0
; RV32I-NEXT: mv a1, s1
; RV32I-NEXT: call __adddf3
; RV32I-NEXT: call __truncdfhf2
; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s2, 16(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s3, 12(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s4, 8(sp) # 4-byte Folded Reload
; RV32I-NEXT: addi sp, sp, 32
; RV32I-NEXT: ret
;
@ -1247,17 +1278,22 @@ define half @fmsub_h(half %a, half %b, half %c) nounwind {
; RV64I-NEXT: xor s3, a0, a1
; RV64I-NEXT: and a0, s1, s2
; RV64I-NEXT: call __extendhfsf2
; RV64I-NEXT: call __extendsfdf2
; RV64I-NEXT: mv s1, a0
; RV64I-NEXT: and a0, s0, s2
; RV64I-NEXT: call __extendhfsf2
; RV64I-NEXT: call __extendsfdf2
; RV64I-NEXT: mv a1, a0
; RV64I-NEXT: mv a0, s1
; RV64I-NEXT: call __muldf3
; RV64I-NEXT: mv s0, a0
; RV64I-NEXT: and a0, s3, s2
; RV64I-NEXT: call __extendhfsf2
; RV64I-NEXT: mv a2, a0
; RV64I-NEXT: mv a0, s1
; RV64I-NEXT: mv a1, s0
; RV64I-NEXT: call fmaf
; RV64I-NEXT: call __truncsfhf2
; RV64I-NEXT: call __extendsfdf2
; RV64I-NEXT: mv a1, a0
; RV64I-NEXT: mv a0, s0
; RV64I-NEXT: call __adddf3
; RV64I-NEXT: call __truncdfhf2
; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload
; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload
; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload
@ -1329,8 +1365,8 @@ define half @fnmadd_h(half %a, half %b, half %c) nounwind {
; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill
; RV32I-NEXT: mv s0, a2
; RV32I-NEXT: mv s1, a1
; RV32I-NEXT: lui a1, 16
; RV32I-NEXT: addi s3, a1, -1
; RV32I-NEXT: lui s3, 16
; RV32I-NEXT: addi s3, s3, -1
; RV32I-NEXT: and a0, a0, s3
; RV32I-NEXT: call __extendhfsf2
; RV32I-NEXT: li a1, 0
@ -1347,17 +1383,26 @@ define half @fnmadd_h(half %a, half %b, half %c) nounwind {
; RV32I-NEXT: xor s4, a0, a1
; RV32I-NEXT: and a0, s1, s3
; RV32I-NEXT: call __extendhfsf2
; RV32I-NEXT: call __extendsfdf2
; RV32I-NEXT: mv s0, a0
; RV32I-NEXT: mv s1, a1
; RV32I-NEXT: and a0, s2, s3
; RV32I-NEXT: call __extendhfsf2
; RV32I-NEXT: mv s1, a0
; RV32I-NEXT: call __extendsfdf2
; RV32I-NEXT: mv a2, s0
; RV32I-NEXT: mv a3, s1
; RV32I-NEXT: call __muldf3
; RV32I-NEXT: mv s0, a0
; RV32I-NEXT: mv s1, a1
; RV32I-NEXT: and a0, s4, s3
; RV32I-NEXT: call __extendhfsf2
; RV32I-NEXT: call __extendsfdf2
; RV32I-NEXT: mv a2, a0
; RV32I-NEXT: mv a0, s1
; RV32I-NEXT: mv a1, s0
; RV32I-NEXT: call fmaf
; RV32I-NEXT: call __truncsfhf2
; RV32I-NEXT: mv a3, a1
; RV32I-NEXT: mv a0, s0
; RV32I-NEXT: mv a1, s1
; RV32I-NEXT: call __adddf3
; RV32I-NEXT: call __truncdfhf2
; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
@ -1378,8 +1423,8 @@ define half @fnmadd_h(half %a, half %b, half %c) nounwind {
; RV64I-NEXT: sd s4, 0(sp) # 8-byte Folded Spill
; RV64I-NEXT: mv s0, a2
; RV64I-NEXT: mv s1, a1
; RV64I-NEXT: lui a1, 16
; RV64I-NEXT: addi s3, a1, -1
; RV64I-NEXT: lui s3, 16
; RV64I-NEXT: addi s3, s3, -1
; RV64I-NEXT: and a0, a0, s3
; RV64I-NEXT: call __extendhfsf2
; RV64I-NEXT: li a1, 0
@ -1396,17 +1441,21 @@ define half @fnmadd_h(half %a, half %b, half %c) nounwind {
; RV64I-NEXT: xor s4, a0, a1
; RV64I-NEXT: and a0, s1, s3
; RV64I-NEXT: call __extendhfsf2
; RV64I-NEXT: call __extendsfdf2
; RV64I-NEXT: mv s0, a0
; RV64I-NEXT: and a0, s2, s3
; RV64I-NEXT: call __extendhfsf2
; RV64I-NEXT: mv s1, a0
; RV64I-NEXT: call __extendsfdf2
; RV64I-NEXT: mv a1, s0
; RV64I-NEXT: call __muldf3
; RV64I-NEXT: mv s0, a0
; RV64I-NEXT: and a0, s4, s3
; RV64I-NEXT: call __extendhfsf2
; RV64I-NEXT: mv a2, a0
; RV64I-NEXT: mv a0, s1
; RV64I-NEXT: mv a1, s0
; RV64I-NEXT: call fmaf
; RV64I-NEXT: call __truncsfhf2
; RV64I-NEXT: call __extendsfdf2
; RV64I-NEXT: mv a1, a0
; RV64I-NEXT: mv a0, s0
; RV64I-NEXT: call __adddf3
; RV64I-NEXT: call __truncdfhf2
; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload
; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload
; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload
@ -1491,8 +1540,8 @@ define half @fnmadd_h_2(half %a, half %b, half %c) nounwind {
; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill
; RV32I-NEXT: mv s0, a2
; RV32I-NEXT: mv s1, a0
; RV32I-NEXT: lui a0, 16
; RV32I-NEXT: addi s3, a0, -1
; RV32I-NEXT: lui s3, 16
; RV32I-NEXT: addi s3, s3, -1
; RV32I-NEXT: and a0, a1, s3
; RV32I-NEXT: call __extendhfsf2
; RV32I-NEXT: li a1, 0
@ -1509,17 +1558,28 @@ define half @fnmadd_h_2(half %a, half %b, half %c) nounwind {
; RV32I-NEXT: xor s4, a0, a1
; RV32I-NEXT: and a0, s1, s3
; RV32I-NEXT: call __extendhfsf2
; RV32I-NEXT: call __extendsfdf2
; RV32I-NEXT: mv s0, a0
; RV32I-NEXT: mv s1, a1
; RV32I-NEXT: and a0, s2, s3
; RV32I-NEXT: call __extendhfsf2
; RV32I-NEXT: mv s1, a0
; RV32I-NEXT: and a0, s4, s3
; RV32I-NEXT: call __extendhfsf2
; RV32I-NEXT: call __extendsfdf2
; RV32I-NEXT: mv a2, a0
; RV32I-NEXT: mv a3, a1
; RV32I-NEXT: mv a0, s0
; RV32I-NEXT: mv a1, s1
; RV32I-NEXT: call fmaf
; RV32I-NEXT: call __truncsfhf2
; RV32I-NEXT: call __muldf3
; RV32I-NEXT: mv s0, a0
; RV32I-NEXT: mv s1, a1
; RV32I-NEXT: and a0, s4, s3
; RV32I-NEXT: call __extendhfsf2
; RV32I-NEXT: call __extendsfdf2
; RV32I-NEXT: mv a2, a0
; RV32I-NEXT: mv a3, a1
; RV32I-NEXT: mv a0, s0
; RV32I-NEXT: mv a1, s1
; RV32I-NEXT: call __adddf3
; RV32I-NEXT: call __truncdfhf2
; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
@ -1540,8 +1600,8 @@ define half @fnmadd_h_2(half %a, half %b, half %c) nounwind {
; RV64I-NEXT: sd s4, 0(sp) # 8-byte Folded Spill
; RV64I-NEXT: mv s0, a2
; RV64I-NEXT: mv s1, a0
; RV64I-NEXT: lui a0, 16
; RV64I-NEXT: addi s3, a0, -1
; RV64I-NEXT: lui s3, 16
; RV64I-NEXT: addi s3, s3, -1
; RV64I-NEXT: and a0, a1, s3
; RV64I-NEXT: call __extendhfsf2
; RV64I-NEXT: li a1, 0
@ -1558,17 +1618,22 @@ define half @fnmadd_h_2(half %a, half %b, half %c) nounwind {
; RV64I-NEXT: xor s4, a0, a1
; RV64I-NEXT: and a0, s1, s3
; RV64I-NEXT: call __extendhfsf2
; RV64I-NEXT: call __extendsfdf2
; RV64I-NEXT: mv s0, a0
; RV64I-NEXT: and a0, s2, s3
; RV64I-NEXT: call __extendhfsf2
; RV64I-NEXT: mv s1, a0
; RV64I-NEXT: call __extendsfdf2
; RV64I-NEXT: mv a1, a0
; RV64I-NEXT: mv a0, s0
; RV64I-NEXT: call __muldf3
; RV64I-NEXT: mv s0, a0
; RV64I-NEXT: and a0, s4, s3
; RV64I-NEXT: call __extendhfsf2
; RV64I-NEXT: mv a2, a0
; RV64I-NEXT: call __extendsfdf2
; RV64I-NEXT: mv a1, a0
; RV64I-NEXT: mv a0, s0
; RV64I-NEXT: mv a1, s1
; RV64I-NEXT: call fmaf
; RV64I-NEXT: call __truncsfhf2
; RV64I-NEXT: call __adddf3
; RV64I-NEXT: call __truncdfhf2
; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload
; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload
; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload
@ -1659,23 +1724,35 @@ define half @fnmadd_h_3(half %a, half %b, half %c) nounwind {
; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill
; RV32I-NEXT: mv s0, a2
; RV32I-NEXT: mv s1, a1
; RV32I-NEXT: lui a1, 16
; RV32I-NEXT: addi s3, a1, -1
; RV32I-NEXT: and a0, a0, s3
; RV32I-NEXT: addi s4, a1, -1
; RV32I-NEXT: and a0, a0, s4
; RV32I-NEXT: call __extendhfsf2
; RV32I-NEXT: call __extendsfdf2
; RV32I-NEXT: mv s2, a0
; RV32I-NEXT: and a0, s1, s3
; RV32I-NEXT: call __extendhfsf2
; RV32I-NEXT: mv s1, a0
; RV32I-NEXT: and a0, s0, s3
; RV32I-NEXT: mv s3, a1
; RV32I-NEXT: and a0, s1, s4
; RV32I-NEXT: call __extendhfsf2
; RV32I-NEXT: call __extendsfdf2
; RV32I-NEXT: mv a2, a0
; RV32I-NEXT: mv a3, a1
; RV32I-NEXT: mv a0, s2
; RV32I-NEXT: mv a1, s1
; RV32I-NEXT: call fmaf
; RV32I-NEXT: call __truncsfhf2
; RV32I-NEXT: mv a1, s3
; RV32I-NEXT: call __muldf3
; RV32I-NEXT: mv s1, a0
; RV32I-NEXT: mv s2, a1
; RV32I-NEXT: and a0, s0, s4
; RV32I-NEXT: call __extendhfsf2
; RV32I-NEXT: call __extendsfdf2
; RV32I-NEXT: mv a2, a0
; RV32I-NEXT: mv a3, a1
; RV32I-NEXT: mv a0, s1
; RV32I-NEXT: mv a1, s2
; RV32I-NEXT: call __adddf3
; RV32I-NEXT: call __truncdfhf2
; RV32I-NEXT: lui a1, 1048568
; RV32I-NEXT: xor a0, a0, a1
; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
@ -1683,6 +1760,7 @@ define half @fnmadd_h_3(half %a, half %b, half %c) nounwind {
; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s2, 16(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s3, 12(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s4, 8(sp) # 4-byte Folded Reload
; RV32I-NEXT: addi sp, sp, 32
; RV32I-NEXT: ret
;
@ -1700,17 +1778,22 @@ define half @fnmadd_h_3(half %a, half %b, half %c) nounwind {
; RV64I-NEXT: addi s3, a1, -1
; RV64I-NEXT: and a0, a0, s3
; RV64I-NEXT: call __extendhfsf2
; RV64I-NEXT: call __extendsfdf2
; RV64I-NEXT: mv s2, a0
; RV64I-NEXT: and a0, s1, s3
; RV64I-NEXT: call __extendhfsf2
; RV64I-NEXT: call __extendsfdf2
; RV64I-NEXT: mv a1, a0
; RV64I-NEXT: mv a0, s2
; RV64I-NEXT: call __muldf3
; RV64I-NEXT: mv s1, a0
; RV64I-NEXT: and a0, s0, s3
; RV64I-NEXT: call __extendhfsf2
; RV64I-NEXT: mv a2, a0
; RV64I-NEXT: mv a0, s2
; RV64I-NEXT: mv a1, s1
; RV64I-NEXT: call fmaf
; RV64I-NEXT: call __truncsfhf2
; RV64I-NEXT: call __extendsfdf2
; RV64I-NEXT: mv a1, a0
; RV64I-NEXT: mv a0, s1
; RV64I-NEXT: call __adddf3
; RV64I-NEXT: call __truncdfhf2
; RV64I-NEXT: lui a1, 1048568
; RV64I-NEXT: xor a0, a0, a1
; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload
@ -1779,23 +1862,35 @@ define half @fnmadd_nsz(half %a, half %b, half %c) nounwind {
; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill
; RV32I-NEXT: mv s0, a2
; RV32I-NEXT: mv s1, a1
; RV32I-NEXT: lui a1, 16
; RV32I-NEXT: addi s3, a1, -1
; RV32I-NEXT: and a0, a0, s3
; RV32I-NEXT: addi s4, a1, -1
; RV32I-NEXT: and a0, a0, s4
; RV32I-NEXT: call __extendhfsf2
; RV32I-NEXT: call __extendsfdf2
; RV32I-NEXT: mv s2, a0
; RV32I-NEXT: and a0, s1, s3
; RV32I-NEXT: call __extendhfsf2
; RV32I-NEXT: mv s1, a0
; RV32I-NEXT: and a0, s0, s3
; RV32I-NEXT: mv s3, a1
; RV32I-NEXT: and a0, s1, s4
; RV32I-NEXT: call __extendhfsf2
; RV32I-NEXT: call __extendsfdf2
; RV32I-NEXT: mv a2, a0
; RV32I-NEXT: mv a3, a1
; RV32I-NEXT: mv a0, s2
; RV32I-NEXT: mv a1, s1
; RV32I-NEXT: call fmaf
; RV32I-NEXT: call __truncsfhf2
; RV32I-NEXT: mv a1, s3
; RV32I-NEXT: call __muldf3
; RV32I-NEXT: mv s1, a0
; RV32I-NEXT: mv s2, a1
; RV32I-NEXT: and a0, s0, s4
; RV32I-NEXT: call __extendhfsf2
; RV32I-NEXT: call __extendsfdf2
; RV32I-NEXT: mv a2, a0
; RV32I-NEXT: mv a3, a1
; RV32I-NEXT: mv a0, s1
; RV32I-NEXT: mv a1, s2
; RV32I-NEXT: call __adddf3
; RV32I-NEXT: call __truncdfhf2
; RV32I-NEXT: lui a1, 1048568
; RV32I-NEXT: xor a0, a0, a1
; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
@ -1803,6 +1898,7 @@ define half @fnmadd_nsz(half %a, half %b, half %c) nounwind {
; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s2, 16(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s3, 12(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s4, 8(sp) # 4-byte Folded Reload
; RV32I-NEXT: addi sp, sp, 32
; RV32I-NEXT: ret
;
@ -1820,17 +1916,22 @@ define half @fnmadd_nsz(half %a, half %b, half %c) nounwind {
; RV64I-NEXT: addi s3, a1, -1
; RV64I-NEXT: and a0, a0, s3
; RV64I-NEXT: call __extendhfsf2
; RV64I-NEXT: call __extendsfdf2
; RV64I-NEXT: mv s2, a0
; RV64I-NEXT: and a0, s1, s3
; RV64I-NEXT: call __extendhfsf2
; RV64I-NEXT: call __extendsfdf2
; RV64I-NEXT: mv a1, a0
; RV64I-NEXT: mv a0, s2
; RV64I-NEXT: call __muldf3
; RV64I-NEXT: mv s1, a0
; RV64I-NEXT: and a0, s0, s3
; RV64I-NEXT: call __extendhfsf2
; RV64I-NEXT: mv a2, a0
; RV64I-NEXT: mv a0, s2
; RV64I-NEXT: mv a1, s1
; RV64I-NEXT: call fmaf
; RV64I-NEXT: call __truncsfhf2
; RV64I-NEXT: call __extendsfdf2
; RV64I-NEXT: mv a1, a0
; RV64I-NEXT: mv a0, s1
; RV64I-NEXT: call __adddf3
; RV64I-NEXT: call __truncdfhf2
; RV64I-NEXT: lui a1, 1048568
; RV64I-NEXT: xor a0, a0, a1
; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload
@ -1892,34 +1993,46 @@ define half @fnmsub_h(half %a, half %b, half %c) nounwind {
; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill
; RV32I-NEXT: mv s0, a2
; RV32I-NEXT: mv s1, a1
; RV32I-NEXT: lui a1, 16
; RV32I-NEXT: addi s2, a1, -1
; RV32I-NEXT: and a0, a0, s2
; RV32I-NEXT: addi s3, a1, -1
; RV32I-NEXT: and a0, a0, s3
; RV32I-NEXT: call __extendhfsf2
; RV32I-NEXT: li a1, 0
; RV32I-NEXT: call __addsf3
; RV32I-NEXT: call __truncsfhf2
; RV32I-NEXT: lui a1, 8
; RV32I-NEXT: xor s3, a0, a1
; RV32I-NEXT: and a0, s1, s2
; RV32I-NEXT: xor s4, a0, a1
; RV32I-NEXT: and a0, s1, s3
; RV32I-NEXT: call __extendhfsf2
; RV32I-NEXT: call __extendsfdf2
; RV32I-NEXT: mv s1, a0
; RV32I-NEXT: and a0, s0, s2
; RV32I-NEXT: mv s2, a1
; RV32I-NEXT: and a0, s4, s3
; RV32I-NEXT: call __extendhfsf2
; RV32I-NEXT: mv s0, a0
; RV32I-NEXT: and a0, s3, s2
; RV32I-NEXT: call __extendsfdf2
; RV32I-NEXT: mv a2, s1
; RV32I-NEXT: mv a3, s2
; RV32I-NEXT: call __muldf3
; RV32I-NEXT: mv s1, a0
; RV32I-NEXT: mv s2, a1
; RV32I-NEXT: and a0, s0, s3
; RV32I-NEXT: call __extendhfsf2
; RV32I-NEXT: mv a1, s1
; RV32I-NEXT: mv a2, s0
; RV32I-NEXT: call fmaf
; RV32I-NEXT: call __truncsfhf2
; RV32I-NEXT: call __extendsfdf2
; RV32I-NEXT: mv a2, a0
; RV32I-NEXT: mv a3, a1
; RV32I-NEXT: mv a0, s1
; RV32I-NEXT: mv a1, s2
; RV32I-NEXT: call __adddf3
; RV32I-NEXT: call __truncdfhf2
; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s2, 16(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s3, 12(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s4, 8(sp) # 4-byte Folded Reload
; RV32I-NEXT: addi sp, sp, 32
; RV32I-NEXT: ret
;
@ -1944,16 +2057,21 @@ define half @fnmsub_h(half %a, half %b, half %c) nounwind {
; RV64I-NEXT: xor s3, a0, a1
; RV64I-NEXT: and a0, s1, s2
; RV64I-NEXT: call __extendhfsf2
; RV64I-NEXT: call __extendsfdf2
; RV64I-NEXT: mv s1, a0
; RV64I-NEXT: and a0, s3, s2
; RV64I-NEXT: call __extendhfsf2
; RV64I-NEXT: call __extendsfdf2
; RV64I-NEXT: mv a1, s1
; RV64I-NEXT: call __muldf3
; RV64I-NEXT: mv s1, a0
; RV64I-NEXT: and a0, s0, s2
; RV64I-NEXT: call __extendhfsf2
; RV64I-NEXT: mv s0, a0
; RV64I-NEXT: and a0, s3, s2
; RV64I-NEXT: call __extendhfsf2
; RV64I-NEXT: mv a1, s1
; RV64I-NEXT: mv a2, s0
; RV64I-NEXT: call fmaf
; RV64I-NEXT: call __truncsfhf2
; RV64I-NEXT: call __extendsfdf2
; RV64I-NEXT: mv a1, a0
; RV64I-NEXT: mv a0, s1
; RV64I-NEXT: call __adddf3
; RV64I-NEXT: call __truncdfhf2
; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload
; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload
; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload
@ -2020,35 +2138,48 @@ define half @fnmsub_h_2(half %a, half %b, half %c) nounwind {
; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill
; RV32I-NEXT: mv s0, a2
; RV32I-NEXT: mv s1, a0
; RV32I-NEXT: lui a0, 16
; RV32I-NEXT: addi s2, a0, -1
; RV32I-NEXT: and a0, a1, s2
; RV32I-NEXT: addi s3, a0, -1
; RV32I-NEXT: and a0, a1, s3
; RV32I-NEXT: call __extendhfsf2
; RV32I-NEXT: li a1, 0
; RV32I-NEXT: call __addsf3
; RV32I-NEXT: call __truncsfhf2
; RV32I-NEXT: lui a1, 8
; RV32I-NEXT: xor s3, a0, a1
; RV32I-NEXT: and a0, s1, s2
; RV32I-NEXT: xor s4, a0, a1
; RV32I-NEXT: and a0, s1, s3
; RV32I-NEXT: call __extendhfsf2
; RV32I-NEXT: call __extendsfdf2
; RV32I-NEXT: mv s1, a0
; RV32I-NEXT: and a0, s0, s2
; RV32I-NEXT: mv s2, a1
; RV32I-NEXT: and a0, s4, s3
; RV32I-NEXT: call __extendhfsf2
; RV32I-NEXT: mv s0, a0
; RV32I-NEXT: and a0, s3, s2
; RV32I-NEXT: call __extendhfsf2
; RV32I-NEXT: mv a1, a0
; RV32I-NEXT: call __extendsfdf2
; RV32I-NEXT: mv a2, a0
; RV32I-NEXT: mv a3, a1
; RV32I-NEXT: mv a0, s1
; RV32I-NEXT: mv a2, s0
; RV32I-NEXT: call fmaf
; RV32I-NEXT: call __truncsfhf2
; RV32I-NEXT: mv a1, s2
; RV32I-NEXT: call __muldf3
; RV32I-NEXT: mv s1, a0
; RV32I-NEXT: mv s2, a1
; RV32I-NEXT: and a0, s0, s3
; RV32I-NEXT: call __extendhfsf2
; RV32I-NEXT: call __extendsfdf2
; RV32I-NEXT: mv a2, a0
; RV32I-NEXT: mv a3, a1
; RV32I-NEXT: mv a0, s1
; RV32I-NEXT: mv a1, s2
; RV32I-NEXT: call __adddf3
; RV32I-NEXT: call __truncdfhf2
; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s2, 16(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s3, 12(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s4, 8(sp) # 4-byte Folded Reload
; RV32I-NEXT: addi sp, sp, 32
; RV32I-NEXT: ret
;
@ -2073,17 +2204,22 @@ define half @fnmsub_h_2(half %a, half %b, half %c) nounwind {
; RV64I-NEXT: xor s3, a0, a1
; RV64I-NEXT: and a0, s1, s2
; RV64I-NEXT: call __extendhfsf2
; RV64I-NEXT: call __extendsfdf2
; RV64I-NEXT: mv s1, a0
; RV64I-NEXT: and a0, s3, s2
; RV64I-NEXT: call __extendhfsf2
; RV64I-NEXT: call __extendsfdf2
; RV64I-NEXT: mv a1, a0
; RV64I-NEXT: mv a0, s1
; RV64I-NEXT: call __muldf3
; RV64I-NEXT: mv s1, a0
; RV64I-NEXT: and a0, s0, s2
; RV64I-NEXT: call __extendhfsf2
; RV64I-NEXT: mv s0, a0
; RV64I-NEXT: and a0, s3, s2
; RV64I-NEXT: call __extendhfsf2
; RV64I-NEXT: call __extendsfdf2
; RV64I-NEXT: mv a1, a0
; RV64I-NEXT: mv a0, s1
; RV64I-NEXT: mv a2, s0
; RV64I-NEXT: call fmaf
; RV64I-NEXT: call __truncsfhf2
; RV64I-NEXT: call __adddf3
; RV64I-NEXT: call __truncdfhf2
; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload
; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload
; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload

View File

@ -1690,28 +1690,41 @@ define half @fma_f16(half %a, half %b, half %c) nounwind {
; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill
; RV32I-NEXT: mv s0, a2
; RV32I-NEXT: mv s1, a1
; RV32I-NEXT: lui a1, 16
; RV32I-NEXT: addi s3, a1, -1
; RV32I-NEXT: and a0, a0, s3
; RV32I-NEXT: addi s4, a1, -1
; RV32I-NEXT: and a0, a0, s4
; RV32I-NEXT: call __extendhfsf2
; RV32I-NEXT: call __extendsfdf2
; RV32I-NEXT: mv s2, a0
; RV32I-NEXT: and a0, s1, s3
; RV32I-NEXT: call __extendhfsf2
; RV32I-NEXT: mv s1, a0
; RV32I-NEXT: and a0, s0, s3
; RV32I-NEXT: mv s3, a1
; RV32I-NEXT: and a0, s1, s4
; RV32I-NEXT: call __extendhfsf2
; RV32I-NEXT: call __extendsfdf2
; RV32I-NEXT: mv a2, a0
; RV32I-NEXT: mv a3, a1
; RV32I-NEXT: mv a0, s2
; RV32I-NEXT: mv a1, s1
; RV32I-NEXT: call fmaf
; RV32I-NEXT: call __truncsfhf2
; RV32I-NEXT: mv a1, s3
; RV32I-NEXT: call __muldf3
; RV32I-NEXT: mv s1, a0
; RV32I-NEXT: mv s2, a1
; RV32I-NEXT: and a0, s0, s4
; RV32I-NEXT: call __extendhfsf2
; RV32I-NEXT: call __extendsfdf2
; RV32I-NEXT: mv a2, a0
; RV32I-NEXT: mv a3, a1
; RV32I-NEXT: mv a0, s1
; RV32I-NEXT: mv a1, s2
; RV32I-NEXT: call __adddf3
; RV32I-NEXT: call __truncdfhf2
; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s2, 16(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s3, 12(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s4, 8(sp) # 4-byte Folded Reload
; RV32I-NEXT: addi sp, sp, 32
; RV32I-NEXT: ret
;
@ -1729,17 +1742,22 @@ define half @fma_f16(half %a, half %b, half %c) nounwind {
; RV64I-NEXT: addi s3, a1, -1
; RV64I-NEXT: and a0, a0, s3
; RV64I-NEXT: call __extendhfsf2
; RV64I-NEXT: call __extendsfdf2
; RV64I-NEXT: mv s2, a0
; RV64I-NEXT: and a0, s1, s3
; RV64I-NEXT: call __extendhfsf2
; RV64I-NEXT: call __extendsfdf2
; RV64I-NEXT: mv a1, a0
; RV64I-NEXT: mv a0, s2
; RV64I-NEXT: call __muldf3
; RV64I-NEXT: mv s1, a0
; RV64I-NEXT: and a0, s0, s3
; RV64I-NEXT: call __extendhfsf2
; RV64I-NEXT: mv a2, a0
; RV64I-NEXT: mv a0, s2
; RV64I-NEXT: mv a1, s1
; RV64I-NEXT: call fmaf
; RV64I-NEXT: call __truncsfhf2
; RV64I-NEXT: call __extendsfdf2
; RV64I-NEXT: mv a1, a0
; RV64I-NEXT: mv a0, s1
; RV64I-NEXT: call __adddf3
; RV64I-NEXT: call __truncdfhf2
; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload
; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload
; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload
@ -1748,14 +1766,41 @@ define half @fma_f16(half %a, half %b, half %c) nounwind {
; RV64I-NEXT: addi sp, sp, 48
; RV64I-NEXT: ret
;
; CHECKIZFHMIN-LABEL: fma_f16:
; CHECKIZFHMIN: # %bb.0:
; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa2
; CHECKIZFHMIN-NEXT: fcvt.s.h fa4, fa1
; CHECKIZFHMIN-NEXT: fcvt.s.h fa3, fa0
; CHECKIZFHMIN-NEXT: fmadd.s fa5, fa3, fa4, fa5
; CHECKIZFHMIN-NEXT: fcvt.h.s fa0, fa5
; CHECKIZFHMIN-NEXT: ret
; RV32IFZFHMIN-LABEL: fma_f16:
; RV32IFZFHMIN: # %bb.0:
; RV32IFZFHMIN-NEXT: fcvt.s.h fa5, fa2
; RV32IFZFHMIN-NEXT: fcvt.s.h fa4, fa1
; RV32IFZFHMIN-NEXT: fcvt.s.h fa3, fa0
; RV32IFZFHMIN-NEXT: fmadd.s fa5, fa3, fa4, fa5
; RV32IFZFHMIN-NEXT: fcvt.h.s fa0, fa5
; RV32IFZFHMIN-NEXT: ret
;
; RV64IFZFHMIN-LABEL: fma_f16:
; RV64IFZFHMIN: # %bb.0:
; RV64IFZFHMIN-NEXT: fcvt.s.h fa5, fa2
; RV64IFZFHMIN-NEXT: fcvt.s.h fa4, fa1
; RV64IFZFHMIN-NEXT: fcvt.s.h fa3, fa0
; RV64IFZFHMIN-NEXT: fmadd.s fa5, fa3, fa4, fa5
; RV64IFZFHMIN-NEXT: fcvt.h.s fa0, fa5
; RV64IFZFHMIN-NEXT: ret
;
; RV32IDZFHMIN-LABEL: fma_f16:
; RV32IDZFHMIN: # %bb.0:
; RV32IDZFHMIN-NEXT: fcvt.d.h fa5, fa2
; RV32IDZFHMIN-NEXT: fcvt.d.h fa4, fa1
; RV32IDZFHMIN-NEXT: fcvt.d.h fa3, fa0
; RV32IDZFHMIN-NEXT: fmadd.d fa5, fa3, fa4, fa5
; RV32IDZFHMIN-NEXT: fcvt.h.d fa0, fa5
; RV32IDZFHMIN-NEXT: ret
;
; RV64IDZFHMIN-LABEL: fma_f16:
; RV64IDZFHMIN: # %bb.0:
; RV64IDZFHMIN-NEXT: fcvt.d.h fa5, fa2
; RV64IDZFHMIN-NEXT: fcvt.d.h fa4, fa1
; RV64IDZFHMIN-NEXT: fcvt.d.h fa3, fa0
; RV64IDZFHMIN-NEXT: fmadd.d fa5, fa3, fa4, fa5
; RV64IDZFHMIN-NEXT: fcvt.h.d fa0, fa5
; RV64IDZFHMIN-NEXT: ret
;
; CHECKIZHINXMIN-LABEL: fma_f16:
; CHECKIZHINXMIN: # %bb.0:

View File

@ -8,12 +8,12 @@ declare float @llvm.fma.f32(float %f1, float %f2, float %f3)
define half @f0(half %f1, half %f2, half %acc) {
; CHECK-LABEL: f0:
; CHECK: brasl %r14, __extendhfsf2@PLT
; CHECK: brasl %r14, __extendhfsf2@PLT
; CHECK: brasl %r14, __extendhfsf2@PLT
; CHECK-SCALAR: maebr %f0, %f9, %f10
; CHECK-VECTOR: wfmasb %f0, %f0, %f8, %f10
; CHECK: brasl %r14, __truncsfhf2@PLT
; CHECK: brasl %r14, __extendhfdf2@PLT
; CHECK: brasl %r14, __extendhfdf2@PLT
; CHECK: brasl %r14, __extendhfdf2@PLT
; CHECK-SCALAR: madbr %f0, %f9, %f10
; CHECK-VECTOR: wfmadb %f0, %f0, %f8, %f10
; CHECK: brasl %r14, __truncdfhf2@PLT
; CHECK: br %r14
%res = call half @llvm.fma.f16 (half %f1, half %f2, half %acc)
ret half %res

View File

@ -10,12 +10,12 @@ define half @f0(half %f1, half %f2, half %acc) {
; CHECK-LABEL: f0:
; CHECK-NOT: brasl
; CHECK: lcdfr %f{{[0-9]+}}, %f4
; CHECK: brasl %r14, __extendhfsf2@PLT
; CHECK: brasl %r14, __extendhfsf2@PLT
; CHECK: brasl %r14, __extendhfsf2@PLT
; CHECK-SCALAR: maebr %f0, %f8, %f10
; CHECK-VECTOR: wfmasb %f0, %f0, %f8, %f10
; CHECK: brasl %r14, __truncsfhf2@PLT
; CHECK: brasl %r14, __extendhfdf2@PLT
; CHECK: brasl %r14, __extendhfdf2@PLT
; CHECK: brasl %r14, __extendhfdf2@PLT
; CHECK-SCALAR: madbr %f0, %f8, %f10
; CHECK-VECTOR: wfmadb %f0, %f0, %f8, %f10
; CHECK: brasl %r14, __truncdfhf2@PLT
; CHECK: br %r14
%negacc = fneg half %acc
%res = call half @llvm.fma.f16 (half %f1, half %f2, half %negacc)

View File

@ -25,11 +25,11 @@ define double @f2(double %f1, double %f2, double %acc) {
define half @f3_half(half %f1, half %f2, half %acc) {
; CHECK-LABEL: f3_half:
; CHECK: brasl %r14, __extendhfsf2@PLT
; CHECK: brasl %r14, __extendhfsf2@PLT
; CHECK: brasl %r14, __extendhfsf2@PLT
; CHECK: wfmasb %f0, %f0, %f8, %f10
; CHECK: brasl %r14, __truncsfhf2@PLT
; CHECK: brasl %r14, __extendhfdf2@PLT
; CHECK: brasl %r14, __extendhfdf2@PLT
; CHECK: brasl %r14, __extendhfdf2@PLT
; CHECK: wfmadb %f0, %f0, %f8, %f10
; CHECK: brasl %r14, __truncdfhf2@PLT
; CHECK-NOT: brasl
; CHECK: lcdfr %f0, %f0
; CHECK-NEXT: lmg
@ -52,11 +52,11 @@ define half @f4_half(half %f1, half %f2, half %acc) {
; CHECK-LABEL: f4_half:
; CHECK-NOT: brasl
; CHECK: lcdfr %f0, %f4
; CHECK: brasl %r14, __extendhfsf2@PLT
; CHECK: brasl %r14, __extendhfsf2@PLT
; CHECK: brasl %r14, __extendhfsf2@PLT
; CHECK: wfmasb %f0, %f0, %f8, %f10
; CHECK: brasl %r14, __truncsfhf2@PLT
; CHECK: brasl %r14, __extendhfdf2@PLT
; CHECK: brasl %r14, __extendhfdf2@PLT
; CHECK: brasl %r14, __extendhfdf2@PLT
; CHECK: wfmadb %f0, %f0, %f8, %f10
; CHECK: brasl %r14, __truncdfhf2@PLT
; CHECK-NOT: brasl
; CHECK: lcdfr %f0, %f0
; CHECK-NEXT: lmg

View File

@ -8,13 +8,13 @@ declare float @llvm.experimental.constrained.fma.f32(float, float, float, metada
define half @f0(half %f1, half %f2, half %acc) #0 {
; CHECK-LABEL: f0:
; CHECK: brasl %r14, __extendhfsf2@PLT
; CHECK: brasl %r14, __extendhfsf2@PLT
; CHECK: brasl %r14, __extendhfsf2@PLT
; CHECK-SCALAR: maebr %f10, %f0, %f8
; CHECK-SCALAR: ler %f0, %f10
; CHECK-VECTOR: wfmasb %f0, %f0, %f8, %f10
; CHECK: brasl %r14, __truncsfhf2@PLT
; CHECK: brasl %r14, __extendhfdf2@PLT
; CHECK: brasl %r14, __extendhfdf2@PLT
; CHECK: brasl %r14, __extendhfdf2@PLT
; CHECK-SCALAR: madbr %f10, %f0, %f8
; CHECK-SCALAR: ldr %f0, %f10
; CHECK-VECTOR: wfmadb %f0, %f0, %f8, %f10
; CHECK: brasl %r14, __truncdfhf2@PLT
; CHECK: br %r14
%res = call half @llvm.experimental.constrained.fma.f16 (
half %f1, half %f2, half %acc,

View File

@ -432,8 +432,7 @@ define half @fma_f16(half %a, half %b, half %c) nounwind strictfp {
; SSE2: # %bb.0:
; SSE2-NEXT: subq $24, %rsp
; SSE2-NEXT: movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; SSE2-NEXT: callq __extendhfsf2@PLT
; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
@ -443,12 +442,17 @@ define half @fma_f16(half %a, half %b, half %c) nounwind strictfp {
; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: callq __extendhfsf2@PLT
; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero
; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload
; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero
; SSE2-NEXT: callq fmaf@PLT
; SSE2-NEXT: callq __truncsfhf2@PLT
; SSE2-NEXT: xorps %xmm2, %xmm2
; SSE2-NEXT: cvtss2sd %xmm0, %xmm2
; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: xorps %xmm1, %xmm1
; SSE2-NEXT: cvtss2sd %xmm0, %xmm1
; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: cvtss2sd %xmm0, %xmm0
; SSE2-NEXT: callq fma@PLT
; SSE2-NEXT: callq __truncdfhf2@PLT
; SSE2-NEXT: addq $24, %rsp
; SSE2-NEXT: retq
;
@ -460,38 +464,42 @@ define half @fma_f16(half %a, half %b, half %c) nounwind strictfp {
; F16C-NEXT: vpextrw $0, %xmm2, %edx
; F16C-NEXT: movzwl %dx, %edx
; F16C-NEXT: vmovd %edx, %xmm0
; F16C-NEXT: vcvtph2ps %xmm0, %xmm2
; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; F16C-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm2
; F16C-NEXT: movzwl %cx, %ecx
; F16C-NEXT: vmovd %ecx, %xmm0
; F16C-NEXT: vcvtph2ps %xmm0, %xmm1
; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; F16C-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm1
; F16C-NEXT: movzwl %ax, %eax
; F16C-NEXT: vmovd %eax, %xmm0
; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; F16C-NEXT: callq fmaf@PLT
; F16C-NEXT: vxorps %xmm1, %xmm1, %xmm1
; F16C-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; F16C-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
; F16C-NEXT: callq fma@PLT
; F16C-NEXT: callq __truncdfhf2@PLT
; F16C-NEXT: popq %rax
; F16C-NEXT: retq
;
; AVX512-LABEL: fma_f16:
; AVX512: # %bb.0:
; AVX512-NEXT: vpextrw $0, %xmm1, %eax
; AVX512-NEXT: vpextrw $0, %xmm0, %ecx
; AVX512-NEXT: pushq %rax
; AVX512-NEXT: vpextrw $0, %xmm0, %eax
; AVX512-NEXT: vpextrw $0, %xmm1, %ecx
; AVX512-NEXT: vpextrw $0, %xmm2, %edx
; AVX512-NEXT: movzwl %dx, %edx
; AVX512-NEXT: vmovd %edx, %xmm0
; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm1
; AVX512-NEXT: movzwl %cx, %ecx
; AVX512-NEXT: vmovd %ecx, %xmm1
; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX512-NEXT: vmovd %ecx, %xmm0
; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm2
; AVX512-NEXT: movzwl %ax, %eax
; AVX512-NEXT: vmovd %eax, %xmm2
; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX512-NEXT: vfmadd213ss {{.*#+}} xmm2 = (xmm1 * xmm2) + xmm0
; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vmovss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX512-NEXT: vmovd %eax, %xmm0
; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1
; AVX512-NEXT: callq __truncdfhf2@PLT
; AVX512-NEXT: popq %rax
; AVX512-NEXT: retq
;
; X86-LABEL: fma_f16:

View File

@ -421,10 +421,13 @@ define void @test_half_fma(half %a0, half %a1, half %a2, ptr %p0) nounwind {
; F16C-NEXT: pushq %rbx
; F16C-NEXT: movq %rdi, %rbx
; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; F16C-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
; F16C-NEXT: vcvtph2ps %xmm1, %xmm1
; F16C-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; F16C-NEXT: vcvtph2ps %xmm2, %xmm2
; F16C-NEXT: callq fmaf@PLT
; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; F16C-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
; F16C-NEXT: callq fma@PLT
; F16C-NEXT: callq __truncdfhf2@PLT
; F16C-NEXT: vpextrw $0, %xmm0, (%rbx)
; F16C-NEXT: popq %rbx
; F16C-NEXT: retq
@ -440,24 +443,27 @@ define void @test_half_fma(half %a0, half %a1, half %a2, ptr %p0) nounwind {
; X64-NEXT: pushq %rbx
; X64-NEXT: subq $16, %rsp
; X64-NEXT: movq %rdi, %rbx
; X64-NEXT: movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; X64-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; X64-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; X64-NEXT: movaps %xmm1, %xmm0
; X64-NEXT: movaps %xmm2, %xmm0
; X64-NEXT: callq __extendhfsf2@PLT
; X64-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; X64-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
; X64-NEXT: # xmm0 = mem[0],zero,zero,zero
; X64-NEXT: callq __extendhfsf2@PLT
; X64-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; X64-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; X64-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
; X64-NEXT: # xmm0 = mem[0],zero,zero,zero
; X64-NEXT: callq __extendhfsf2@PLT
; X64-NEXT: cvtss2sd %xmm0, %xmm0
; X64-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
; X64-NEXT: # xmm1 = mem[0],zero,zero,zero
; X64-NEXT: cvtss2sd %xmm1, %xmm1
; X64-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload
; X64-NEXT: # xmm2 = mem[0],zero,zero,zero
; X64-NEXT: callq fmaf@PLT
; X64-NEXT: callq __truncsfhf2@PLT
; X64-NEXT: cvtss2sd %xmm2, %xmm2
; X64-NEXT: callq fma@PLT
; X64-NEXT: callq __truncdfhf2@PLT
; X64-NEXT: pextrw $0, %xmm0, %eax
; X64-NEXT: movw %ax, (%rbx)
; X64-NEXT: addq $16, %rsp
@ -467,7 +473,7 @@ define void @test_half_fma(half %a0, half %a1, half %a2, ptr %p0) nounwind {
; X86-LABEL: test_half_fma:
; X86: # %bb.0:
; X86-NEXT: pushl %esi
; X86-NEXT: subl $72, %esp
; X86-NEXT: subl $88, %esp
; X86-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0
; X86-NEXT: movdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
; X86-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0
@ -487,17 +493,17 @@ define void @test_half_fma(half %a0, half %a1, half %a2, ptr %p0) nounwind {
; X86-NEXT: pextrw $0, %xmm0, %eax
; X86-NEXT: movw %ax, (%esp)
; X86-NEXT: calll __extendhfsf2
; X86-NEXT: fstps {{[0-9]+}}(%esp)
; X86-NEXT: fstpl {{[0-9]+}}(%esp)
; X86-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
; X86-NEXT: fstps {{[0-9]+}}(%esp)
; X86-NEXT: fstpl {{[0-9]+}}(%esp)
; X86-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
; X86-NEXT: fstps (%esp)
; X86-NEXT: calll fmaf
; X86-NEXT: fstps (%esp)
; X86-NEXT: calll __truncsfhf2
; X86-NEXT: fstpl (%esp)
; X86-NEXT: calll fma
; X86-NEXT: fstpl (%esp)
; X86-NEXT: calll __truncdfhf2
; X86-NEXT: pextrw $0, %xmm0, %eax
; X86-NEXT: movw %ax, (%esi)
; X86-NEXT: addl $72, %esp
; X86-NEXT: addl $88, %esp
; X86-NEXT: popl %esi
; X86-NEXT: retl
%res = call half @llvm.fma.half(half %a0, half %a1, half %a2)