From 0cecacd971a5471803b79f2b4a976ce75a2539b2 Mon Sep 17 00:00:00 2001
From: Simon Tatham <simon.tatham@arm.com>
Date: Tue, 31 Mar 2026 12:00:11 +0100
Subject: [PATCH] [compiler-rt][ARM] Optimized double precision FP add/sub
 (#179921)

The one new assembly source file, `arm/adddf3.S`, implements both
addition and subtraction via cross-branching after flipping signs, since
both operations must provide substantially the same logic. The new cmake
properties introduced in a prior commit are used to arrange that
including `adddf3.S` supersedes the C versions of both addition and
subtraction, and also informs the test suite that both functions are
available to test.
---
 compiler-rt/lib/builtins/CMakeLists.txt       |    2 +
 compiler-rt/lib/builtins/arm/adddf3.S         | 1140 +++++++++++++++++
 .../test/builtins/Unit/adddf3new_test.c       |  684 ++++++++++
 .../test/builtins/Unit/subdf3new_test.c       |  706 ++++++++++
 4 files changed, 2532 insertions(+)
 create mode 100644 compiler-rt/lib/builtins/arm/adddf3.S
 create mode 100644 compiler-rt/test/builtins/Unit/adddf3new_test.c
 create mode 100644 compiler-rt/test/builtins/Unit/subdf3new_test.c

diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt
index c83488bd3ed5..503a9aa3ff4e 100644
--- a/compiler-rt/lib/builtins/CMakeLists.txt
+++ b/compiler-rt/lib/builtins/CMakeLists.txt
@@ -468,6 +468,7 @@ if(COMPILER_RT_ARM_OPTIMIZED_FP AND BUILTIN_SUPPORTED_ARCH MATCHES "arm")
     set(assembly_files
       arm/mulsf3.S
       arm/divsf3.S
+      arm/adddf3.S
       )
     set_source_files_properties(${assembly_files}
       PROPERTIES COMPILE_OPTIONS ${implicit_it_flag})
@@ -515,6 +516,7 @@ set(thumb1_base_SOURCES
   arm/addsf3.S
   ${GENERIC_SOURCES}
 )
+set_special_properties(arm/adddf3.S SUPERSEDES subdf3.c PROVIDES subdf3)
 
 if(COMPILER_RT_ARM_OPTIMIZED_FP)
   set(thumb1_base_SOURCES
diff --git a/compiler-rt/lib/builtins/arm/adddf3.S b/compiler-rt/lib/builtins/arm/adddf3.S
new file mode 100644
index 000000000000..8c1a53a4d1bc
--- /dev/null
+++ b/compiler-rt/lib/builtins/arm/adddf3.S
@@ -0,0 +1,1140 @@
+//===-- adddf3.S - Add/subtract double precision floating point numbers ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the __adddf3 and __subdf3 functions (double precision
+// floating point number addition and subtraction), with the IEEE-754 default
+// rounding (to nearest, ties to even), for the Arm and Thumb2 ISAs.
+//
+//===----------------------------------------------------------------------===//
+
+#include "../assembly.h"
+#include "crt_endian.h"
+
+  .syntax unified
+  .text
+  .p2align 2
+
+// General structure of this code:
+//
+// There are three actual entry points here, for addition, subtraction and
+// reversed subtraction (just taking the operands the other way round, so that
+// it returns y-x instead of x-y). But the first thing the functions do (after
+// checking for NaNs) is to sort out whether the magnitudes of the two inputs
+// are being added (x+y with like signs, or x-y with different signs), or
+// subtracted. So dadd jumps across into the middle of dsub if it sees that the
+// signs are different, and vice versa. Then the main code path in dadd handles
+// magnitude addition, and the one in dsub handles magnitude subtraction.
+//
+// NaNs are checked first, so that an input NaN can be propagated exactly,
+// including its sign bit. After ruling out that case, it's safe to flip the
+// sign of one of the inputs, so that during the cross-calls, x - y can be
+// rewritten as x + (-y) and vice versa.
+
+#if __ARM_PCS_VFP
+DEFINE_COMPILERRT_FUNCTION(__adddf3)
+  push {r4, lr}
+  VMOV_FROM_DOUBLE(r0, r1, d0)
+  VMOV_FROM_DOUBLE(r2, r3, d1)
+  bl __aeabi_dadd
+  VMOV_TO_DOUBLE(d0, r0, r1)
+  pop {r4, pc}
+#else
+DEFINE_COMPILERRT_FUNCTION_ALIAS(__adddf3, __aeabi_dadd)
+#endif
+
+DEFINE_COMPILERRT_FUNCTION(__aeabi_dadd)
+
+  push    {r4, r14}
+
+  // Test for all uncommon values at once: infinities, NaNs, denormals and
+  // zeroes. Branch out of line if any are found. We do this by incrementing
+  // the exponent of each input, so that the two extreme exponents 0x7ff,0x000
+  // map to 0x000,0x001 respectively. Then the original number had one of those
+  // exponents precisely when the modified version has the top 10 exponent bits
+  // zero.
+  //
+  // The constant we load into r14 for testing those ten exponent bits will be
+  // reused later. (We could load a constant suitable for just this initial
+  // test slightly more efficiently by writing MOVW r14,#0x3ff or similar, but
+  // having the set bits at the top of the word is useful later because we can
+  // extend them using ASR.)
+  ldr     r14, =0xFFC00000
+  add     r12, xh, #1 << 20   // r12 has the adjusted version of x's exponent
+  add     r4, yh, #1 << 20    // and r4 the adjusted version of y's
+  tst     r14, r12, lsl #1    // test the top 10 exponent bits of each
+  tstne   r14, r4, lsl #1
+  beq     LOCAL_LABEL(add_uncommon)       // and branch out of line if either is 0
+
+  // Now we have two normalised numbers. If their signs are opposite, we should
+  // be subtracting their magnitudes rather than adding, so cross-jump to dsub.
+  teq     xh, yh
+  eormi   yh, yh, #1 << 31
+  bmi     LOCAL_LABEL(sub_magnitude)
+LOCAL_LABEL(add_magnitude):
+  // If we get here, we're adding operands with equal signs (i.e. a magnitude
+  // addition). First thing to do is put the operands in magnitude order, so
+  // that x >= y.
+  subs    r4, xl, yl          // compare inputs, also keeping x-y
+  sbcs    r12, xh, yh
+  bhs     LOCAL_LABEL(add_swapped)        // if x>=y then branch round the swap
+  adds    yl, yl, r4          // otherwise turn y into x by adding (x-y)
+  adc     yh, yh, r12
+  subs    xl, xl, r4          // and turn x into y by subtracting it
+  sbc     xh, xh, r12
+LOCAL_LABEL(add_swapped):
+  // Keep the sign and exponent of the larger input, to use as the sign and
+  // exponent of the output (up to carries and overflows). Also calculate the
+  // exponent difference, which tells us how far we'll need to shift y's
+  // mantissa right to add it to x's.
+  //
+  // The shifted-right values will include the sign bits as well as the
+  // exponents, but both sign bits are the same, so they'll cancel.
+  lsr     r4, xh, #20            // r4 = initial sign+exponent of the output
+  sub     r12, r4, yh, lsr #20   // r12 = exponent difference
+
+  // Clear the exponents and signs off the numbers to prepare for the addition.
+  // (We reuse the value 0xffc00000 that we left in r14 on entry: ASRing that
+  // right by 2 gives 0xfff00000, just the bit mask we wanted.)
+  //
+  // Also OR in the leading 1 bit of y's mantissa, so that when we shift it
+  // right and add, it will be included in the addition.
+  //
+  // (It's cheaper not to bother doing the same for x, unless the addition
+  // carries into the exponent.)
+  bic     xh, xh, r14, asr #2
+  bic     yh, yh, r14, asr #2
+  orr     yh, yh, #1 << 20
+
+LOCAL_LABEL(add_doadd):
+  // Here we perform the actual addition. We either fell through from the code
+  // above, or jumped back to here after handling an input denormal.
+  //
+  // We get here with:
+  //   Operands known to be numeric rather than zero/infinity/NaN;
+  //   xh:xl = mantissa of larger operand, with low bit at the bottom of xl
+  //   yh:yl = mantissa of smaller operand, with low bit at the bottom of yl
+  //   r4 = result sign and exponent (in low 12 bits);
+  //   r12 = exponent difference.
+  //
+  // For normal inputs, the mantissa of y will have the leading bit set.
+  // Denormals will leave that bit clear, treating the number as 0.[mantissa] x
+  // 2^(fixed exponent) instead of renormalising to 1.[mantissa] x 2^(variable
+  // exponent) as a multiplication would want.
+
+  // The main addition involves shifting y right by the exponent difference in
+  // r12, and adding it to x. This must be done differently depending on how
+  // big the exponent difference is. Start by checking if it's at most 32.
+  rsbs    r14, r12, #32
+  blo     LOCAL_LABEL(add_bigshift)
+
+  // The exponent difference is 32 or less. The test above also left
+  // (32-difference) in r14, which is the amount we need to shift yh left by to
+  // recover the bits that the right shift will lose off the bottom.
+#if __thumb__
+  // Thumb can't fold a register-controlled shift into an add, so we must use
+  // two separate instructions in each case.
+  //
+  // We don't have any more spare registers, so we'll use r14 as a temporary
+  // register to hold each shifted value before adding it to something. This
+  // clobbers the inverted shift count in r14, which we're going to need again
+  // during rounding, so we must recompute it after the additions are complete.
+  // (It would cost more cycles to avoid that awkwardness by pushing and
+  // popping an extra register around the whole function.)
+  //
+  // To avoid recomputing r14 _twice_, we do the addition of (yh shifted left)
+  // first, so we can use the value in r14 before clobbering it at all.
+  lsl     r14, yh, r14
+  adds    xl, xl, r14
+  adcs    xh, xh, #0
+  // Now do the addition of (yh shifted right) and (yl shifted right).
+  lsr     r14, yl, r12
+  adds    xl, xl, r14
+  lsr     r14, yh, r12
+  adc     xh, xh, r14
+  // And now reconstruct the inverted shift count, for use later.
+  rsb     r14, r12, #32
+#else
+  // Add the right-shifted parts of yh and yl to xh and xl, keeping the carry
+  // in between if any.
+  adds    xl, xl, yl, lsr r12
+  adc     xh, xh, yh, lsr r12
+  // Now add the remainder of yh to the low word, again checking for a carry.
+  adds    xl, xl, yh, lsl r14
+  adcs    xh, xh, #0
+#endif
+
+  // If that addition carried into bit 20 of xh, then the number has increased
+  // its exponent. Diverge into a completely separate code path for that case,
+  // because there we must check for overflow.
+  cmp     xh, #1 << 20
+  bhs     LOCAL_LABEL(add_carry)
+
+  // Here, on the non-carrying path, we don't need to check for overflow at
+  // all. If there is an overflow it can only be due to rounding up, so the
+  // overflowed mantissa will be all zeroes, so the naively generated output
+  // will look like the correct infinity anyway.
+  //
+  // Recombine the mantissa with the sign + exponent (in r4) via addition.
+  add     xh, xh, r4, lsl #20
+  // Now our number is complete apart from rounding.
+
+LOCAL_LABEL(add_nocarry):
+  // This is the general rounding path for additions that didn't carry into the
+  // next exponent. We come here with the unrounded output in xh:xl, and yl and
+  // r14 set up so that (yl << r14) consists of all the bits shifted off the
+  // bottom of y's mantissa, or at least some approximation to them good enough
+  // to make the right rounding decision.
+  //
+  // Perform that shift, which sets the N flag if we need to round.
+  lsls    yl, yl, r14
+
+  // We're done with our two extra registers, so we can pop them.
+  pop     {r4, r14}
+
+  // If N is clear, we're rounding down (or the result was exact), and we know
+  // there was no overflow either, so xh:xl contains the correct output and we
+  // can return immediately.
+  bxpl    lr
+
+  // Otherwise, we're rounding up, or rounding to even. Start by incrementing
+  // the low word of the output.
+  adds    xl, xl, #1
+
+  // The obvious thing to do next would be to ADC xh, xh, #0, propagating any
+  // carry from that ADDS, and completing the addition of 1 to the 64-bit value
+  // in xh:xl. But we can do better, by doing a combined test for that carry
+  // _and_ round-to-even, and returning as quickly as possible in the common
+  // case where neither has happened.
+  //
+  // The Z flag is set if the addition to xl carried, and clear if it didn't.
+  // So if Z is clear, we also test the bits of yl below the round bit. Then if
+  // Z is still clear, there was no carry into xh _and_ no round to even, so we
+  // can return.
+  lslsne  yl, yl, #1
+  bxne    lr
+
+  // Now we know that we've just incremented xl, and either or both of these
+  // things is true:
+  //
+  //  1. this is a halfway case that needs rounding to even
+  //  2. the increment of xl wrapped it round from 0xFFFFFFFF to 0
+  //
+  // We can reliably tell if #2 is true by checking if xl = 0. If that is so,
+  // we must increment xh. On the other hand, if xl != 0, then #1 must be true,
+  // so we clear the low bit of xl to complete the round-to-even.
+  //
+  // What if _both_ are true? Luckily, it doesn't matter, because if xl = 0
+  // then its low bit is already clear, so it makes no difference whether we
+  // clear it or not.
+  cmp     xl, #0                  // is xl 0?
+  bicne   xl, xl, #1              // if not, then round to even
+  adceq   xh, xh, #0              // if so, then increment xh
+  bx      lr
+
+LOCAL_LABEL(add_bigshift):
+  // We come here from dadd_doadd if y's mantissa must be shifted right by more
+  // than 32 bits. So all of yl is going to be shifted off the bottom, not
+  // _even_ into the bit that determines rounding up or down. Therefore we can
+  // approximate it well enough by a single bit at the bottom of yh, which is 1
+  // if any bit of yl is 1.
+  //
+  // We put the modified value in yl, which is where the rounding code (shared
+  // with the case for shift <= 32 bits) will expect to find the value it has
+  // to shift left to make the round word.
+  cmp     yl, #1                  // set C if yl >= 0
+  adc     yl, yh, yh              // shift yh left 1, putting C at the bottom
+
+  // Calculate shift counts. r12 is adjusted down by 32 so it tells us how much
+  // to shift yh right by when adding; r14 is the distance to shift yl left by
+  // to make the round word (again where the shared rounding code will expect
+  // to find it).
+  //
+  // The second instruction also has the side effect of checking whether the
+  // shift count in r12 is greater than 31, which we'll use in a moment.
+  sub     r12, r12, #32
+  rsbs    r14, r12, #31
+
+  // Double precision exponents are bigger than 8 bits, so it's possible that
+  // the exponent difference is > 255. AArch32 shift operations tolerate shifts
+  // bigger than the size of the word, but only up to 255, because they only
+  // look at the low 8 bits. So we must detect that r12 was huge, and handle it
+  // specially.
+  //
+  // In this situation we reset r14 to 0, so that the rounding code will not
+  // shift yl left at all. Since the top bit of yl is clear (we made yl by
+  // shifting the top word of a mantissa left by 1, so its highest set bit is
+  // at most bit 21), the effect is to consider _all_ of y's mantissa to be
+  // lower than the round bit.
+  movlo   r14, #0
+
+  // Do the actual addition, again conditionalised on the result of checking
+  // whether the shift count r12 was too big.
+#if __thumb__
+  // As noted above, Thumb can't fold a register-controlled shift into an add,
+  // so we must use two instructions.
+  lsrhs   yh, yh, r12
+  addshs  xl, xl, yh
+#else
+  addshs  xl, xl, yh, lsr r12
+#endif
+
+  // Recombine the (unrounded) output mantissa with the output sign and
+  // exponent in r4. This also propagates any carry from xl into xh, from the
+  // addition. (Luckily the condition for skipping the addition also implies
+  // C=0, so in that situation, the ADC is still harmless.)
+  adc     xh, xh, r4, lsl #20
+
+  // Check whether the addition carried into the exponent field, by seeing if
+  // the exponent that ended up at the top of xh is the same as the one in r4
+  // that we just added. If it is the same (no carry) then we can go to
+  // dadd_nocarry to do the easy version of rounding that doesn't also need to
+  // check overflow.
+  cmp     r4, xh, lsr #20
+  beq     LOCAL_LABEL(add_nocarry)
+
+  // Otherwise, the addition has carried into the exponent. Subtract the
+  // exponent and sign off again, because dadd_carry (again shared with the
+  // small-shift code) will need those not to be in xh, because it will need to
+  // shift just the mantissa down by a bit.
+  sub     xh, xh, r4, lsl #20
+
+LOCAL_LABEL(add_carry):
+  // We get here from both shift branches if magnitude addition overflowed the
+  // input mantissa, so that the output will have an exponent one larger than
+  // the larger input.
+  //
+  // xh:xl was the larger input mantissa _without_ its leading 1, which we then
+  // added y's mantissa to. So before we shift down, we must put on the
+  // explicit leading 1.
+  add     xh, xh, #1 << 20
+  lsrs    xh, xh, #1
+  rrxs    xl, xl
+  // Now we can put the sign and exponent back on.
+  add     xh, xh, r4, lsl #20
+
+  // The right shift left the round bit in C. So if that's clear, we're not
+  // rounding up; we only have to check for overflow and then we can return.
+  bcc     LOCAL_LABEL(add_check_overflow_pop)
+
+  // Otherwise, set up for the combined dadd_roundeven_or_roundup_carry code:
+  // round up by incrementing the low word of xl, leaving the carry bit set if
+  // xh needs to be incremented too. If that addition _didn't_ carry, make the
+  // round word in r14 that's zero if we need to round to even. Then Z is set
+  // in either case, and otherwise, we only have overflow checking left to do.
+  adds    xl, xl, #1              // set Z if there's a carry
+  lslsne  r14, yl, r14            // else set Z if we need to round to even
+  pop     {r4, r14}
+  bne     LOCAL_LABEL(add_check_overflow)     // if Z not set for either reason, done
+
+LOCAL_LABEL(add_roundeven_or_roundup_carry):
+  // Just as in the dadd_nocarry case above, here we know that we've just
+  // incremented xl, and we either need to propagate a carry into xh, or we
+  // need to round to even, or both. See the comment there for explanation of
+  // these three instructions.
+  //
+  // The difference in this case is that after we've done that, we also need to
+  // check for overflow, where dadd_nocarry knew that wasn't necessary.
+  cmp     xl, #0                  // is xl 0?
+  bicne   xl, xl, #1              // if not, then round to even
+  adceq   xh, xh, #0              // if so, then increment xh
+  // We come here with a result ready to be returned, except that we have to
+  // check it for overflow first.
+LOCAL_LABEL(add_check_overflow):
+  lsl     yh, xh, #1              // move exponent into top 11 bits of yh
+  cmp     yh, #0xFFE00000         // if yh >= this, then exponent is all 1s
+  bxlo    lr                      // otherwise, no overflow
+
+  // If we haven't just returned, then we have an overflow. In addition we can
+  // only overflow by up to a factor of 2, so the sign bit in xh is still
+  // correct, and even the exponent has all its bits set. We only need to clear
+  // the mantissa.
+  mov     xl, #0                   // clear low word
+  lsrs    xh, xh, #20
+  lsls    xh, xh, #20
+  bx      lr
+
+  // Alternative entry point to dadd_check_overflow above, for use when the
+  // registers pushed at the start of the function haven't been popped yet.
+LOCAL_LABEL(add_check_overflow_pop):
+  pop     {r4, r14}
+  b       LOCAL_LABEL(add_check_overflow)
+
+LOCAL_LABEL(add_uncommon):
+  // We come here from the start of the function if we detected that either
+  // input had exponent 0x7ff or 0x000: that is, at least one operand is a NaN,
+  // infinity, denormal or zero.
+  //
+  // First detect whether there are any NaNs or infinities, by checking more
+  // specifically if either input has exponent 0x7ff. We take advantage of
+  // knowing that r14 was set to 0xFFC00000 in the function prologue, so we can
+  // make a useful constant for this test by adjusting it.
+  orr     r14, r14, #0x00200000   // now r14 = 0xFFE00000
+  bics    r4, r14, xh, lsl #1     // if x has exponent 0x7ff, this sets r4=0
+  bicsne  r4, r14, yh, lsl #1     // and similarly for y
+  beq     LOCAL_LABEL(add_naninf)             // so if either set Z, we have a NaN or inf
+
+  // Now we've ruled out NaNs and infinities. With NaNs gone, it's safe to flip
+  // the signs of the inputs (which only mattered for returning the right NaN).
+  // So check if the signs are the same, and cross-jump to dsub_zerodenorm
+  // (magnitude subtraction involving a zero or denormal) if not. Meanwhile,
+  // that will cross-jump back to here in the opposite case.
+  teq     xh, yh
+  eormi   yh, yh, #1 << 31
+  bmi     LOCAL_LABEL(sub_zerodenorm)
+LOCAL_LABEL(add_zerodenorm):
+  // Now we know we're doing a magnitude addition, involving at least one zero
+  // or denormal, and no NaNs or infinities.
+  //
+  // Sort the operands into magnitude order so that x >= y, exactly as in the
+  // main code path.
+  subs    r4, xl, yl          // compare inputs, also keeping x-y
+  sbcs    r12, xh, yh
+  bhs     LOCAL_LABEL(add_zerodenorm_swapped) // if x>=y then branch round the swap
+  adds    yl, yl, r4          // otherwise turn y into x by adding (x-y)
+  adc     yh, yh, r12
+  subs    xl, xl, r4          // and turn x into y by subtracting it
+  sbc     xh, xh, r12
+LOCAL_LABEL(add_zerodenorm_swapped):
+  // Set up the output sign+exponent, and the exponent difference, again
+  // exactly as in the main code path.
+  lsr     r4, xh, #20            // r4 = initial sign+exponent of the output
+  sub     r12, r4, yh, lsr #20   // r12 = exponent difference
+
+  // With the operands sorted so that y is smallest, and knowing there's at
+  // least one zero or denormal present, we know furthermore that if there's
+  // zero at all then it's y. And if y=0, then _whatever_ is in x is the right
+  // answer to return from the whole operation, whether it's another zero, a
+  // denormal, or normalised.
+  orrs    r14, yl, yh, lsl #1     // test all bits of y except the sign bit
+  popeq   {r4, pc}                // if they're all zero, we're done
+
+  // Otherwise, there are no zeroes, so y must be denormal, and we don't yet
+  // know if x is denormal too.
+  //
+  // If x isn't denormal, we rejoin the main code path for adding normalised
+  // numbers, with everything set up as dadd_doadd expects. It's easiest to
+  // represent the denormal y the same way the FP format does, as a mantissa
+  // without its leading bit set, shifted by the same amount as normalised
+  // numbers of the lowest exponent. (Renormalising via CLZ is more work, and
+  // not needed for addition.)
+  //
+  // To tell the main code that y's mantissa should be shifted by the same
+  // amount as a number with exponent 0x001, we must adjust the exponent
+  // difference r12 by one, because we've already made that by subtracting the
+  // _raw_ exponent values.
+
+  lsls    r14, r4, #21          // output exp = 0? If so, x is denormal too
+  bic     xh, xh, r4, lsl #20   // clear sign+exponent from top of x
+  bicne   yh, yh, #1 << 31      // if x not denormal, clear sign of y
+  subne   r12, r12, #1          //   and adjust exponent difference
+  bne     LOCAL_LABEL(add_doadd)            //   and rejoin the main path
+
+  // If we didn't take that branch, then both operands are denormal. In that
+  // situation we can simply do a 64-bit _integer_ addition of the values we
+  // have already! Both inputs represent numbers less than 2^52, with the same
+  // exponent; so adding them produces a number less than 2^53, which means
+  // it's either still a denormal, or if the addition carried into bit 52 then
+  // it's become a normalised number, with the mantissa still scaled by the
+  // same factor relative to the true value.
+  //
+  // The only tricky part is the sign bit. But we cleared that out of xh above,
+  // and haven't cleared it out of yh, so there's exactly one copy of it
+  // involved in this addition. So the sign bit will end up correct at the top
+  // of xh too.
+  adds    xl, xl, yl
+  adc     xh, xh, yh
+  pop     {r4, pc}
+
+LOCAL_LABEL(add_naninf):
+  // We come here knowing that at least one operand is either NaN or infinity.
+  // If there's a NaN, we can tailcall __dnan2 to do the right thing. Pop our
+  // stacked registers first: we won't need that much spare space any more, and
+  // it makes the tailcall easier if we've already done it.
+  pop     {r4, r14}
+
+  // A number is a NaN if its exponent is 0x7ff and at least one bit below that
+  // is set. The CMP + ADC pair here converts the two words xh:xl into a single
+  // word containing xh shifted up by one (throwing away the sign bit which
+  // makes no difference), with its low bit set if xl was nonzero. So if that
+  // is strictly greater than 0xffe00000, then x was a NaN.
+  cmp     xl, #1
+  adc     r12, xh, xh
+  cmp     r12, #0xFFE00000
+  bhi     SYMBOL_NAME(__compiler_rt_dnan2)
+  // Now check y in the same way.
+  cmp     yl, #1
+  adc     r12, yh, yh
+  cmp     r12, #0xFFE00000
+  bhi     SYMBOL_NAME(__compiler_rt_dnan2)
+
+LOCAL_LABEL(add_inf):
+  // Now we know there are no NaNs. Therefore there's at least one infinity. If
+  // we have two infinities of opposite sign, that's an invalid operation and
+  // we must return NaN; this happens if and only if x XOR y is all zero except
+  // for the top bit.
+  eor     r12, xh, yh
+  cmp     r12, #0x80000000
+  eorseq  r12, xl, yl
+  beq     LOCAL_LABEL(addsub_return_nan)
+
+  // Otherwise, only one sign of infinity is involved in our addition, so
+  // return whichever operand is the infinity. Since we know there are no NaNs,
+  // we can identify an infinity from just its exponent.
+  lsl     r12, xh, #1
+  cmp     r12, #0xFFE00000
+  bxeq    lr
+  movs    xh, yh
+  movs    xl, yl
+  bx      lr
+
+LOCAL_LABEL(addsub_return_nan):
+  // Return the default NaN, in the case of adding +inf to -inf.
+  movw    xh, 0x7ff8
+  lsls    xh, xh, #16        // 0x7ff80000 is the high word of the default NaN
+  mov     xl, #0             // and the low word is 0
+  bx      lr
+
+END_COMPILERRT_FUNCTION(__aeabi_dadd)
+
+DEFINE_COMPILERRT_FUNCTION(__aeabi_drsub)
+  // Reversed subtraction, that is, compute y-x, where x is in r0/r1 and y in
+  // r2/r3.
+  //
+  // We could implement this by simply swapping the register pairs. But the
+  // point of having a reversed-subtract in the first place is to avoid the
+  // caller having to do that, so if we do it ourselves, it wastes all the time
+  // they saved. So instead, on the fast path, we redo the sign check our own
+  // way and branch to dadd_magnitude or dsub_magnitude.
+
+  push    {r4, r14}
+
+  // Start by testing for uncommon operands in the same way as dadd.
+  ldr     r14, =0xFFC00000
+  add     r12, xh, #1 << 20   // r12 has the adjusted version of x's exponent
+  add     r4, yh, #1 << 20    // and r4 the adjusted version of y's
+  tst     r14, r12, lsl #1    // test the top 10 exponent bits of each
+  tstne   r14, r4, lsl #1
+  beq     LOCAL_LABEL(rsub_uncommon)      // and branch out of line if either is 0
+
+  // Check if the signs are equal, and branch to one or the other of
+  // dadd_magnitude and dsub_magnitude.
+  //
+  // If the signs are unequal, then y-x is a magnitude addition: we negate x so
+  // that we're computing y + (-x), in which both values have the same sign and
+  // go to dadd_magnitude. If the signs are equal then y-x is a magnitude
+  // subtraction, equal to (-x) - (-y), so we negate both operands and go to
+  // dsub_magnitude. Since x needs to be negated in both cases, we can do that
+  // unconditionally.
+  teq     xh, yh              // N set for a magnitude addition
+  eor     xh, xh, #1 << 31    // negate x unconditionally
+  bmi     LOCAL_LABEL(add_magnitude)      // branch away for magnitude addition
+  eor     yh, yh, #1 << 31    // otherwise, negate y too
+  b       LOCAL_LABEL(sub_magnitude)      // and do a magnitude subtraction
+
+LOCAL_LABEL(rsub_uncommon):
+  // Any uncommon operands to drsub are handled by just swapping the two
+  // operands and going to dsub's handler. We're off the main fast path now, so
+  // there's no need to try to optimise it any harder.
+  eor     xh, xh, yh
+  eor     xl, xl, yl
+  eor     yh, yh, xh
+  eor     yl, yl, xl
+  eor     xh, xh, yh
+  eor     xl, xl, yl
+  b       LOCAL_LABEL(sub_uncommon)
+
+END_COMPILERRT_FUNCTION(__aeabi_drsub)
+
+#if __ARM_PCS_VFP
+DEFINE_COMPILERRT_FUNCTION(__subdf3)
+  push {r4, lr}
+  VMOV_FROM_DOUBLE(r0, r1, d0)
+  VMOV_FROM_DOUBLE(r2, r3, d1)
+  bl __aeabi_dsub
+  VMOV_TO_DOUBLE(d0, r0, r1)
+  pop {r4, pc}
+#else
+DEFINE_COMPILERRT_FUNCTION_ALIAS(__subdf3, __aeabi_dsub)
+#endif
+
+DEFINE_COMPILERRT_FUNCTION(__aeabi_dsub)
+  // Main entry point for subtraction.
+
+  push    {r4, r14}
+
+  // Start by testing for uncommon operands in the same way as dadd.
+  ldr     r14, =0xFFC00000
+  add     r12, xh, #1 << 20   // r12 has the adjusted version of x's exponent
+  add     r4, yh, #1 << 20    // and r4 the adjusted version of y's
+  tst     r14, r12, lsl #1    // test the top 10 exponent bits of each
+  tstne   r14, r4, lsl #1
+  beq     LOCAL_LABEL(sub_uncommon)       // and branch out of line if either is 0
+
+  // Check the signs, and if they're unequal, cross-jump into dadd to do
+  // magnitude addition. (Now we've excluded NaNs, it's safe to flip the sign
+  // of y.)
+  teq     xh, yh
+  eormi   yh, yh, #1 << 31
+  bmi     LOCAL_LABEL(add_magnitude)
+LOCAL_LABEL(sub_magnitude):
+  // If we get here, we're subtracting operands with equal signs (i.e. a
+  // magnitude subtraction). First thing to do is put operands in magnitude
+  // order, so that x >= y. However, if they are swapped, we must also negate
+  // both of them, since A - B = (-B) - (-A). We do this by flipping the top
+  // bit of the value we add/subtract to each input to perform the swap
+  subs    r4, xl, yl          // compare inputs, also keeping x-y
+  sbcs    r12, xh, yh
+  bhs     LOCAL_LABEL(sub_swapped)        // if x>=y then branch round the swap
+  eor     r12, r12, #1 << 31  // flip the top bit of x-y
+  adds    yl, yl, r4          // so that this addition turns y into x+TOPBIT
+  adc     yh, yh, r12
+  subs    xl, xl, r4          // and this subtraction turns x into y-TOPBIT
+  sbc     xh, xh, r12
+LOCAL_LABEL(sub_swapped):
+  // Keep the sign and exponent of the larger input, to use as the sign and
+  // exponent of the output (up to carries and overflows). Also calculate the
+  // exponent difference, which tells us how far we'll need to shift y's
+  // mantissa right to add it to x's.
+  //
+  // As in dadd, the values being subtracted both include the sign bit, but
+  // we've already ensured the sign bits are the same (if we came here from
+  // dadd then we flipped the sign of y), so as in dadd, they cancel.
+  lsr     r4, xh, #20
+  sub     r12, r4, yh, lsr #20
+
+  // Isolate the two mantissas.
+  bic     xh, xh, r4, lsl #20
+  bic     yh, yh, r14, asr #2     // 0xffc00000 ASR 2 = 0xfff00000
+
+  // Negate the mantissa of y, so that we can compute the difference using
+  // ADD/ADC. As a side effect we also add in the leading bit of y's mantissa,
+  // by subtracting y from 0xfff0000000000000 instead of from 0.
+  rsbs    yl, yl, #0
+#if !__thumb__
+  rsc     yh, yh, r14, asr #2     // 0xffc00000 ASR 2 = 0xfff00000
+#else
+  // Thumb has no RSC, so simulate it by bitwise inversion and then ADC
+  mvn     yh, yh
+  adc     yh, yh, r14, asr #2     // 0xffc00000 ASR 2 = 0xfff00000
+#endif
+
+LOCAL_LABEL(sub_dosub):
+  // Here we perform the actual subtraction. We either fell through from the
+  // code above, or jumped back to here after handling an input denormal.
+  //
+  // We get here with:
+  //   Operands known to be numeric rather than zero/infinity/NaN;
+  //   xh:xl = mantissa of larger operand, with low bit at the bottom of xl
+  //   yh:yl = negated mantissa of smaller operand, similarly
+  //   r4 = result sign and exponent (in low 12 bits);
+  //   r12 = exponent difference.
+  //
+  // For normal inputs, the value in yh:yl will be as if the mantissa of y had
+  // the leading bit set before negating it. For denormal y, the mantissa will
+  // have been negated without setting that bit, similarly to dadd.
+
+  // As in dadd, we start by separating off the case where we're shifting the
+  // mantissa of y right by more than 32 bits.
+  rsbs    r14, r12, #32
+  blo     LOCAL_LABEL(sub_bigshift)
+
+  // The exponent difference is 32 or less. The test above also left
+  // (32-difference) in r14, which is the amount we need to shift yh left by to
+  // recover the bits that the right shift will lose off the bottom.
+#if !__thumb__
+  // Add the right-shifted parts of yh and yl to xh and xl, keeping the carry
+  // in between if any.
+  adds    xl, xl, yl, lsr r12
+  adc     xh, xh, yh, asr r12
+  // Now add the remainder of yh to the low word, again checking for a carry.
+  adds    xl, xl, yh, lsl r14
+  adcs    xh, xh, #0
+#else
+  // The Thumb version of the addition, which must do each register-controlled
+  // shift in a separate instruction from the addition. This works the same as
+  // the dadd version, except that we use ASR to shift yh right, because yh:yl
+  // contains a negative signed integer.
+
+  // As in dadd, start by adding (yh shifted left), so as not to waste the
+  // value we've already set up in r14.
+  lsl     r14, yh, r14
+  adds    xl, xl, r14
+  adcs    xh, xh, #0
+  // Then add (yh shifted right) and (yl shifted right).
+  lsr     r14, yl, r12
+  adds    xl, xl, r14
+  asr     r14, yh, r12
+  adcs    xh, xh, r14
+  // And now reconstruct the inverted shift count, for use later.
+  rsb     r14, r12, #32
+#endif
+
+  // We know we had x >= y before the subtraction. So x-y is still a number of
+  // the same sign, but its exponent might have reduced. If we'd set the
+  // leading bit on x's mantissa before subtracting, we'd be able to tell this
+  // by testing if it was still set. But in fact we didn't, so the question is
+  // whether x's mantissa without the leading bit is still even positive.
+  //
+  // The last ADCS (in either of the Arm and Thumb code sequences above) will
+  // have set the N flag if x < 0, which is the case where the exponent has
+  // reduced. Branch out of line for that case.
+  bmi     LOCAL_LABEL(sub_borrow)
+
+LOCAL_LABEL(sub_noborrow):
+  // This is the easy case: the exponent of x has stayed the same, so there's
+  // no possibility of underflow. All we have to do is put the pieces of the
+  // result back together, round, and return.
+
+  // Recombine x's mantissa with the output sign and exponent.
+  add     xh, xh, r4, lsl #20
+
+  // Make the word of bits shifted off the bottom of y's mantissa, with the
+  // topmost bit indicating whether we round up or down, and the rest used to
+  // determine whether to round to even.
+  lsls    yl, yl, r14
+
+  // If the top bit of the round word is clear, then we're rounding down, so
+  // the value in xh:xl is already correct and we can return.
+  poppl   {r4, pc}
+
+  // Otherwise, start by rounding up. As in dadd, we make the Z flag do double
+  // duty: it's initially set by the ADDS to indicate a carry into the high
+  // word, and then if that doesn't happen then we have another chance to set
+  // it if the round word indicates an exact halfway case. So we can return
+  // early in the common case where neither of those things happened.
+  adds    xl, xl, #1
+  cmpne   yl, #0x80000000
+  popne   {r4, pc}
+
+  // Now if xl=0 then we must increment xh (the addition from rounding carried
+  // into the high word). Otherwise we must round to even, by clearing the low
+  // bit of xl. As in dadd, it's possible that _both_ conditions are true at
+  // once, but in that situation, the fact that xl=0 means if makes no
+  // difference whether we clear its low bit or not.
+  cmp     xl, #0              // do we need to increment xh?
+  addeq   xh, xh, #1          // if so, do it
+  bicne   xl, xl, #1          // otherwise, round to even
+  pop     {r4, pc}
+
+LOCAL_LABEL(sub_bigshift):
+  // We come here from dsub_dosub if y's mantissa must be shifted right by more
+  // than 32 bits.
+  //
+  // In dadd_bigshift we concluded that all of yl could be condensed into a
+  // single bit at the bottom of the round word, because it could only affect
+  // round-to-even. However, in subtraction, that's not true, because we might
+  // renormalise: if the input exponents differ by exactly 33, and the
+  // subtraction reduces the exponent by 1, then the top bit of yl might become
+  // the round bit again. So we must make our round word by shifting two extra
+  // bits on to the bottom of yh: first the topmost bit of yl, then a single
+  // bit indicating whether any of the rest is nonzero.
+  //
+  // As in dadd_bigshift, we make this new round word in yl, leaving yh
+  // unmodified so that we can use it for the actual shift-and-add.
+  //
+  // (For these purposes, we only have to worry about renormalisation by _one_
+  // bit. If the output exponent reduces by 2 or more, it must be because the
+  // input exponents were so close that the output is exact anyway, so a round
+  // word isn't needed at all.)
+  adds    r14, yl, yl         // put the top bit of yl into C
+  adc     yl, yh, yh          // and shift it in to the bottom of yh
+  cmp     r14, #1             // set C if anything below that bit was nonzero
+  adc     yl, yl, yl          // shift that in to yl as well
+
+  // Calculate shift counts. r12 is how far to shift yh right when adding; r14
+  // is how far to shift yl left to make the round word (subtracted from 30
+  // instead of 32 to account for the two bits we just shifted in at the bottom
+  // of yl).
+  //
+  // If the latter shift count goes negative, then we can't use it. Branch to
+  // another handler for _really_ big exponent differences.
+  sub     r12, r12, #32
+  rsbs    r14, r12, #30
+  blo     LOCAL_LABEL(sub_hugeshift)
+
+  // Shift yh right and add it to x, to produce the unrounded output mantissa.
+#if !__thumb__
+  adds    xl, xl, yh, asr r12
+#else
+  // In Thumb we must do the register-controlled shift and addition separately
+  asr     r12, yh, r12
+  adds    xl, xl, r12
+#endif
+  // The top half of the addition, propagating a carry from xl into xh. Since
+  // yh was a negative number and we arithmetically shifted it right, the value
+  // we add to xh is 0xFFFFFFFF rather than 0, as if we'd sign-extended that
+  // negative number to 64 bits.
+  adcs    xh, xh, #-1
+
+  // As in the small-shift case above, if this has left a positive value in
+  // xh:xl, it means the exponent hasn't changed, so we can go to the easy
+  // epilogue code in dsub_noborrow.
+  bpl     LOCAL_LABEL(sub_noborrow)
+
+LOCAL_LABEL(sub_borrow):
+  // We come here from either of the small-shift or large-shift versions of the
+  // subtraction step, if the subtraction caused xh:xl to go negative. This
+  // means that the result of the subtraction is less than the smallest
+  // possible value with x's exponent. In other words, the output will have a
+  // smaller exponent, and we must shift the mantissa left and put some bits
+  // back in from yl (which contains the bits of y shifted off the bottom).
+  //
+  // The most important question in this situation is: do we have to shift the
+  // mantissa left by only one bit, or by more than one? It's important because
+  // in the case where we shift left by more than one bit, no rounding can
+  // possibly be needed: if x >= 2^k but x-y < 2^{k-1}, then y > 2^{k-1}, so
+  // the exponents of x and y differ by at most 1. Therefore the lowest set bit
+  // in the true difference x-y (before rounding) can't possibly be any lower
+  // than the bit just off the bottom of x's mantissa, and we're shifting left
+  // by at least 1, so that will be part of the output mantissa. So in this
+  // case the result must be exact.
+  //
+  // (This is not normally considered a good thing from the point of view of
+  // the user! Subtracting two very close values and getting a result that has
+  // a lot of mantissa bits zero at the bottom is called 'significance loss'
+  // and can be a cause of numerical instability. But whether the client code
+  // _likes_ it or not, the IEEE standard is very clear that we must return the
+  // value with lots of trailing 0 bits, which can't need any rounding.)
+  //
+  // On the other hand, if we shift left by only one bit, then the value we
+  // subtracted from x could have been almost arbitrarily small, so there's
+  // lots of scope for bits of y to have been shifted off the bottom to cause
+  // rounding.
+  //
+  // Conclusion: we either shift left 1 and have to figure out rounding, or we
+  // shift left more than 1 and have to figure out the right shift count, but
+  // never both.
+
+  // On entry to here, (yl << r14) gives the bits shifted off the bottom of
+  // xh:xl. Shift xh:xl up by one, bringing the high bit of that back in.
+  //
+  // If we're shifting left by only one bit, then the mantissa is now at its
+  // correct position and yl is the round word. On the other hand, if we're
+  // shifting by more, then all the output mantissa bits we need are now in
+  // xh:xl, and there aren't any in yl that still need to be salvaged.
+  add     r14, r14, #1            // we want to shift yl one extra bit left
+  lsls    r14, yl, r14            // do the shift, leaving the top bit in C
+  adcs    xl, xl, xl              // shift that in to the bottom of xl
+  adc     xh, xh, xh              // and propagate into xh
+
+  // Our next task is to find out which case we're in: shift by one bit and
+  // round, or figure out how many more bits to shift by? We can determine this
+  // by looking at bit 20 of xh: if that's 0 then we need to shift further.
+  //
+  // But to save instructions, we fold that test together with a test for
+  // another awkward case: was the input exponent in r4 equal to 1? If so, then
+  // it's been decremented to 0, which means the result of the subtraction is a
+  // denormal. (Separately from that, we might _also_ get a denormal if
+  // significance loss has occurred, even if the exponent in r4 was larger.)
+  //
+  // To do both of these tests at once, we add the original output exponent in
+  // r4 back in to xh, _shifted left by an extra bit_, as if we'd added it
+  // before doing the shift above. This loses the sign bit off the top, and
+  // since the top 11 bits of xh are all 1, has the same result as decrementing
+  // r4. So bit 20 of xh is unaffected (it's still 0 if we need to shift
+  // further), and bits 21 and upwards are all zero if the output might be
+  // denormal.
+  //
+  // The Arm condition code LS (unsigned lower-or-same) is implemented by
+  // testing if C=0 or Z=1. That's just what we need! Having made our modified
+  // version of xh, shift it right so that bit 20 goes off the bottom into the
+  // carry flag. Then C=0 means bit 20 of xh was clear and we need to shift
+  // further; Z=1 means the exponent has decremented from 1 to 0 and we're
+  // returning a denormal; if _either_ is true, then the BLS will send us out
+  // of line.
+
+  add     r12, xh, r4, lsl #21    // make test value (keeping the original xh)
+  lsrs    r12, r12, #21           // set C and Z to the values we want to test
+  bls     LOCAL_LABEL(sub_renorm_or_denorm)   // branch out of line if C=0 or Z=1
+
+  // If we haven't taken that branch, then we now have our mantissa in the
+  // correct position _and_ we're confident that the output is a normalised
+  // number. So we only have rounding left to do.
+  //
+  // Put the sign and exponent back on the output. Because the bits in xh's
+  // exponent field are still all 1s, this decrements the exponent in r4 by
+  // one, which is just what we want.
+  add     xh, xh, r4, lsl #20
+
+  // The round bit is at the top of r14, so we can add it to the bottom of xl
+  // by a right shift.
+  //
+  // If this addition carries off the top of xl, then C and Z will both be set.
+  // If C is not set, then Z might still be set because xl was already zero.
+  adds    xl, xl, r14, lsr #31
+  // We only need to check for round-to-even if there wasn't a carry, because
+  // if there was a carry, xl = 0 and so clearing its low bit won't make a
+  // difference anyway. So in the C=0 case, we now clobber the potentially
+  // misleading value left in Z by the previous instruction, and replace it
+  // with the result of checking r14 against the exact halfway value of the
+  // round word.
+  cmpcc   r14, #0x80000000
+  // Now if Z is clear, we don't have to round to even _or_ propagate a carry
+  // into xh, so we're done.
+  popne   {r4, pc}
+
+  // Otherwise, we have to either round to even, or increment xh. We increment
+  // xh exactly if xl = 0, because the case where xl=0 without rounding up
+  // would have taken the early return: the ADDS would have left C clear, so
+  // the CMPCC would have checked r14 against 0x80000000, and would have
+  // compared unequal because the top bit of r14 would have been claer.
+  cmp     xl, #0                  // is xl zero?
+  addeq   xh, xh, #1              // if so, increment xh to propagate carry
+  bicne   xl, xl, #1              // otherwise, clear xl bit 0 to round to even
+  pop     {r4, pc}
+
+LOCAL_LABEL(sub_renorm_or_denorm):
+  // We come here from the tricky combined test above, where we set C=0 if the
+  // output mantissa still doesn't have its leading bit set, and Z=1 if the
+  // exponent has already decreased to 0 so that the output will be denormal.
+  //
+  // In the latter case, we don't want to shift the mantissa any further up,
+  // because we'd only have to shift it back down again. So branch again to
+  // deal with that, or fall through to multiple-bit renormalisation.
+  beq     LOCAL_LABEL(sub_already_denormal)
+
+  // We'll want to adjust the exponent by the amount we shift. So split up the
+  // sign and exponent, so that we can do arithmetic on the exponent without
+  // the sign getting in the way.
+  lsr     r12, r4, #11            // sign is now in r12 bit 0
+  bic     r4, r4, #1 << 11        // exponent is in r4 all by itself
+
+  // Add the leading bit of x's mantissa back in (at bit 21 rather than 20
+  // because we already shifted left by one), to recover the full output
+  // mantissa.
+  //
+  // As a side effect, this sets Z to indicate that the top word xh is all
+  // zero, so now we know which of xh and xl we need to CLZ. It's easier to
+  // separate the two cases than to try to deal with them in a combined code
+  // path. We branch out of line for the xh=0 case, on the theory that the
+  // larger the renormalization, the less likely it is, so the common case
+  // stays in line.
+  adds    xh, xh, #1 << 21
+  beq     LOCAL_LABEL(sub_renorm_clz_xl)
+
+  // There's a set bit somewhere in xh. Find it, and shift it up to bit 20.
+  clz     yl, xh                  // distance from leading bit to bit 31
+  subs    yl, yl, #11             // distance to bit 20, where we want it
+  rsbs    yh, yl, #32             // work out the associated right shift
+  lsls    xh, xh, yl              // shift xh upwards
+#if !__thumb__
+  orr     xh, xh, xl, lsr yh      // combine with the high bits of xl
+#else
+  // As usual, in Thumb we must do the register-controlled right shift and the
+  // ORR separately.
+  lsrs    yh, xl, yh
+  orrs    xh, xh, yh
+#endif
+  lsls    xl, xl, yl              // finally, shift xl left
+
+  // Adjust the exponent downward, to match the distance we just shifted the
+  // mantissa upward.
+  //
+  // We adjust downward by an extra 2: one because we already shifted xh left
+  // by one bit, and another because the leading bit of the renormalized
+  // mantissa will increment it again.
+  subs    r4, r4, yl
+  subs    r4, r4, #2
+
+LOCAL_LABEL(sub_renormed):
+  // Here the two renormalization branches reconverge. The output mantissa in
+  // xh:xl has been shifted up to the correct position, with its leading bit
+  // present and in bit 20 of xh. r4 is the adjusted exponent, and the low bit
+  // of r12 is the output sign.
+  //
+  // Recombine all the pieces. Since no rounding is needed on this path, the
+  // output is correct and ready to return unless the exponent is too small.
+  // The smallest valid exponent is 0, because it will be adjusted upwards by 1
+  // by the leading mantissa bit. Since the last thing both branches did before
+  // coming here was to update r4 using a flag-setting instruction, we can
+  // therefore detect underflow by the N flag.
+  add     xh, xh, r12, lsl #31
+  add     xh, xh, r4, lsl #20
+  poppl   {r4, pc}
+
+  // Renormalisation made the exponent negative. We're well off the fast path
+  // by now, so the simplest way to sort this out is to use the helper routine
+  // __dunder.
+  add     xh, xh, #3 << 29        // rebias exponent as __dunder will expect
+  mov     r2, #0                  // rounding direction = 0 for an exact answer
+  pop     {r4, lr}
+  b       SYMBOL_NAME(__compiler_rt_dunder)
+
+LOCAL_LABEL(sub_renorm_clz_xl):
+  // This is the alternative renormalization code for the case where xh=0, so
+  // that the highest remaining set bit in the mantissa is somewhere in xl.
+  // Again we want to shift that all the way up to bit 20 of xh. The easiest
+  // way is to shift it to the top of xl, and then shift that in turn by a
+  // fixed distance to split it across xh[20..0] and xl[31..21], saving a
+  // conditional decision about whether to shift up or down.
+  //
+  // However, there's another special case: on this branch, we might find out
+  // that we've subtracted two _exactly_ equal values, not just nearly equal,
+  // so the result is zero! To handle this quickly, we put the shifted-up
+  // version of xl into xh instead of shifting it in place. Then, if it's zero,
+  // we've just filled xh _and_ xl with zero bits, so we can return
+  // immediately. (Since this function always uses round-to-nearest mode, an
+  // output zero from subtracting like-signed inputs is unconditionally +0.)
+  clz     yh, xl
+  lsls    xh, xl, yh              // now xl has leading bit in bit 31
+  popeq   {r4, pc}                // and if the answer is 0, just return it
+  lsls    xl, xh, #21             // now set xl to the low bits of the mantissa
+  lsrs    xh, xh, #11             // and xh to the high bits
+
+  // Adjust the exponent down by the amount we shifted up, which is the CLZ
+  // output (in yh), plus another 21 bits to get from the top bit of xl to bit
+  // 20 of xh, plus 1 bit for the shift already performed before we did the
+  // CLZ, plus 1 which the leading mantissa bit will undo when we add it to the
+  // exponent. Then go back to dsub_renormed for the shared epilogue code.
+  subs    r4, r4, yh
+  subs    r4, r4, #23
+  b       LOCAL_LABEL(sub_renormed)
+
+LOCAL_LABEL(sub_hugeshift):
+  // We came here in the case where the whole of y's mantissa was shifted down
+  // so far that dsub_bigshift couldn't cope with it. In this situation the
+  // result of the subtraction differs from the input x by under half a ULP, so
+  // we just return the original x, which we recover by putting the sign and
+  // exponent in r4 back together with the mantissa.
+  add     xh, xh, r4, lsl #20
+  pop     {r4, pc}
+
+LOCAL_LABEL(sub_already_denormal):
+  // We come here if the initial renormalization by one bit reduced the
+  // exponent of x from 1 to 0, so that the output is denormal. In this
+  // situation we don't need to call __dunder to figure out how far to shift
+  // the result, because the answer is a constant: the mantissa was already in
+  // the right place _before_ our one-bit left shift (denormals have the same
+  // mantissa shift as normalised numbers with the smallest exponent), so all
+  // we have to do is undo that left shift, and put the sign bit back on.
+  movs    xh, xh, asr #1
+  rrx     xl, xl
+  add     xh, xh, r4, lsl #20
+
+LOCAL_LABEL(sub_check_zero):
+  // Here we have a denormal result in xh:xl, with its sign bit already in
+  // place ... except that the mantissa might be all zeroes, in which case we
+  // must clear the sign bit so as to return +0.
+  pop     {r4, r14}
+  orrs    r12, xl, xh, lsl #1     // EQ if all non-sign bits of x are zero
+  bxne    lr                      // if that's not true, return our denormal
+  movs    xh, #0                  // otherwise, clear xh completely
+  bx      lr
+
+LOCAL_LABEL(sub_uncommon):
+  // We come here from the start of the function if we detected that either
+  // input had exponent 0x7ff or 0x000: that is, at least one operand is a NaN,
+  // infinity, denormal or zero.
+  //
+  // First detect whether there are any NaNs or infinities, by checking more
+  // specifically if either input has exponent 0x7ff. We take advantage of
+  // knowing that r14 was set to 0xFFC00000 in the function prologue, so we can
+  // make a useful constant for this test by adjusting it.
+  orr     r14, r14, #0x00200000   // now r14 = 0xFFE00000
+  bics    r4, r14, xh, lsl #1     // if x has exponent 0x7ff, this sets r4=0
+  bicsne  r4, r14, yh, lsl #1     // and similarly for y
+  beq     LOCAL_LABEL(sub_naninf)             // so if either set Z, we have a NaN or inf
+
+  // Now we've ruled out NaNs and infinities. With NaNs gone, it's safe to flip
+  // the signs of the inputs (which only mattered for returning the right NaN).
+  // So check if the signs are the same, and cross-jump to dadd_zerodenorm
+  // (magnitude subtraction involving a zero or denormal) if not. Meanwhile,
+  // that will cross-jump back to here in the opposite case.
+  teq     xh, yh
+  eormi   yh, yh, #1 << 31
+  bmi     LOCAL_LABEL(add_zerodenorm)
+LOCAL_LABEL(sub_zerodenorm):
+  // Now we know we're doing a magnitude addition, involving at least one zero
+  // or denormal, and no NaNs or infinities.
+  //
+  // Sort the operands into magnitude order so that x >= y, exactly as in the
+  // main code path, including the EOR that negates both operands in the course
+  // of swapping them.
+  subs    r4, xl, yl          // compare inputs, also keeping x-y
+  sbcs    r12, xh, yh
+  bhs     LOCAL_LABEL(sub_zerodenorm_swapped) // if x>=y then branch round the swap
+  eor     r12, r12, #1 << 31  // flip the top bit of x-y
+  adds    yl, yl, r4          // so that this addition turns y into x+TOPBIT
+  adc     yh, yh, r12
+  subs    xl, xl, r4          // and this subtraction turns x into y-TOPBIT
+  sbc     xh, xh, r12
+LOCAL_LABEL(sub_zerodenorm_swapped):
+  // Set up the output sign+exponent, and the exponent difference, again
+  // exactly as in the main code path.
+  lsr     r4, xh, #20
+  sub     r12, r4, yh, lsr #20
+
+  // With the operands sorted so that y is smallest, and knowing there's at
+  // least one zero or denormal present, we know furthermore that if there's
+  // zero at all then it's y. And if y=0, then we just return x, except that if
+  // x=0 too we must fix up the sign of zero.
+  orrs    r14, yl, yh, lsl #1     // test all bits of y except the sign bit
+  beq     LOCAL_LABEL(sub_check_zero)         // if they're all zero, return x
+
+  // Otherwise, there are no zeroes, so y must be denormal, and we don't yet
+  // know if x is denormal too.
+  //
+  // If x isn't denormal, we rejoin the main code path for adding normalised
+  // numbers, with everything set up as dadd_doadd expects. It's easiest to
+  // represent the denormal y the same way the FP format does, as a mantissa
+  // without its leading bit set, shifted by the same amount as normalised
+  // numbers of the lowest exponent. (Renormalising via CLZ is more work, and
+  // not needed for addition.)
+  //
+  // To tell the main code that y's mantissa should be shifted by the same
+  // amount as a number with exponent 0x001, we must adjust the exponent
+  // difference r12 by one, because we've already made that by subtracting the
+  // _raw_ exponent values.
+  lsls    r14, r4, #21          // output exp = 0? If so, x is denormal too
+  bic     xh, xh, r4, lsl #20   // clear sign+exponent from top of x
+  beq     LOCAL_LABEL(sub_both_denorm)      // if both inputs denormal, go elsewhere
+  bic     yh, yh, #1 << 31      // if x not denormal, clear sign of y
+  sub     r12, r12, #1          //   and adjust exponent difference
+  // Now negate the mantissa of y and then rejoin the main path.
+  rsbs    yl, yl, #0
+#if !__thumb__
+  rsc     yh, yh, #0
+#else
+  // Thumb has no RSC, so simulate it by bitwise inversion and then ADC
+  mvn     yh, yh
+  adc     yh, yh, #0
+#endif
+  b       LOCAL_LABEL(sub_dosub)
+
+LOCAL_LABEL(sub_both_denorm):
+  // If both inputs are denormal, then we can just subtract the mantissas like
+  // ordinary integers. We've cleared the sign bit from x, but not from y, so
+  // we'll get exactly one copy of the sign bit in the result. (Negating it
+  // makes no difference!)
+  subs    xl, xl, yl
+  sbc     xh, xh, yh
+  // Now go to dsub_check_zero, which will check if the answer is exactly zero,
+  // and fix the sign bit if it is.
+  b       LOCAL_LABEL(sub_check_zero)
+
+  // Handle NaNs and infinities in subtraction.
+LOCAL_LABEL(sub_naninf):
+  // Look for NaNs and hand them off to __dnan2, exactly as in dadd_naninf.
+  pop     {r4, r14}
+  cmp     xl, #1
+  adc     r12, xh, xh
+  cmp     r12, #0xFFE00000
+  bhi     SYMBOL_NAME(__compiler_rt_dnan2)
+  cmp     yl, #1
+  adc     r12, yh, yh
+  cmp     r12, #0xFFE00000
+  bhi     SYMBOL_NAME(__compiler_rt_dnan2)
+
+  // Now we know there aren't any NaNs, we can deal with subtractions involving
+  // an infinity by flipping the sign of y and letting dadd_inf deal with it.
+  eor     yh, yh, #0x80000000
+  b       LOCAL_LABEL(add_inf)
+
+END_COMPILERRT_FUNCTION(__aeabi_dsub)
+
+NO_EXEC_STACK_DIRECTIVE
diff --git a/compiler-rt/test/builtins/Unit/adddf3new_test.c b/compiler-rt/test/builtins/Unit/adddf3new_test.c
new file mode 100644
index 000000000000..c4913144d33d
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/adddf3new_test.c
@@ -0,0 +1,684 @@
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_adddf3
+
+#include "int_lib.h"
+#include <inttypes.h>
+#include <stdio.h>
+
+#include "fp_test.h"
+
+// By default this test uses compareResultD to check the returned floats, which
+// accepts any returned NaN if the expected result is the canonical NaN value
+// 0x7ff8000000000000. For the Arm optimized FP implementation, which commits
+// to a more detailed handling of NaNs, we tighten up the check and include
+// some extra test cases specific to that NaN policy.
+#if COMPILER_RT_ARM_OPTIMIZED_FP
+#  define EXPECT_EXACT_RESULTS
+#  define ARM_NAN_HANDLING
+#endif
+
+// Returns: a + b
+COMPILER_RT_ABI double __adddf3(double a, double b);
+
+int test__adddf3(uint64_t a_rep, uint64_t b_rep, uint64_t expected_rep,
+                 int line) {
+  double a = fromRep64(a_rep), b = fromRep64(b_rep);
+  double x = __adddf3(a, b);
+#ifdef EXPECT_EXACT_RESULTS
+  int ret = toRep64(x) != expected_rep;
+#else
+  int ret = compareResultD(x, expected_rep);
+#endif
+
+  if (ret) {
+    printf("error at line %d: __adddf3(%016" PRIx64 ", %016" PRIx64
+           ") = %016" PRIx64 ", expected %016" PRIx64 "\n",
+           line, a_rep, b_rep, toRep64(x), expected_rep);
+  }
+  return ret;
+}
+
+#define test__adddf3(a, b, x) (test__adddf3)(a, b, x, __LINE__)
+
+int main(void) {
+  int status = 0;
+
+  status |=
+      test__adddf3(0x0000000000000000, 0x0000000000000000, 0x0000000000000000);
+  status |=
+      test__adddf3(0x0000000000000000, 0x000fffffffffffff, 0x000fffffffffffff);
+  status |=
+      test__adddf3(0x0000000000000000, 0x3ff0000000000000, 0x3ff0000000000000);
+  status |=
+      test__adddf3(0x0000000000000000, 0x7fe0000000000000, 0x7fe0000000000000);
+  status |=
+      test__adddf3(0x0000000000000000, 0x7ff0000000000000, 0x7ff0000000000000);
+  status |=
+      test__adddf3(0x0000000000000000, 0x8000000000000000, 0x0000000000000000);
+  status |=
+      test__adddf3(0x0000000000000000, 0x800fffffffffffff, 0x800fffffffffffff);
+  status |=
+      test__adddf3(0x0000000000000000, 0x8010000000000000, 0x8010000000000000);
+  status |=
+      test__adddf3(0x0000000000000000, 0xfff0000000000000, 0xfff0000000000000);
+  status |=
+      test__adddf3(0x0000000000000001, 0x0000000000000001, 0x0000000000000002);
+  status |=
+      test__adddf3(0x0000000000000001, 0x3fefffffffffffff, 0x3fefffffffffffff);
+  status |=
+      test__adddf3(0x0000000000000001, 0x3ff0000000000000, 0x3ff0000000000000);
+  status |=
+      test__adddf3(0x0000000000000001, 0x3ffffffffffffffe, 0x3ffffffffffffffe);
+  status |=
+      test__adddf3(0x0000000000000001, 0x3fffffffffffffff, 0x3fffffffffffffff);
+  status |=
+      test__adddf3(0x0000000000000001, 0x7fdfffffffffffff, 0x7fdfffffffffffff);
+  status |=
+      test__adddf3(0x0000000000000001, 0x7fe0000000000000, 0x7fe0000000000000);
+  status |=
+      test__adddf3(0x0000000000000001, 0x7feffffffffffffe, 0x7feffffffffffffe);
+  status |=
+      test__adddf3(0x0000000000000001, 0x7fefffffffffffff, 0x7fefffffffffffff);
+  status |=
+      test__adddf3(0x0000000000000001, 0x8000000000000001, 0x0000000000000000);
+  status |=
+      test__adddf3(0x0000000000000002, 0x8000000000000001, 0x0000000000000001);
+  status |=
+      test__adddf3(0x0000000000000003, 0x0000000000000000, 0x0000000000000003);
+  status |=
+      test__adddf3(0x0000000000000003, 0x7ff0000000000000, 0x7ff0000000000000);
+  status |=
+      test__adddf3(0x0000000000000003, 0x8000000000000000, 0x0000000000000003);
+  status |=
+      test__adddf3(0x0000000000000003, 0x8000000000000002, 0x0000000000000001);
+  status |=
+      test__adddf3(0x0000000000000003, 0xc014000000000000, 0xc014000000000000);
+  status |=
+      test__adddf3(0x0000000000000003, 0xffe0000000000000, 0xffe0000000000000);
+  status |=
+      test__adddf3(0x0000000000000003, 0xfff0000000000000, 0xfff0000000000000);
+  status |=
+      test__adddf3(0x0000000000000004, 0x0000000000000004, 0x0000000000000008);
+  status |=
+      test__adddf3(0x000ffffffffffffc, 0x800ffffffffffffc, 0x0000000000000000);
+  status |=
+      test__adddf3(0x000ffffffffffffd, 0x800ffffffffffffe, 0x8000000000000001);
+  status |=
+      test__adddf3(0x000fffffffffffff, 0x000fffffffffffff, 0x001ffffffffffffe);
+  status |=
+      test__adddf3(0x000fffffffffffff, 0x800ffffffffffffe, 0x0000000000000001);
+  status |=
+      test__adddf3(0x000fffffffffffff, 0x8010000000000000, 0x8000000000000001);
+  status |=
+      test__adddf3(0x0010000000000000, 0x0000000000000000, 0x0010000000000000);
+  status |=
+      test__adddf3(0x0010000000000000, 0x0010000000000000, 0x0020000000000000);
+  status |=
+      test__adddf3(0x0010000000000000, 0x8010000000000000, 0x0000000000000000);
+  status |=
+      test__adddf3(0x0010000000000001, 0x8010000000000000, 0x0000000000000001);
+  status |=
+      test__adddf3(0x0010000000000001, 0x8010000000000002, 0x8000000000000001);
+  status |=
+      test__adddf3(0x001fffffffffffff, 0x8020000000000000, 0x8000000000000001);
+  status |=
+      test__adddf3(0x001fffffffffffff, 0x8020000000000002, 0x8000000000000005);
+  status |=
+      test__adddf3(0x001fffffffffffff, 0x8020000000000004, 0x8000000000000009);
+  status |=
+      test__adddf3(0x0020000000000000, 0x801fffffffffffff, 0x0000000000000001);
+  status |=
+      test__adddf3(0x0020000000000001, 0x8010000000000001, 0x0010000000000001);
+  status |=
+      test__adddf3(0x0020000000000001, 0x801fffffffffffff, 0x0000000000000003);
+  status |=
+      test__adddf3(0x0020000000000002, 0x8010000000000001, 0x0010000000000003);
+  status |=
+      test__adddf3(0x002fffffffffffff, 0x8030000000000000, 0x8000000000000002);
+  status |=
+      test__adddf3(0x0030000000000000, 0x802fffffffffffff, 0x0000000000000002);
+  status |=
+      test__adddf3(0x0030000000000001, 0x802fffffffffffff, 0x0000000000000006);
+  status |=
+      test__adddf3(0x0030000000000002, 0x8020000000000003, 0x0020000000000001);
+  status |=
+      test__adddf3(0x3fefffffffffffff, 0x8000000000000001, 0x3fefffffffffffff);
+  status |=
+      test__adddf3(0x3ff0000000000000, 0x3ff0000000000000, 0x4000000000000000);
+  status |=
+      test__adddf3(0x3ff0000000000000, 0x3ff0000000000003, 0x4000000000000002);
+  status |=
+      test__adddf3(0x3ff0000000000000, 0x4000000000000000, 0x4008000000000000);
+  status |=
+      test__adddf3(0x3ff0000000000000, 0x401c000000000000, 0x4020000000000000);
+  status |=
+      test__adddf3(0x3ff0000000000000, 0x8000000000000000, 0x3ff0000000000000);
+  status |=
+      test__adddf3(0x3ff0000000000000, 0xbff0000000000000, 0x0000000000000000);
+  status |=
+      test__adddf3(0x3ff0000000000001, 0x3ff0000000000000, 0x4000000000000000);
+  status |=
+      test__adddf3(0x3ff0000000000001, 0xbff0000000000000, 0x3cb0000000000000);
+  status |=
+      test__adddf3(0x3ff0000000000001, 0xbff0000000000002, 0xbcb0000000000000);
+  status |=
+      test__adddf3(0x3ffffffffffffffc, 0xbffffffffffffffd, 0xbcb0000000000000);
+  status |=
+      test__adddf3(0x3fffffffffffffff, 0xc000000000000000, 0xbcb0000000000000);
+  status |=
+      test__adddf3(0x4000000000000000, 0x3cb0000000000000, 0x4000000000000000);
+  status |=
+      test__adddf3(0x4000000000000000, 0x3ff0000000000000, 0x4008000000000000);
+  status |=
+      test__adddf3(0x4000000000000000, 0x4000000000000000, 0x4010000000000000);
+  status |=
+      test__adddf3(0x4000000000000000, 0x4000000000000001, 0x4010000000000000);
+  status |=
+      test__adddf3(0x4000000000000000, 0xbfffffffffffffff, 0x3cb0000000000000);
+  status |=
+      test__adddf3(0x4000000000000000, 0xc000000000000000, 0x0000000000000000);
+  status |=
+      test__adddf3(0x4000000000000000, 0xc000000000000001, 0xbcc0000000000000);
+  status |=
+      test__adddf3(0x4000000000000000, 0xc014000000000000, 0xc008000000000000);
+  status |=
+      test__adddf3(0x4000000000000001, 0x3cb0000000000000, 0x4000000000000002);
+  status |=
+      test__adddf3(0x4000000000000001, 0x4000000000000002, 0x4010000000000002);
+  status |=
+      test__adddf3(0x4000000000000001, 0xbff0000000000001, 0x3ff0000000000001);
+  status |=
+      test__adddf3(0x4000000000000002, 0xbff0000000000001, 0x3ff0000000000003);
+  status |=
+      test__adddf3(0x4000000000000002, 0xbff0000000000003, 0x3ff0000000000001);
+  status |=
+      test__adddf3(0x4000000000000004, 0xc000000000000003, 0x3cc0000000000000);
+  status |=
+      test__adddf3(0x4008000000000000, 0x4008000000000000, 0x4018000000000000);
+  status |=
+      test__adddf3(0x400fffffffffffff, 0x3cafffffffffffff, 0x400fffffffffffff);
+  status |=
+      test__adddf3(0x400fffffffffffff, 0x3cb0000000000000, 0x4010000000000000);
+  status |=
+      test__adddf3(0x400fffffffffffff, 0xc00ffffffffffffe, 0x3cc0000000000000);
+  status |=
+      test__adddf3(0x400fffffffffffff, 0xc010000000000002, 0xbce4000000000000);
+  status |=
+      test__adddf3(0x4010000000000001, 0xc00fffffffffffff, 0x3cd8000000000000);
+  status |=
+      test__adddf3(0x4014000000000000, 0x0000000000000000, 0x4014000000000000);
+  status |=
+      test__adddf3(0x4014000000000000, 0x8000000000000000, 0x4014000000000000);
+  status |=
+      test__adddf3(0x4014000000000000, 0xbff0000000000000, 0x4010000000000000);
+  status |=
+      test__adddf3(0x4014000000000000, 0xc014000000000000, 0x0000000000000000);
+  status |=
+      test__adddf3(0x7fb0000000000001, 0xffafffffffffffff, 0x7c78000000000000);
+  status |=
+      test__adddf3(0x7fcfffffffffffff, 0xffcffffffffffffe, 0x7c80000000000000);
+  status |=
+      test__adddf3(0x7fcfffffffffffff, 0xffd0000000000002, 0xfca4000000000000);
+  status |=
+      test__adddf3(0x7fd0000000000000, 0x7fd0000000000000, 0x7fe0000000000000);
+  status |=
+      test__adddf3(0x7fd0000000000000, 0xffcfffffffffffff, 0x7c80000000000000);
+  status |=
+      test__adddf3(0x7fd0000000000000, 0xffd0000000000001, 0xfc90000000000000);
+  status |=
+      test__adddf3(0x7fd0000000000001, 0x7fd0000000000000, 0x7fe0000000000000);
+  status |=
+      test__adddf3(0x7fd0000000000001, 0xffe0000000000001, 0xffd0000000000001);
+  status |=
+      test__adddf3(0x7fd0000000000002, 0xffc0000000000003, 0x7fc0000000000001);
+  status |=
+      test__adddf3(0x7fd0000000000004, 0xffd0000000000003, 0x7c90000000000000);
+  status |=
+      test__adddf3(0x7fdffffffffffffe, 0x7fdffffffffffffe, 0x7feffffffffffffe);
+  status |=
+      test__adddf3(0x7fdffffffffffffe, 0x7fdfffffffffffff, 0x7feffffffffffffe);
+  status |=
+      test__adddf3(0x7fdfffffffffffff, 0x3ff0000000000000, 0x7fdfffffffffffff);
+  status |=
+      test__adddf3(0x7fdfffffffffffff, 0x7fe0000000000000, 0x7ff0000000000000);
+  status |=
+      test__adddf3(0x7fdfffffffffffff, 0xbff0000000000000, 0x7fdfffffffffffff);
+  status |=
+      test__adddf3(0x7fdfffffffffffff, 0xffe0000000000000, 0xfc90000000000000);
+  status |=
+      test__adddf3(0x7fe0000000000000, 0x3ff0000000000000, 0x7fe0000000000000);
+  status |=
+      test__adddf3(0x7fe0000000000000, 0x7fe0000000000000, 0x7ff0000000000000);
+  status |=
+      test__adddf3(0x7fe0000000000000, 0x7ff0000000000000, 0x7ff0000000000000);
+  status |=
+      test__adddf3(0x7fe0000000000000, 0xbff0000000000000, 0x7fe0000000000000);
+  status |=
+      test__adddf3(0x7fe0000000000000, 0xffe0000000000000, 0x0000000000000000);
+  status |=
+      test__adddf3(0x7fe0000000000000, 0xfff0000000000000, 0xfff0000000000000);
+  status |=
+      test__adddf3(0x7fe0000000000001, 0x7fe0000000000000, 0x7ff0000000000000);
+  status |=
+      test__adddf3(0x7fe0000000000001, 0xffe0000000000000, 0x7ca0000000000000);
+  status |=
+      test__adddf3(0x7fe0000000000001, 0xffe0000000000002, 0xfca0000000000000);
+  status |=
+      test__adddf3(0x7fe0000000000002, 0xffd0000000000001, 0x7fd0000000000003);
+  status |=
+      test__adddf3(0x7feffffffffffffe, 0x3ff0000000000000, 0x7feffffffffffffe);
+  status |=
+      test__adddf3(0x7feffffffffffffe, 0x7feffffffffffffe, 0x7ff0000000000000);
+  status |=
+      test__adddf3(0x7feffffffffffffe, 0x7fefffffffffffff, 0x7ff0000000000000);
+  status |=
+      test__adddf3(0x7feffffffffffffe, 0xbff0000000000000, 0x7feffffffffffffe);
+  status |=
+      test__adddf3(0x7feffffffffffffe, 0xffefffffffffffff, 0xfca0000000000000);
+  status |=
+      test__adddf3(0x7fefffffffffffff, 0x3ff0000000000000, 0x7fefffffffffffff);
+  status |=
+      test__adddf3(0x7fefffffffffffff, 0x8000000000000001, 0x7fefffffffffffff);
+  status |=
+      test__adddf3(0x7fefffffffffffff, 0xbff0000000000000, 0x7fefffffffffffff);
+  status |=
+      test__adddf3(0x7fefffffffffffff, 0xffefffffffffffff, 0x0000000000000000);
+  status |=
+      test__adddf3(0x7ff0000000000000, 0x0000000000000000, 0x7ff0000000000000);
+  status |=
+      test__adddf3(0x7ff0000000000000, 0x000fffffffffffff, 0x7ff0000000000000);
+  status |=
+      test__adddf3(0x7ff0000000000000, 0x7fe0000000000000, 0x7ff0000000000000);
+  status |=
+      test__adddf3(0x7ff0000000000000, 0x7ff0000000000000, 0x7ff0000000000000);
+  status |=
+      test__adddf3(0x7ff0000000000000, 0x8000000000000000, 0x7ff0000000000000);
+  status |=
+      test__adddf3(0x7ff0000000000000, 0x800fffffffffffff, 0x7ff0000000000000);
+  status |=
+      test__adddf3(0x7ff0000000000000, 0xffe0000000000000, 0x7ff0000000000000);
+  status |=
+      test__adddf3(0x8000000000000000, 0x0000000000000000, 0x0000000000000000);
+  status |=
+      test__adddf3(0x8000000000000000, 0x000fffffffffffff, 0x000fffffffffffff);
+  status |=
+      test__adddf3(0x8000000000000000, 0x7fe0000000000000, 0x7fe0000000000000);
+  status |=
+      test__adddf3(0x8000000000000000, 0x7ff0000000000000, 0x7ff0000000000000);
+  status |=
+      test__adddf3(0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
+  status |=
+      test__adddf3(0x8000000000000000, 0x800fffffffffffff, 0x800fffffffffffff);
+  status |=
+      test__adddf3(0x8000000000000000, 0x8010000000000000, 0x8010000000000000);
+  status |=
+      test__adddf3(0x8000000000000000, 0xbff0000000000000, 0xbff0000000000000);
+  status |=
+      test__adddf3(0x8000000000000000, 0xfff0000000000000, 0xfff0000000000000);
+  status |=
+      test__adddf3(0x8000000000000001, 0x0000000000000001, 0x0000000000000000);
+  status |=
+      test__adddf3(0x8000000000000001, 0x8000000000000001, 0x8000000000000002);
+  status |=
+      test__adddf3(0x8000000000000001, 0xbfefffffffffffff, 0xbfefffffffffffff);
+  status |=
+      test__adddf3(0x8000000000000001, 0xbff0000000000000, 0xbff0000000000000);
+  status |=
+      test__adddf3(0x8000000000000001, 0xbffffffffffffffe, 0xbffffffffffffffe);
+  status |=
+      test__adddf3(0x8000000000000001, 0xbfffffffffffffff, 0xbfffffffffffffff);
+  status |=
+      test__adddf3(0x8000000000000001, 0xffdfffffffffffff, 0xffdfffffffffffff);
+  status |=
+      test__adddf3(0x8000000000000001, 0xffe0000000000000, 0xffe0000000000000);
+  status |=
+      test__adddf3(0x8000000000000001, 0xffeffffffffffffe, 0xffeffffffffffffe);
+  status |=
+      test__adddf3(0x8000000000000001, 0xffefffffffffffff, 0xffefffffffffffff);
+  status |=
+      test__adddf3(0x8000000000000002, 0x0000000000000001, 0x8000000000000001);
+  status |=
+      test__adddf3(0x8000000000000003, 0x0000000000000000, 0x8000000000000003);
+  status |=
+      test__adddf3(0x8000000000000003, 0x0000000000000002, 0x8000000000000001);
+  status |=
+      test__adddf3(0x8000000000000003, 0x4008000000000000, 0x4008000000000000);
+  status |=
+      test__adddf3(0x8000000000000003, 0x7fe0000000000000, 0x7fe0000000000000);
+  status |=
+      test__adddf3(0x8000000000000003, 0x7ff0000000000000, 0x7ff0000000000000);
+  status |=
+      test__adddf3(0x8000000000000003, 0x8000000000000000, 0x8000000000000003);
+  status |=
+      test__adddf3(0x8000000000000003, 0xfff0000000000000, 0xfff0000000000000);
+  status |=
+      test__adddf3(0x8000000000000004, 0x8000000000000004, 0x8000000000000008);
+  status |=
+      test__adddf3(0x800ffffffffffffd, 0x000ffffffffffffe, 0x0000000000000001);
+  status |=
+      test__adddf3(0x800fffffffffffff, 0x000ffffffffffffe, 0x8000000000000001);
+  status |=
+      test__adddf3(0x800fffffffffffff, 0x000fffffffffffff, 0x0000000000000000);
+  status |=
+      test__adddf3(0x800fffffffffffff, 0x0010000000000000, 0x0000000000000001);
+  status |=
+      test__adddf3(0x800fffffffffffff, 0x800fffffffffffff, 0x801ffffffffffffe);
+  status |=
+      test__adddf3(0x8010000000000000, 0x0000000000000000, 0x8010000000000000);
+  status |=
+      test__adddf3(0x8010000000000000, 0x0010000000000000, 0x0000000000000000);
+  status |=
+      test__adddf3(0x8010000000000001, 0x0010000000000000, 0x8000000000000001);
+  status |=
+      test__adddf3(0x8010000000000001, 0x0010000000000002, 0x0000000000000001);
+  status |=
+      test__adddf3(0x801fffffffffffff, 0x0020000000000000, 0x0000000000000001);
+  status |=
+      test__adddf3(0x801fffffffffffff, 0x0020000000000002, 0x0000000000000005);
+  status |=
+      test__adddf3(0x801fffffffffffff, 0x0020000000000004, 0x0000000000000009);
+  status |=
+      test__adddf3(0x8020000000000000, 0x001fffffffffffff, 0x8000000000000001);
+  status |=
+      test__adddf3(0x8020000000000001, 0x0010000000000001, 0x8010000000000001);
+  status |=
+      test__adddf3(0x8020000000000001, 0x001fffffffffffff, 0x8000000000000003);
+  status |=
+      test__adddf3(0x8020000000000002, 0x0010000000000001, 0x8010000000000003);
+  status |=
+      test__adddf3(0x802fffffffffffff, 0x0030000000000000, 0x0000000000000002);
+  status |=
+      test__adddf3(0x8030000000000000, 0x002fffffffffffff, 0x8000000000000002);
+  status |=
+      test__adddf3(0x8030000000000001, 0x002fffffffffffff, 0x8000000000000006);
+  status |=
+      test__adddf3(0x8030000000000002, 0x0020000000000003, 0x8020000000000001);
+  status |=
+      test__adddf3(0xbff0000000000000, 0x8000000000000000, 0xbff0000000000000);
+  status |=
+      test__adddf3(0xbff0000000000000, 0xbff0000000000003, 0xc000000000000002);
+  status |=
+      test__adddf3(0xbff0000000000001, 0x3ff0000000000000, 0xbcb0000000000000);
+  status |=
+      test__adddf3(0xbff0000000000001, 0x3ff0000000000002, 0x3cb0000000000000);
+  status |=
+      test__adddf3(0xbff0000000000001, 0xbff0000000000000, 0xc000000000000000);
+  status |=
+      test__adddf3(0xbffffffffffffffc, 0x3ffffffffffffffd, 0x3cb0000000000000);
+  status |=
+      test__adddf3(0xbfffffffffffffff, 0x0000000000000001, 0xbfffffffffffffff);
+  status |=
+      test__adddf3(0xbfffffffffffffff, 0x4000000000000000, 0x3cb0000000000000);
+  status |=
+      test__adddf3(0xc000000000000000, 0x3fffffffffffffff, 0xbcb0000000000000);
+  status |=
+      test__adddf3(0xc000000000000000, 0x4000000000000001, 0x3cc0000000000000);
+  status |=
+      test__adddf3(0xc000000000000000, 0xc000000000000001, 0xc010000000000000);
+  status |=
+      test__adddf3(0xc000000000000001, 0x3ff0000000000001, 0xbff0000000000001);
+  status |=
+      test__adddf3(0xc000000000000001, 0xc000000000000002, 0xc010000000000002);
+  status |=
+      test__adddf3(0xc000000000000002, 0x3ff0000000000001, 0xbff0000000000003);
+  status |=
+      test__adddf3(0xc000000000000002, 0x3ff0000000000003, 0xbff0000000000001);
+  status |=
+      test__adddf3(0xc000000000000004, 0x4000000000000003, 0xbcc0000000000000);
+  status |=
+      test__adddf3(0xc008000000000000, 0x4008000000000000, 0x0000000000000000);
+  status |=
+      test__adddf3(0xc00fffffffffffff, 0x400ffffffffffffe, 0xbcc0000000000000);
+  status |=
+      test__adddf3(0xc00fffffffffffff, 0x4010000000000002, 0x3ce4000000000000);
+  status |=
+      test__adddf3(0xc00fffffffffffff, 0xbcafffffffffffff, 0xc00fffffffffffff);
+  status |=
+      test__adddf3(0xc00fffffffffffff, 0xbcb0000000000000, 0xc010000000000000);
+  status |=
+      test__adddf3(0xc010000000000001, 0x400fffffffffffff, 0xbcd8000000000000);
+  status |=
+      test__adddf3(0xffb0000000000001, 0x7fafffffffffffff, 0xfc78000000000000);
+  status |=
+      test__adddf3(0xffcfffffffffffff, 0x7fcffffffffffffe, 0xfc80000000000000);
+  status |=
+      test__adddf3(0xffcfffffffffffff, 0x7fd0000000000002, 0x7ca4000000000000);
+  status |=
+      test__adddf3(0xffd0000000000000, 0x7fcfffffffffffff, 0xfc80000000000000);
+  status |=
+      test__adddf3(0xffd0000000000000, 0x7fd0000000000001, 0x7c90000000000000);
+  status |=
+      test__adddf3(0xffd0000000000001, 0x7fe0000000000001, 0x7fd0000000000001);
+  status |=
+      test__adddf3(0xffd0000000000001, 0xffd0000000000000, 0xffe0000000000000);
+  status |=
+      test__adddf3(0xffd0000000000002, 0x7fc0000000000003, 0xffc0000000000001);
+  status |=
+      test__adddf3(0xffd0000000000004, 0x7fd0000000000003, 0xfc90000000000000);
+  status |=
+      test__adddf3(0xffdffffffffffffe, 0x7fdffffffffffffe, 0x0000000000000000);
+  status |=
+      test__adddf3(0xffdffffffffffffe, 0xffdffffffffffffe, 0xffeffffffffffffe);
+  status |=
+      test__adddf3(0xffdffffffffffffe, 0xffdfffffffffffff, 0xffeffffffffffffe);
+  status |=
+      test__adddf3(0xffdfffffffffffff, 0x3ff0000000000000, 0xffdfffffffffffff);
+  status |=
+      test__adddf3(0xffdfffffffffffff, 0x7fe0000000000000, 0x7c90000000000000);
+  status |=
+      test__adddf3(0xffdfffffffffffff, 0xbff0000000000000, 0xffdfffffffffffff);
+  status |=
+      test__adddf3(0xffdfffffffffffff, 0xffe0000000000000, 0xfff0000000000000);
+  status |=
+      test__adddf3(0xffe0000000000000, 0x0000000000000000, 0xffe0000000000000);
+  status |=
+      test__adddf3(0xffe0000000000000, 0x3ff0000000000000, 0xffe0000000000000);
+  status |=
+      test__adddf3(0xffe0000000000000, 0x7ff0000000000000, 0x7ff0000000000000);
+  status |=
+      test__adddf3(0xffe0000000000000, 0x8000000000000000, 0xffe0000000000000);
+  status |=
+      test__adddf3(0xffe0000000000000, 0xbff0000000000000, 0xffe0000000000000);
+  status |=
+      test__adddf3(0xffe0000000000000, 0xffe0000000000000, 0xfff0000000000000);
+  status |=
+      test__adddf3(0xffe0000000000000, 0xfff0000000000000, 0xfff0000000000000);
+  status |=
+      test__adddf3(0xffe0000000000001, 0x7fe0000000000000, 0xfca0000000000000);
+  status |=
+      test__adddf3(0xffe0000000000001, 0x7fe0000000000002, 0x7ca0000000000000);
+  status |=
+      test__adddf3(0xffe0000000000001, 0xffe0000000000000, 0xfff0000000000000);
+  status |=
+      test__adddf3(0xffe0000000000002, 0x7fd0000000000001, 0xffd0000000000003);
+  status |=
+      test__adddf3(0xffeffffffffffffe, 0x3ff0000000000000, 0xffeffffffffffffe);
+  status |=
+      test__adddf3(0xffeffffffffffffe, 0x7fefffffffffffff, 0x7ca0000000000000);
+  status |=
+      test__adddf3(0xffeffffffffffffe, 0xbff0000000000000, 0xffeffffffffffffe);
+  status |=
+      test__adddf3(0xffeffffffffffffe, 0xffeffffffffffffe, 0xfff0000000000000);
+  status |=
+      test__adddf3(0xffeffffffffffffe, 0xffefffffffffffff, 0xfff0000000000000);
+  status |=
+      test__adddf3(0xffefffffffffffff, 0x0000000000000001, 0xffefffffffffffff);
+  status |=
+      test__adddf3(0xffefffffffffffff, 0x3ff0000000000000, 0xffefffffffffffff);
+  status |=
+      test__adddf3(0xffefffffffffffff, 0xbff0000000000000, 0xffefffffffffffff);
+  status |=
+      test__adddf3(0xfff0000000000000, 0x0000000000000000, 0xfff0000000000000);
+  status |=
+      test__adddf3(0xfff0000000000000, 0x000fffffffffffff, 0xfff0000000000000);
+  status |=
+      test__adddf3(0xfff0000000000000, 0x7fe0000000000000, 0xfff0000000000000);
+  status |=
+      test__adddf3(0xfff0000000000000, 0x8000000000000000, 0xfff0000000000000);
+  status |=
+      test__adddf3(0xfff0000000000000, 0x800fffffffffffff, 0xfff0000000000000);
+  status |=
+      test__adddf3(0xfff0000000000000, 0xffe0000000000000, 0xfff0000000000000);
+  status |=
+      test__adddf3(0xfff0000000000000, 0xfff0000000000000, 0xfff0000000000000);
+  status |=
+      test__adddf3(0x3de3a83a83a83a83, 0xbff0000000000000, 0xbfefffffffec57c5);
+  status |=
+      test__adddf3(0x0000000007ffffff, 0x0010000000010000, 0x001000000800ffff);
+  status |=
+      test__adddf3(0x001effffffffffff, 0x0000000000400000, 0x001f0000003fffff);
+  status |=
+      test__adddf3(0x80000000000003ff, 0x801ffffbffffffff, 0x801ffffc000003fe);
+  status |=
+      test__adddf3(0x80003fffffffffff, 0x8010000000100000, 0x80104000000fffff);
+
+  // Test that the result of an operation is a NaN at all when it should be.
+  //
+  // In most configurations these tests' results are checked compared using
+  // compareResultD, so we set all the answers to the canonical NaN
+  // 0x7ff8000000000000, which causes compareResultF to accept any NaN
+  // encoding. We also use the same value as the input NaN in tests that have
+  // one, so that even in EXPECT_EXACT_RESULTS mode these tests should pass,
+  // because 0x7ff8000000000000 is still the exact expected NaN.
+  status |=
+      test__adddf3(0x7ff0000000000000, 0xfff0000000000000, 0x7ff8000000000000);
+  status |=
+      test__adddf3(0xfff0000000000000, 0x7ff0000000000000, 0x7ff8000000000000);
+  status |=
+      test__adddf3(0x3ff0000000000000, 0x7ff8000000000000, 0x7ff8000000000000);
+  status |=
+      test__adddf3(0x7ff8000000000000, 0x3ff0000000000000, 0x7ff8000000000000);
+  status |=
+      test__adddf3(0x7ff8000000000000, 0x7ff8000000000000, 0x7ff8000000000000);
+
+#ifdef ARM_NAN_HANDLING
+  // Tests specific to the NaN handling of Arm hardware, mimicked by
+  // arm/adddf3.S:
+  //
+  //  - a quiet NaN is distinguished by the top mantissa bit being 1
+  //
+  //  - if a signalling NaN appears in the input, the output quiet NaN is
+  //    obtained by setting its top mantissa bit and leaving everything else
+  //    unchanged
+  //
+  //  - if both operands are signalling NaNs then the output NaN is derived
+  //    from the first operand
+  //
+  //  - if both operands are quiet NaNs then the output NaN is the first
+  //    operand
+  //
+  //  - invalid operations not involving an input NaN return the quiet
+  //    NaN with fewest bits set, 0x7ff8000000000000.
+  status |=
+      test__adddf3(0x0000000000000000, 0x7ff3758244400801, 0x7ffb758244400801);
+  status |=
+      test__adddf3(0x0000000000000000, 0x7fff44d3f65148af, 0x7fff44d3f65148af);
+  status |=
+      test__adddf3(0x0000000000000001, 0x7ff48607b4b37057, 0x7ffc8607b4b37057);
+  status |=
+      test__adddf3(0x0000000000000001, 0x7ff855f2d435b33d, 0x7ff855f2d435b33d);
+  status |=
+      test__adddf3(0x000fffffffffffff, 0x7ff169269a674e13, 0x7ff969269a674e13);
+  status |=
+      test__adddf3(0x000fffffffffffff, 0x7ffc80978b2ef0da, 0x7ffc80978b2ef0da);
+  status |=
+      test__adddf3(0x3ff0000000000000, 0x7ff3458ad034593d, 0x7ffb458ad034593d);
+  status |=
+      test__adddf3(0x3ff0000000000000, 0x7ffdd8bb98c9f13a, 0x7ffdd8bb98c9f13a);
+  status |=
+      test__adddf3(0x7fefffffffffffff, 0x7ff79a8b96250a98, 0x7fff9a8b96250a98);
+  status |=
+      test__adddf3(0x7fefffffffffffff, 0x7ffdcc675b63bb94, 0x7ffdcc675b63bb94);
+  status |=
+      test__adddf3(0x7ff0000000000000, 0x7ff018cfaf4d0fff, 0x7ff818cfaf4d0fff);
+  status |=
+      test__adddf3(0x7ff0000000000000, 0x7ff83ad1ab4dfd24, 0x7ff83ad1ab4dfd24);
+  status |=
+      test__adddf3(0x7ff48ce6c0cdd5ac, 0x0000000000000000, 0x7ffc8ce6c0cdd5ac);
+  status |=
+      test__adddf3(0x7ff08a34f3d5385b, 0x0000000000000001, 0x7ff88a34f3d5385b);
+  status |=
+      test__adddf3(0x7ff0a264c1c96281, 0x000fffffffffffff, 0x7ff8a264c1c96281);
+  status |=
+      test__adddf3(0x7ff77ce629e61f0e, 0x3ff0000000000000, 0x7fff7ce629e61f0e);
+  status |=
+      test__adddf3(0x7ff715e2d147fd76, 0x7fefffffffffffff, 0x7fff15e2d147fd76);
+  status |=
+      test__adddf3(0x7ff689a2031f1781, 0x7ff0000000000000, 0x7ffe89a2031f1781);
+  status |=
+      test__adddf3(0x7ff5dfb4a0c8cd05, 0x7ff11c1fe9793a33, 0x7ffddfb4a0c8cd05);
+  status |=
+      test__adddf3(0x7ff5826283ffb5d7, 0x7fff609b83884e81, 0x7ffd826283ffb5d7);
+  status |=
+      test__adddf3(0x7ff7cb03f2e61d42, 0x8000000000000000, 0x7fffcb03f2e61d42);
+  status |=
+      test__adddf3(0x7ff2adc8dfe72c96, 0x8000000000000001, 0x7ffaadc8dfe72c96);
+  status |=
+      test__adddf3(0x7ff4fc0bacc707f2, 0x800fffffffffffff, 0x7ffcfc0bacc707f2);
+  status |=
+      test__adddf3(0x7ff76248c8c9a619, 0xbff0000000000000, 0x7fff6248c8c9a619);
+  status |=
+      test__adddf3(0x7ff367972fce131b, 0xffefffffffffffff, 0x7ffb67972fce131b);
+  status |=
+      test__adddf3(0x7ff188f5ac284e92, 0xfff0000000000000, 0x7ff988f5ac284e92);
+  status |=
+      test__adddf3(0x7ffed4c22e4e569d, 0x0000000000000000, 0x7ffed4c22e4e569d);
+  status |=
+      test__adddf3(0x7ffe95105fa3f339, 0x0000000000000001, 0x7ffe95105fa3f339);
+  status |=
+      test__adddf3(0x7ffb8d33dbb9ecfb, 0x000fffffffffffff, 0x7ffb8d33dbb9ecfb);
+  status |=
+      test__adddf3(0x7ff874e41dc63e07, 0x3ff0000000000000, 0x7ff874e41dc63e07);
+  status |=
+      test__adddf3(0x7ffe27594515ecdf, 0x7fefffffffffffff, 0x7ffe27594515ecdf);
+  status |=
+      test__adddf3(0x7ffeac86d5c69bdf, 0x7ff0000000000000, 0x7ffeac86d5c69bdf);
+  status |=
+      test__adddf3(0x7ff97d657b99f76f, 0x7ff7e4149862a796, 0x7fffe4149862a796);
+  status |=
+      test__adddf3(0x7ffad17c6aa33fad, 0x7ffd898893ad4d28, 0x7ffad17c6aa33fad);
+  status |=
+      test__adddf3(0x7ff96e04e9c3d173, 0x8000000000000000, 0x7ff96e04e9c3d173);
+  status |=
+      test__adddf3(0x7ffec01ad8da3abb, 0x8000000000000001, 0x7ffec01ad8da3abb);
+  status |=
+      test__adddf3(0x7ffd1d565c495941, 0x800fffffffffffff, 0x7ffd1d565c495941);
+  status |=
+      test__adddf3(0x7ffe3d24f1e474a7, 0xbff0000000000000, 0x7ffe3d24f1e474a7);
+  status |=
+      test__adddf3(0x7ffc206f2bb8c8ce, 0xffefffffffffffff, 0x7ffc206f2bb8c8ce);
+  status |=
+      test__adddf3(0x7ff93efdecfb7d3b, 0xfff0000000000000, 0x7ff93efdecfb7d3b);
+  status |=
+      test__adddf3(0x8000000000000000, 0x7ff2ee725d143ac5, 0x7ffaee725d143ac5);
+  status |=
+      test__adddf3(0x8000000000000000, 0x7ffbba26e5c5fe98, 0x7ffbba26e5c5fe98);
+  status |=
+      test__adddf3(0x8000000000000001, 0x7ff7818a1cd26df9, 0x7fff818a1cd26df9);
+  status |=
+      test__adddf3(0x8000000000000001, 0x7ffaee6cc63b5292, 0x7ffaee6cc63b5292);
+  status |=
+      test__adddf3(0x800fffffffffffff, 0x7ff401096edaf79d, 0x7ffc01096edaf79d);
+  status |=
+      test__adddf3(0x800fffffffffffff, 0x7ffbf1778c7a2e59, 0x7ffbf1778c7a2e59);
+  status |=
+      test__adddf3(0xbff0000000000000, 0x7ff2e8fb0201c496, 0x7ffae8fb0201c496);
+  status |=
+      test__adddf3(0xbff0000000000000, 0x7ffcb6a5adb2e154, 0x7ffcb6a5adb2e154);
+  status |=
+      test__adddf3(0xffefffffffffffff, 0x7ff1ea1bfc15d71d, 0x7ff9ea1bfc15d71d);
+  status |=
+      test__adddf3(0xffefffffffffffff, 0x7ffae0766e21efc0, 0x7ffae0766e21efc0);
+  status |=
+      test__adddf3(0xfff0000000000000, 0x7ff3b364cffbdfe6, 0x7ffbb364cffbdfe6);
+  status |=
+      test__adddf3(0xfff0000000000000, 0x7ffd0d3223334ae3, 0x7ffd0d3223334ae3);
+
+#endif // ARM_NAN_HANDLING
+
+  return status;
+}
diff --git a/compiler-rt/test/builtins/Unit/subdf3new_test.c b/compiler-rt/test/builtins/Unit/subdf3new_test.c
new file mode 100644
index 000000000000..dd72fbb8b8f5
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/subdf3new_test.c
@@ -0,0 +1,706 @@
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_subdf3
+
+#include "int_lib.h"
+#include <inttypes.h>
+#include <stdio.h>
+
+#include "fp_test.h"
+
+// By default this test uses compareResultD to check the returned floats, which
+// accepts any returned NaN if the expected result is the canonical NaN value
+// 0x7ff8000000000000. For the Arm optimized FP implementation, which commits
+// to a more detailed handling of NaNs, we tighten up the check and include
+// some extra test cases specific to that NaN policy.
+#if COMPILER_RT_ARM_OPTIMIZED_FP
+#  define EXPECT_EXACT_RESULTS
+#  define ARM_NAN_HANDLING
+#endif
+
+// Returns: a - b
+COMPILER_RT_ABI double __subdf3(double a, double b);
+
+int test__subdf3(uint64_t a_rep, uint64_t b_rep, uint64_t expected_rep,
+                 int line) {
+  double a = fromRep64(a_rep), b = fromRep64(b_rep);
+  double x = __subdf3(a, b);
+#ifdef EXPECT_EXACT_RESULTS
+  int ret = toRep64(x) != expected_rep;
+#else
+  int ret = compareResultD(x, expected_rep);
+#endif
+
+  if (ret) {
+    printf("error at line %d: __subdf3(%016" PRIx64 ", %016" PRIx64
+           ") = %016" PRIx64 ", expected %016" PRIx64 "\n",
+           line, a_rep, b_rep, toRep64(x), expected_rep);
+  }
+  return ret;
+}
+
+#define test__subdf3(a, b, x) test__subdf3(a, b, x, __LINE__)
+
+int main(void) {
+  int status = 0;
+
+  status |=
+      test__subdf3(0x0000000000000000, 0x0000000000000000, 0x0000000000000000);
+  status |=
+      test__subdf3(0x0000000000000000, 0x000fffffffffffff, 0x800fffffffffffff);
+  status |=
+      test__subdf3(0x0000000000000000, 0x0010000000000000, 0x8010000000000000);
+  status |=
+      test__subdf3(0x0000000000000000, 0x7ff0000000000000, 0xfff0000000000000);
+  status |=
+      test__subdf3(0x0000000000000000, 0x8000000000000000, 0x0000000000000000);
+  status |=
+      test__subdf3(0x0000000000000000, 0x800fffffffffffff, 0x000fffffffffffff);
+  status |=
+      test__subdf3(0x0000000000000000, 0xbff0000000000000, 0x3ff0000000000000);
+  status |=
+      test__subdf3(0x0000000000000000, 0xffe0000000000000, 0x7fe0000000000000);
+  status |=
+      test__subdf3(0x0000000000000000, 0xfff0000000000000, 0x7ff0000000000000);
+  status |=
+      test__subdf3(0x0000000000000001, 0x0000000000000001, 0x0000000000000000);
+  status |=
+      test__subdf3(0x0000000000000001, 0x8000000000000001, 0x0000000000000002);
+  status |=
+      test__subdf3(0x0000000000000001, 0xbfefffffffffffff, 0x3fefffffffffffff);
+  status |=
+      test__subdf3(0x0000000000000001, 0xbff0000000000000, 0x3ff0000000000000);
+  status |=
+      test__subdf3(0x0000000000000001, 0xbffffffffffffffe, 0x3ffffffffffffffe);
+  status |=
+      test__subdf3(0x0000000000000001, 0xbfffffffffffffff, 0x3fffffffffffffff);
+  status |=
+      test__subdf3(0x0000000000000001, 0xffdfffffffffffff, 0x7fdfffffffffffff);
+  status |=
+      test__subdf3(0x0000000000000001, 0xffe0000000000000, 0x7fe0000000000000);
+  status |=
+      test__subdf3(0x0000000000000001, 0xffeffffffffffffe, 0x7feffffffffffffe);
+  status |=
+      test__subdf3(0x0000000000000001, 0xffefffffffffffff, 0x7fefffffffffffff);
+  status |=
+      test__subdf3(0x0000000000000002, 0x0000000000000001, 0x0000000000000001);
+  status |=
+      test__subdf3(0x0000000000000003, 0x0000000000000000, 0x0000000000000003);
+  status |=
+      test__subdf3(0x0000000000000003, 0x0000000000000002, 0x0000000000000001);
+  status |=
+      test__subdf3(0x0000000000000003, 0x4014000000000000, 0xc014000000000000);
+  status |=
+      test__subdf3(0x0000000000000003, 0x7fe0000000000000, 0xffe0000000000000);
+  status |=
+      test__subdf3(0x0000000000000003, 0x7ff0000000000000, 0xfff0000000000000);
+  status |=
+      test__subdf3(0x0000000000000003, 0x8000000000000000, 0x0000000000000003);
+  status |=
+      test__subdf3(0x0000000000000003, 0xfff0000000000000, 0x7ff0000000000000);
+  status |=
+      test__subdf3(0x0000000000000004, 0x8000000000000004, 0x0000000000000008);
+  status |=
+      test__subdf3(0x000ffffffffffffc, 0x000ffffffffffffc, 0x0000000000000000);
+  status |=
+      test__subdf3(0x000ffffffffffffd, 0x000ffffffffffffe, 0x8000000000000001);
+  status |=
+      test__subdf3(0x000fffffffffffff, 0x000ffffffffffffe, 0x0000000000000001);
+  status |=
+      test__subdf3(0x000fffffffffffff, 0x0010000000000000, 0x8000000000000001);
+  status |=
+      test__subdf3(0x000fffffffffffff, 0x800fffffffffffff, 0x001ffffffffffffe);
+  status |=
+      test__subdf3(0x0010000000000000, 0x0010000000000000, 0x0000000000000000);
+  status |=
+      test__subdf3(0x0010000000000000, 0x8000000000000000, 0x0010000000000000);
+  status |=
+      test__subdf3(0x0010000000000000, 0x8010000000000000, 0x0020000000000000);
+  status |=
+      test__subdf3(0x0010000000000001, 0x0010000000000000, 0x0000000000000001);
+  status |=
+      test__subdf3(0x0010000000000001, 0x0010000000000002, 0x8000000000000001);
+  status |=
+      test__subdf3(0x001fffffffffffff, 0x0020000000000000, 0x8000000000000001);
+  status |=
+      test__subdf3(0x001fffffffffffff, 0x0020000000000002, 0x8000000000000005);
+  status |=
+      test__subdf3(0x001fffffffffffff, 0x0020000000000004, 0x8000000000000009);
+  status |=
+      test__subdf3(0x0020000000000000, 0x001fffffffffffff, 0x0000000000000001);
+  status |=
+      test__subdf3(0x0020000000000001, 0x0010000000000001, 0x0010000000000001);
+  status |=
+      test__subdf3(0x0020000000000001, 0x001fffffffffffff, 0x0000000000000003);
+  status |=
+      test__subdf3(0x0020000000000002, 0x0010000000000001, 0x0010000000000003);
+  status |=
+      test__subdf3(0x002fffffffffffff, 0x0030000000000000, 0x8000000000000002);
+  status |=
+      test__subdf3(0x0030000000000000, 0x002fffffffffffff, 0x0000000000000002);
+  status |=
+      test__subdf3(0x0030000000000001, 0x002fffffffffffff, 0x0000000000000006);
+  status |=
+      test__subdf3(0x0030000000000002, 0x0020000000000003, 0x0020000000000001);
+  status |=
+      test__subdf3(0x3fefffffffffffff, 0x0000000000000001, 0x3fefffffffffffff);
+  status |=
+      test__subdf3(0x3ff0000000000000, 0x0000000000000000, 0x3ff0000000000000);
+  status |=
+      test__subdf3(0x3ff0000000000000, 0x3ff0000000000000, 0x0000000000000000);
+  status |=
+      test__subdf3(0x3ff0000000000000, 0xbff0000000000000, 0x4000000000000000);
+  status |=
+      test__subdf3(0x3ff0000000000000, 0xbff0000000000003, 0x4000000000000002);
+  status |=
+      test__subdf3(0x3ff0000000000000, 0xc000000000000000, 0x4008000000000000);
+  status |=
+      test__subdf3(0x3ff0000000000000, 0xc01c000000000000, 0x4020000000000000);
+  status |=
+      test__subdf3(0x3ff0000000000001, 0x3ff0000000000000, 0x3cb0000000000000);
+  status |=
+      test__subdf3(0x3ff0000000000001, 0x3ff0000000000002, 0xbcb0000000000000);
+  status |=
+      test__subdf3(0x3ff0000000000001, 0xbff0000000000000, 0x4000000000000000);
+  status |=
+      test__subdf3(0x3ffffffffffffffc, 0x3ffffffffffffffd, 0xbcb0000000000000);
+  status |=
+      test__subdf3(0x3fffffffffffffff, 0x4000000000000000, 0xbcb0000000000000);
+  status |=
+      test__subdf3(0x4000000000000000, 0x3fffffffffffffff, 0x3cb0000000000000);
+  status |=
+      test__subdf3(0x4000000000000000, 0x4000000000000000, 0x0000000000000000);
+  status |=
+      test__subdf3(0x4000000000000000, 0x4000000000000001, 0xbcc0000000000000);
+  status |=
+      test__subdf3(0x4000000000000000, 0x4014000000000000, 0xc008000000000000);
+  status |=
+      test__subdf3(0x4000000000000000, 0xbcb0000000000000, 0x4000000000000000);
+  status |=
+      test__subdf3(0x4000000000000000, 0xbff0000000000000, 0x4008000000000000);
+  status |=
+      test__subdf3(0x4000000000000000, 0xc000000000000000, 0x4010000000000000);
+  status |=
+      test__subdf3(0x4000000000000000, 0xc000000000000001, 0x4010000000000000);
+  status |=
+      test__subdf3(0x4000000000000001, 0x3ff0000000000001, 0x3ff0000000000001);
+  status |=
+      test__subdf3(0x4000000000000001, 0xbcb0000000000000, 0x4000000000000002);
+  status |=
+      test__subdf3(0x4000000000000001, 0xc000000000000002, 0x4010000000000002);
+  status |=
+      test__subdf3(0x4000000000000002, 0x3ff0000000000001, 0x3ff0000000000003);
+  status |=
+      test__subdf3(0x4000000000000002, 0x3ff0000000000003, 0x3ff0000000000001);
+  status |=
+      test__subdf3(0x4000000000000004, 0x4000000000000003, 0x3cc0000000000000);
+  status |=
+      test__subdf3(0x4008000000000000, 0xc008000000000000, 0x4018000000000000);
+  status |=
+      test__subdf3(0x400fffffffffffff, 0x400ffffffffffffe, 0x3cc0000000000000);
+  status |=
+      test__subdf3(0x400fffffffffffff, 0x4010000000000002, 0xbce4000000000000);
+  status |=
+      test__subdf3(0x400fffffffffffff, 0xbcafffffffffffff, 0x400fffffffffffff);
+  status |=
+      test__subdf3(0x400fffffffffffff, 0xbcb0000000000000, 0x4010000000000000);
+  status |=
+      test__subdf3(0x4010000000000001, 0x400fffffffffffff, 0x3cd8000000000000);
+  status |=
+      test__subdf3(0x4014000000000000, 0x0000000000000000, 0x4014000000000000);
+  status |=
+      test__subdf3(0x4014000000000000, 0x3ff0000000000000, 0x4010000000000000);
+  status |=
+      test__subdf3(0x4014000000000000, 0x4014000000000000, 0x0000000000000000);
+  status |=
+      test__subdf3(0x4014000000000000, 0x8000000000000000, 0x4014000000000000);
+  status |=
+      test__subdf3(0x4280000000000001, 0x3ff0017fffffffff, 0x427ffffffffff001);
+  status |=
+      test__subdf3(0x7fb0000000000001, 0x7fafffffffffffff, 0x7c78000000000000);
+  status |=
+      test__subdf3(0x7fcfffffffffffff, 0x7fcffffffffffffe, 0x7c80000000000000);
+  status |=
+      test__subdf3(0x7fcfffffffffffff, 0x7fd0000000000002, 0xfca4000000000000);
+  status |=
+      test__subdf3(0x7fd0000000000000, 0x7fcfffffffffffff, 0x7c80000000000000);
+  status |=
+      test__subdf3(0x7fd0000000000000, 0x7fd0000000000001, 0xfc90000000000000);
+  status |=
+      test__subdf3(0x7fd0000000000000, 0xffd0000000000000, 0x7fe0000000000000);
+  status |=
+      test__subdf3(0x7fd0000000000001, 0x7fe0000000000001, 0xffd0000000000001);
+  status |=
+      test__subdf3(0x7fd0000000000001, 0xffd0000000000000, 0x7fe0000000000000);
+  status |=
+      test__subdf3(0x7fd0000000000002, 0x7fc0000000000003, 0x7fc0000000000001);
+  status |=
+      test__subdf3(0x7fd0000000000004, 0x7fd0000000000003, 0x7c90000000000000);
+  status |=
+      test__subdf3(0x7fdffffffffffffe, 0xffdffffffffffffe, 0x7feffffffffffffe);
+  status |=
+      test__subdf3(0x7fdffffffffffffe, 0xffdfffffffffffff, 0x7feffffffffffffe);
+  status |=
+      test__subdf3(0x7fdfffffffffffff, 0x3ff0000000000000, 0x7fdfffffffffffff);
+  status |=
+      test__subdf3(0x7fdfffffffffffff, 0x7fe0000000000000, 0xfc90000000000000);
+  status |=
+      test__subdf3(0x7fdfffffffffffff, 0xbff0000000000000, 0x7fdfffffffffffff);
+  status |=
+      test__subdf3(0x7fdfffffffffffff, 0xffe0000000000000, 0x7ff0000000000000);
+  status |=
+      test__subdf3(0x7fe0000000000000, 0x3ff0000000000000, 0x7fe0000000000000);
+  status |=
+      test__subdf3(0x7fe0000000000000, 0x7fe0000000000000, 0x0000000000000000);
+  status |=
+      test__subdf3(0x7fe0000000000000, 0x7ff0000000000000, 0xfff0000000000000);
+  status |=
+      test__subdf3(0x7fe0000000000000, 0xbff0000000000000, 0x7fe0000000000000);
+  status |=
+      test__subdf3(0x7fe0000000000000, 0xffe0000000000000, 0x7ff0000000000000);
+  status |=
+      test__subdf3(0x7fe0000000000000, 0xfff0000000000000, 0x7ff0000000000000);
+  status |=
+      test__subdf3(0x7fe0000000000001, 0x7fe0000000000000, 0x7ca0000000000000);
+  status |=
+      test__subdf3(0x7fe0000000000001, 0x7fe0000000000002, 0xfca0000000000000);
+  status |=
+      test__subdf3(0x7fe0000000000001, 0xffe0000000000000, 0x7ff0000000000000);
+  status |=
+      test__subdf3(0x7fe0000000000002, 0x7fd0000000000001, 0x7fd0000000000003);
+  status |=
+      test__subdf3(0x7feffffffffffffe, 0x3ff0000000000000, 0x7feffffffffffffe);
+  status |=
+      test__subdf3(0x7feffffffffffffe, 0x7fefffffffffffff, 0xfca0000000000000);
+  status |=
+      test__subdf3(0x7feffffffffffffe, 0xbff0000000000000, 0x7feffffffffffffe);
+  status |=
+      test__subdf3(0x7feffffffffffffe, 0xffeffffffffffffe, 0x7ff0000000000000);
+  status |=
+      test__subdf3(0x7feffffffffffffe, 0xffefffffffffffff, 0x7ff0000000000000);
+  status |=
+      test__subdf3(0x7fefffffffffffff, 0x0000000000000001, 0x7fefffffffffffff);
+  status |=
+      test__subdf3(0x7fefffffffffffff, 0x3ff0000000000000, 0x7fefffffffffffff);
+  status |=
+      test__subdf3(0x7fefffffffffffff, 0x7fefffffffffffff, 0x0000000000000000);
+  status |=
+      test__subdf3(0x7fefffffffffffff, 0xbff0000000000000, 0x7fefffffffffffff);
+  status |=
+      test__subdf3(0x7ff0000000000000, 0x0000000000000000, 0x7ff0000000000000);
+  status |=
+      test__subdf3(0x7ff0000000000000, 0x000fffffffffffff, 0x7ff0000000000000);
+  status |=
+      test__subdf3(0x7ff0000000000000, 0x7fe0000000000000, 0x7ff0000000000000);
+  status |=
+      test__subdf3(0x7ff0000000000000, 0x8000000000000000, 0x7ff0000000000000);
+  status |=
+      test__subdf3(0x7ff0000000000000, 0x800fffffffffffff, 0x7ff0000000000000);
+  status |=
+      test__subdf3(0x7ff0000000000000, 0xffe0000000000000, 0x7ff0000000000000);
+  status |=
+      test__subdf3(0x7ff0000000000000, 0xfff0000000000000, 0x7ff0000000000000);
+  status |=
+      test__subdf3(0x8000000000000000, 0x0000000000000000, 0x8000000000000000);
+  status |=
+      test__subdf3(0x8000000000000000, 0x000fffffffffffff, 0x800fffffffffffff);
+  status |=
+      test__subdf3(0x8000000000000000, 0x0010000000000000, 0x8010000000000000);
+  status |=
+      test__subdf3(0x8000000000000000, 0x3ff0000000000000, 0xbff0000000000000);
+  status |=
+      test__subdf3(0x8000000000000000, 0x7ff0000000000000, 0xfff0000000000000);
+  status |=
+      test__subdf3(0x8000000000000000, 0x8000000000000000, 0x0000000000000000);
+  status |=
+      test__subdf3(0x8000000000000000, 0x800fffffffffffff, 0x000fffffffffffff);
+  status |=
+      test__subdf3(0x8000000000000000, 0xffe0000000000000, 0x7fe0000000000000);
+  status |=
+      test__subdf3(0x8000000000000000, 0xfff0000000000000, 0x7ff0000000000000);
+  status |=
+      test__subdf3(0x8000000000000001, 0x0000000000000001, 0x8000000000000002);
+  status |=
+      test__subdf3(0x8000000000000001, 0x3fefffffffffffff, 0xbfefffffffffffff);
+  status |=
+      test__subdf3(0x8000000000000001, 0x3ff0000000000000, 0xbff0000000000000);
+  status |=
+      test__subdf3(0x8000000000000001, 0x3ffffffffffffffe, 0xbffffffffffffffe);
+  status |=
+      test__subdf3(0x8000000000000001, 0x3fffffffffffffff, 0xbfffffffffffffff);
+  status |=
+      test__subdf3(0x8000000000000001, 0x7fdfffffffffffff, 0xffdfffffffffffff);
+  status |=
+      test__subdf3(0x8000000000000001, 0x7fe0000000000000, 0xffe0000000000000);
+  status |=
+      test__subdf3(0x8000000000000001, 0x7feffffffffffffe, 0xffeffffffffffffe);
+  status |=
+      test__subdf3(0x8000000000000001, 0x7fefffffffffffff, 0xffefffffffffffff);
+  status |=
+      test__subdf3(0x8000000000000001, 0x8000000000000001, 0x0000000000000000);
+  status |=
+      test__subdf3(0x8000000000000002, 0x8000000000000001, 0x8000000000000001);
+  status |=
+      test__subdf3(0x8000000000000003, 0x0000000000000000, 0x8000000000000003);
+  status |=
+      test__subdf3(0x8000000000000003, 0x7ff0000000000000, 0xfff0000000000000);
+  status |=
+      test__subdf3(0x8000000000000003, 0x8000000000000000, 0x8000000000000003);
+  status |=
+      test__subdf3(0x8000000000000003, 0x8000000000000002, 0x8000000000000001);
+  status |=
+      test__subdf3(0x8000000000000003, 0xc008000000000000, 0x4008000000000000);
+  status |=
+      test__subdf3(0x8000000000000003, 0xffe0000000000000, 0x7fe0000000000000);
+  status |=
+      test__subdf3(0x8000000000000003, 0xfff0000000000000, 0x7ff0000000000000);
+  status |=
+      test__subdf3(0x8000000000000004, 0x0000000000000004, 0x8000000000000008);
+  status |=
+      test__subdf3(0x800ffffffffffffd, 0x800ffffffffffffe, 0x0000000000000001);
+  status |=
+      test__subdf3(0x800fffffffffffff, 0x000fffffffffffff, 0x801ffffffffffffe);
+  status |=
+      test__subdf3(0x800fffffffffffff, 0x800ffffffffffffe, 0x8000000000000001);
+  status |=
+      test__subdf3(0x800fffffffffffff, 0x800fffffffffffff, 0x0000000000000000);
+  status |=
+      test__subdf3(0x800fffffffffffff, 0x8010000000000000, 0x0000000000000001);
+  status |=
+      test__subdf3(0x8010000000000000, 0x8000000000000000, 0x8010000000000000);
+  status |=
+      test__subdf3(0x8010000000000000, 0x8010000000000000, 0x0000000000000000);
+  status |=
+      test__subdf3(0x8010000000000001, 0x8010000000000000, 0x8000000000000001);
+  status |=
+      test__subdf3(0x8010000000000001, 0x8010000000000002, 0x0000000000000001);
+  status |=
+      test__subdf3(0x801fffffffffffff, 0x8020000000000000, 0x0000000000000001);
+  status |=
+      test__subdf3(0x801fffffffffffff, 0x8020000000000002, 0x0000000000000005);
+  status |=
+      test__subdf3(0x801fffffffffffff, 0x8020000000000004, 0x0000000000000009);
+  status |=
+      test__subdf3(0x8020000000000000, 0x801fffffffffffff, 0x8000000000000001);
+  status |=
+      test__subdf3(0x8020000000000001, 0x8010000000000001, 0x8010000000000001);
+  status |=
+      test__subdf3(0x8020000000000001, 0x801fffffffffffff, 0x8000000000000003);
+  status |=
+      test__subdf3(0x8020000000000002, 0x8010000000000001, 0x8010000000000003);
+  status |=
+      test__subdf3(0x802fffffffffffff, 0x8030000000000000, 0x0000000000000002);
+  status |=
+      test__subdf3(0x8030000000000000, 0x802fffffffffffff, 0x8000000000000002);
+  status |=
+      test__subdf3(0x8030000000000001, 0x802fffffffffffff, 0x8000000000000006);
+  status |=
+      test__subdf3(0x8030000000000002, 0x8020000000000003, 0x8020000000000001);
+  status |=
+      test__subdf3(0xbff0000000000000, 0x0000000000000000, 0xbff0000000000000);
+  status |=
+      test__subdf3(0xbff0000000000000, 0x3ff0000000000003, 0xc000000000000002);
+  status |=
+      test__subdf3(0xbff0000000000001, 0x3ff0000000000000, 0xc000000000000000);
+  status |=
+      test__subdf3(0xbff0000000000001, 0xbff0000000000000, 0xbcb0000000000000);
+  status |=
+      test__subdf3(0xbff0000000000001, 0xbff0000000000002, 0x3cb0000000000000);
+  status |=
+      test__subdf3(0xbffffffffffffffc, 0xbffffffffffffffd, 0x3cb0000000000000);
+  status |=
+      test__subdf3(0xbfffffffffffffff, 0x8000000000000001, 0xbfffffffffffffff);
+  status |=
+      test__subdf3(0xbfffffffffffffff, 0xc000000000000000, 0x3cb0000000000000);
+  status |=
+      test__subdf3(0xc000000000000000, 0x4000000000000001, 0xc010000000000000);
+  status |=
+      test__subdf3(0xc000000000000000, 0xbfffffffffffffff, 0xbcb0000000000000);
+  status |=
+      test__subdf3(0xc000000000000000, 0xc000000000000001, 0x3cc0000000000000);
+  status |=
+      test__subdf3(0xc000000000000001, 0x4000000000000002, 0xc010000000000002);
+  status |=
+      test__subdf3(0xc000000000000001, 0xbff0000000000001, 0xbff0000000000001);
+  status |=
+      test__subdf3(0xc000000000000002, 0xbff0000000000001, 0xbff0000000000003);
+  status |=
+      test__subdf3(0xc000000000000002, 0xbff0000000000003, 0xbff0000000000001);
+  status |=
+      test__subdf3(0xc000000000000004, 0xc000000000000003, 0xbcc0000000000000);
+  status |=
+      test__subdf3(0xc008000000000000, 0xc008000000000000, 0x0000000000000000);
+  status |=
+      test__subdf3(0xc00fffffffffffff, 0x3cafffffffffffff, 0xc00fffffffffffff);
+  status |=
+      test__subdf3(0xc00fffffffffffff, 0x3cb0000000000000, 0xc010000000000000);
+  status |=
+      test__subdf3(0xc00fffffffffffff, 0xc00ffffffffffffe, 0xbcc0000000000000);
+  status |=
+      test__subdf3(0xc00fffffffffffff, 0xc010000000000002, 0x3ce4000000000000);
+  status |=
+      test__subdf3(0xc010000000000001, 0xc00fffffffffffff, 0xbcd8000000000000);
+  status |=
+      test__subdf3(0xffb0000000000001, 0xffafffffffffffff, 0xfc78000000000000);
+  status |=
+      test__subdf3(0xffcfffffffffffff, 0xffcffffffffffffe, 0xfc80000000000000);
+  status |=
+      test__subdf3(0xffcfffffffffffff, 0xffd0000000000002, 0x7ca4000000000000);
+  status |=
+      test__subdf3(0xffd0000000000000, 0xffcfffffffffffff, 0xfc80000000000000);
+  status |=
+      test__subdf3(0xffd0000000000000, 0xffd0000000000001, 0x7c90000000000000);
+  status |=
+      test__subdf3(0xffd0000000000001, 0x7fd0000000000000, 0xffe0000000000000);
+  status |=
+      test__subdf3(0xffd0000000000001, 0xffe0000000000001, 0x7fd0000000000001);
+  status |=
+      test__subdf3(0xffd0000000000002, 0xffc0000000000003, 0xffc0000000000001);
+  status |=
+      test__subdf3(0xffd0000000000004, 0xffd0000000000003, 0xfc90000000000000);
+  status |=
+      test__subdf3(0xffdffffffffffffe, 0x7fdffffffffffffe, 0xffeffffffffffffe);
+  status |=
+      test__subdf3(0xffdffffffffffffe, 0x7fdfffffffffffff, 0xffeffffffffffffe);
+  status |=
+      test__subdf3(0xffdffffffffffffe, 0xffdffffffffffffe, 0x0000000000000000);
+  status |=
+      test__subdf3(0xffdfffffffffffff, 0x3ff0000000000000, 0xffdfffffffffffff);
+  status |=
+      test__subdf3(0xffdfffffffffffff, 0x7fe0000000000000, 0xfff0000000000000);
+  status |=
+      test__subdf3(0xffdfffffffffffff, 0xbff0000000000000, 0xffdfffffffffffff);
+  status |=
+      test__subdf3(0xffdfffffffffffff, 0xffe0000000000000, 0x7c90000000000000);
+  status |=
+      test__subdf3(0xffe0000000000000, 0x0000000000000000, 0xffe0000000000000);
+  status |=
+      test__subdf3(0xffe0000000000000, 0x3ff0000000000000, 0xffe0000000000000);
+  status |=
+      test__subdf3(0xffe0000000000000, 0x7fe0000000000000, 0xfff0000000000000);
+  status |=
+      test__subdf3(0xffe0000000000000, 0x7ff0000000000000, 0xfff0000000000000);
+  status |=
+      test__subdf3(0xffe0000000000000, 0x8000000000000000, 0xffe0000000000000);
+  status |=
+      test__subdf3(0xffe0000000000000, 0xbff0000000000000, 0xffe0000000000000);
+  status |=
+      test__subdf3(0xffe0000000000000, 0xfff0000000000000, 0x7ff0000000000000);
+  status |=
+      test__subdf3(0xffe0000000000001, 0x7fe0000000000000, 0xfff0000000000000);
+  status |=
+      test__subdf3(0xffe0000000000001, 0xffe0000000000000, 0xfca0000000000000);
+  status |=
+      test__subdf3(0xffe0000000000001, 0xffe0000000000002, 0x7ca0000000000000);
+  status |=
+      test__subdf3(0xffe0000000000002, 0xffd0000000000001, 0xffd0000000000003);
+  status |=
+      test__subdf3(0xffeffffffffffffe, 0x3ff0000000000000, 0xffeffffffffffffe);
+  status |=
+      test__subdf3(0xffeffffffffffffe, 0x7feffffffffffffe, 0xfff0000000000000);
+  status |=
+      test__subdf3(0xffeffffffffffffe, 0x7fefffffffffffff, 0xfff0000000000000);
+  status |=
+      test__subdf3(0xffeffffffffffffe, 0xbff0000000000000, 0xffeffffffffffffe);
+  status |=
+      test__subdf3(0xffeffffffffffffe, 0xffefffffffffffff, 0x7ca0000000000000);
+  status |=
+      test__subdf3(0xffefffffffffffff, 0x3ff0000000000000, 0xffefffffffffffff);
+  status |=
+      test__subdf3(0xffefffffffffffff, 0x8000000000000001, 0xffefffffffffffff);
+  status |=
+      test__subdf3(0xffefffffffffffff, 0xbff0000000000000, 0xffefffffffffffff);
+  status |=
+      test__subdf3(0xfff0000000000000, 0x0000000000000000, 0xfff0000000000000);
+  status |=
+      test__subdf3(0xfff0000000000000, 0x000fffffffffffff, 0xfff0000000000000);
+  status |=
+      test__subdf3(0xfff0000000000000, 0x7fe0000000000000, 0xfff0000000000000);
+  status |=
+      test__subdf3(0xfff0000000000000, 0x7ff0000000000000, 0xfff0000000000000);
+  status |=
+      test__subdf3(0xfff0000000000000, 0x8000000000000000, 0xfff0000000000000);
+  status |=
+      test__subdf3(0xfff0000000000000, 0x800fffffffffffff, 0xfff0000000000000);
+  status |=
+      test__subdf3(0xfff0000000000000, 0xffe0000000000000, 0xfff0000000000000);
+  status |=
+      test__subdf3(0x004caed458edc883, 0x004f7fc23eeef153, 0x8016876f30094680);
+  status |=
+      test__subdf3(0x0028000000000000, 0x0010000000000001, 0x001fffffffffffff);
+  status |=
+      test__subdf3(0x0028000000000000, 0x0010000000000000, 0x0020000000000000);
+  status |=
+      test__subdf3(0x001fffffffffffff, 0x0010000000000000, 0x000fffffffffffff);
+  status |=
+      test__subdf3(0x001fffffffffffff, 0x000fffffffffffff, 0x0010000000000000);
+  status |=
+      test__subdf3(0x0020000000000000, 0x0010000000000000, 0x0010000000000000);
+  status |=
+      test__subdf3(0x0038000000000000, 0x0034000000000001, 0x000ffffffffffffc);
+  status |=
+      test__subdf3(0x0038000000000000, 0x0034000000000000, 0x0010000000000000);
+  status |=
+      test__subdf3(0x0038000000000000, 0x0030000000000001, 0x001ffffffffffffc);
+  status |=
+      test__subdf3(0x0038000000000000, 0x0030000000000000, 0x0020000000000000);
+  status |=
+      test__subdf3(0x000fffffffe00000, 0x801000000007ffff, 0x001fffffffe7ffff);
+  status |=
+      test__subdf3(0x0010000000004000, 0x800effffffffffff, 0x001f000000003fff);
+  status |=
+      test__subdf3(0x800000000fffffff, 0x001ffff000000000, 0x801ffff00fffffff);
+  status |=
+      test__subdf3(0x800fffff80000000, 0x001000000fffffff, 0x801fffff8fffffff);
+  status |=
+      test__subdf3(0x80100000001fffff, 0x000ffffeffffffff, 0x801fffff001ffffe);
+
+  // Test that the result of an operation is a NaN at all when it should be.
+  //
+  // In most configurations these tests' results are checked compared using
+  // compareResultD, so we set all the answers to the canonical NaN
+  // 0x7ff8000000000000, which causes compareResultF to accept any NaN
+  // encoding. We also use the same value as the input NaN in tests that have
+  // one, so that even in EXPECT_EXACT_RESULTS mode these tests should pass,
+  // because 0x7ff8000000000000 is still the exact expected NaN.
+  status |=
+      test__subdf3(0x7ff0000000000000, 0x7ff0000000000000, 0x7ff8000000000000);
+  status |=
+      test__subdf3(0xfff0000000000000, 0xfff0000000000000, 0x7ff8000000000000);
+  status |=
+      test__subdf3(0x3ff0000000000000, 0x7ff8000000000000, 0x7ff8000000000000);
+  status |=
+      test__subdf3(0x7ff8000000000000, 0x3ff0000000000000, 0x7ff8000000000000);
+  status |=
+      test__subdf3(0x7ff8000000000000, 0x7ff8000000000000, 0x7ff8000000000000);
+
+#ifdef ARM_NAN_HANDLING
+  // Tests specific to the NaN handling of Arm hardware, mimicked by
+  // the subtraction function in arm/adddf3.S:
+  //
+  //  - a quiet NaN is distinguished by the top mantissa bit being 1
+  //
+  //  - if a signalling NaN appears in the input, the output quiet NaN is
+  //    obtained by setting its top mantissa bit and leaving everything else
+  //    unchanged
+  //
+  //  - if both operands are signalling NaNs then the output NaN is derived
+  //    from the first operand
+  //
+  //  - if both operands are quiet NaNs then the output NaN is the first
+  //    operand
+  //
+  //  - invalid operations not involving an input NaN return the quiet
+  //    NaN with fewest bits set, 0x7ff8000000000000.
+  status |=
+      test__subdf3(0x0000000000000000, 0x7ff3758244400801, 0x7ffb758244400801);
+  status |=
+      test__subdf3(0x0000000000000000, 0x7fff44d3f65148af, 0x7fff44d3f65148af);
+  status |=
+      test__subdf3(0x0000000000000001, 0x7ff48607b4b37057, 0x7ffc8607b4b37057);
+  status |=
+      test__subdf3(0x0000000000000001, 0x7ff855f2d435b33d, 0x7ff855f2d435b33d);
+  status |=
+      test__subdf3(0x000fffffffffffff, 0x7ff169269a674e13, 0x7ff969269a674e13);
+  status |=
+      test__subdf3(0x000fffffffffffff, 0x7ffc80978b2ef0da, 0x7ffc80978b2ef0da);
+  status |=
+      test__subdf3(0x3ff0000000000000, 0x7ff3458ad034593d, 0x7ffb458ad034593d);
+  status |=
+      test__subdf3(0x3ff0000000000000, 0x7ffdd8bb98c9f13a, 0x7ffdd8bb98c9f13a);
+  status |=
+      test__subdf3(0x7fefffffffffffff, 0x7ff79a8b96250a98, 0x7fff9a8b96250a98);
+  status |=
+      test__subdf3(0x7fefffffffffffff, 0x7ffdcc675b63bb94, 0x7ffdcc675b63bb94);
+  status |=
+      test__subdf3(0x7ff0000000000000, 0x7ff018cfaf4d0fff, 0x7ff818cfaf4d0fff);
+  status |=
+      test__subdf3(0x7ff0000000000000, 0x7ff83ad1ab4dfd24, 0x7ff83ad1ab4dfd24);
+  status |=
+      test__subdf3(0x7ff48ce6c0cdd5ac, 0x0000000000000000, 0x7ffc8ce6c0cdd5ac);
+  status |=
+      test__subdf3(0x7ff08a34f3d5385b, 0x0000000000000001, 0x7ff88a34f3d5385b);
+  status |=
+      test__subdf3(0x7ff0a264c1c96281, 0x000fffffffffffff, 0x7ff8a264c1c96281);
+  status |=
+      test__subdf3(0x7ff77ce629e61f0e, 0x3ff0000000000000, 0x7fff7ce629e61f0e);
+  status |=
+      test__subdf3(0x7ff715e2d147fd76, 0x7fefffffffffffff, 0x7fff15e2d147fd76);
+  status |=
+      test__subdf3(0x7ff689a2031f1781, 0x7ff0000000000000, 0x7ffe89a2031f1781);
+  status |=
+      test__subdf3(0x7ff5dfb4a0c8cd05, 0x7ff11c1fe9793a33, 0x7ffddfb4a0c8cd05);
+  status |=
+      test__subdf3(0x7ff5826283ffb5d7, 0x7fff609b83884e81, 0x7ffd826283ffb5d7);
+  status |=
+      test__subdf3(0x7ff7cb03f2e61d42, 0x8000000000000000, 0x7fffcb03f2e61d42);
+  status |=
+      test__subdf3(0x7ff2adc8dfe72c96, 0x8000000000000001, 0x7ffaadc8dfe72c96);
+  status |=
+      test__subdf3(0x7ff4fc0bacc707f2, 0x800fffffffffffff, 0x7ffcfc0bacc707f2);
+  status |=
+      test__subdf3(0x7ff76248c8c9a619, 0xbff0000000000000, 0x7fff6248c8c9a619);
+  status |=
+      test__subdf3(0x7ff367972fce131b, 0xffefffffffffffff, 0x7ffb67972fce131b);
+  status |=
+      test__subdf3(0x7ff188f5ac284e92, 0xfff0000000000000, 0x7ff988f5ac284e92);
+  status |=
+      test__subdf3(0x7ffed4c22e4e569d, 0x0000000000000000, 0x7ffed4c22e4e569d);
+  status |=
+      test__subdf3(0x7ffe95105fa3f339, 0x0000000000000001, 0x7ffe95105fa3f339);
+  status |=
+      test__subdf3(0x7ffb8d33dbb9ecfb, 0x000fffffffffffff, 0x7ffb8d33dbb9ecfb);
+  status |=
+      test__subdf3(0x7ff874e41dc63e07, 0x3ff0000000000000, 0x7ff874e41dc63e07);
+  status |=
+      test__subdf3(0x7ffe27594515ecdf, 0x7fefffffffffffff, 0x7ffe27594515ecdf);
+  status |=
+      test__subdf3(0x7ffeac86d5c69bdf, 0x7ff0000000000000, 0x7ffeac86d5c69bdf);
+  status |=
+      test__subdf3(0x7ff97d657b99f76f, 0x7ff7e4149862a796, 0x7fffe4149862a796);
+  status |=
+      test__subdf3(0x7ffad17c6aa33fad, 0x7ffd898893ad4d28, 0x7ffad17c6aa33fad);
+  status |=
+      test__subdf3(0x7ff96e04e9c3d173, 0x8000000000000000, 0x7ff96e04e9c3d173);
+  status |=
+      test__subdf3(0x7ffec01ad8da3abb, 0x8000000000000001, 0x7ffec01ad8da3abb);
+  status |=
+      test__subdf3(0x7ffd1d565c495941, 0x800fffffffffffff, 0x7ffd1d565c495941);
+  status |=
+      test__subdf3(0x7ffe3d24f1e474a7, 0xbff0000000000000, 0x7ffe3d24f1e474a7);
+  status |=
+      test__subdf3(0x7ffc206f2bb8c8ce, 0xffefffffffffffff, 0x7ffc206f2bb8c8ce);
+  status |=
+      test__subdf3(0x7ff93efdecfb7d3b, 0xfff0000000000000, 0x7ff93efdecfb7d3b);
+  status |=
+      test__subdf3(0x8000000000000000, 0x7ff2ee725d143ac5, 0x7ffaee725d143ac5);
+  status |=
+      test__subdf3(0x8000000000000000, 0x7ffbba26e5c5fe98, 0x7ffbba26e5c5fe98);
+  status |=
+      test__subdf3(0x8000000000000001, 0x7ff7818a1cd26df9, 0x7fff818a1cd26df9);
+  status |=
+      test__subdf3(0x8000000000000001, 0x7ffaee6cc63b5292, 0x7ffaee6cc63b5292);
+  status |=
+      test__subdf3(0x800fffffffffffff, 0x7ff401096edaf79d, 0x7ffc01096edaf79d);
+  status |=
+      test__subdf3(0x800fffffffffffff, 0x7ffbf1778c7a2e59, 0x7ffbf1778c7a2e59);
+  status |=
+      test__subdf3(0xbff0000000000000, 0x7ff2e8fb0201c496, 0x7ffae8fb0201c496);
+  status |=
+      test__subdf3(0xbff0000000000000, 0x7ffcb6a5adb2e154, 0x7ffcb6a5adb2e154);
+  status |=
+      test__subdf3(0xffefffffffffffff, 0x7ff1ea1bfc15d71d, 0x7ff9ea1bfc15d71d);
+  status |=
+      test__subdf3(0xffefffffffffffff, 0x7ffae0766e21efc0, 0x7ffae0766e21efc0);
+  status |=
+      test__subdf3(0xfff0000000000000, 0x7ff3b364cffbdfe6, 0x7ffbb364cffbdfe6);
+  status |=
+      test__subdf3(0xfff0000000000000, 0x7ffd0d3223334ae3, 0x7ffd0d3223334ae3);
+
+#endif // ARM_NAN_HANDLING
+
+  return status;
+}