diff --git a/libc/shared/math.h b/libc/shared/math.h index b6f7a2b6b20b..e43ed9863084 100644 --- a/libc/shared/math.h +++ b/libc/shared/math.h @@ -100,6 +100,7 @@ #include "math/logbf16.h" #include "math/logf.h" #include "math/logf16.h" +#include "math/pow.h" #include "math/rsqrtf.h" #include "math/rsqrtf16.h" #include "math/sin.h" diff --git a/libc/shared/math/pow.h b/libc/shared/math/pow.h new file mode 100644 index 000000000000..9743683a8623 --- /dev/null +++ b/libc/shared/math/pow.h @@ -0,0 +1,23 @@ +//===-- Shared pow function -------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SHARED_MATH_POW_H +#define LLVM_LIBC_SHARED_MATH_POW_H + +#include "shared/libc_common.h" +#include "src/__support/math/pow.h" + +namespace LIBC_NAMESPACE_DECL { +namespace shared { + +using math::pow; + +} // namespace shared +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SHARED_MATH_POW_H diff --git a/libc/src/__support/math/CMakeLists.txt b/libc/src/__support/math/CMakeLists.txt index 5453cd65c967..3f2f13c3f185 100644 --- a/libc/src/__support/math/CMakeLists.txt +++ b/libc/src/__support/math/CMakeLists.txt @@ -1492,6 +1492,26 @@ add_header_library( libc.src.__support.uint128 ) +add_header_library( + pow + HDRS + pow.h + DEPENDS + .common_constants + .exp_constants + libc.hdr.errno_macros + libc.hdr.fenv_macros + libc.src.__support.CPP.bit + libc.src.__support.FPUtil.double_double + libc.src.__support.FPUtil.fenv_impl + libc.src.__support.FPUtil.fp_bits + libc.src.__support.FPUtil.multiply_add + libc.src.__support.FPUtil.nearest_integer + libc.src.__support.FPUtil.polyeval + libc.src.__support.FPUtil.sqrt + libc.src.__support.macros.optimization +) + add_header_library( sin HDRS diff --git a/libc/src/__support/math/pow.h b/libc/src/__support/math/pow.h new file mode 100644 index 000000000000..eebbd72bd31c --- /dev/null +++ b/libc/src/__support/math/pow.h @@ -0,0 +1,545 @@ +//===-- Implementation header for pow ---------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_POW_H +#define LLVM_LIBC_SRC___SUPPORT_MATH_POW_H + +#include "common_constants.h" // Lookup tables EXP_M1 and EXP_M2. +#include "exp_constants.h" // Lookup tables EXP_M1 and EXP_M2. +#include "hdr/errno_macros.h" +#include "hdr/fenv_macros.h" +#include "src/__support/CPP/bit.h" +#include "src/__support/FPUtil/FEnvImpl.h" +#include "src/__support/FPUtil/FPBits.h" +#include "src/__support/FPUtil/PolyEval.h" +#include "src/__support/FPUtil/double_double.h" +#include "src/__support/FPUtil/multiply_add.h" +#include "src/__support/FPUtil/nearest_integer.h" +#include "src/__support/FPUtil/sqrt.h" // Speedup for pow(x, 1/2) = sqrt(x) +#include "src/__support/common.h" +#include "src/__support/macros/config.h" +#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY + +namespace LIBC_NAMESPACE_DECL { + +namespace math { + +namespace pow_internal { + +using fputil::DoubleDouble; + +using namespace common_constants_internal; + +// Constants for log2(x) range reduction, generated by Sollya with: +// > for i from 0 to 127 do { +// r = 2^-8 * ceil( 2^8 * (1 - 2^(-8)) / (1 + i*2^-7) ); +// b = nearestint(log2(r) * 2^41) * 2^-41; +// c = round(log2(r) - b, D, RN); +// print("{", -c, ",", -b, "},"); +// }; +// This is the same as -log2(RD[i]), with the least significant bits of the +// high part set to be 2^-41, so that the sum of high parts + e_x is exact in +// double precision. +// We also replace the first and the last ones to be 0. +LIBC_INLINE_VAR constexpr DoubleDouble LOG2_R_DD[128] = { + {0.0, 0.0}, + {-0x1.19b14945cf6bap-44, 0x1.72c7ba21p-7}, + {-0x1.95539356f93dcp-43, 0x1.743ee862p-6}, + {0x1.abe0a48f83604p-43, 0x1.184b8e4c5p-5}, + {0x1.635577970e04p-43, 0x1.77394c9d9p-5}, + {-0x1.401fbaaa67e3cp-45, 0x1.d6ebd1f2p-5}, + {-0x1.5b1799ceaeb51p-43, 0x1.1bb32a6008p-4}, + {0x1.7c407050799bfp-43, 0x1.4c560fe688p-4}, + {0x1.da6339da288fcp-43, 0x1.7d60496cf8p-4}, + {0x1.be4f6f22dbbadp-43, 0x1.960caf9ab8p-4}, + {-0x1.c760bc9b188c4p-45, 0x1.c7b528b71p-4}, + {0x1.164e932b2d51cp-44, 0x1.f9c95dc1dp-4}, + {0x1.924ae921f7ecap-45, 0x1.097e38ce6p-3}, + {-0x1.6d25a5b8a19b2p-44, 0x1.22dadc2ab4p-3}, + {0x1.e50a1644ac794p-43, 0x1.3c6fb650ccp-3}, + {0x1.f34baa74a7942p-43, 0x1.494f863b8cp-3}, + {-0x1.8f7aac147fdc1p-46, 0x1.633a8bf438p-3}, + {0x1.f84be19cb9578p-43, 0x1.7046031c78p-3}, + {-0x1.66cccab240e9p-46, 0x1.8a8980abfcp-3}, + {-0x1.3f7a55cd2af4cp-47, 0x1.97c1cb13c8p-3}, + {0x1.3458cde69308cp-43, 0x1.b2602497d4p-3}, + {-0x1.667f21fa8423fp-44, 0x1.bfc67a8p-3}, + {0x1.d2fe4574e09b9p-47, 0x1.dac22d3e44p-3}, + {0x1.367bde40c5e6dp-43, 0x1.e857d3d36p-3}, + {0x1.d45da26510033p-46, 0x1.01d9bbcfa6p-2}, + {-0x1.7204f55bbf90dp-44, 0x1.08bce0d96p-2}, + {-0x1.d4f1b95e0ff45p-43, 0x1.169c05364p-2}, + {0x1.c20d74c0211bfp-44, 0x1.1d982c9d52p-2}, + {0x1.ad89a083e072ap-43, 0x1.249cd2b13cp-2}, + {0x1.cd0cb4492f1bcp-43, 0x1.32bfee370ep-2}, + {-0x1.2101a9685c779p-47, 0x1.39de8e155ap-2}, + {0x1.9451cd394fe8dp-43, 0x1.4106017c3ep-2}, + {0x1.661e393a16b95p-44, 0x1.4f6fbb2cecp-2}, + {-0x1.c6d8d86531d56p-44, 0x1.56b22e6b58p-2}, + {0x1.c1c885adb21d3p-43, 0x1.5dfdcf1eeap-2}, + {0x1.3bb5921006679p-45, 0x1.6552b49986p-2}, + {0x1.1d406db502403p-43, 0x1.6cb0f6865cp-2}, + {0x1.55a63e278bad5p-43, 0x1.7b89f02cf2p-2}, + {-0x1.66ae2a7ada553p-49, 0x1.8304d90c12p-2}, + {-0x1.66cccab240e9p-45, 0x1.8a8980abfcp-2}, + {-0x1.62404772a151dp-45, 0x1.921800924ep-2}, + {0x1.ac9bca36fd02ep-44, 0x1.99b072a96cp-2}, + {0x1.4bc302ffa76fbp-43, 0x1.a8ff97181p-2}, + {0x1.01fea1ec47c71p-43, 0x1.b0b67f4f46p-2}, + {-0x1.f20203b3186a6p-43, 0x1.b877c57b1cp-2}, + {-0x1.2642415d47384p-45, 0x1.c043859e3p-2}, + {-0x1.bc76a2753b99bp-50, 0x1.c819dc2d46p-2}, + {-0x1.da93ae3a5f451p-43, 0x1.cffae611aep-2}, + {-0x1.50e785694a8c6p-43, 0x1.d7e6c0abc4p-2}, + {0x1.c56138c894641p-43, 0x1.dfdd89d586p-2}, + {0x1.5669df6a2b592p-43, 0x1.e7df5fe538p-2}, + {-0x1.ea92d9e0e8ac2p-48, 0x1.efec61b012p-2}, + {0x1.a0331af2e6feap-43, 0x1.f804ae8d0cp-2}, + {0x1.9518ce032f41dp-48, 0x1.0014332bep-1}, + {-0x1.b3b3864c60011p-44, 0x1.042bd4b9a8p-1}, + {-0x1.103e8f00d41c8p-45, 0x1.08494c66b9p-1}, + {0x1.65be75cc3da17p-43, 0x1.0c6caaf0c5p-1}, + {0x1.3676289cd3dd4p-43, 0x1.1096015deep-1}, + {-0x1.41dfc7d7c3321p-43, 0x1.14c560fe69p-1}, + {0x1.e0cda8bd74461p-44, 0x1.18fadb6e2dp-1}, + {0x1.2a606046ad444p-44, 0x1.1d368296b5p-1}, + {0x1.f9ea977a639cp-43, 0x1.217868b0c3p-1}, + {-0x1.50520a377c7ecp-45, 0x1.25c0a0463cp-1}, + {0x1.6e3cb71b554e7p-47, 0x1.2a0f3c3407p-1}, + {-0x1.4275f1035e5e8p-48, 0x1.2e644fac05p-1}, + {-0x1.4275f1035e5e8p-48, 0x1.2e644fac05p-1}, + {-0x1.979a5db68721dp-45, 0x1.32bfee370fp-1}, + {0x1.1ee969a95f529p-43, 0x1.37222bb707p-1}, + {0x1.bb4b69336b66ep-43, 0x1.3b8b1c68fap-1}, + {0x1.d5e6a8a4fb059p-45, 0x1.3ffad4e74fp-1}, + {0x1.3106e404cabb7p-44, 0x1.44716a2c08p-1}, + {0x1.3106e404cabb7p-44, 0x1.44716a2c08p-1}, + {-0x1.9bcaf1aa4168ap-43, 0x1.48eef19318p-1}, + {0x1.1646b761c48dep-44, 0x1.4d7380dcc4p-1}, + {0x1.2f0c0bfe9dbecp-43, 0x1.51ff2e3021p-1}, + {0x1.29904613e33cp-43, 0x1.5692101d9bp-1}, + {0x1.1d406db502403p-44, 0x1.5b2c3da197p-1}, + {0x1.1d406db502403p-44, 0x1.5b2c3da197p-1}, + {-0x1.125d6cbcd1095p-44, 0x1.5fcdce2728p-1}, + {-0x1.bd9b32266d92cp-43, 0x1.6476d98adap-1}, + {0x1.54243b21709cep-44, 0x1.6927781d93p-1}, + {0x1.54243b21709cep-44, 0x1.6927781d93p-1}, + {-0x1.ce60916e52e91p-44, 0x1.6ddfc2a79p-1}, + {0x1.f1f5ae718f241p-43, 0x1.729fd26b7p-1}, + {-0x1.6eb9612e0b4f3p-43, 0x1.7767c12968p-1}, + {-0x1.6eb9612e0b4f3p-43, 0x1.7767c12968p-1}, + {0x1.fed21f9cb2cc5p-43, 0x1.7c37a9227ep-1}, + {0x1.7f5dc57266758p-43, 0x1.810fa51bf6p-1}, + {0x1.7f5dc57266758p-43, 0x1.810fa51bf6p-1}, + {0x1.5b338360c2ae2p-43, 0x1.85efd062c6p-1}, + {-0x1.96fc8f4b56502p-43, 0x1.8ad846cf37p-1}, + {-0x1.96fc8f4b56502p-43, 0x1.8ad846cf37p-1}, + {-0x1.bdc81c4db3134p-44, 0x1.8fc924c89bp-1}, + {0x1.36c101ee1344p-43, 0x1.94c287492cp-1}, + {0x1.36c101ee1344p-43, 0x1.94c287492cp-1}, + {0x1.e41fa0a62e6aep-44, 0x1.99c48be206p-1}, + {-0x1.d97ee9124773bp-46, 0x1.9ecf50bf44p-1}, + {-0x1.d97ee9124773bp-46, 0x1.9ecf50bf44p-1}, + {-0x1.3f94e00e7d6bcp-46, 0x1.a3e2f4ac44p-1}, + {-0x1.6879fa00b120ap-43, 0x1.a8ff971811p-1}, + {-0x1.6879fa00b120ap-43, 0x1.a8ff971811p-1}, + {0x1.1659d8e2d7d38p-44, 0x1.ae255819fp-1}, + {0x1.1e5e0ae0d3f8ap-43, 0x1.b35458761dp-1}, + {0x1.1e5e0ae0d3f8ap-43, 0x1.b35458761dp-1}, + {0x1.484a15babcf88p-43, 0x1.b88cb9a2abp-1}, + {0x1.484a15babcf88p-43, 0x1.b88cb9a2abp-1}, + {0x1.871a7610e40bdp-45, 0x1.bdce9dcc96p-1}, + {-0x1.2d90e5edaeceep-43, 0x1.c31a27dd01p-1}, + {-0x1.2d90e5edaeceep-43, 0x1.c31a27dd01p-1}, + {-0x1.5dd31d962d373p-43, 0x1.c86f7b7ea5p-1}, + {-0x1.5dd31d962d373p-43, 0x1.c86f7b7ea5p-1}, + {-0x1.9ad57391924a7p-43, 0x1.cdcebd2374p-1}, + {-0x1.3167ccc538261p-44, 0x1.d338120a6ep-1}, + {-0x1.3167ccc538261p-44, 0x1.d338120a6ep-1}, + {0x1.c7a4ff65ddbc9p-45, 0x1.d8aba045bp-1}, + {0x1.c7a4ff65ddbc9p-45, 0x1.d8aba045bp-1}, + {-0x1.f9ab3cf74babap-44, 0x1.de298ec0bbp-1}, + {-0x1.f9ab3cf74babap-44, 0x1.de298ec0bbp-1}, + {0x1.52842c1c1e586p-43, 0x1.e3b20546f5p-1}, + {0x1.52842c1c1e586p-43, 0x1.e3b20546f5p-1}, + {0x1.3c6764fc87b4ap-48, 0x1.e9452c8a71p-1}, + {0x1.3c6764fc87b4ap-48, 0x1.e9452c8a71p-1}, + {-0x1.a0976c0a2827dp-44, 0x1.eee32e2aedp-1}, + {-0x1.a0976c0a2827dp-44, 0x1.eee32e2aedp-1}, + {-0x1.a45314dc4fc42p-43, 0x1.f48c34bd1fp-1}, + {-0x1.a45314dc4fc42p-43, 0x1.f48c34bd1fp-1}, + {0x1.ef5d00e390ap-44, 0x1.fa406bd244p-1}, + {0.0, 1.0}, +}; + +LIBC_INLINE constexpr bool is_odd_integer(double x) { + using FPBits = fputil::FPBits; + FPBits xbits(x); + uint64_t x_u = xbits.uintval(); + unsigned x_e = static_cast(xbits.get_biased_exponent()); + unsigned lsb = + static_cast(cpp::countr_zero(x_u | FPBits::EXP_MASK)); + constexpr unsigned UNIT_EXPONENT = + static_cast(FPBits::EXP_BIAS + FPBits::FRACTION_LEN); + return (x_e + lsb == UNIT_EXPONENT); +} + +LIBC_INLINE constexpr bool is_integer(double x) { + using FPBits = fputil::FPBits; + FPBits xbits(x); + uint64_t x_u = xbits.uintval(); + unsigned x_e = static_cast(xbits.get_biased_exponent()); + unsigned lsb = + static_cast(cpp::countr_zero(x_u | FPBits::EXP_MASK)); + constexpr unsigned UNIT_EXPONENT = + static_cast(FPBits::EXP_BIAS + FPBits::FRACTION_LEN); + return (x_e + lsb >= UNIT_EXPONENT); +} + +} // namespace pow_internal + +LIBC_INLINE double pow(double x, double y) { + using namespace pow_internal; + using FPBits = fputil::FPBits; + + FPBits xbits(x), ybits(y); + + bool x_sign = xbits.sign() == Sign::NEG; + bool y_sign = ybits.sign() == Sign::NEG; + + FPBits x_abs = xbits.abs(); + FPBits y_abs = ybits.abs(); + + uint64_t x_mant = xbits.get_mantissa(); + uint64_t y_mant = ybits.get_mantissa(); + uint64_t x_u = xbits.uintval(); + uint64_t x_a = x_abs.uintval(); + uint64_t y_a = y_abs.uintval(); + + double e_x = static_cast(xbits.get_exponent()); + uint64_t sign = 0; + + ///////// BEGIN - Check exceptional cases //////////////////////////////////// + // If x or y is signaling NaN + if (x_abs.is_signaling_nan() || y_abs.is_signaling_nan()) { + fputil::raise_except_if_required(FE_INVALID); + return FPBits::quiet_nan().get_val(); + } + + // The double precision number that is closest to 1 is (1 - 2^-53), which has + // log2(1 - 2^-53) ~ -1.715...p-53. + // So if |y| > |1075 / log2(1 - 2^-53)|, and x is finite: + // |y * log2(x)| = 0 or > 1075. + // Hence x^y will either overflow or underflow if x is not zero. + if (LIBC_UNLIKELY(y_mant == 0 || y_a > 0x43d7'4910'd52d'3052 || + x_u == FPBits::one().uintval() || + x_u >= FPBits::inf().uintval() || + x_u < FPBits::min_normal().uintval())) { + // Exceptional exponents. + if (y == 0.0) + return 1.0; + + switch (y_a) { + case 0x3fe0'0000'0000'0000: { // y = +-0.5 + // TODO: speed up x^(-1/2) with rsqrt(x) when available. + if (LIBC_UNLIKELY( + (x == 0.0 || x_u == FPBits::inf(Sign::NEG).uintval()))) { + // pow(-0, 1/2) = +0 + // pow(-inf, 1/2) = +inf + // Make sure it works correctly for FTZ/DAZ. + return y_sign ? 1.0 / (x * x) : (x * x); + } + return y_sign ? (1.0 / fputil::sqrt(x)) : fputil::sqrt(x); + } + case 0x3ff0'0000'0000'0000: // y = +-1.0 + return y_sign ? (1.0 / x) : x; + case 0x4000'0000'0000'0000: // y = +-2.0; + return y_sign ? (1.0 / (x * x)) : (x * x); + } + + // |y| > |1075 / log2(1 - 2^-53)|. + if (y_a > 0x43d7'4910'd52d'3052) { + if (y_a >= 0x7ff0'0000'0000'0000) { + // y is inf or nan + if (y_mant != 0) { + // y is NaN + // pow(1, NaN) = 1 + // pow(x, NaN) = NaN + return (x_u == FPBits::one().uintval()) ? 1.0 : y; + } + + // Now y is +-Inf + if (x_abs.is_nan()) { + // pow(NaN, +-Inf) = NaN + return x; + } + + if (x_a == 0x3ff0'0000'0000'0000) { + // pow(+-1, +-Inf) = 1.0 + return 1.0; + } + + if (x == 0.0 && y_sign) { + // pow(+-0, -Inf) = +inf and raise FE_DIVBYZERO + fputil::set_errno_if_required(EDOM); + fputil::raise_except_if_required(FE_DIVBYZERO); + return FPBits::inf().get_val(); + } + // pow (|x| < 1, -inf) = +inf + // pow (|x| < 1, +inf) = 0.0 + // pow (|x| > 1, -inf) = 0.0 + // pow (|x| > 1, +inf) = +inf + return ((x_a < FPBits::one().uintval()) == y_sign) + ? FPBits::inf().get_val() + : 0.0; + } + // x^y will overflow / underflow in double precision. Set y to a + // large enough exponent but not too large, so that the computations + // won't overflow in double precision. + y = y_sign ? -0x1.0p100 : 0x1.0p100; + } + + // y is finite and non-zero. + + if (x_u == FPBits::one().uintval()) { + // pow(1, y) = 1 + return 1.0; + } + + // TODO: Speed things up with pow(2, y) = exp2(y) and pow(10, y) = exp10(y). + + if (x == 0.0) { + bool out_is_neg = x_sign && is_odd_integer(y); + if (y_sign) { + // pow(0, negative number) = inf + fputil::set_errno_if_required(EDOM); + fputil::raise_except_if_required(FE_DIVBYZERO); + return FPBits::inf(out_is_neg ? Sign::NEG : Sign::POS).get_val(); + } + // pow(0, positive number) = 0 + return out_is_neg ? -0.0 : 0.0; + } + + if (x_a == FPBits::inf().uintval()) { + bool out_is_neg = x_sign && is_odd_integer(y); + if (y_sign) + return out_is_neg ? -0.0 : 0.0; + return FPBits::inf(out_is_neg ? Sign::NEG : Sign::POS).get_val(); + } + + if (x_a > FPBits::inf().uintval()) { + // x is NaN. + // pow (aNaN, 0) is already taken care above. + return x; + } + + // Normalize denormal inputs. + if (x_a < FPBits::min_normal().uintval()) { + e_x -= 64.0; + x_mant = FPBits(x * 0x1.0p64).get_mantissa(); + } + + // x is finite and negative, and y is a finite integer. + if (x_sign) { + if (is_integer(y)) { + x = -x; + if (is_odd_integer(y)) + // sign = -1.0; + sign = 0x8000'0000'0000'0000; + } else { + // pow( negative, non-integer ) = NaN + fputil::set_errno_if_required(EDOM); + fputil::raise_except_if_required(FE_INVALID); + return FPBits::quiet_nan().get_val(); + } + } + } + + ///////// END - Check exceptional cases ////////////////////////////////////// + + // x^y = 2^( y * log2(x) ) + // = 2^( y * ( e_x + log2(m_x) ) ) + // First we compute log2(x) = e_x + log2(m_x) + + // Extract exponent field of x. + + // Use the highest 7 fractional bits of m_x as the index for look up tables. + unsigned idx_x = static_cast(x_mant >> (FPBits::FRACTION_LEN - 7)); + // Add the hidden bit to the mantissa. + // 1 <= m_x < 2 + FPBits m_x = FPBits(x_mant | 0x3ff0'0000'0000'0000); + + // Reduced argument for log2(m_x): + // dx = r * m_x - 1. + // The computation is exact, and -2^-8 <= dx < 2^-7. + // Then m_x = (1 + dx) / r, and + // log2(m_x) = log2( (1 + dx) / r ) + // = log2(1 + dx) - log2(r). + + // In order for the overall computations x^y = 2^(y * log2(x)) to have the + // relative errors < 2^-52 (1ULP), we will need to evaluate the exponent part + // y * log2(x) with absolute errors < 2^-52 (or better, 2^-53). Since the + // whole exponent range for double precision is bounded by + // |y * log2(x)| < 1076 ~ 2^10, we need to evaluate log2(x) with absolute + // errors < 2^-53 * 2^-10 = 2^-63. + + // With that requirement, we use the following degree-6 polynomial + // approximation: + // P(dx) ~ log2(1 + dx) / dx + // Generated by Sollya with: + // > P = fpminimax(log2(1 + x)/x, 6, [|D...|], [-2^-8, 2^-7]); P; + // > dirtyinfnorm(log2(1 + x) - x*P, [-2^-8, 2^-7]); + // 0x1.d03cc...p-66 + constexpr double COEFFS[] = {0x1.71547652b82fep0, -0x1.71547652b82e7p-1, + 0x1.ec709dc3b1fd5p-2, -0x1.7154766124215p-2, + 0x1.2776bd90259d8p-2, -0x1.ec586c6f3d311p-3, + 0x1.9c4775eccf524p-3}; + // Error: ulp(dx^2) <= (2^-7)^2 * 2^-52 = 2^-66 + // Extra errors from various computations and rounding directions, the overall + // errors we can be bounded by 2^-65. + + DoubleDouble dx_c0; + + // Perform exact range reduction and exact product dx * c0. +#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE + double dx = fputil::multiply_add(RD[idx_x], m_x.get_val(), -1.0); // Exact + dx_c0 = fputil::exact_mult(COEFFS[0], dx); +#else + double c = FPBits(m_x.uintval() & 0x3fff'e000'0000'0000).get_val(); + double dx = + fputil::multiply_add(RD[idx_x], m_x.get_val() - c, CD[idx_x]); // Exact + dx_c0 = fputil::exact_mult(dx, COEFFS[0]); // Exact +#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE + + double dx2 = dx * dx; + double c0 = fputil::multiply_add(dx, COEFFS[2], COEFFS[1]); + double c1 = fputil::multiply_add(dx, COEFFS[4], COEFFS[3]); + double c2 = fputil::multiply_add(dx, COEFFS[6], COEFFS[5]); + + double p = fputil::polyeval(dx2, c0, c1, c2); + + // s = e_x - log2(r) + dx * P(dx) + // Absolute error bound: + // |log2(x) - log2_x.hi - log2_x.lo| < 2^-65. + + // Notice that e_x - log2(r).hi is exact, so we perform an exact sum of + // e_x - log2(r).hi and the high part of the product dx * c0: + // log2_x_hi.hi + log2_x_hi.lo = e_x - log2(r).hi + (dx * c0).hi + DoubleDouble log2_x_hi = + fputil::exact_add(e_x + LOG2_R_DD[idx_x].hi, dx_c0.hi); + // The low part is dx^2 * p + low part of (dx * c0) + low part of -log2(r). + double log2_x_lo = + fputil::multiply_add(dx2, p, dx_c0.lo + LOG2_R_DD[idx_x].lo); + // Perform accurate sums. + DoubleDouble log2_x = fputil::exact_add(log2_x_hi.hi, log2_x_lo); + log2_x.lo += log2_x_hi.lo; + + // To compute 2^(y * log2(x)), we break the exponent into 3 parts: + // y * log(2) = hi + mid + lo, where + // hi is an integer + // mid * 2^6 is an integer + // |lo| <= 2^-7 + // Then: + // x^y = 2^(y * log2(x)) = 2^hi * 2^mid * 2^lo, + // In which 2^mid is obtained from a look-up table of size 2^6 = 64 elements, + // and 2^lo ~ 1 + lo * P(lo). + // Thus, we have: + // hi + mid = 2^-6 * round( 2^6 * y * log2(x) ) + // If we restrict the output such that |hi| < 150, (hi + mid) uses (8 + 6) + // bits, hence, if we use double precision to perform + // round( 2^6 * y * log2(x)) + // the lo part is bounded by 2^-7 + 2^(-(52 - 14)) = 2^-7 + 2^-38 + + // In the following computations: + // y6 = 2^6 * y + // hm = 2^6 * (hi + mid) = round(2^6 * y * log2(x)) ~ round(y6 * s) + // lo6 = 2^6 * lo = 2^6 * (y - (hi + mid)) = y6 * log2(x) - hm. + double y6 = y * 0x1.0p6; // Exact. + + DoubleDouble y6_log2_x = fputil::exact_mult(y6, log2_x.hi); + y6_log2_x.lo = fputil::multiply_add(y6, log2_x.lo, y6_log2_x.lo); + + // Check overflow/underflow. + double scale = 1.0; + + // |2^(hi + mid) - exp2_hi_mid| <= ulp(exp2_hi_mid) / 2 + // Clamp the exponent part into smaller range that fits double precision. + // For those exponents that are out of range, the final conversion will round + // them correctly to inf/max float or 0/min float accordingly. + constexpr double UPPER_EXP_BOUND = 512.0 * 0x1.0p6; + if (LIBC_UNLIKELY(FPBits(y6_log2_x.hi).abs().get_val() >= UPPER_EXP_BOUND)) { + if (FPBits(y6_log2_x.hi).sign() == Sign::POS) { + scale = 0x1.0p512; + y6_log2_x.hi -= 512.0 * 64.0; + if (y6_log2_x.hi > 513.0 * 64.0) + y6_log2_x.hi = 513.0 * 64.0; + } else { + scale = 0x1.0p-512; + y6_log2_x.hi += 512.0 * 64.0; + if (y6_log2_x.hi < (-1076.0 + 512.0) * 64.0) + y6_log2_x.hi = -564.0 * 64.0; + } + } + + double hm = fputil::nearest_integer(y6_log2_x.hi); + + // lo6 = 2^6 * lo. + double lo6_hi = y6_log2_x.hi - hm; + double lo6 = lo6_hi + y6_log2_x.lo; + + int hm_i = static_cast(hm); + unsigned idx_y = static_cast(hm_i) & 0x3f; + + // 2^hi + int64_t exp2_hi_i = static_cast( + static_cast(static_cast(hm_i >> 6)) + << FPBits::FRACTION_LEN); + // 2^mid + int64_t exp2_mid_hi_i = + static_cast(FPBits(EXP2_MID1[idx_y].hi).uintval()); + int64_t exp2_mid_lo_i = + static_cast(FPBits(EXP2_MID1[idx_y].mid).uintval()); + // (-1)^sign * 2^hi * 2^mid + // Error <= 2^hi * 2^-53 + uint64_t exp2_hm_hi_i = + static_cast(exp2_hi_i + exp2_mid_hi_i) + sign; + // The low part could be 0. + uint64_t exp2_hm_lo_i = + idx_y != 0 ? static_cast(exp2_hi_i + exp2_mid_lo_i) + sign + : sign; + double exp2_hm_hi = FPBits(exp2_hm_hi_i).get_val(); + double exp2_hm_lo = FPBits(exp2_hm_lo_i).get_val(); + + // Degree-5 polynomial approximation P(lo6) ~ 2^(lo6 / 2^6) = 2^(lo). + // Generated by Sollya with: + // > P = fpminimax(2^(x/64), 5, [|1, D...|], [-2^-1, 2^-1]); + // > dirtyinfnorm(2^(x/64) - P, [-0.5, 0.5]); + // 0x1.a2b77e618f5c4c176fd11b7659016cde5de83cb72p-60 + constexpr double EXP2_COEFFS[] = {0x1p0, + 0x1.62e42fefa39efp-7, + 0x1.ebfbdff82a23ap-15, + 0x1.c6b08d7076268p-23, + 0x1.3b2ad33f8b48bp-31, + 0x1.5d870c4d84445p-40}; + + double lo6_sqr = lo6 * lo6; + + double d0 = fputil::multiply_add(lo6, EXP2_COEFFS[2], EXP2_COEFFS[1]); + double d1 = fputil::multiply_add(lo6, EXP2_COEFFS[4], EXP2_COEFFS[3]); + double pp = fputil::polyeval(lo6_sqr, d0, d1, EXP2_COEFFS[5]); + + double r = fputil::multiply_add(exp2_hm_hi * lo6, pp, exp2_hm_lo); + r += exp2_hm_hi; + + return r * scale; +} + +} // namespace math +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC___SUPPORT_MATH_POW_H diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt index 4036b1044d96..fde5c71c45d9 100644 --- a/libc/src/math/generic/CMakeLists.txt +++ b/libc/src/math/generic/CMakeLists.txt @@ -1553,18 +1553,8 @@ add_entrypoint_object( HDRS ../pow.h DEPENDS - libc.hdr.errno_macros - libc.hdr.fenv_macros - libc.src.__support.CPP.bit - libc.src.__support.FPUtil.double_double - libc.src.__support.FPUtil.fenv_impl - libc.src.__support.FPUtil.fp_bits - libc.src.__support.FPUtil.multiply_add - libc.src.__support.FPUtil.nearest_integer - libc.src.__support.FPUtil.polyeval - libc.src.__support.FPUtil.sqrt - libc.src.__support.macros.optimization - libc.src.__support.math.common_constants + libc.src.__support.math.pow + libc.src.errno.errno ) add_entrypoint_object( diff --git a/libc/src/math/generic/pow.cpp b/libc/src/math/generic/pow.cpp index c9f685b82fcb..5fbd7f5441ad 100644 --- a/libc/src/math/generic/pow.cpp +++ b/libc/src/math/generic/pow.cpp @@ -7,531 +7,12 @@ //===----------------------------------------------------------------------===// #include "src/math/pow.h" -#include "hdr/errno_macros.h" -#include "hdr/fenv_macros.h" -#include "src/__support/CPP/bit.h" -#include "src/__support/FPUtil/FEnvImpl.h" -#include "src/__support/FPUtil/FPBits.h" -#include "src/__support/FPUtil/PolyEval.h" -#include "src/__support/FPUtil/double_double.h" -#include "src/__support/FPUtil/multiply_add.h" -#include "src/__support/FPUtil/nearest_integer.h" -#include "src/__support/FPUtil/sqrt.h" // Speedup for pow(x, 1/2) = sqrt(x) -#include "src/__support/common.h" -#include "src/__support/macros/config.h" -#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY -#include "src/__support/math/common_constants.h" // Lookup tables EXP_M1 and EXP_M2. -#include "src/__support/math/exp_constants.h" // Lookup tables EXP_M1 and EXP_M2. +#include "src/__support/math/pow.h" namespace LIBC_NAMESPACE_DECL { -using fputil::DoubleDouble; - -namespace { - -using namespace common_constants_internal; - -// Constants for log2(x) range reduction, generated by Sollya with: -// > for i from 0 to 127 do { -// r = 2^-8 * ceil( 2^8 * (1 - 2^(-8)) / (1 + i*2^-7) ); -// b = nearestint(log2(r) * 2^41) * 2^-41; -// c = round(log2(r) - b, D, RN); -// print("{", -c, ",", -b, "},"); -// }; -// This is the same as -log2(RD[i]), with the least significant bits of the -// high part set to be 2^-41, so that the sum of high parts + e_x is exact in -// double precision. -// We also replace the first and the last ones to be 0. -constexpr DoubleDouble LOG2_R_DD[128] = { - {0.0, 0.0}, - {-0x1.19b14945cf6bap-44, 0x1.72c7ba21p-7}, - {-0x1.95539356f93dcp-43, 0x1.743ee862p-6}, - {0x1.abe0a48f83604p-43, 0x1.184b8e4c5p-5}, - {0x1.635577970e04p-43, 0x1.77394c9d9p-5}, - {-0x1.401fbaaa67e3cp-45, 0x1.d6ebd1f2p-5}, - {-0x1.5b1799ceaeb51p-43, 0x1.1bb32a6008p-4}, - {0x1.7c407050799bfp-43, 0x1.4c560fe688p-4}, - {0x1.da6339da288fcp-43, 0x1.7d60496cf8p-4}, - {0x1.be4f6f22dbbadp-43, 0x1.960caf9ab8p-4}, - {-0x1.c760bc9b188c4p-45, 0x1.c7b528b71p-4}, - {0x1.164e932b2d51cp-44, 0x1.f9c95dc1dp-4}, - {0x1.924ae921f7ecap-45, 0x1.097e38ce6p-3}, - {-0x1.6d25a5b8a19b2p-44, 0x1.22dadc2ab4p-3}, - {0x1.e50a1644ac794p-43, 0x1.3c6fb650ccp-3}, - {0x1.f34baa74a7942p-43, 0x1.494f863b8cp-3}, - {-0x1.8f7aac147fdc1p-46, 0x1.633a8bf438p-3}, - {0x1.f84be19cb9578p-43, 0x1.7046031c78p-3}, - {-0x1.66cccab240e9p-46, 0x1.8a8980abfcp-3}, - {-0x1.3f7a55cd2af4cp-47, 0x1.97c1cb13c8p-3}, - {0x1.3458cde69308cp-43, 0x1.b2602497d4p-3}, - {-0x1.667f21fa8423fp-44, 0x1.bfc67a8p-3}, - {0x1.d2fe4574e09b9p-47, 0x1.dac22d3e44p-3}, - {0x1.367bde40c5e6dp-43, 0x1.e857d3d36p-3}, - {0x1.d45da26510033p-46, 0x1.01d9bbcfa6p-2}, - {-0x1.7204f55bbf90dp-44, 0x1.08bce0d96p-2}, - {-0x1.d4f1b95e0ff45p-43, 0x1.169c05364p-2}, - {0x1.c20d74c0211bfp-44, 0x1.1d982c9d52p-2}, - {0x1.ad89a083e072ap-43, 0x1.249cd2b13cp-2}, - {0x1.cd0cb4492f1bcp-43, 0x1.32bfee370ep-2}, - {-0x1.2101a9685c779p-47, 0x1.39de8e155ap-2}, - {0x1.9451cd394fe8dp-43, 0x1.4106017c3ep-2}, - {0x1.661e393a16b95p-44, 0x1.4f6fbb2cecp-2}, - {-0x1.c6d8d86531d56p-44, 0x1.56b22e6b58p-2}, - {0x1.c1c885adb21d3p-43, 0x1.5dfdcf1eeap-2}, - {0x1.3bb5921006679p-45, 0x1.6552b49986p-2}, - {0x1.1d406db502403p-43, 0x1.6cb0f6865cp-2}, - {0x1.55a63e278bad5p-43, 0x1.7b89f02cf2p-2}, - {-0x1.66ae2a7ada553p-49, 0x1.8304d90c12p-2}, - {-0x1.66cccab240e9p-45, 0x1.8a8980abfcp-2}, - {-0x1.62404772a151dp-45, 0x1.921800924ep-2}, - {0x1.ac9bca36fd02ep-44, 0x1.99b072a96cp-2}, - {0x1.4bc302ffa76fbp-43, 0x1.a8ff97181p-2}, - {0x1.01fea1ec47c71p-43, 0x1.b0b67f4f46p-2}, - {-0x1.f20203b3186a6p-43, 0x1.b877c57b1cp-2}, - {-0x1.2642415d47384p-45, 0x1.c043859e3p-2}, - {-0x1.bc76a2753b99bp-50, 0x1.c819dc2d46p-2}, - {-0x1.da93ae3a5f451p-43, 0x1.cffae611aep-2}, - {-0x1.50e785694a8c6p-43, 0x1.d7e6c0abc4p-2}, - {0x1.c56138c894641p-43, 0x1.dfdd89d586p-2}, - {0x1.5669df6a2b592p-43, 0x1.e7df5fe538p-2}, - {-0x1.ea92d9e0e8ac2p-48, 0x1.efec61b012p-2}, - {0x1.a0331af2e6feap-43, 0x1.f804ae8d0cp-2}, - {0x1.9518ce032f41dp-48, 0x1.0014332bep-1}, - {-0x1.b3b3864c60011p-44, 0x1.042bd4b9a8p-1}, - {-0x1.103e8f00d41c8p-45, 0x1.08494c66b9p-1}, - {0x1.65be75cc3da17p-43, 0x1.0c6caaf0c5p-1}, - {0x1.3676289cd3dd4p-43, 0x1.1096015deep-1}, - {-0x1.41dfc7d7c3321p-43, 0x1.14c560fe69p-1}, - {0x1.e0cda8bd74461p-44, 0x1.18fadb6e2dp-1}, - {0x1.2a606046ad444p-44, 0x1.1d368296b5p-1}, - {0x1.f9ea977a639cp-43, 0x1.217868b0c3p-1}, - {-0x1.50520a377c7ecp-45, 0x1.25c0a0463cp-1}, - {0x1.6e3cb71b554e7p-47, 0x1.2a0f3c3407p-1}, - {-0x1.4275f1035e5e8p-48, 0x1.2e644fac05p-1}, - {-0x1.4275f1035e5e8p-48, 0x1.2e644fac05p-1}, - {-0x1.979a5db68721dp-45, 0x1.32bfee370fp-1}, - {0x1.1ee969a95f529p-43, 0x1.37222bb707p-1}, - {0x1.bb4b69336b66ep-43, 0x1.3b8b1c68fap-1}, - {0x1.d5e6a8a4fb059p-45, 0x1.3ffad4e74fp-1}, - {0x1.3106e404cabb7p-44, 0x1.44716a2c08p-1}, - {0x1.3106e404cabb7p-44, 0x1.44716a2c08p-1}, - {-0x1.9bcaf1aa4168ap-43, 0x1.48eef19318p-1}, - {0x1.1646b761c48dep-44, 0x1.4d7380dcc4p-1}, - {0x1.2f0c0bfe9dbecp-43, 0x1.51ff2e3021p-1}, - {0x1.29904613e33cp-43, 0x1.5692101d9bp-1}, - {0x1.1d406db502403p-44, 0x1.5b2c3da197p-1}, - {0x1.1d406db502403p-44, 0x1.5b2c3da197p-1}, - {-0x1.125d6cbcd1095p-44, 0x1.5fcdce2728p-1}, - {-0x1.bd9b32266d92cp-43, 0x1.6476d98adap-1}, - {0x1.54243b21709cep-44, 0x1.6927781d93p-1}, - {0x1.54243b21709cep-44, 0x1.6927781d93p-1}, - {-0x1.ce60916e52e91p-44, 0x1.6ddfc2a79p-1}, - {0x1.f1f5ae718f241p-43, 0x1.729fd26b7p-1}, - {-0x1.6eb9612e0b4f3p-43, 0x1.7767c12968p-1}, - {-0x1.6eb9612e0b4f3p-43, 0x1.7767c12968p-1}, - {0x1.fed21f9cb2cc5p-43, 0x1.7c37a9227ep-1}, - {0x1.7f5dc57266758p-43, 0x1.810fa51bf6p-1}, - {0x1.7f5dc57266758p-43, 0x1.810fa51bf6p-1}, - {0x1.5b338360c2ae2p-43, 0x1.85efd062c6p-1}, - {-0x1.96fc8f4b56502p-43, 0x1.8ad846cf37p-1}, - {-0x1.96fc8f4b56502p-43, 0x1.8ad846cf37p-1}, - {-0x1.bdc81c4db3134p-44, 0x1.8fc924c89bp-1}, - {0x1.36c101ee1344p-43, 0x1.94c287492cp-1}, - {0x1.36c101ee1344p-43, 0x1.94c287492cp-1}, - {0x1.e41fa0a62e6aep-44, 0x1.99c48be206p-1}, - {-0x1.d97ee9124773bp-46, 0x1.9ecf50bf44p-1}, - {-0x1.d97ee9124773bp-46, 0x1.9ecf50bf44p-1}, - {-0x1.3f94e00e7d6bcp-46, 0x1.a3e2f4ac44p-1}, - {-0x1.6879fa00b120ap-43, 0x1.a8ff971811p-1}, - {-0x1.6879fa00b120ap-43, 0x1.a8ff971811p-1}, - {0x1.1659d8e2d7d38p-44, 0x1.ae255819fp-1}, - {0x1.1e5e0ae0d3f8ap-43, 0x1.b35458761dp-1}, - {0x1.1e5e0ae0d3f8ap-43, 0x1.b35458761dp-1}, - {0x1.484a15babcf88p-43, 0x1.b88cb9a2abp-1}, - {0x1.484a15babcf88p-43, 0x1.b88cb9a2abp-1}, - {0x1.871a7610e40bdp-45, 0x1.bdce9dcc96p-1}, - {-0x1.2d90e5edaeceep-43, 0x1.c31a27dd01p-1}, - {-0x1.2d90e5edaeceep-43, 0x1.c31a27dd01p-1}, - {-0x1.5dd31d962d373p-43, 0x1.c86f7b7ea5p-1}, - {-0x1.5dd31d962d373p-43, 0x1.c86f7b7ea5p-1}, - {-0x1.9ad57391924a7p-43, 0x1.cdcebd2374p-1}, - {-0x1.3167ccc538261p-44, 0x1.d338120a6ep-1}, - {-0x1.3167ccc538261p-44, 0x1.d338120a6ep-1}, - {0x1.c7a4ff65ddbc9p-45, 0x1.d8aba045bp-1}, - {0x1.c7a4ff65ddbc9p-45, 0x1.d8aba045bp-1}, - {-0x1.f9ab3cf74babap-44, 0x1.de298ec0bbp-1}, - {-0x1.f9ab3cf74babap-44, 0x1.de298ec0bbp-1}, - {0x1.52842c1c1e586p-43, 0x1.e3b20546f5p-1}, - {0x1.52842c1c1e586p-43, 0x1.e3b20546f5p-1}, - {0x1.3c6764fc87b4ap-48, 0x1.e9452c8a71p-1}, - {0x1.3c6764fc87b4ap-48, 0x1.e9452c8a71p-1}, - {-0x1.a0976c0a2827dp-44, 0x1.eee32e2aedp-1}, - {-0x1.a0976c0a2827dp-44, 0x1.eee32e2aedp-1}, - {-0x1.a45314dc4fc42p-43, 0x1.f48c34bd1fp-1}, - {-0x1.a45314dc4fc42p-43, 0x1.f48c34bd1fp-1}, - {0x1.ef5d00e390ap-44, 0x1.fa406bd244p-1}, - {0.0, 1.0}, -}; - -bool is_odd_integer(double x) { - using FPBits = fputil::FPBits; - FPBits xbits(x); - uint64_t x_u = xbits.uintval(); - unsigned x_e = static_cast(xbits.get_biased_exponent()); - unsigned lsb = - static_cast(cpp::countr_zero(x_u | FPBits::EXP_MASK)); - constexpr unsigned UNIT_EXPONENT = - static_cast(FPBits::EXP_BIAS + FPBits::FRACTION_LEN); - return (x_e + lsb == UNIT_EXPONENT); -} - -bool is_integer(double x) { - using FPBits = fputil::FPBits; - FPBits xbits(x); - uint64_t x_u = xbits.uintval(); - unsigned x_e = static_cast(xbits.get_biased_exponent()); - unsigned lsb = - static_cast(cpp::countr_zero(x_u | FPBits::EXP_MASK)); - constexpr unsigned UNIT_EXPONENT = - static_cast(FPBits::EXP_BIAS + FPBits::FRACTION_LEN); - return (x_e + lsb >= UNIT_EXPONENT); -} - -} // namespace - LLVM_LIBC_FUNCTION(double, pow, (double x, double y)) { - using FPBits = fputil::FPBits; - - FPBits xbits(x), ybits(y); - - bool x_sign = xbits.sign() == Sign::NEG; - bool y_sign = ybits.sign() == Sign::NEG; - - FPBits x_abs = xbits.abs(); - FPBits y_abs = ybits.abs(); - - uint64_t x_mant = xbits.get_mantissa(); - uint64_t y_mant = ybits.get_mantissa(); - uint64_t x_u = xbits.uintval(); - uint64_t x_a = x_abs.uintval(); - uint64_t y_a = y_abs.uintval(); - - double e_x = static_cast(xbits.get_exponent()); - uint64_t sign = 0; - - ///////// BEGIN - Check exceptional cases //////////////////////////////////// - // If x or y is signaling NaN - if (x_abs.is_signaling_nan() || y_abs.is_signaling_nan()) { - fputil::raise_except_if_required(FE_INVALID); - return FPBits::quiet_nan().get_val(); - } - - // The double precision number that is closest to 1 is (1 - 2^-53), which has - // log2(1 - 2^-53) ~ -1.715...p-53. - // So if |y| > |1075 / log2(1 - 2^-53)|, and x is finite: - // |y * log2(x)| = 0 or > 1075. - // Hence x^y will either overflow or underflow if x is not zero. - if (LIBC_UNLIKELY(y_mant == 0 || y_a > 0x43d7'4910'd52d'3052 || - x_u == FPBits::one().uintval() || - x_u >= FPBits::inf().uintval() || - x_u < FPBits::min_normal().uintval())) { - // Exceptional exponents. - if (y == 0.0) - return 1.0; - - switch (y_a) { - case 0x3fe0'0000'0000'0000: { // y = +-0.5 - // TODO: speed up x^(-1/2) with rsqrt(x) when available. - if (LIBC_UNLIKELY( - (x == 0.0 || x_u == FPBits::inf(Sign::NEG).uintval()))) { - // pow(-0, 1/2) = +0 - // pow(-inf, 1/2) = +inf - // Make sure it works correctly for FTZ/DAZ. - return y_sign ? 1.0 / (x * x) : (x * x); - } - return y_sign ? (1.0 / fputil::sqrt(x)) : fputil::sqrt(x); - } - case 0x3ff0'0000'0000'0000: // y = +-1.0 - return y_sign ? (1.0 / x) : x; - case 0x4000'0000'0000'0000: // y = +-2.0; - return y_sign ? (1.0 / (x * x)) : (x * x); - } - - // |y| > |1075 / log2(1 - 2^-53)|. - if (y_a > 0x43d7'4910'd52d'3052) { - if (y_a >= 0x7ff0'0000'0000'0000) { - // y is inf or nan - if (y_mant != 0) { - // y is NaN - // pow(1, NaN) = 1 - // pow(x, NaN) = NaN - return (x_u == FPBits::one().uintval()) ? 1.0 : y; - } - - // Now y is +-Inf - if (x_abs.is_nan()) { - // pow(NaN, +-Inf) = NaN - return x; - } - - if (x_a == 0x3ff0'0000'0000'0000) { - // pow(+-1, +-Inf) = 1.0 - return 1.0; - } - - if (x == 0.0 && y_sign) { - // pow(+-0, -Inf) = +inf and raise FE_DIVBYZERO - fputil::set_errno_if_required(EDOM); - fputil::raise_except_if_required(FE_DIVBYZERO); - return FPBits::inf().get_val(); - } - // pow (|x| < 1, -inf) = +inf - // pow (|x| < 1, +inf) = 0.0 - // pow (|x| > 1, -inf) = 0.0 - // pow (|x| > 1, +inf) = +inf - return ((x_a < FPBits::one().uintval()) == y_sign) - ? FPBits::inf().get_val() - : 0.0; - } - // x^y will overflow / underflow in double precision. Set y to a - // large enough exponent but not too large, so that the computations - // won't overflow in double precision. - y = y_sign ? -0x1.0p100 : 0x1.0p100; - } - - // y is finite and non-zero. - - if (x_u == FPBits::one().uintval()) { - // pow(1, y) = 1 - return 1.0; - } - - // TODO: Speed things up with pow(2, y) = exp2(y) and pow(10, y) = exp10(y). - - if (x == 0.0) { - bool out_is_neg = x_sign && is_odd_integer(y); - if (y_sign) { - // pow(0, negative number) = inf - fputil::set_errno_if_required(EDOM); - fputil::raise_except_if_required(FE_DIVBYZERO); - return FPBits::inf(out_is_neg ? Sign::NEG : Sign::POS).get_val(); - } - // pow(0, positive number) = 0 - return out_is_neg ? -0.0 : 0.0; - } - - if (x_a == FPBits::inf().uintval()) { - bool out_is_neg = x_sign && is_odd_integer(y); - if (y_sign) - return out_is_neg ? -0.0 : 0.0; - return FPBits::inf(out_is_neg ? Sign::NEG : Sign::POS).get_val(); - } - - if (x_a > FPBits::inf().uintval()) { - // x is NaN. - // pow (aNaN, 0) is already taken care above. - return x; - } - - // Normalize denormal inputs. - if (x_a < FPBits::min_normal().uintval()) { - e_x -= 64.0; - x_mant = FPBits(x * 0x1.0p64).get_mantissa(); - } - - // x is finite and negative, and y is a finite integer. - if (x_sign) { - if (is_integer(y)) { - x = -x; - if (is_odd_integer(y)) - // sign = -1.0; - sign = 0x8000'0000'0000'0000; - } else { - // pow( negative, non-integer ) = NaN - fputil::set_errno_if_required(EDOM); - fputil::raise_except_if_required(FE_INVALID); - return FPBits::quiet_nan().get_val(); - } - } - } - - ///////// END - Check exceptional cases ////////////////////////////////////// - - // x^y = 2^( y * log2(x) ) - // = 2^( y * ( e_x + log2(m_x) ) ) - // First we compute log2(x) = e_x + log2(m_x) - - // Extract exponent field of x. - - // Use the highest 7 fractional bits of m_x as the index for look up tables. - unsigned idx_x = static_cast(x_mant >> (FPBits::FRACTION_LEN - 7)); - // Add the hidden bit to the mantissa. - // 1 <= m_x < 2 - FPBits m_x = FPBits(x_mant | 0x3ff0'0000'0000'0000); - - // Reduced argument for log2(m_x): - // dx = r * m_x - 1. - // The computation is exact, and -2^-8 <= dx < 2^-7. - // Then m_x = (1 + dx) / r, and - // log2(m_x) = log2( (1 + dx) / r ) - // = log2(1 + dx) - log2(r). - - // In order for the overall computations x^y = 2^(y * log2(x)) to have the - // relative errors < 2^-52 (1ULP), we will need to evaluate the exponent part - // y * log2(x) with absolute errors < 2^-52 (or better, 2^-53). Since the - // whole exponent range for double precision is bounded by - // |y * log2(x)| < 1076 ~ 2^10, we need to evaluate log2(x) with absolute - // errors < 2^-53 * 2^-10 = 2^-63. - - // With that requirement, we use the following degree-6 polynomial - // approximation: - // P(dx) ~ log2(1 + dx) / dx - // Generated by Sollya with: - // > P = fpminimax(log2(1 + x)/x, 6, [|D...|], [-2^-8, 2^-7]); P; - // > dirtyinfnorm(log2(1 + x) - x*P, [-2^-8, 2^-7]); - // 0x1.d03cc...p-66 - constexpr double COEFFS[] = {0x1.71547652b82fep0, -0x1.71547652b82e7p-1, - 0x1.ec709dc3b1fd5p-2, -0x1.7154766124215p-2, - 0x1.2776bd90259d8p-2, -0x1.ec586c6f3d311p-3, - 0x1.9c4775eccf524p-3}; - // Error: ulp(dx^2) <= (2^-7)^2 * 2^-52 = 2^-66 - // Extra errors from various computations and rounding directions, the overall - // errors we can be bounded by 2^-65. - - double dx; - DoubleDouble dx_c0; - - // Perform exact range reduction and exact product dx * c0. -#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE - dx = fputil::multiply_add(RD[idx_x], m_x.get_val(), -1.0); // Exact - dx_c0 = fputil::exact_mult(COEFFS[0], dx); -#else - double c = FPBits(m_x.uintval() & 0x3fff'e000'0000'0000).get_val(); - dx = fputil::multiply_add(RD[idx_x], m_x.get_val() - c, CD[idx_x]); // Exact - dx_c0 = fputil::exact_mult(dx, COEFFS[0]); // Exact -#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE - - double dx2 = dx * dx; - double c0 = fputil::multiply_add(dx, COEFFS[2], COEFFS[1]); - double c1 = fputil::multiply_add(dx, COEFFS[4], COEFFS[3]); - double c2 = fputil::multiply_add(dx, COEFFS[6], COEFFS[5]); - - double p = fputil::polyeval(dx2, c0, c1, c2); - - // s = e_x - log2(r) + dx * P(dx) - // Absolute error bound: - // |log2(x) - log2_x.hi - log2_x.lo| < 2^-65. - - // Notice that e_x - log2(r).hi is exact, so we perform an exact sum of - // e_x - log2(r).hi and the high part of the product dx * c0: - // log2_x_hi.hi + log2_x_hi.lo = e_x - log2(r).hi + (dx * c0).hi - DoubleDouble log2_x_hi = - fputil::exact_add(e_x + LOG2_R_DD[idx_x].hi, dx_c0.hi); - // The low part is dx^2 * p + low part of (dx * c0) + low part of -log2(r). - double log2_x_lo = - fputil::multiply_add(dx2, p, dx_c0.lo + LOG2_R_DD[idx_x].lo); - // Perform accurate sums. - DoubleDouble log2_x = fputil::exact_add(log2_x_hi.hi, log2_x_lo); - log2_x.lo += log2_x_hi.lo; - - // To compute 2^(y * log2(x)), we break the exponent into 3 parts: - // y * log(2) = hi + mid + lo, where - // hi is an integer - // mid * 2^6 is an integer - // |lo| <= 2^-7 - // Then: - // x^y = 2^(y * log2(x)) = 2^hi * 2^mid * 2^lo, - // In which 2^mid is obtained from a look-up table of size 2^6 = 64 elements, - // and 2^lo ~ 1 + lo * P(lo). - // Thus, we have: - // hi + mid = 2^-6 * round( 2^6 * y * log2(x) ) - // If we restrict the output such that |hi| < 150, (hi + mid) uses (8 + 6) - // bits, hence, if we use double precision to perform - // round( 2^6 * y * log2(x)) - // the lo part is bounded by 2^-7 + 2^(-(52 - 14)) = 2^-7 + 2^-38 - - // In the following computations: - // y6 = 2^6 * y - // hm = 2^6 * (hi + mid) = round(2^6 * y * log2(x)) ~ round(y6 * s) - // lo6 = 2^6 * lo = 2^6 * (y - (hi + mid)) = y6 * log2(x) - hm. - double y6 = y * 0x1.0p6; // Exact. - - DoubleDouble y6_log2_x = fputil::exact_mult(y6, log2_x.hi); - y6_log2_x.lo = fputil::multiply_add(y6, log2_x.lo, y6_log2_x.lo); - - // Check overflow/underflow. - double scale = 1.0; - - // |2^(hi + mid) - exp2_hi_mid| <= ulp(exp2_hi_mid) / 2 - // Clamp the exponent part into smaller range that fits double precision. - // For those exponents that are out of range, the final conversion will round - // them correctly to inf/max float or 0/min float accordingly. - constexpr double UPPER_EXP_BOUND = 512.0 * 0x1.0p6; - if (LIBC_UNLIKELY(FPBits(y6_log2_x.hi).abs().get_val() >= UPPER_EXP_BOUND)) { - if (FPBits(y6_log2_x.hi).sign() == Sign::POS) { - scale = 0x1.0p512; - y6_log2_x.hi -= 512.0 * 64.0; - if (y6_log2_x.hi > 513.0 * 64.0) - y6_log2_x.hi = 513.0 * 64.0; - } else { - scale = 0x1.0p-512; - y6_log2_x.hi += 512.0 * 64.0; - if (y6_log2_x.hi < (-1076.0 + 512.0) * 64.0) - y6_log2_x.hi = -564.0 * 64.0; - } - } - - double hm = fputil::nearest_integer(y6_log2_x.hi); - - // lo6 = 2^6 * lo. - double lo6_hi = y6_log2_x.hi - hm; - double lo6 = lo6_hi + y6_log2_x.lo; - - int hm_i = static_cast(hm); - unsigned idx_y = static_cast(hm_i) & 0x3f; - - // 2^hi - int64_t exp2_hi_i = static_cast( - static_cast(static_cast(hm_i >> 6)) - << FPBits::FRACTION_LEN); - // 2^mid - int64_t exp2_mid_hi_i = - static_cast(FPBits(EXP2_MID1[idx_y].hi).uintval()); - int64_t exp2_mid_lo_i = - static_cast(FPBits(EXP2_MID1[idx_y].mid).uintval()); - // (-1)^sign * 2^hi * 2^mid - // Error <= 2^hi * 2^-53 - uint64_t exp2_hm_hi_i = - static_cast(exp2_hi_i + exp2_mid_hi_i) + sign; - // The low part could be 0. - uint64_t exp2_hm_lo_i = - idx_y != 0 ? static_cast(exp2_hi_i + exp2_mid_lo_i) + sign - : sign; - double exp2_hm_hi = FPBits(exp2_hm_hi_i).get_val(); - double exp2_hm_lo = FPBits(exp2_hm_lo_i).get_val(); - - // Degree-5 polynomial approximation P(lo6) ~ 2^(lo6 / 2^6) = 2^(lo). - // Generated by Sollya with: - // > P = fpminimax(2^(x/64), 5, [|1, D...|], [-2^-1, 2^-1]); - // > dirtyinfnorm(2^(x/64) - P, [-0.5, 0.5]); - // 0x1.a2b77e618f5c4c176fd11b7659016cde5de83cb72p-60 - constexpr double EXP2_COEFFS[] = {0x1p0, - 0x1.62e42fefa39efp-7, - 0x1.ebfbdff82a23ap-15, - 0x1.c6b08d7076268p-23, - 0x1.3b2ad33f8b48bp-31, - 0x1.5d870c4d84445p-40}; - - double lo6_sqr = lo6 * lo6; - - double d0 = fputil::multiply_add(lo6, EXP2_COEFFS[2], EXP2_COEFFS[1]); - double d1 = fputil::multiply_add(lo6, EXP2_COEFFS[4], EXP2_COEFFS[3]); - double pp = fputil::polyeval(lo6_sqr, d0, d1, EXP2_COEFFS[5]); - - double r = fputil::multiply_add(exp2_hm_hi * lo6, pp, exp2_hm_lo); - r += exp2_hm_hi; - - return r * scale; + return math::pow(x, y); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/test/shared/CMakeLists.txt b/libc/test/shared/CMakeLists.txt index 2614ab626c21..ab54dc610301 100644 --- a/libc/test/shared/CMakeLists.txt +++ b/libc/test/shared/CMakeLists.txt @@ -97,6 +97,7 @@ add_fp_unittest( libc.src.__support.math.llogbf16 libc.src.__support.math.logf16 libc.src.__support.math.llogbl + libc.src.__support.math.pow libc.src.__support.math.rsqrtf libc.src.__support.math.rsqrtf16 libc.src.__support.math.sqrtf16 diff --git a/libc/test/shared/shared_math_test.cpp b/libc/test/shared/shared_math_test.cpp index 54b8a970929d..2e6daf0509da 100644 --- a/libc/test/shared/shared_math_test.cpp +++ b/libc/test/shared/shared_math_test.cpp @@ -142,6 +142,7 @@ TEST(LlvmLibcSharedMathTest, AllDouble) { EXPECT_FP_EQ(0x0p+0, LIBC_NAMESPACE::shared::log10(1.0)); EXPECT_FP_EQ(0x0p+0, LIBC_NAMESPACE::shared::log1p(0.0)); EXPECT_FP_EQ(0x0p+0, LIBC_NAMESPACE::shared::log2(1.0)); + EXPECT_FP_EQ(1.0, LIBC_NAMESPACE::shared::pow(0.0, 0.0)); EXPECT_FP_EQ(0.0, LIBC_NAMESPACE::shared::sin(0.0)); EXPECT_FP_EQ(0x0p+0, LIBC_NAMESPACE::shared::sqrt(0.0)); EXPECT_FP_EQ(0.0, LIBC_NAMESPACE::shared::tan(0.0)); diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel index 69b68ba52b45..96421b6076ae 100644 --- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel @@ -3302,6 +3302,24 @@ libc_support_library( ], ) +libc_support_library( + name = "__support_math_pow", + hdrs = ["src/__support/math/pow.h"], + deps = [ + ":__support_cpp_bit", + ":__support_fputil_double_double", + ":__support_fputil_fenv_impl", + ":__support_fputil_fp_bits", + ":__support_fputil_multiply_add", + ":__support_fputil_nearest_integer", + ":__support_fputil_polyeval", + ":__support_fputil_sqrt", + ":__support_macros_optimization", + ":__support_math_common_constants", + ":__support_math_exp_constants", + ], +) + libc_support_library( name = "__support_math_exp_constants", hdrs = ["src/__support/math/exp_constants.h"], @@ -5125,11 +5143,8 @@ libc_math_function(name = "nextupf16") libc_math_function( name = "pow", additional_deps = [ - ":__support_fputil_double_double", - ":__support_fputil_nearest_integer", - ":__support_fputil_polyeval", - ":__support_fputil_sqrt", - ":__support_math_common_constants", + ":__support_math_pow", + ":errno", ], )