libclc: Update sinh (#188213)

This was originally ported from rocm device libs in
9f7172965c650627c020264e9dbdb32d005ce69e. Merge in more
recent changes.
This commit is contained in:
Matt Arsenault 2026-03-24 11:56:05 +01:00 committed by GitHub
parent 1b255ef5ac
commit d491f70206
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 106 additions and 185 deletions

View File

@ -134,6 +134,9 @@ _CLC_DECL _CLC_OVERLOAD _CLC_CONST __CLC_EP_PAIR __clc_ep_sqrt(__CLC_EP_PAIR a);
#if __CLC_FPSIZE == 32 || __CLC_FPSIZE == 64
_CLC_DECL _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_ep_exp(__CLC_EP_PAIR a);
_CLC_DECL _CLC_OVERLOAD _CLC_CONST __CLC_EP_PAIR
__clc_ep_exp_extended(__CLC_EP_PAIR x);
_CLC_DECL _CLC_OVERLOAD _CLC_CONST __CLC_EP_PAIR __clc_ep_ln(__CLC_GENTYPE a);
_CLC_DECL _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_ep_ln_hi(__CLC_EP_PAIR a,
__CLC_INTN ea);

View File

@ -441,6 +441,40 @@ _CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_ep_exp(__CLC_EP_PAIR x) {
return __clc_isinf(z) ? z : zz;
}
_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_EP_PAIR
__clc_ep_exp_extended(__CLC_EP_PAIR x) {
const __CLC_FLOATN LOG2E = 0x1.715476p+0f;
const __CLC_FLOATN LN2_HI = 0x1.62e400p-1f;
const __CLC_FLOATN LN2_MED = 0x1.7f7800p-20f;
const __CLC_FLOATN LN2_LO = 0x1.473de6p-34f;
const __CLC_FLOATN POLY_C1 = 0x1.6850e4p-10f;
const __CLC_FLOATN POLY_C2 = 0x1.123bccp-7f;
const __CLC_FLOATN POLY_C3 = 0x1.555b98p-5f;
const __CLC_FLOATN POLY_C4 = 0x1.55548ep-3f;
const __CLC_FLOATN POLY_C5 = 0x1.fffff8p-2f;
__CLC_FLOATN fn = __clc_rint(x.hi * LOG2E);
__CLC_EP_PAIR term_hi = __clc_ep_fast_add(__clc_mad(fn, -LN2_HI, x.hi), x.lo);
__CLC_EP_PAIR term_med = __clc_ep_fast_sub(term_hi, fn * LN2_MED);
__CLC_EP_PAIR t = __clc_ep_fast_sub(term_med, fn * LN2_LO);
__CLC_FLOATN th = t.hi;
__CLC_FLOATN poly1 = __clc_mad(th, POLY_C1, POLY_C2);
__CLC_FLOATN poly2 = __clc_mad(th, poly1, POLY_C3);
__CLC_FLOATN poly3 = __clc_mad(th, poly2, POLY_C4);
__CLC_FLOATN p = __clc_mad(th, poly3, POLY_C5);
__CLC_EP_PAIR sqr_t = __clc_ep_sqr(t);
__CLC_EP_PAIR poly_term = __clc_ep_mul(sqr_t, p);
__CLC_EP_PAIR inner_sum = __clc_ep_fast_add(t, poly_term);
__CLC_EP_PAIR r = __clc_ep_fast_add(1.0f, inner_sum);
return __clc_ep_ldexp(r, __CLC_CONVERT_INTN(fn));
}
_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_EP_PAIR __clc_ep_ln(__CLC_GENTYPE a) {
__CLC_INTN a_exp;
__CLC_GENTYPE m = __clc_frexp(a, &a_exp);
@ -499,6 +533,50 @@ _CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_ep_exp(__CLC_EP_PAIR x) {
return __clc_isinf(z) ? z : zz;
}
_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_EP_PAIR
__clc_ep_exp_extended(__CLC_EP_PAIR x) {
const __CLC_DOUBLEN LOG2E = 0x1.71547652b82fep+0;
const __CLC_DOUBLEN LN2_HI = 0x1.62e42fefa3000p-1;
const __CLC_DOUBLEN LN2_MED = 0x1.3de6af278e000p-42;
const __CLC_DOUBLEN LN2_LO = 0x1.9cc01f97b57a0p-83;
const __CLC_DOUBLEN POLY_C1 = 0x1.ade156a5dcb37p-26;
const __CLC_DOUBLEN POLY_C2 = 0x1.28af3fca7ab0cp-22;
const __CLC_DOUBLEN POLY_C3 = 0x1.71dee623fde64p-19;
const __CLC_DOUBLEN POLY_C4 = 0x1.a01997c89e6b0p-16;
const __CLC_DOUBLEN POLY_C5 = 0x1.a01a014761f6ep-13;
const __CLC_DOUBLEN POLY_C6 = 0x1.6c16c1852b7b0p-10;
const __CLC_DOUBLEN POLY_C7 = 0x1.1111111122322p-7;
const __CLC_DOUBLEN POLY_C8 = 0x1.55555555502a1p-5;
const __CLC_DOUBLEN POLY_C9 = 0x1.5555555555511p-3;
const __CLC_DOUBLEN POLY_C10 = 0x1.000000000000bp-1;
__CLC_DOUBLEN dn = __clc_rint(x.hi * LOG2E);
__CLC_EP_PAIR term_hi = __clc_ep_fast_add(__clc_mad(dn, -LN2_HI, x.hi), x.lo);
__CLC_EP_PAIR term_med = __clc_ep_fast_sub(term_hi, dn * LN2_MED);
__CLC_EP_PAIR t = __clc_ep_fast_sub(term_med, dn * LN2_LO);
__CLC_DOUBLEN th = t.hi;
__CLC_DOUBLEN poly1 = __clc_mad(th, POLY_C1, POLY_C2);
__CLC_DOUBLEN poly2 = __clc_mad(th, poly1, POLY_C3);
__CLC_DOUBLEN poly3 = __clc_mad(th, poly2, POLY_C4);
__CLC_DOUBLEN poly4 = __clc_mad(th, poly3, POLY_C5);
__CLC_DOUBLEN poly5 = __clc_mad(th, poly4, POLY_C6);
__CLC_DOUBLEN poly6 = __clc_mad(th, poly5, POLY_C7);
__CLC_DOUBLEN poly7 = __clc_mad(th, poly6, POLY_C8);
__CLC_DOUBLEN poly8 = __clc_mad(th, poly7, POLY_C9);
__CLC_DOUBLEN p = __clc_mad(th, poly8, POLY_C10);
__CLC_EP_PAIR sqr_t = __clc_ep_sqr(t);
__CLC_EP_PAIR poly_term = __clc_ep_mul(sqr_t, p);
__CLC_EP_PAIR inner_sum = __clc_ep_fast_add(t, poly_term);
__CLC_EP_PAIR r = __clc_ep_fast_add(1.0, inner_sum);
return __clc_ep_ldexp(r, __CLC_CONVERT_INTN(dn));
}
_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_EP_PAIR __clc_ep_ln(__CLC_DOUBLEN a) {
__CLC_INTN a_exp;
__CLC_DOUBLEN m = __clc_frexp(a, &a_exp);

View File

@ -7,17 +7,11 @@
//===----------------------------------------------------------------------===//
#include "clc/clc_convert.h"
#include "clc/internal/clc.h"
#include "clc/math/clc_copysign.h"
#include "clc/math/clc_exp.h"
#include "clc/math/clc_ep.h"
#include "clc/math/clc_exp2_fast.h"
#include "clc/math/clc_fabs.h"
#include "clc/math/clc_fma.h"
#include "clc/math/clc_mad.h"
#include "clc/math/math.h"
#include "clc/math/tables.h"
#include "clc/relational/clc_isinf.h"
#include "clc/relational/clc_isnan.h"
#include "clc/shared/clc_min.h"
#include "clc/math/clc_sinh.h"
#define __CLC_BODY "clc_sinh.inc"
#include "clc/math/gentype.inc"

View File

@ -8,194 +8,40 @@
#if __CLC_FPSIZE == 32
_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_sinh(__CLC_GENTYPE x) {
// After dealing with special cases the computation is split into regions as
// follows. abs(x) >= max_sinh_arg: sinh(x) = sign(x)*Inf abs(x) >=
// small_threshold: sinh(x) = sign(x)*exp(abs(x))/2 computed using the
// splitexp and scaleDouble functions as for exp_amd(). abs(x) <
// small_threshold: compute p = exp(y) - 1 and then z = 0.5*(p+(p/(p+1.0)))
// sinh(x) is then sign(x)*z.
_CLC_OVERLOAD _CLC_CONST _CLC_DEF __CLC_FLOATN __clc_sinh(__CLC_FLOATN x) {
const __CLC_EP_PAIR ln2 = __clc_ep_make_pair(__CLC_FP_LIT(0x1.62e430p-1),
__CLC_FP_LIT(-0x1.05c610p-29));
__CLC_FLOATN absx = __clc_fabs(x);
__CLC_EP_PAIR e = __clc_ep_exp_extended(__clc_ep_sub(absx, ln2));
__CLC_EP_PAIR s = __clc_ep_fast_sub(e, __clc_ep_ldexp(__clc_ep_recip(e), -2));
__CLC_FLOATN z = absx > 0x1.65a9f8p+6f ? __CLC_GENTYPE_INF : s.hi;
const __CLC_GENTYPE max_sinh_arg = 0x1.65a9fap+6f;
const __CLC_GENTYPE small_threshold = 0x1.0a2b24p+3f;
__CLC_UINTN ux = __CLC_AS_UINTN(x);
__CLC_GENTYPE y = __clc_fabs(x);
__CLC_UINTN aux = __CLC_AS_UINTN(y);
__CLC_UINTN xs = ux ^ aux;
// We find the integer part y0 of y and the increment dy = y - y0. We then
// compute z = sinh(y) = sinh(y0)cosh(dy) + cosh(y0)sinh(dy) where sinh(y0)
// and cosh(y0) are tabulated above.
__CLC_INTN ind = __CLC_CONVERT_INTN(y);
ind = __CLC_CONVERT_UINTN(ind) > 36U ? 0 : ind;
__CLC_GENTYPE dy = y - __CLC_CONVERT_GENTYPE(ind);
__CLC_GENTYPE dy2 = dy * dy;
__CLC_GENTYPE sdy = __clc_mad(
dy2,
__clc_mad(
dy2,
__clc_mad(
dy2,
__clc_mad(
dy2,
__clc_mad(dy2,
__clc_mad(dy2, 0.7746188980094184251527126e-12f,
0.160576793121939886190847e-9f),
0.250521176994133472333666e-7f),
0.275573191913636406057211e-5f),
0.198412698413242405162014e-3f),
0.833333333333329931873097e-2f),
0.166666666666666667013899e0f);
sdy = __clc_mad(sdy, dy * dy2, dy);
__CLC_GENTYPE cdy = __clc_mad(
dy2,
__clc_mad(
dy2,
__clc_mad(
dy2,
__clc_mad(
dy2,
__clc_mad(dy2,
__clc_mad(dy2, 0.1163921388172173692062032e-10f,
0.208744349831471353536305e-8f),
0.275573350756016588011357e-6f),
0.248015872460622433115785e-4f),
0.138888888889814854814536e-2f),
0.416666666666660876512776e-1f),
0.500000000000000005911074e0f);
cdy = __clc_mad(cdy, dy2, 1.0f);
__CLC_GENTYPE sinhcoshh = __CLC_USE_TABLE(sinhcosh_tbl_head, ind);
__CLC_GENTYPE sinhcosht = __CLC_USE_TABLE(sinhcosh_tbl_tail, ind);
__CLC_GENTYPE z = __clc_mad(sinhcosht, sdy, sinhcoshh * cdy);
z = __CLC_AS_GENTYPE(xs | __CLC_AS_UINTN(z));
// When y is large enough so that the negative exponential is negligible,
// so sinh(y) is approximated by sign(x)*exp(y)/2.
__CLC_GENTYPE t = __clc_exp(y - 0x1.62e500p-1f);
__CLC_GENTYPE zsmall = __clc_mad(0x1.a0210ep-18f, t, t);
zsmall = __CLC_AS_GENTYPE(xs | __CLC_AS_UINTN(zsmall));
z = y >= small_threshold ? zsmall : z;
// Corner cases
__CLC_GENTYPE zinf = __CLC_AS_GENTYPE(PINFBITPATT_SP32 | xs);
z = y >= max_sinh_arg ? zinf : z;
z = aux > PINFBITPATT_SP32 || aux < 0x38800000U ? x : z;
return z;
z = absx < 0x1.0p-12f ? absx : z;
return __clc_copysign(z, x);
}
#elif __CLC_FPSIZE == 64
_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_sinh(__CLC_GENTYPE x) {
// After dealing with special cases the computation is split into
// regions as follows:
//
// abs(x) >= max_sinh_arg:
// sinh(x) = sign(x)*Inf
//
// abs(x) >= small_threshold:
// sinh(x) = sign(x)*exp(abs(x))/2 computed using the
// splitexp and scaleDouble functions as for exp_amd().
//
// abs(x) < small_threshold:
// compute p = exp(y) - 1 and then z = 0.5*(p+(p/(p+1.0)))
// sinh(x) is then sign(x)*z.
_CLC_OVERLOAD _CLC_CONST _CLC_DEF __CLC_DOUBLEN __clc_sinh(__CLC_DOUBLEN x) {
const __CLC_EP_PAIR ln2 = __clc_ep_make_pair(
__CLC_FP_LIT(0x1.62e42fefa39efp-1), __CLC_FP_LIT(0x1.abc9e3b39803fp-56));
// 0x408633ce8fb9f87e
const __CLC_GENTYPE max_sinh_arg = 7.10475860073943977113e+02;
// This is where exp(-x) is insignificant compared to exp(x) = ln(2^27)
const __CLC_GENTYPE small_threshold = 0x1.2b708872320e2p+4;
__CLC_GENTYPE y = __clc_fabs(x);
// In this range we find the integer part y0 of y
// and the increment dy = y - y0. We then compute
// z = sinh(y) = sinh(y0)cosh(dy) + cosh(y0)sinh(dy)
// where sinh(y0) and cosh(y0) are obtained from tables
__CLC_INTN ind = __clc_min(__CLC_CONVERT_INTN(y), 36);
__CLC_GENTYPE dy = y - __CLC_CONVERT_GENTYPE(ind);
__CLC_GENTYPE dy2 = dy * dy;
__CLC_GENTYPE sdy =
dy * dy2 *
__clc_fma(
dy2,
__clc_fma(
dy2,
__clc_fma(
dy2,
__clc_fma(
dy2,
__clc_fma(dy2,
__clc_fma(dy2, 0.7746188980094184251527126e-12,
0.160576793121939886190847e-9),
0.250521176994133472333666e-7),
0.275573191913636406057211e-5),
0.198412698413242405162014e-3),
0.833333333333329931873097e-2),
0.166666666666666667013899e0);
__CLC_GENTYPE cdy =
dy2 *
__clc_fma(
dy2,
__clc_fma(
dy2,
__clc_fma(
dy2,
__clc_fma(
dy2,
__clc_fma(dy2,
__clc_fma(dy2, 0.1163921388172173692062032e-10,
0.208744349831471353536305e-8),
0.275573350756016588011357e-6),
0.248015872460622433115785e-4),
0.138888888889814854814536e-2),
0.416666666666660876512776e-1),
0.500000000000000005911074e0);
// At this point sinh(dy) is approximated by dy + sdy.
// Shift some significant bits from dy to sdy.
__CLC_GENTYPE sdy1 =
__CLC_AS_GENTYPE(__CLC_AS_ULONGN(dy) & 0xfffffffff8000000UL);
__CLC_GENTYPE sdy2 = sdy + (dy - sdy1);
__CLC_GENTYPE cl = __CLC_USE_TABLE(cosh_tbl_head, ind);
__CLC_GENTYPE ct = __CLC_USE_TABLE(cosh_tbl_tail, ind);
__CLC_GENTYPE sl = __CLC_USE_TABLE(sinh_tbl_head, ind);
__CLC_GENTYPE st = __CLC_USE_TABLE(sinh_tbl_tail, ind);
__CLC_GENTYPE z =
__clc_fma(cl, sdy1,
__clc_fma(sl, cdy,
__clc_fma(cl, sdy2,
__clc_fma(ct, sdy1,
__clc_fma(st, cdy, ct * sdy2)) +
st))) +
sl;
// Other cases
z = (y < 0x1.0p-28) || __clc_isnan(x) || __clc_isinf(x) ? y : z;
__CLC_GENTYPE t = __clc_exp(y - 0x1.62e42fefa3800p-1);
t = __clc_fma(t, -0x1.ef35793c76641p-45, t);
z = y >= small_threshold ? t : z;
z = y >= max_sinh_arg ? __CLC_AS_GENTYPE((__CLC_ULONGN)PINFBITPATT_DP64) : z;
__CLC_DOUBLEN absx = __clc_fabs(x);
__CLC_EP_PAIR e = __clc_ep_exp_extended(__clc_ep_sub(absx, ln2));
__CLC_EP_PAIR s = __clc_ep_fast_sub(e, __clc_ep_ldexp(__clc_ep_recip(e), -2));
__CLC_DOUBLEN z = absx >= 0x1.633ce8fb9f87ep+9 ? __CLC_GENTYPE_INF : s.hi;
z = absx < 0x1.0p-27 ? absx : z;
return __clc_copysign(z, x);
}
#elif __CLC_FPSIZE == 16
_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_sinh(__CLC_GENTYPE x) {
return __CLC_CONVERT_GENTYPE(__clc_sinh(__CLC_CONVERT_FLOATN(x)));
_CLC_OVERLOAD _CLC_CONST _CLC_DEF __CLC_HALFN __clc_sinh(__CLC_HALFN x) {
__CLC_FLOATN x_float = __CLC_CONVERT_FLOATN(x) * (__CLC_FLOATN)M_LOG2E;
__CLC_FLOATN result =
0.5f * (__clc_exp2_fast(x_float) - __clc_exp2_fast(-x_float));
return __CLC_CONVERT_HALFN(result);
}
#endif