libclc: Improve float trig function handling (#187264)

Most of this was originally ported from rocm device libs in
c0ab2f81e3ab5c7a4c2e0b812a873c3a7f9dca8b, so merge
in more recent changes.
This commit is contained in:
Matt Arsenault 2026-03-18 14:10:58 +01:00 committed by GitHub
parent d7dbf1bd64
commit b15fa374ff
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 97 additions and 127 deletions

View File

@ -6,6 +6,15 @@
//
//===----------------------------------------------------------------------===//
typedef struct __CLC_XCONCAT(__clc_sincos_ret_, __CLC_GENTYPE) {
__CLC_GENTYPE sin, cos;
} __CLC_XCONCAT(__clc_sincos_ret_, __CLC_GENTYPE);
#define __CLC_SINCOS_RET_GENTYPE __CLC_XCONCAT(__clc_sincos_ret_, __CLC_GENTYPE)
_CLC_DECL _CLC_OVERLOAD _CLC_CONST __CLC_SINCOS_RET_GENTYPE
__clc_sincos_reduced_eval(__CLC_FLOATN x);
_CLC_DECL _CLC_OVERLOAD __CLC_FLOATN __clc_sinf_piby4(__CLC_FLOATN x,
__CLC_FLOATN y);
_CLC_DECL _CLC_OVERLOAD __CLC_FLOATN __clc_cosf_piby4(__CLC_FLOATN x,
@ -19,5 +28,4 @@ _CLC_DECL _CLC_OVERLOAD __CLC_FLOATN __clc_tanf_piby4(__CLC_FLOATN x,
__CLC_INTN regn);
_CLC_DECL _CLC_OVERLOAD __CLC_INTN __clc_argReductionS(private __CLC_FLOATN *r,
private __CLC_FLOATN *rr,
__CLC_FLOATN x);

View File

@ -12,14 +12,13 @@ _CLC_OVERLOAD _CLC_DEF __CLC_FLOATN __clc_cos(__CLC_FLOATN x) {
x = __clc_select(x, __CLC_GENTYPE_NAN, __clc_isinf(x));
__CLC_FLOATN absx = __clc_fabs(x);
__CLC_FLOATN r0, r1;
__CLC_INTN regn = __clc_argReductionS(&r0, &r1, absx);
__CLC_FLOATN r0;
__CLC_INTN regn = __clc_argReductionS(&r0, absx);
__CLC_SINCOS_RET_GENTYPE eval = __clc_sincos_reduced_eval(r0);
eval.sin = -eval.sin;
__CLC_FLOATN ss = -__clc_sinf_piby4(r0, r1);
__CLC_FLOATN cc = __clc_cosf_piby4(r0, r1);
__CLC_FLOATN c = (regn & 1) != 0 ? ss : cc;
return __CLC_AS_FLOATN(__CLC_AS_INTN(c) ^ ((regn > 1) << 31));
__CLC_FLOATN c = (regn & 1) != 0 ? eval.sin : eval.cos;
return __CLC_AS_FLOATN(__CLC_AS_UINTN(c) ^ (regn > 1 ? SIGNBIT_SP32 : 0));
}
#elif __CLC_FPSIZE == 16

View File

@ -13,15 +13,14 @@ _CLC_OVERLOAD _CLC_DEF __CLC_FLOATN __clc_sin(__CLC_FLOATN x) {
__CLC_FLOATN absx = __clc_fabs(x);
__CLC_FLOATN r0, r1;
__CLC_INTN regn = __clc_argReductionS(&r0, &r1, absx);
__CLC_FLOATN r0;
__CLC_INTN regn = __clc_argReductionS(&r0, absx);
__CLC_SINCOS_RET_GENTYPE eval = __clc_sincos_reduced_eval(r0);
__CLC_FLOATN ss = __clc_sinf_piby4(r0, r1);
__CLC_FLOATN cc = __clc_cosf_piby4(r0, r1);
__CLC_FLOATN s = (regn & 1) != 0 ? eval.cos : eval.sin;
__CLC_FLOATN s = (regn & 1) != 0 ? cc : ss;
return __CLC_AS_FLOATN(__CLC_AS_INTN(s) ^ ((regn > 1) << 31) ^
(__CLC_AS_INTN(x) ^ __CLC_AS_INTN(absx)));
return __CLC_AS_FLOATN(__CLC_AS_UINTN(s) ^ (regn > 1 ? SIGNBIT_SP32 : 0) ^
(__CLC_AS_UINTN(x) ^ __CLC_AS_UINTN(absx)));
}
#elif __CLC_FPSIZE == 16

View File

@ -8,13 +8,14 @@
#include "clc/clc_convert.h"
#include "clc/integer/clc_clz.h"
#include "clc/integer/clc_mul_hi.h"
#include "clc/internal/clc.h"
#include "clc/math/clc_fma.h"
#include "clc/math/clc_frexp.h"
#include "clc/math/clc_ldexp.h"
#include "clc/math/clc_mad.h"
#include "clc/math/clc_native_divide.h"
#include "clc/math/clc_rint.h"
#include "clc/math/clc_sincos_helpers.h"
#include "clc/math/clc_trunc.h"
#include "clc/math/math.h"

View File

@ -8,71 +8,36 @@
#pragma OPENCL FP_CONTRACT OFF
_CLC_DEF _CLC_OVERLOAD __CLC_FLOATN __clc_sinf_piby4(__CLC_FLOATN x,
__CLC_FLOATN y) {
_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_SINCOS_RET_GENTYPE
__clc_sincos_reduced_eval(__CLC_FLOATN x) {
// Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ...
// = x * (1 - x^2/3! + x^4/5! - x^6/7! ...
// = x * f(w)
// where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ...
// We use a minimax approximation of (f(w) - 1) / w
// because this produces an expansion in even powers of x.
const __CLC_FLOATN sin_c1 = -0x1.55553ap-3f; // ~-1/3!
const __CLC_FLOATN sin_c2 = 0x1.110388p-7f; // ~1/5!
const __CLC_FLOATN sin_c3 = -0x1.983304p-13f; // ~1/7!
const __CLC_FLOATN c1 = -0.1666666666e0f;
const __CLC_FLOATN c2 = 0.8333331876e-2f;
const __CLC_FLOATN c3 = -0.198400874e-3f;
const __CLC_FLOATN c4 = 0.272500015e-5f;
const __CLC_FLOATN c5 = -2.5050759689e-08f; // 0xb2d72f34
const __CLC_FLOATN c6 = 1.5896910177e-10f; // 0x2f2ec9d3
const __CLC_FLOATN cos_c1 = -0x1.000008p-1f; // ~1/2!
const __CLC_FLOATN cos_c2 = 0x1.5557eep-5f; // ~1/4!
const __CLC_FLOATN cos_c3 = -0x1.6c9e76p-10f; // ~1/6!
const __CLC_FLOATN cos_c4 = 0x1.aea668p-16f; // ~1/8!
__CLC_FLOATN z = x * x;
__CLC_FLOATN v = z * x;
__CLC_FLOATN r = __clc_mad(
z, __clc_mad(z, __clc_mad(z, __clc_mad(z, c6, c5), c4), c3), c2);
__CLC_FLOATN ret =
x - __clc_mad(v, -c1, __clc_mad(z, __clc_mad(y, 0.5f, -v * r), -y));
__CLC_FLOATN t = x * x;
return ret;
}
__CLC_FLOATN s =
__clc_mad(x, t * __clc_mad(t, __clc_mad(t, sin_c3, sin_c2), sin_c1), x);
_CLC_DEF _CLC_OVERLOAD __CLC_FLOATN __clc_cosf_piby4(__CLC_FLOATN x,
__CLC_FLOATN y) {
// Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ...
// = f(w)
// where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ...
// We use a minimax approximation of (f(w) - 1 + w/2) / (w*w)
// because this produces an expansion in even powers of x.
__CLC_FLOATN c = __clc_mad(
t,
__clc_mad(t, __clc_mad(t, __clc_mad(t, cos_c4, cos_c3), cos_c2), cos_c1),
1.0f);
const __CLC_FLOATN c1 = 0.416666666e-1f;
const __CLC_FLOATN c2 = -0.138888876e-2f;
const __CLC_FLOATN c3 = 0.248006008e-4f;
const __CLC_FLOATN c4 = -0.2730101334e-6f;
const __CLC_FLOATN c5 = 2.0875723372e-09f; // 0x310f74f6
const __CLC_FLOATN c6 = -1.1359647598e-11f; // 0xad47d74e
__CLC_FLOATN z = x * x;
__CLC_FLOATN r =
z *
__clc_mad(
z,
__clc_mad(z, __clc_mad(z, __clc_mad(z, __clc_mad(z, c6, c5), c4), c3),
c2),
c1);
// if |x| < 0.3
__CLC_FLOATN qx = 0.0f;
__CLC_INTN ix = __CLC_AS_INTN(x) & EXSIGNBIT_SP32;
// 0.78125 > |x| >= 0.3
__CLC_FLOATN xby4 = __CLC_AS_FLOATN(ix - 0x01000000);
qx = ((ix >= 0x3e99999a) & (ix <= 0x3f480000)) ? xby4 : qx;
// x > 0.78125
qx = ix > 0x3f480000 ? 0.28125f : qx;
__CLC_FLOATN hz = __clc_mad(z, 0.5f, -qx);
__CLC_FLOATN a = 1.0f - qx;
__CLC_FLOATN ret = a - (hz - __clc_mad(z, r, -x * y));
__CLC_SINCOS_RET_GENTYPE ret;
ret.cos = c;
ret.sin = s;
return ret;
}
@ -152,53 +117,29 @@ _CLC_DEF _CLC_OVERLOAD void __clc_fullMulS(private __CLC_FLOATN *hi,
}
_CLC_DEF _CLC_OVERLOAD __CLC_FLOATN __clc_removePi2S(private __CLC_FLOATN *hi,
private __CLC_FLOATN *lo,
__CLC_FLOATN x) {
// 72 bits of pi/2
const __CLC_FLOATN fpiby2_1 = (__CLC_FLOATN)0xC90FDA / 0x1.0p+23f;
const __CLC_FLOATN fpiby2_1_h = (__CLC_FLOATN)0xC90 / 0x1.0p+11f;
const __CLC_FLOATN fpiby2_1_t = (__CLC_FLOATN)0xFDA / 0x1.0p+23f;
const __CLC_FLOATN fpiby2_2 = (__CLC_FLOATN)0xA22168 / 0x1.0p+47f;
const __CLC_FLOATN fpiby2_2_h = (__CLC_FLOATN)0xA22 / 0x1.0p+35f;
const __CLC_FLOATN fpiby2_2_t = (__CLC_FLOATN)0x168 / 0x1.0p+47f;
const __CLC_FLOATN fpiby2_3 = (__CLC_FLOATN)0xC234C4 / 0x1.0p+71f;
const __CLC_FLOATN fpiby2_3_h = (__CLC_FLOATN)0xC23 / 0x1.0p+59f;
const __CLC_FLOATN fpiby2_3_t = (__CLC_FLOATN)0x4C4 / 0x1.0p+71f;
const __CLC_FLOATN twobypi = 0x1.45f306p-1f;
const __CLC_FLOATN piby2_h = 0x1.921fb4p+0f;
const __CLC_FLOATN piby2_m = 0x1.4442d0p-24f;
const __CLC_FLOATN piby2_l = 0x1.846988p-48f;
__CLC_FLOATN fnpi2 = __clc_trunc(__clc_mad(x, twobypi, 0.5f));
__CLC_FLOATN fn = __clc_rint(x * twobypi);
// subtract n * pi/2 from x
__CLC_FLOATN rhead, rtail;
__clc_fullMulS(&rhead, &rtail, fnpi2, fpiby2_1, fpiby2_1_h, fpiby2_1_t);
__CLC_FLOATN v = x - rhead;
__CLC_FLOATN rem = v + (((x - v) - rhead) - rtail);
__CLC_FLOATN r = __clc_fma(
fn, -piby2_l, __clc_fma(fn, -piby2_m, __clc_fma(fn, -piby2_h, x)));
*hi = r;
__CLC_FLOATN rhead2, rtail2;
__clc_fullMulS(&rhead2, &rtail2, fnpi2, fpiby2_2, fpiby2_2_h, fpiby2_2_t);
v = rem - rhead2;
rem = v + (((rem - v) - rhead2) - rtail2);
__CLC_FLOATN rhead3, rtail3;
__clc_fullMulS(&rhead3, &rtail3, fnpi2, fpiby2_3, fpiby2_3_h, fpiby2_3_t);
v = rem - rhead3;
*hi = v + ((rem - v) - rhead3);
*lo = -rtail3;
return fnpi2;
return fn;
}
_CLC_DEF _CLC_OVERLOAD __CLC_INTN __clc_argReductionSmallS(
private __CLC_FLOATN *r, private __CLC_FLOATN *rr, __CLC_FLOATN x) {
__CLC_FLOATN fnpi2 = __clc_removePi2S(r, rr, x);
_CLC_DEF _CLC_OVERLOAD __CLC_INTN
__clc_argReductionSmallS(private __CLC_FLOATN *r, __CLC_FLOATN x) {
__CLC_FLOATN fnpi2 = __clc_removePi2S(r, x);
return __CLC_CONVERT_INTN(fnpi2) & 0x3;
}
_CLC_DEF _CLC_OVERLOAD __CLC_INTN __clc_argReductionLargeS(
private __CLC_FLOATN *r, private __CLC_FLOATN *rr, __CLC_FLOATN x) {
_CLC_DEF _CLC_OVERLOAD __CLC_INTN
__clc_argReductionLargeS(private __CLC_FLOATN *r, __CLC_FLOATN x) {
__CLC_INTN xe;
__CLC_FLOATN m = __clc_frexp(x, &xe);
--xe;
@ -248,7 +189,8 @@ _CLC_DEF _CLC_OVERLOAD __CLC_INTN __clc_argReductionLargeS(
p6 = __CLC_CONVERT_UINTN(a);
p7 = __CLC_CONVERT_UINTN(a >> 32);
__CLC_UINTN fbits = (__CLC_UINTN)224 + (__CLC_UINTN)23 - __CLC_AS_UINTN(xe);
__CLC_UINTN fbits =
(__CLC_UINTN)224 + (__CLC_UINTN)23 - __CLC_CONVERT_UINTN(xe);
// shift amount to get 2 lsb of integer part at top 2 bits
// min: 25 (xe=18) max: 134 (xe=127)
@ -262,7 +204,7 @@ _CLC_DEF _CLC_OVERLOAD __CLC_INTN __clc_argReductionLargeS(
p4 = c ? p2 : p4;
p3 = c ? p1 : p3;
p2 = c ? p0 : p2;
shift -= __CLC_CONVERT_UINTN((-c) & (__CLC_INTN)64);
shift -= c ? 64u : 0u;
c = shift > 31;
p7 = c ? p6 : p7;
@ -270,14 +212,14 @@ _CLC_DEF _CLC_OVERLOAD __CLC_INTN __clc_argReductionLargeS(
p5 = c ? p4 : p5;
p4 = c ? p3 : p4;
p3 = c ? p2 : p3;
shift -= __CLC_CONVERT_UINTN((-c) & (__CLC_INTN)32);
shift -= c ? 32u : 0u;
c = shift > 31;
p7 = c ? p6 : p7;
p6 = c ? p5 : p6;
p5 = c ? p4 : p5;
p4 = c ? p3 : p4;
shift -= __CLC_CONVERT_UINTN((-c) & (__CLC_INTN)32);
shift -= c ? 32u : 0u;
// bitalign cannot handle a shift of 32
c = shift > 0;
@ -290,7 +232,7 @@ _CLC_DEF _CLC_OVERLOAD __CLC_INTN __clc_argReductionLargeS(
p5 = c ? t5 : p5;
// Get 2 lsb of int part and msb of fraction
__CLC_INTN i = __CLC_AS_INTN(p7 >> 29U);
__CLC_INTN i = __CLC_CONVERT_INTN(p7 >> 29U);
// Scoot up 2 more bits so only fraction remains
p7 = bitalign(p7, p6, (__CLC_UINTN)30u);
@ -354,24 +296,22 @@ _CLC_DEF _CLC_OVERLOAD __CLC_INTN __clc_argReductionLargeS(
rt = rt - (t - rh);
*r = t;
*rr = rt;
return ((i >> 1) + (i & 1)) & 0x3;
}
_CLC_DEF _CLC_OVERLOAD __CLC_INTN __clc_argReductionS(private __CLC_FLOATN *r,
private __CLC_FLOATN *rr,
__CLC_FLOATN x) {
__CLC_INTN is_large = x >= (__CLC_FLOATN)0x1.0p+23f;
__CLC_INTN is_large = x >= (__CLC_FLOATN)0x1.0p+17f;
#ifdef __CLC_SCALAR
if (is_large)
return __clc_argReductionLargeS(r, rr, x);
return __clc_argReductionSmallS(r, rr, x);
return __clc_argReductionLargeS(r, x);
return __clc_argReductionSmallS(r, x);
#else
__CLC_FLOATN r1, rr1, r2, rr2;
__CLC_INTN ret1 = __clc_argReductionSmallS(&r1, &rr1, x);
__CLC_INTN ret2 = __clc_argReductionLargeS(&r2, &rr2, x);
*r = is_large ? r2 : r1;
*rr = is_large ? rr2 : rr1;
return is_large ? ret2 : ret1;
__CLC_FLOATN large_result, small_result;
__CLC_INTN small_n = __clc_argReductionSmallS(&small_result, x);
__CLC_INTN large_n = __clc_argReductionLargeS(&large_result, x);
*r = is_large ? large_result : small_result;
return is_large ? large_n : small_n;
#endif
}

View File

@ -9,7 +9,11 @@
#include "clc/clc_convert.h"
#include "clc/float/definitions.h"
#include "clc/internal/clc.h"
#include "clc/math/clc_div_fast.h"
#include "clc/math/clc_fabs.h"
#include "clc/math/clc_fma.h"
#include "clc/math/clc_mad.h"
#include "clc/math/clc_recip_fast.h"
#include "clc/math/clc_sincos_helpers.h"
#include "clc/math/math.h"
#include "clc/math/tables.h"

View File

@ -8,17 +8,36 @@
#if __CLC_FPSIZE == 32
_CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_tan_reduced_eval(__CLC_GENTYPE x,
__CLC_INTN is_odd) {
__CLC_GENTYPE s = x * x;
__CLC_GENTYPE a = __clc_mad(s, -0x1.19dba6p-6f, 0x1.8a8b0ep-2f);
__CLC_GENTYPE b = __clc_mad(s, __clc_mad(s, 0x1.2e2900p-6f, -0x1.07266ep-1f),
0x1.27e84ap+0f);
__CLC_GENTYPE p = s * __clc_div_fast(a, b);
__CLC_GENTYPE t = __clc_fma(p, x, x);
__CLC_GENTYPE tt = __clc_fma(p, x, -(t - x));
__CLC_GENTYPE tr = -__clc_recip_fast(t);
__CLC_GENTYPE e = __clc_fma(tt, tr, __clc_fma(t, tr, 1.0f));
tr = __clc_fma(e, tr, tr);
return is_odd ? tr : t;
}
_CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_tan(__CLC_GENTYPE x) {
x = __clc_select(x, __CLC_GENTYPE_NAN, __clc_isinf(x));
__CLC_GENTYPE absx = __clc_fabs(x);
__CLC_UINTN x_signbit = __CLC_AS_UINTN(x) & SIGNBIT_SP32;
__CLC_GENTYPE r0, r1;
__CLC_INTN regn = __clc_argReductionS(&r0, &r1, absx);
__CLC_GENTYPE r0;
__CLC_INTN regn = __clc_argReductionS(&r0, absx);
__CLC_GENTYPE t = __clc_tanf_piby4(r0 + r1, regn);
return __CLC_AS_GENTYPE(__CLC_AS_UINTN(t) ^ x_signbit);
__CLC_GENTYPE t = __clc_tan_reduced_eval(r0, regn & 1);
return __CLC_AS_FLOATN(__CLC_AS_UINTN(t) ^ x_signbit);
}
#elif __CLC_FPSIZE == 64