libclc: Update trigpi functions (#187579)

These were originally ported from rocm device libs in bc81ebefb7d9d9d71d20bfee2ce4cccb09701e9b. Merge in more recent changes.
2026-03-20 08:24:23 +01:00 · 2026-03-20 08:24:23 +01:00 · 421bf13e4b
commit 421bf13e4b
parent a971089cb8
21 changed files with 310 additions and 398 deletions
--- a/libclc/clc/include/clc/math/clc_sincos_helpers_decl.inc
+++ b/libclc/clc/include/clc/math/clc_sincos_helpers_decl.inc
@ -15,6 +15,9 @@ typedef struct __CLC_XCONCAT(__clc_sincos_ret_, __CLC_GENTYPE) {
 _CLC_DECL _CLC_OVERLOAD _CLC_CONST __CLC_SINCOS_RET_GENTYPE
 __clc_sincos_reduced_eval(__CLC_FLOATN x);

+_CLC_DECL _CLC_OVERLOAD _CLC_CONST __CLC_SINCOS_RET_GENTYPE
+__clc_sincospi_reduced_eval(__CLC_FLOATN x);
+
 _CLC_DECL _CLC_OVERLOAD __CLC_FLOATN __clc_sinf_piby4(__CLC_FLOATN x,
                                                      __CLC_FLOATN y);
 _CLC_DECL _CLC_OVERLOAD __CLC_FLOATN __clc_cosf_piby4(__CLC_FLOATN x,
--- a/libclc/clc/include/clc/math/clc_sincos_helpers_fp16_decl.inc
+++ b/libclc/clc/include/clc/math/clc_sincos_helpers_fp16_decl.inc
@ -15,5 +15,8 @@ typedef struct __CLC_XCONCAT(__clc_sincos_ret_, __CLC_GENTYPE) {
 _CLC_DEF _CLC_OVERLOAD __CLC_SINCOS_RET_GENTYPE
 __clc_sincos_reduced_eval(__CLC_HALFN x);

+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_SINCOS_RET_GENTYPE
+__clc_sincospi_reduced_eval(__CLC_HALFN x);
+
 _CLC_DEF _CLC_OVERLOAD __CLC_INTN __clc_argReductionS(private __CLC_HALFN *r,
                                                      __CLC_HALFN x);
--- a/libclc/clc/include/clc/math/clc_sincos_helpers_fp64_decl.inc
+++ b/libclc/clc/include/clc/math/clc_sincos_helpers_fp64_decl.inc
@ -15,19 +15,13 @@ typedef struct __CLC_XCONCAT(__clc_sincos_ret_, __CLC_GENTYPE) {
 _CLC_DEF _CLC_OVERLOAD __CLC_SINCOS_RET_GENTYPE
 __clc_sincos_reduced_eval(__CLC_DOUBLEN x, __CLC_DOUBLEN y);

+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_SINCOS_RET_GENTYPE
+__clc_sincospi_reduced_eval(__CLC_DOUBLEN x);
+
 _CLC_DEF _CLC_OVERLOAD __CLC_DOUBLEN __clc_tan_reduced_eval(__CLC_DOUBLEN x,
                                                            __CLC_DOUBLEN y,
                                                            __CLC_INTN is_odd);

-_CLC_DECL _CLC_OVERLOAD void __clc_sincos_piby4(__CLC_DOUBLEN x,
-                                                __CLC_DOUBLEN xx,
-                                                private __CLC_DOUBLEN *sinval,
-                                                private __CLC_DOUBLEN *cosval);
-
-_CLC_DECL _CLC_OVERLOAD void __clc_tan_piby4(__CLC_DOUBLEN x, __CLC_DOUBLEN xx,
-                                             private __CLC_DOUBLEN *leadval,
-                                             private __CLC_DOUBLEN *tailval);
-
 _CLC_DECL _CLC_OVERLOAD __CLC_INTN __clc_remainder_piby2_small(
    __CLC_DOUBLEN x, private __CLC_DOUBLEN *r, private __CLC_DOUBLEN *rr);

--- a/libclc/clc/include/clc/math/clc_sincospi.h
+++ b/libclc/clc/include/clc/math/clc_sincospi.h
@ -0,0 +1,21 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_MATH_CLC_SINCOSPI_H__
+#define __CLC_MATH_CLC_SINCOSPI_H__
+
+#include "clc/internal/clc.h"
+
+#define __CLC_BODY "clc/math/unary_decl_with_ptr.inc"
+#define __CLC_FUNCTION __clc_sincospi
+
+#include "clc/math/gentype.inc"
+
+#undef __CLC_FUNCTION
+
+#endif // __CLC_MATH_CLC_SINCOSPI_H__
--- a/libclc/clc/include/clc/math/clc_trigpi_helpers.h
+++ b/libclc/clc/include/clc/math/clc_trigpi_helpers.h
@ -0,0 +1,17 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_MATH_CLC_TRIGPI_HELPERS_H__
+#define __CLC_MATH_CLC_TRIGPI_HELPERS_H__
+
+#include "clc/internal/clc.h"
+
+#define __CLC_BODY "clc/math/clc_trigpi_helpers_decl.inc"
+#include "clc/math/gentype.inc"
+
+#endif // __CLC_MATH_CLC_TRIGPI_HELPERS_H__
--- a/libclc/clc/include/clc/math/clc_trigpi_helpers_decl.inc
+++ b/libclc/clc/include/clc/math/clc_trigpi_helpers_decl.inc
@ -0,0 +1,10 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+_CLC_DECL _CLC_OVERLOAD __CLC_INTN
+__clc_piArgReductionS(private __CLC_GENTYPE *r, __CLC_GENTYPE x);
--- a/libclc/clc/include/clc/math/gentype.inc
+++ b/libclc/clc/include/clc/math/gentype.inc
@ -69,6 +69,9 @@
 #define __CLC_CONVERT_S_GENTYPE __CLC_XCONCAT(__clc_convert_, __CLC_S_GENTYPE)
 #define __CLC_CONVERT_U_GENTYPE __CLC_XCONCAT(__clc_convert_, __CLC_U_GENTYPE)

+#define __CLC_GENTYPE_S_SIGNBIT (__CLC_S_GENTYPE)((1ll << (__CLC_FPSIZE - 1)))
+#define __CLC_GENTYPE_U_SIGNBIT (__CLC_U_GENTYPE)((1ull << (__CLC_FPSIZE - 1u)))
+
 #if (!defined(__CLC_HALF_ONLY) && !defined(__CLC_DOUBLE_ONLY))
 #define __CLC_SCALAR_GENTYPE float
 #define __CLC_FPSIZE 32
--- a/libclc/clc/lib/generic/CMakeLists.txt
+++ b/libclc/clc/lib/generic/CMakeLists.txt
@ -156,6 +156,7 @@ libclc_configure_source_list(CLC_GENERIC_SOURCES
  math/clc_rsqrt.cl
  math/clc_sin.cl
  math/clc_sincos.cl
+  math/clc_sincospi.cl
  math/clc_sincos_helpers.cl
  math/clc_sinh.cl
  math/clc_sinpi.cl
@ -167,6 +168,7 @@ libclc_configure_source_list(CLC_GENERIC_SOURCES
  math/clc_tanh.cl
  math/clc_tanpi.cl
  math/clc_tgamma.cl
+  math/clc_trigpi_helpers.cl
  math/clc_trunc.cl
  mem_fence/clc_mem_fence.cl
  misc/clc_shuffle.cl
--- a/libclc/clc/lib/generic/math/clc_cospi.cl
+++ b/libclc/clc/lib/generic/math/clc_cospi.cl
@ -6,12 +6,8 @@
 //
 //===----------------------------------------------------------------------===//

-#include "clc/clc_convert.h"
-#include "clc/float/definitions.h"
-#include "clc/internal/clc.h"
-#include "clc/math/clc_fabs.h"
-#include "clc/math/clc_sincos_helpers.h"
-#include "clc/math/math.h"
+#include "clc/math/clc_cospi.h"
+#include "clc/math/clc_sincospi.h"

 #define __CLC_BODY "clc_cospi.inc"
 #include "clc/math/gentype.inc"
--- a/libclc/clc/lib/generic/math/clc_cospi.inc
+++ b/libclc/clc/lib/generic/math/clc_cospi.inc
@ -6,111 +6,8 @@
 //
 //===----------------------------------------------------------------------===//

-#if __CLC_FPSIZE == 32
-
 _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_cospi(__CLC_GENTYPE x) {
-  __CLC_GENTYPE absx = __clc_fabs(x);
-  __CLC_INTN ix = __CLC_AS_INTN(absx);
-  __CLC_INTN iax = __CLC_CONVERT_INTN(absx);
-  __CLC_GENTYPE r = absx - __CLC_CONVERT_GENTYPE(iax);
-  __CLC_INTN xodd = (iax & 0x1) != 0 ? (__CLC_INTN)0x80000000 : (__CLC_INTN)0;
-
-  // Initialize with return for +-Inf and NaN
-  __CLC_INTN ir = QNANBITPATT_SP32;
-
-  // 2^24 <= |x| < Inf, the result is always even integer
-  ir = ix < PINFBITPATT_SP32 ? 0x3f800000 : ir;
-
-  // 2^23 <= |x| < 2^24, the result is always integer
-  ir = ix < 0x4b800000 ? xodd | 0x3f800000 : ir;
-
-  // 0x1.0p-7 <= |x| < 2^23, result depends on which 0.25 interval
-
-  // r < 1.0
-  __CLC_GENTYPE a = 1.0f - r;
-  __CLC_INTN e = 1;
-  __CLC_INTN s = xodd ^ (__CLC_INTN)0x80000000;
-
-  // r <= 0.75
-  __CLC_INTN c = r <= 0.75f;
-  a = c ? r - 0.5f : a;
-  e = c ? 0 : e;
-
-  // r < 0.5
-  c = r < 0.5f;
-  a = c ? 0.5f - r : a;
-  s = c ? xodd : s;
-
-  // r <= 0.25
-  c = r <= 0.25f;
-  a = c ? r : a;
-  e = c ? 1 : e;
-
-  __CLC_GENTYPE sinval, cosval;
-  __clc_sincos_piby4(a * M_PI_F, &sinval, &cosval);
-  __CLC_INTN jr = s ^ __CLC_AS_INTN(e != 0 ? cosval : sinval);
-
-  ir = ix < 0x4b000000 ? jr : ir;
-
-  return __CLC_AS_GENTYPE(ir);
+  __CLC_GENTYPE cos;
+  (void)__clc_sincospi(x, &cos);
+  return cos;
 }
-
-#elif __CLC_FPSIZE == 64
-
-_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_cospi(__CLC_GENTYPE x) {
-  __CLC_GENTYPE absx = __clc_fabs(x);
-  __CLC_LONGN ix = __CLC_AS_LONGN(absx);
-  __CLC_LONGN iax = __CLC_CONVERT_LONGN(absx);
-  __CLC_GENTYPE r = absx - __CLC_CONVERT_GENTYPE(iax);
-  __CLC_LONGN xodd =
-      (iax & 0x1L) != 0 ? (__CLC_LONGN)0x8000000000000000L : (__CLC_LONGN)0L;
-
-  // Initialize with return for +-Inf and NaN
-  __CLC_LONGN ir = QNANBITPATT_DP64;
-
-  // 2^53 <= |x| < Inf, the result is always even integer
-  ir = ix < PINFBITPATT_DP64 ? 0x3ff0000000000000L : ir;
-
-  // 2^52 <= |x| < 2^53, the result is always integer
-  ir = absx < 0x1.0p+53 ? xodd | 0x3ff0000000000000L : ir;
-
-  // 0x1.0p-7 <= |x| < 2^52, result depends on which 0.25 interval
-
-  // r < 1.0
-  __CLC_GENTYPE a = 1.0 - r;
-  __CLC_LONGN e = 1;
-  __CLC_LONGN s = xodd ^ (__CLC_LONGN)0x8000000000000000L;
-
-  // r <= 0.75
-  __CLC_LONGN c = r <= 0.75;
-  __CLC_GENTYPE t = r - 0.5;
-  a = c ? t : a;
-  e = c ? 0 : e;
-
-  // r < 0.5
-  c = r < 0.5;
-  t = 0.5 - r;
-  a = c ? t : a;
-  s = c ? xodd : s;
-
-  // r <= 0.25
-  c = r <= 0.25;
-  a = c ? r : a;
-  e = c ? 1 : e;
-
-  __CLC_GENTYPE sinval, cosval;
-  __clc_sincos_piby4(a * M_PI, 0.0, &sinval, &cosval);
-  __CLC_LONGN jr = s ^ __CLC_AS_LONGN(e != 0 ? cosval : sinval);
-
-  ir = absx < 0x1.0p+52 ? jr : ir;
-
-  return __CLC_AS_GENTYPE(ir);
-}
-
-#elif __CLC_FPSIZE == 16
-
-_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_cospi(__CLC_GENTYPE x) {
-  return __CLC_CONVERT_GENTYPE(__clc_cospi(__CLC_CONVERT_FLOATN(x)));
-}
-
-#endif
--- a/libclc/clc/lib/generic/math/clc_sincos_helpers.inc
+++ b/libclc/clc/lib/generic/math/clc_sincos_helpers.inc
@ -41,6 +41,33 @@ __clc_sincos_reduced_eval(__CLC_FLOATN x) {
  return ret;
 }

+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_SINCOS_RET_GENTYPE
+__clc_sincospi_reduced_eval(__CLC_GENTYPE x) {
+  __CLC_GENTYPE t = x * x;
+
+  __CLC_GENTYPE sx =
+      __clc_mad(t,
+                __clc_mad(t, __clc_mad(t, 0x1.eb5482p-3f, -0x1.3e497cp-1f),
+                          0x1.468e6cp+1f),
+                -0x1.4abc1cp+2f);
+  sx = x * t * sx;
+  sx = __clc_mad(x, 0x1.921fb6p+1f, sx);
+
+  __CLC_GENTYPE cx = __clc_mad(
+      t,
+      __clc_mad(t,
+                __clc_mad(t, __clc_mad(t, 0x1.97ca88p-5f, 0x1.c85d3ap-3f),
+                          -0x1.55a3b4p+0f),
+                0x1.03c1a6p+2f),
+      -0x1.3bd3ccp+2f);
+  cx = __clc_mad(t, cx, 1.0f);
+
+  __CLC_SINCOS_RET_GENTYPE ret;
+  ret.cos = cx;
+  ret.sin = sx;
+  return ret;
+}
+
 // Evaluate single precisions sin and cos of value in interval [-pi/4, pi/4]
 _CLC_DEF _CLC_OVERLOAD void __clc_sincos_piby4(__CLC_FLOATN x,
                                               private __CLC_FLOATN *sinval,
--- a/libclc/clc/lib/generic/math/clc_sincos_helpers_fp16.inc
+++ b/libclc/clc/lib/generic/math/clc_sincos_helpers_fp16.inc
@ -34,3 +34,19 @@ __clc_sincos_reduced_eval(__CLC_HALFN x) {
  ret.sin = __clc_mad(x, t * __clc_mad(t, 0x1.0bp-7h, -0x1.554p-3h), x);
  return ret;
 }
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_SINCOS_RET_GENTYPE
+__clc_sincospi_reduced_eval(__CLC_HALFN x) {
+  __CLC_HALFN t = x * x;
+  __CLC_HALFN sx = __clc_mad(t, 0x1.b84p+0h, -0x1.46cp+2h);
+  sx = x * t * sx;
+  sx = __clc_mad(x, 0x1.92p+1h, sx);
+
+  __CLC_HALFN cx = __clc_mad(t, 0x1.fbp+1h, -0x1.3bcp+2h);
+  cx = __clc_mad(t, cx, 1.0h);
+
+  __CLC_SINCOS_RET_GENTYPE ret;
+  ret.cos = cx;
+  ret.sin = sx;
+  return ret;
+}
--- a/libclc/clc/lib/generic/math/clc_sincos_helpers_fp64.inc
+++ b/libclc/clc/lib/generic/math/clc_sincos_helpers_fp64.inc
@ -41,6 +41,29 @@ __clc_sincos_reduced_eval(__CLC_DOUBLEN x, __CLC_DOUBLEN y) {
  return ret;
 }

+_CLC_DEF _CLC_OVERLOAD __CLC_SINCOS_RET_GENTYPE
+__clc_sincospi_reduced_eval(__CLC_GENTYPE x) {
+  __CLC_GENTYPE t = x * x;
+
+  __CLC_GENTYPE sx = __clc_mad(t, __clc_mad(t, __clc_mad(t, __clc_mad(t,
+              __clc_mad(t,
+                  0x1.e357ef99eb0bbp-12, -0x1.e2fe76fdffd2bp-8), 0x1.50782d5f14825p-4), -0x1.32d2ccdfe9424p-1),
+                  0x1.466bc67754fffp+1), -0x1.4abbce625be09p+2);
+  sx = x * t * sx;
+  sx = __clc_mad(x, 0x1.921fb54442d18p+1, sx);
+
+  __CLC_GENTYPE cx = __clc_mad(t, __clc_mad(t, __clc_mad(t, __clc_mad(t,
+              __clc_mad(t, __clc_mad(t,
+                  -0x1.b167302e21c33p-14, 0x1.f9c89ca1d4f33p-10), -0x1.a6d1e7294bff9p-6), 0x1.e1f5067b90b37p-3),
+                  -0x1.55d3c7e3c325bp+0), 0x1.03c1f081b5a67p+2), -0x1.3bd3cc9be45dep+2);
+  cx = __clc_mad(t, cx, 1.0);
+
+  __CLC_SINCOS_RET_GENTYPE ret;
+  ret.cos = cx;
+  ret.sin = sx;
+  return ret;
+}
+
 _CLC_DEF _CLC_OVERLOAD __CLC_DOUBLEN __clc_tan_reduced_eval(__CLC_DOUBLEN x,
                                                            __CLC_DOUBLEN xx,
                                                            __CLC_INTN is_odd) {
@ -123,68 +146,6 @@ _CLC_DEF _CLC_OVERLOAD void __clc_sincos_piby4(__CLC_DOUBLEN x,
  *cosval = cp;
 }

-_CLC_DEF _CLC_OVERLOAD void __clc_tan_piby4(__CLC_DOUBLEN x, __CLC_DOUBLEN xx,
-                                            private __CLC_DOUBLEN *leadval,
-                                            private __CLC_DOUBLEN *tailval) {
-  // 0x3fe921fb54442d18
-  const __CLC_DOUBLEN piby4_lead = 7.85398163397448278999e-01;
-  // 0x3c81a62633145c06
-  const __CLC_DOUBLEN piby4_tail = 3.06161699786838240164e-17;
-
-  // In order to maintain relative precision transform using the identity:
-  // tan(pi/4-x) = (1-tan(x))/(1+tan(x)) for arguments close to pi/4.
-  // Similarly use tan(x-pi/4) = (tan(x)-1)/(tan(x)+1) close to -pi/4.
-
-  __CLC_LONGN ca = x > 0.68;
-  __CLC_LONGN cb = x < -0.68;
-  __CLC_DOUBLEN transform = ca ? 1.0 : 0.0;
-  transform = cb ? -1.0 : transform;
-
-  __CLC_DOUBLEN tx = __clc_fma(-transform, x, piby4_lead) +
-                     __clc_fma(-transform, xx, piby4_tail);
-  __CLC_LONGN c = ca | cb;
-  x = c ? tx : x;
-  xx = c ? 0.0 : xx;
-
-  // Core Remez [2,3] approximation to tan(x+xx) on the interval [0,0.68].
-  __CLC_DOUBLEN t1 = x;
-  __CLC_DOUBLEN r = __clc_fma(2.0, x * xx, x * x);
-
-  __CLC_DOUBLEN a = __clc_fma(r,
-                              __clc_fma(r, 0.224044448537022097264602535574e-3,
-                                        -0.229345080057565662883358588111e-1),
-                              0.372379159759792203640806338901e0);
-
-  __CLC_DOUBLEN b =
-      __clc_fma(r,
-                __clc_fma(r,
-                          __clc_fma(r, -0.232371494088563558304549252913e-3,
-                                    0.260656620398645407524064091208e-1),
-                          -0.515658515729031149329237816945e0),
-                0.111713747927937668539901657944e1);
-
-  __CLC_DOUBLEN t2 = __clc_fma(MATH_DIVIDE(a, b), x * r, xx);
-
-  __CLC_DOUBLEN tp = t1 + t2;
-
-  // Compute -1.0/(t1 + t2) accurately
-  __CLC_DOUBLEN z1 =
-      __CLC_AS_GENTYPE(__CLC_AS_ULONGN(tp) & 0xffffffff00000000L);
-  __CLC_DOUBLEN z2 = t2 - (z1 - t1);
-  __CLC_DOUBLEN trec = -MATH_RECIP(tp);
-  __CLC_DOUBLEN trec_top =
-      __CLC_AS_GENTYPE(__CLC_AS_ULONGN(trec) & 0xffffffff00000000L);
-
-  __CLC_DOUBLEN tpr = __clc_fma(
-      __clc_fma(trec_top, z2, __clc_fma(trec_top, z1, 1.0)), trec, trec_top);
-
-  __CLC_DOUBLEN tpt = transform * (1.0 - MATH_DIVIDE(2.0 * tp, 1.0 + tp));
-  __CLC_DOUBLEN tptr = transform * (MATH_DIVIDE(2.0 * tp, tp - 1.0) - 1.0);
-
-  *leadval = c ? tpt : tp;
-  *tailval = c ? tptr : tpr;
-}
-
 // Reduction for small sized arguments
 _CLC_DEF _CLC_OVERLOAD __CLC_INTN __clc_remainder_piby2_small(
    __CLC_DOUBLEN x, private __CLC_DOUBLEN *rh, private __CLC_DOUBLEN *rt) {
--- a/libclc/clc/lib/generic/math/clc_sincospi.cl
+++ b/libclc/clc/lib/generic/math/clc_sincospi.cl
@ -0,0 +1,22 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clc/clc_convert.h"
+#include "clc/float/definitions.h"
+#include "clc/internal/clc.h"
+#include "clc/math/clc_cos.h"
+#include "clc/math/clc_fabs.h"
+#include "clc/math/clc_sin.h"
+#include "clc/math/clc_sincos_helpers.h"
+#include "clc/math/clc_trigpi_helpers.h"
+#include "clc/math/math.h"
+#include "clc/relational/clc_isinf.h"
+#include "clc/relational/clc_select.h"
+
+#define __CLC_BODY "clc_sincospi.inc"
+#include "clc/math/gentype.inc"
--- a/libclc/clc/lib/generic/math/clc_sincospi.inc
+++ b/libclc/clc/lib/generic/math/clc_sincospi.inc
@ -0,0 +1,50 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+_CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE
+__clc_sincospi(__CLC_GENTYPE x, __private __CLC_GENTYPE *cos_out) {
+  x = __clc_select(x, __CLC_GENTYPE_NAN,
+                   __CLC_CONVERT_S_GENTYPE(__clc_isinf(x)));
+
+  __CLC_GENTYPE absx = __clc_fabs(x);
+
+  __CLC_GENTYPE reduced;
+  __CLC_INTN n = __clc_piArgReductionS(&reduced, absx);
+
+  __CLC_SINCOS_RET_GENTYPE eval = __clc_sincospi_reduced_eval(reduced);
+
+  __CLC_S_GENTYPE flip = __CLC_CONVERT_S_GENTYPE(n > 1)
+                             ? __CLC_GENTYPE_S_SIGNBIT
+                             : (__CLC_S_GENTYPE)0;
+  __CLC_S_GENTYPE odd = __CLC_CONVERT_S_GENTYPE((n & 1) != 0);
+  __CLC_GENTYPE s = odd ? eval.cos : eval.sin;
+
+  __CLC_S_GENTYPE sin_val = __CLC_AS_S_GENTYPE(s) ^ flip ^
+                            (__CLC_AS_S_GENTYPE(absx) ^ __CLC_AS_S_GENTYPE(x));
+
+  __CLC_GENTYPE c = odd ? -eval.sin : eval.cos;
+  *cos_out =
+      __CLC_AS_GENTYPE(__CLC_CONVERT_S_GENTYPE(__CLC_AS_S_GENTYPE(c) ^ flip));
+  return __CLC_AS_GENTYPE(sin_val);
+}
+
+#define __CLC_SINCOSPI_DEF(addrspace)                                          \
+  _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_sincospi(                         \
+      __CLC_GENTYPE x, addrspace __CLC_GENTYPE *cos_out) {                     \
+                                                                               \
+    __CLC_GENTYPE cos_result;                                                  \
+    __CLC_GENTYPE sin_result = __clc_sincospi(x, &cos_result);                 \
+    *cos_out = cos_result;                                                     \
+    return sin_result;                                                         \
+  }
+
+__CLC_SINCOSPI_DEF(local)
+__CLC_SINCOSPI_DEF(global)
+#if _CLC_DISTINCT_GENERIC_AS_SUPPORTED
+__CLC_SINCOSPI_DEF(generic)
+#endif
--- a/libclc/clc/lib/generic/math/clc_sinpi.cl
+++ b/libclc/clc/lib/generic/math/clc_sinpi.cl
@ -6,12 +6,8 @@
 //
 //===----------------------------------------------------------------------===//

-#include "clc/clc_convert.h"
-#include "clc/float/definitions.h"
-#include "clc/internal/clc.h"
-#include "clc/math/clc_fabs.h"
-#include "clc/math/clc_sincos_helpers.h"
-#include "clc/math/math.h"
+#include "clc/math/clc_sincospi.h"
+#include "clc/math/clc_sinpi.h"

 #define __CLC_BODY "clc_sinpi.inc"
 #include "clc/math/gentype.inc"
--- a/libclc/clc/lib/generic/math/clc_sinpi.inc
+++ b/libclc/clc/lib/generic/math/clc_sinpi.inc
@ -6,109 +6,7 @@
 //
 //===----------------------------------------------------------------------===//

-#if __CLC_FPSIZE == 32
-
 _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_sinpi(__CLC_GENTYPE x) {
-  __CLC_INTN ix = __CLC_AS_INTN(x);
-  __CLC_INTN xsgn = ix & (__CLC_INTN)0x80000000;
-  ix ^= xsgn;
-  __CLC_GENTYPE absx = __clc_fabs(x);
-  __CLC_INTN iax = __CLC_CONVERT_INTN(absx);
-  __CLC_GENTYPE r = absx - __CLC_CONVERT_GENTYPE(iax);
-  __CLC_INTN xodd =
-      xsgn ^ ((iax & 0x1) != 0 ? (__CLC_INTN)0x80000000 : (__CLC_INTN)0);
-
-  // Initialize with return for +-Inf and NaN
-  __CLC_INTN ir = QNANBITPATT_SP32;
-
-  // 2^23 <= |x| < Inf, the result is always integer
-  ir = ix < PINFBITPATT_SP32 ? xsgn : ir;
-
-  // 0x1.0p-7 <= |x| < 2^23, result depends on which 0.25 interval
-
-  // r < 1.0
-  __CLC_GENTYPE a = 1.0f - r;
-  __CLC_INTN e = 0;
-
-  // r <= 0.75
-  __CLC_INTN c = r <= 0.75f;
-  a = c ? r - 0.5f : a;
-  e = c ? 1 : e;
-
-  // r < 0.5
-  c = r < 0.5f;
-  a = c ? 0.5f - r : a;
-
-  // 0 < r <= 0.25
-  c = r <= 0.25f;
-  a = c ? r : a;
-  e = c ? 0 : e;
-
-  __CLC_GENTYPE sinval, cosval;
-  __clc_sincos_piby4(a * M_PI_F, &sinval, &cosval);
-  __CLC_INTN jr = xodd ^ __CLC_AS_INTN(e != 0 ? cosval : sinval);
-
-  ir = ix < 0x4b000000 ? jr : ir;
-
-  return __CLC_AS_GENTYPE(ir);
+  __CLC_GENTYPE unused_cos;
+  return __clc_sincospi(x, &unused_cos);
 }
-
-#elif __CLC_FPSIZE == 64
-
-_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_sinpi(__CLC_GENTYPE x) {
-  __CLC_LONGN ix = __CLC_AS_LONGN(x);
-  __CLC_LONGN xsgn = ix & (__CLC_LONGN)0x8000000000000000L;
-  ix ^= xsgn;
-  __CLC_GENTYPE absx = __clc_fabs(x);
-  __CLC_LONGN iax = __CLC_CONVERT_LONGN(absx);
-  __CLC_GENTYPE r = absx - __CLC_CONVERT_GENTYPE(iax);
-  __CLC_LONGN xodd =
-      xsgn ^
-      ((iax & 0x1L) != 0 ? (__CLC_LONGN)0x8000000000000000L : (__CLC_LONGN)0L);
-
-  // Initialize with return for +-Inf and NaN
-  __CLC_LONGN ir = QNANBITPATT_DP64;
-
-  // 2^23 <= |x| < Inf, the result is always integer
-  ir = ix < PINFBITPATT_DP64 ? xsgn : ir;
-
-  // 0x1.0p-7 <= |x| < 2^23, result depends on which 0.25 interval
-
-  // r < 1.0
-  __CLC_GENTYPE a = 1.0 - r;
-  __CLC_LONGN e = 0;
-
-  //  r <= 0.75
-  __CLC_LONGN c = r <= 0.75;
-  __CLC_GENTYPE t = r - 0.5;
-  a = c ? t : a;
-  e = c ? 1 : e;
-
-  // r < 0.5
-  c = r < 0.5;
-  t = 0.5 - r;
-  a = c ? t : a;
-
-  // r <= 0.25
-  c = r <= 0.25;
-  a = c ? r : a;
-  e = c ? 0 : e;
-
-  __CLC_GENTYPE api = a * M_PI;
-
-  __CLC_GENTYPE sinval, cosval;
-  __clc_sincos_piby4(api, 0.0, &sinval, &cosval);
-  __CLC_LONGN jr = xodd ^ __CLC_AS_LONGN(e != 0 ? cosval : sinval);
-
-  ir = absx < 0x1.0p+52 ? jr : ir;
-
-  return __CLC_AS_GENTYPE(ir);
-}
-
-#elif __CLC_FPSIZE == 16
-
-_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_sinpi(__CLC_GENTYPE x) {
-  return __CLC_CONVERT_GENTYPE(__clc_sinpi(__CLC_CONVERT_FLOATN(x)));
-}
-
-#endif
--- a/libclc/clc/lib/generic/math/clc_tanpi.cl
+++ b/libclc/clc/lib/generic/math/clc_tanpi.cl
@ -10,9 +10,13 @@
 #include "clc/float/definitions.h"
 #include "clc/internal/clc.h"
 #include "clc/math/clc_fabs.h"
+#include "clc/math/clc_mad.h"
 #include "clc/math/clc_native_recip.h"
 #include "clc/math/clc_sincos_helpers.h"
+#include "clc/math/clc_trigpi_helpers.h"
 #include "clc/math/math.h"
+#include "clc/relational/clc_isinf.h"
+#include "clc/relational/clc_select.h"

 #define __CLC_BODY "clc_tanpi.inc"
 #include "clc/math/gentype.inc"
--- a/libclc/clc/lib/generic/math/clc_tanpi.inc
+++ b/libclc/clc/lib/generic/math/clc_tanpi.inc
@ -8,125 +8,82 @@

 #if __CLC_FPSIZE == 32

-_CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_tanpi(__CLC_GENTYPE x) {
-  __CLC_INTN ix = __CLC_AS_INTN(x);
-  __CLC_INTN xsgn = ix & (__CLC_INTN)SIGNBIT_SP32;
-  __CLC_INTN xnsgn = xsgn ^ (__CLC_INTN)SIGNBIT_SP32;
-  ix ^= xsgn;
-  __CLC_GENTYPE absx = __clc_fabs(x);
-  __CLC_INTN iax = __CLC_CONVERT_INTN(absx);
-  __CLC_GENTYPE r = absx - __CLC_CONVERT_GENTYPE(iax);
-  __CLC_INTN xodd = xsgn ^ __CLC_AS_INTN((iax & 0x1) != 0 ? SIGNBIT_SP32 : 0);
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE
+__clc_tanpi_reduced_eval(__CLC_GENTYPE x, __CLC_INTN is_odd) {
+  __CLC_GENTYPE s = x * x;

-  // Initialize with return for +-Inf and NaN
-  __CLC_INTN ir = QNANBITPATT_SP32;
+  __CLC_GENTYPE t = __clc_mad(s, __clc_mad(s, __clc_mad(s, __clc_mad(s,
+            __clc_mad(s, __clc_mad(s,
+                0x1.7d2bd4p+16f, 0x1.a4d306p+12f), 0x1.435004p+11f), 0x1.4b6926p+9f),
+                0x1.451e22p+7f), 0x1.467a9cp+5f), 0x1.4abb6ap+3f);

-  // 2^24 <= |x| < Inf, the result is always even integer
-  ir = ix < PINFBITPATT_SP32 ? xsgn : ir;
+  t = x * s * t;
+  t = __clc_mad(x, 0x1.921fb6p+1f, t);

-  // 2^23 <= |x| < 2^24, the result is always integer
-  ir = ix < 0x4b800000 ? xodd : ir;
+  __CLC_GENTYPE tr = __CLC_FP_LIT(-1.0) / t;

-  // 0x1.0p-7 <= |x| < 2^23, result depends on which 0.25 interval
-
-  // r < 1.0
-  __CLC_GENTYPE a = 1.0f - r;
-  __CLC_INTN e = 0;
-  __CLC_INTN s = xnsgn;
-
-  // r <= 0.75
-  __CLC_INTN c = r <= 0.75f;
-  a = c ? r - 0.5f : a;
-  e = c ? 1 : e;
-  s = c ? xsgn : s;
-
-  // r < 0.5
-  c = r < 0.5f;
-  a = c ? 0.5f - r : a;
-  s = c ? xnsgn : s;
-
-  // 0 < r <= 0.25
-  c = r <= 0.25f;
-  a = c ? r : a;
-  e = c ? 0 : e;
-  s = c ? xsgn : s;
-
-  __CLC_GENTYPE t = __clc_tanf_piby4(a * M_PI_F, 0);
-  __CLC_GENTYPE tr = -__clc_native_recip(t);
-  __CLC_INTN jr = s ^ __CLC_AS_INTN(e != 0 ? tr : t);
-
-  jr = r == 0.5f ? xodd | 0x7f800000 : jr;
-
-  ir = ix < 0x4b000000 ? jr : ir;
-
-  return __CLC_AS_GENTYPE(ir);
+  return is_odd ? tr : t;
 }

 #elif __CLC_FPSIZE == 64

-_CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_tanpi(__CLC_GENTYPE x) {
-  __CLC_LONGN ix = __CLC_AS_LONGN(x);
-  __CLC_LONGN xsgn = ix & (__CLC_LONGN)0x8000000000000000L;
-  __CLC_LONGN xnsgn = xsgn ^ (__CLC_LONGN)0x8000000000000000L;
-  ix ^= xsgn;
-  __CLC_GENTYPE absx = __clc_fabs(x);
-  __CLC_LONGN iax = __CLC_CONVERT_LONGN(absx);
-  __CLC_GENTYPE r = absx - __CLC_CONVERT_GENTYPE(iax);
-  __CLC_LONGN xodd =
-      xsgn ^ __CLC_AS_LONGN((iax & 0x1) != 0 ? 0x8000000000000000L : 0L);
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_DOUBLEN
+__clc_tanpi_reduced_eval(__CLC_DOUBLEN x, __CLC_INTN is_odd) {
+  __CLC_GENTYPE s = x * x;
+  __CLC_GENTYPE t = __clc_mad(s, __clc_mad(s, __clc_mad(s, __clc_mad(s,
+             __clc_mad(s, __clc_mad(s, __clc_mad(s, __clc_mad(s,
+             __clc_mad(s, __clc_mad(s, __clc_mad(s, __clc_mad(s,
+             __clc_mad(s,
+                 0x1.3fad0a71ea6d1p+32, -0x1.11a76ac97377bp+30), 0x1.ba2bcaca6da1bp+27), -0x1.79e8e2d7aaf57p+22),
+                 0x1.c1c1102e46eccp+21), 0x1.31291bbcb5588p+19), 0x1.486b2d6bb3db2p+17), 0x1.45be1b46ff156p+15),
+                 0x1.45f61b419c746p+13), 0x1.45f311045a4ffp+11), 0x1.45f4739a998c7p+9), 0x1.45fff9b243050p+7),
+                 0x1.466bc6775cf74p+5), 0x1.4abbce625be8bp+3);
+  t = x * s * t;
+  t = __clc_mad(x, 0x1.921fb54442d18p+1, t);

-  // Initialize with return for +-Inf and NaN
-  __CLC_LONGN ir = QNANBITPATT_DP64;
+  __CLC_GENTYPE tr = __CLC_FP_LIT(-1.0) / t;

-  // 2^53 <= |x| < Inf, the result is always even integer
-  ir = ix < PINFBITPATT_DP64 ? xsgn : ir;
-
-  // 2^52 <= |x| < 2^53, the result is always integer
-  ir = ix < 0x4340000000000000L ? xodd : ir;
-
-  // 0x1.0p-14 <= |x| < 2^53, result depends on which 0.25 interval
-
-  // r < 1.0
-  __CLC_GENTYPE a = 1.0 - r;
-  __CLC_LONGN e = 0;
-  __CLC_LONGN s = xnsgn;
-
-  // r <= 0.75
-  __CLC_LONGN c = r <= 0.75;
-  __CLC_GENTYPE t = r - 0.5;
-  a = c ? t : a;
-  e = c ? 1 : e;
-  s = c ? xsgn : s;
-
-  // r < 0.5
-  c = r < 0.5;
-  t = 0.5 - r;
-  a = c ? t : a;
-  s = c ? xnsgn : s;
-
-  // r <= 0.25
-  c = r <= 0.25;
-  a = c ? r : a;
-  e = c ? 0 : e;
-  s = c ? xsgn : s;
-
-  __CLC_GENTYPE api = a * M_PI;
-  __CLC_GENTYPE lo, hi;
-  __clc_tan_piby4(api, 0.0, &lo, &hi);
-  __CLC_LONGN jr = s ^ __CLC_AS_LONGN(e != 0 ? hi : lo);
-
-  __CLC_LONGN si = xodd | 0x7ff0000000000000L;
-  jr = r == 0.5 ? si : jr;
-
-  ir = ix < 0x4330000000000000L ? jr : ir;
-
-  return __CLC_AS_GENTYPE(ir);
+  return __CLC_CONVERT_LONGN(is_odd) ? tr : t;
 }

 #elif __CLC_FPSIZE == 16

-_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_tanpi(__CLC_GENTYPE x) {
-  return __CLC_CONVERT_GENTYPE(__clc_tanpi(__CLC_CONVERT_FLOATN(x)));
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_HALFN
+__clc_tanpi_reduced_eval(__CLC_HALFN x, __CLC_INTN is_odd) {
+  __CLC_HALFN s = x * x;
+
+  __CLC_HALFN t =
+      __clc_mad(s, __clc_mad(s, 0x1.3d8p+8h, 0x1.fe4p+4h), 0x1.508p+3h);
+
+  t = x * s * t;
+  t = __clc_mad(x, 0x1.92p+1h, t);
+
+  __CLC_HALFN tr = __CLC_FP_LIT(-1.0) / t;
+
+  return __CLC_CONVERT_SHORTN(is_odd) ? tr : t;
 }

 #endif
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_tanpi(__CLC_GENTYPE x) {
+  x = __clc_select(x, __CLC_GENTYPE_NAN,
+                   __CLC_CONVERT_S_GENTYPE(__clc_isinf(x)));
+
+  __CLC_GENTYPE absx = __clc_fabs(x);
+
+  __CLC_GENTYPE reduced;
+  __CLC_INTN n = __clc_piArgReductionS(&reduced, absx);
+
+  __CLC_GENTYPE t = __clc_tanpi_reduced_eval(reduced, (n & 1) != 0);
+
+  __CLC_S_GENTYPE n_1_or_2 = __CLC_CONVERT_S_GENTYPE((n == 1) || (n == 2));
+
+  __CLC_S_GENTYPE flip = (n_1_or_2 && (reduced == __CLC_FP_LIT(0.0)))
+                             ? __CLC_GENTYPE_S_SIGNBIT
+                             : (__CLC_S_GENTYPE)0;
+
+  __CLC_S_GENTYPE result = (__CLC_AS_S_GENTYPE(t) ^ flip) ^
+                           (__CLC_AS_S_GENTYPE(x) & __CLC_GENTYPE_S_SIGNBIT);
+
+  return __CLC_AS_GENTYPE(result);
+}
--- a/libclc/clc/lib/generic/math/clc_trigpi_helpers.cl
+++ b/libclc/clc/lib/generic/math/clc_trigpi_helpers.cl
@ -0,0 +1,16 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clc/clc_convert.h"
+#include "clc/math/clc_fract.h"
+#include "clc/math/clc_mad.h"
+#include "clc/math/clc_rint.h"
+#include "clc/math/clc_trigpi_helpers.h"
+
+#define __CLC_BODY "clc_trigpi_helpers.inc"
+#include "clc/math/gentype.inc"
--- a/libclc/clc/lib/generic/math/clc_trigpi_helpers.inc
+++ b/libclc/clc/lib/generic/math/clc_trigpi_helpers.inc
@ -0,0 +1,19 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+_CLC_DEF _CLC_OVERLOAD __CLC_INTN
+__clc_piArgReductionS(private __CLC_GENTYPE *reduced, __CLC_GENTYPE x) {
+  __CLC_GENTYPE unused;
+  __CLC_GENTYPE t =
+      __CLC_FP_LIT(2.0) * __clc_fract(__CLC_FP_LIT(0.5) * x, &unused);
+  x = x > __CLC_FP_LIT(1.0) ? t : x;
+  t = __clc_rint(__CLC_FP_LIT(2.0) * x);
+
+  *reduced = __clc_mad(t, __CLC_FP_LIT(-0.5), x);
+  return __CLC_CONVERT_INTN(t) & 0x3;
+}