Matt Arsenault eed1f2749d
libclc: Use special division for atan2 for DAZ (#190248)
The AMDGPU DAZ fdiv works fine in this case, so there's
maybe something better we could do here.
2026-04-02 22:18:17 +02:00

124 lines
3.4 KiB
C++

//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#pragma OPENCL FP_CONTRACT OFF
#if __CLC_FPSIZE == 32
_CLC_OVERLOAD _CLC_CONST _CLC_DEF __CLC_FLOATN __clc_atan2(__CLC_FLOATN y,
__CLC_FLOATN x) {
const __CLC_FLOATN pi = 0x1.921fb6p+1f;
const __CLC_FLOATN piby2 = 0x1.921fb6p+0f;
const __CLC_FLOATN piby4 = 0x1.921fb6p-1f;
const __CLC_FLOATN threepiby4 = 0x1.2d97c8p+1f;
__CLC_FLOATN ax = __clc_fabs(x);
__CLC_FLOATN ay = __clc_fabs(y);
__CLC_FLOATN v = __clc_fmin(ax, ay);
__CLC_FLOATN u = __clc_fmax(ax, ay);
__CLC_FLOATN vbyu;
if (__clc_denormals_are_zero_fp32()) {
__CLC_FLOATN s = u > 0x1.0p+96f ? 0x1.0p-32f : 1.0f;
vbyu = s * __clc_div_fast(v, s * u);
} else {
vbyu = v / u;
}
__CLC_FLOATN a = __clc_atan_reduced(vbyu);
__CLC_FLOATN t = piby2 - a;
a = ay > ax ? t : a;
t = pi - a;
a = x < 0.0f ? t : a;
t = __clc_signbit(x) ? pi : 0.0f;
a = y == 0.0f ? t : a;
// x and y are +- Inf
t = x < 0.0f ? threepiby4 : piby4;
a = (__clc_isinf(x) & __clc_isinf(y)) ? t : a;
// x or y is NaN
a = __clc_isunordered(x, y) ? FLT_NAN : a;
return __clc_copysign(a, y);
}
#elif __CLC_FPSIZE == 64
_CLC_OVERLOAD _CLC_CONST _CLC_DEF __CLC_DOUBLEN __clc_atan2(__CLC_DOUBLEN y,
__CLC_DOUBLEN x) {
const __CLC_DOUBLEN pi = 0x1.921fb54442d18p+1;
const __CLC_DOUBLEN piby2 = 0x1.921fb54442d18p+0;
const __CLC_DOUBLEN piby4 = 0x1.921fb54442d18p-1;
const __CLC_DOUBLEN threepiby4 = 0x1.2d97c7f3321d2p+1;
__CLC_DOUBLEN ay = __clc_fabs(y);
__CLC_DOUBLEN ax = __clc_fabs(x);
__CLC_DOUBLEN u = __clc_fmax(ax, ay);
__CLC_DOUBLEN v = __clc_fmin(ax, ay);
__CLC_DOUBLEN vbyu = v / u;
__CLC_DOUBLEN a = __clc_atan_reduced(vbyu);
__CLC_LONGN xneg = __clc_signbit(x);
__CLC_DOUBLEN t = piby2 - a;
a = ax < ay ? t : a;
t = pi - a;
a = xneg ? t : a;
t = xneg ? pi : 0.0;
a = y == 0.0 ? t : a;
t = xneg ? threepiby4 : piby4;
t = __clc_copysign(t, y);
a = (__clc_isinf(x) && __clc_isinf(y)) ? t : a;
a = __clc_isunordered(x, y) ? DBL_NAN : a;
return __clc_copysign(a, y);
}
#elif __CLC_FPSIZE == 16
_CLC_OVERLOAD _CLC_CONST _CLC_DEF __CLC_HALFN __clc_atan2(__CLC_HALFN y,
__CLC_HALFN x) {
const __CLC_HALFN pi = 0x1.921fb6p+1h;
const __CLC_HALFN piby2 = 0x1.921fb6p+0h;
const __CLC_HALFN piby4 = 0x1.921fb6p-1h;
const __CLC_HALFN threepiby4 = 0x1.2d97c8p+1h;
__CLC_HALFN ax = __clc_fabs(x);
__CLC_HALFN ay = __clc_fabs(y);
__CLC_HALFN v = __clc_fmin(ax, ay);
__CLC_HALFN u = __clc_fmax(ax, ay);
__CLC_HALFN vbyu = v / u;
__CLC_HALFN a = __clc_atan_reduced(vbyu);
__CLC_HALFN t = piby2 - a;
a = ay > ax ? t : a;
t = pi - a;
a = x < 0.0h ? t : a;
t = __clc_signbit(x) ? pi : 0.0h;
a = y == 0.0h ? t : a;
// x and y are +- Inf
t = x < 0.0h ? threepiby4 : piby4;
a = (__clc_isinf(x) && __clc_isinf(y)) ? t : a;
// x or y is NaN
a = __clc_isunordered(x, y) ? HALF_NAN : a;
return __clc_copysign(a, y);
}
#endif