diff --git a/libclc/clc/include/clc/shared/clc_less_aligned_types.h b/libclc/clc/include/clc/shared/clc_less_aligned_types.h new file mode 100644 index 000000000000..73e436147ce0 --- /dev/null +++ b/libclc/clc/include/clc/shared/clc_less_aligned_types.h @@ -0,0 +1,23 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Defines types to be used with (CLC) vstore and vload functions. These are +// vector types whose alignment is that of their respective scalar types. +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_SHARED_CLC_LESS_ALIGNED_TYPES_H__ +#define __CLC_SHARED_CLC_LESS_ALIGNED_TYPES_H__ + +#define __CLC_BODY +#include + +#define __CLC_BODY +#include + +#endif // __CLC_SHARED_CLC_LESS_ALIGNED_TYPES_H__ diff --git a/libclc/clc/include/clc/shared/clc_less_aligned_types.inc b/libclc/clc/include/clc/shared/clc_less_aligned_types.inc new file mode 100644 index 000000000000..45d69ea72fc0 --- /dev/null +++ b/libclc/clc/include/clc/shared/clc_less_aligned_types.inc @@ -0,0 +1,23 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Defines types to be used with (CLC) vstore and vload functions. These are +// vector types whose alignment is that of their respective scalar types. +// +//===----------------------------------------------------------------------===// + +#ifdef __CLC_SCALAR + +typedef __CLC_GENTYPE __CLC_XCONCAT(less_aligned_, __CLC_GENTYPE); + +#else + +typedef __CLC_GENTYPE __CLC_XCONCAT(less_aligned_, __CLC_GENTYPE) + __attribute__((aligned(sizeof(__CLC_SCALAR_GENTYPE)))); + +#endif diff --git a/libclc/clc/include/clc/shared/clc_vload.h b/libclc/clc/include/clc/shared/clc_vload.h new file mode 100644 index 000000000000..c3dbe0696cc1 --- /dev/null +++ b/libclc/clc/include/clc/shared/clc_vload.h @@ -0,0 +1,20 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_SHARED_CLC_VLOAD_H__ +#define __CLC_SHARED_CLC_VLOAD_H__ + +#include + +#define __CLC_BODY +#include + +#define __CLC_BODY +#include + +#endif // __CLC_SHARED_CLC_VLOAD_H__ diff --git a/libclc/clc/include/clc/shared/clc_vload.inc b/libclc/clc/include/clc/shared/clc_vload.inc new file mode 100644 index 000000000000..8f3b00ec0445 --- /dev/null +++ b/libclc/clc/include/clc/shared/clc_vload.inc @@ -0,0 +1,64 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#define CLC_VLOAD_NAME __CLC_XCONCAT(__clc_vload, __CLC_VECSIZE) +#define CLC_VLOAD_HALF_NAME __CLC_XCONCAT(__clc_vload_half, __CLC_VECSIZE) +#define CLC_VLOADA_HALF_NAME __CLC_XCONCAT(__clc_vloada_half, __CLC_VECSIZE) + +#ifndef __CLC_SCALAR + +#define CLC_VLOAD_TY __CLC_XCONCAT(less_aligned_, __CLC_GENTYPE) + +#define CLC_VLOAD_DECL(ADDRSPACE) \ + _CLC_OVERLOAD _CLC_DECL CLC_VLOAD_TY CLC_VLOAD_NAME( \ + size_t offset, const ADDRSPACE __CLC_SCALAR_GENTYPE *x); + +CLC_VLOAD_DECL(__private) +CLC_VLOAD_DECL(__local) +CLC_VLOAD_DECL(__constant) +CLC_VLOAD_DECL(__global) + +#if _CLC_DISTINCT_GENERIC_AS_SUPPORTED +CLC_VLOAD_DECL(__generic) +#endif + +#undef CLC_VLOAD_DECL +#undef CLC_VLOAD_TY + +#endif // __CLC_SCALAR + +// vload_half and vloada_half are available even if cl_khr_fp16 is unavailable. +// Declare these functions when working on float types, which we know are +// always available. +#ifdef __CLC_FPSIZE +#if __CLC_FPSIZE == 32 + +#define CLC_VLOAD_HALF_DECL(ADDRSPACE) \ + _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE CLC_VLOAD_HALF_NAME( \ + size_t offset, const ADDRSPACE half *mem); \ + \ + _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE CLC_VLOADA_HALF_NAME( \ + size_t offset, const ADDRSPACE half *mem); + +CLC_VLOAD_HALF_DECL(__private) +CLC_VLOAD_HALF_DECL(__local) +CLC_VLOAD_HALF_DECL(__constant) +CLC_VLOAD_HALF_DECL(__global) + +#if _CLC_DISTINCT_GENERIC_AS_SUPPORTED +CLC_VLOAD_HALF_DECL(__generic) +#endif + +#undef CLC_VLOAD_HALF_DECL + +#endif +#endif + +#undef CLC_VLOAD_NAME +#undef CLC_VLOAD_HALF_NAME +#undef CLC_VLOADA_HALF_NAME diff --git a/libclc/clc/include/clc/shared/clc_vstore.h b/libclc/clc/include/clc/shared/clc_vstore.h new file mode 100644 index 000000000000..647dc7da1afb --- /dev/null +++ b/libclc/clc/include/clc/shared/clc_vstore.h @@ -0,0 +1,20 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_SHARED_CLC_VSTORE_H__ +#define __CLC_SHARED_CLC_VSTORE_H__ + +#include + +#define __CLC_BODY +#include + +#define __CLC_BODY +#include + +#endif // __CLC_SHARED_CLC_VSTORE_H__ diff --git a/libclc/clc/include/clc/shared/clc_vstore.inc b/libclc/clc/include/clc/shared/clc_vstore.inc new file mode 100644 index 000000000000..38d54b2f1b67 --- /dev/null +++ b/libclc/clc/include/clc/shared/clc_vstore.inc @@ -0,0 +1,70 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#define CLC_VSTORE_TY __CLC_XCONCAT(less_aligned_, __CLC_GENTYPE) +#define CLC_VSTORE_NAME __CLC_XCONCAT(__clc_vstore, __CLC_VECSIZE) +#define CLC_VSTORE_HALF_NAME(x) \ + __CLC_XCONCAT(__CLC_XCONCAT(__clc_vstore_half, __CLC_VECSIZE), x) +#define CLC_VSTOREA_HALF_NAME(x) \ + __CLC_XCONCAT(__CLC_XCONCAT(__clc_vstorea_half, __CLC_VECSIZE), x) + +#ifndef __CLC_SCALAR + +#define CLC_VSTORE_DECL(ADDRSPACE) \ + _CLC_OVERLOAD _CLC_DECL void CLC_VSTORE_NAME( \ + CLC_VSTORE_TY data, size_t offset, ADDRSPACE __CLC_SCALAR_GENTYPE *p); + +CLC_VSTORE_DECL(__private) +CLC_VSTORE_DECL(__local) +CLC_VSTORE_DECL(__global) + +#if _CLC_DISTINCT_GENERIC_AS_SUPPORTED +CLC_VSTORE_DECL(__generic) +#endif + +#undef CLC_VSTORE_DECL + +#endif // __CLC_SCALAR + +// vstore_half and vstorea_half are available even if cl_khr_fp16 is +// unavailable. +#ifdef __CLC_FPSIZE +#if __CLC_FPSIZE == 32 || __CLC_FPSIZE == 64 + +#define CLC_VSTORE_HALF_DECL(ADDRSPACE, SUFFIX) \ + _CLC_OVERLOAD _CLC_DECL void CLC_VSTORE_HALF_NAME(SUFFIX)( \ + CLC_VSTORE_TY data, size_t offset, ADDRSPACE half *p); \ + \ + _CLC_OVERLOAD _CLC_DECL void CLC_VSTOREA_HALF_NAME(SUFFIX)( \ + CLC_VSTORE_TY data, size_t offset, ADDRSPACE half *p); + +#define CLC_VSTORE_HALF_DECL_ALL_MODES(ADDRSPACE) \ + CLC_VSTORE_HALF_DECL(ADDRSPACE, ) \ + CLC_VSTORE_HALF_DECL(ADDRSPACE, _rtz) \ + CLC_VSTORE_HALF_DECL(ADDRSPACE, _rtn) \ + CLC_VSTORE_HALF_DECL(ADDRSPACE, _rtp) \ + CLC_VSTORE_HALF_DECL(ADDRSPACE, _rte) + +CLC_VSTORE_HALF_DECL_ALL_MODES(__private) +CLC_VSTORE_HALF_DECL_ALL_MODES(__local) +CLC_VSTORE_HALF_DECL_ALL_MODES(__global) + +#if _CLC_DISTINCT_GENERIC_AS_SUPPORTED +CLC_VSTORE_HALF_DECL_ALL_MODES(__generic) +#endif + +#undef CLC_VSTORE_HALF_DECL +#undef CLC_VSTORE_HALF_DECL_ALL_MODES + +#endif +#endif + +#undef CLC_VSTORE_TY +#undef CLC_VSTORE_NAME +#undef CLC_VSTORE_HALF_NAME +#undef CLC_VSTOREA_HALF_NAME diff --git a/libclc/clc/lib/generic/SOURCES b/libclc/clc/lib/generic/SOURCES index a8a906159e28..49c7ca636f24 100644 --- a/libclc/clc/lib/generic/SOURCES +++ b/libclc/clc/lib/generic/SOURCES @@ -147,3 +147,5 @@ relational/clc_signbit.cl shared/clc_clamp.cl shared/clc_max.cl shared/clc_min.cl +shared/clc_vload.cl +shared/clc_vstore.cl diff --git a/libclc/clc/lib/generic/shared/clc_vload.cl b/libclc/clc/lib/generic/shared/clc_vload.cl new file mode 100644 index 000000000000..e4003e4a9673 --- /dev/null +++ b/libclc/clc/lib/generic/shared/clc_vload.cl @@ -0,0 +1,130 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include + +#define VLOAD_VECTORIZE(PRIM_TYPE, ADDR_SPACE) \ + _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##2 __clc_vload2( \ + size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \ + return *( \ + (const ADDR_SPACE less_aligned_##PRIM_TYPE##2 *)(&x[2 * offset])); \ + } \ + \ + _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##3 __clc_vload3( \ + size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \ + PRIM_TYPE##2 vec = \ + *((const ADDR_SPACE less_aligned_##PRIM_TYPE##2 *)(&x[3 * offset])); \ + return (PRIM_TYPE##3)(vec.s0, vec.s1, x[offset * 3 + 2]); \ + } \ + \ + _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##4 __clc_vload4( \ + size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \ + return *( \ + (const ADDR_SPACE less_aligned_##PRIM_TYPE##4 *)(&x[4 * offset])); \ + } \ + \ + _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##8 __clc_vload8( \ + size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \ + return *( \ + (const ADDR_SPACE less_aligned_##PRIM_TYPE##8 *)(&x[8 * offset])); \ + } \ + \ + _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##16 __clc_vload16( \ + size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \ + return *( \ + (const ADDR_SPACE less_aligned_##PRIM_TYPE##16 *)(&x[16 * offset])); \ + } + +#if _CLC_DISTINCT_GENERIC_AS_SUPPORTED +#define VLOAD_VECTORIZE_GENERIC VLOAD_VECTORIZE +#else +// The generic address space isn't available, so make the macro do nothing +#define VLOAD_VECTORIZE_GENERIC(X, Y) +#endif + +#define VLOAD_ADDR_SPACES(__CLC_SCALAR_GENTYPE) \ + VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __private) \ + VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __local) \ + VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __constant) \ + VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __global) \ + VLOAD_VECTORIZE_GENERIC(__CLC_SCALAR_GENTYPE, __generic) + +#define VLOAD_TYPES() \ + VLOAD_ADDR_SPACES(char) \ + VLOAD_ADDR_SPACES(uchar) \ + VLOAD_ADDR_SPACES(short) \ + VLOAD_ADDR_SPACES(ushort) \ + VLOAD_ADDR_SPACES(int) \ + VLOAD_ADDR_SPACES(uint) \ + VLOAD_ADDR_SPACES(long) \ + VLOAD_ADDR_SPACES(ulong) \ + VLOAD_ADDR_SPACES(float) + +VLOAD_TYPES() + +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable +VLOAD_ADDR_SPACES(double) +#endif +#ifdef cl_khr_fp16 +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +VLOAD_ADDR_SPACES(half) +#endif + +/* vload_half are legal even without cl_khr_fp16 */ +/* no vload_half for double */ +#define VEC_LOAD1(val, AS) val = __builtin_load_halff(&mem[offset++]); +#define VEC_LOAD2(val, AS) \ + VEC_LOAD1(val.lo, AS) \ + VEC_LOAD1(val.hi, AS) +#define VEC_LOAD3(val, AS) \ + VEC_LOAD1(val.s0, AS) \ + VEC_LOAD1(val.s1, AS) \ + VEC_LOAD1(val.s2, AS) +#define VEC_LOAD4(val, AS) \ + VEC_LOAD2(val.lo, AS) \ + VEC_LOAD2(val.hi, AS) +#define VEC_LOAD8(val, AS) \ + VEC_LOAD4(val.lo, AS) \ + VEC_LOAD4(val.hi, AS) +#define VEC_LOAD16(val, AS) \ + VEC_LOAD8(val.lo, AS) \ + VEC_LOAD8(val.hi, AS) + +#define __FUNC(SUFFIX, VEC_SIZE, OFFSET_SIZE, TYPE, AS) \ + _CLC_OVERLOAD _CLC_DEF TYPE __clc_vload_half##SUFFIX(size_t offset, \ + const AS half *mem) { \ + offset *= VEC_SIZE; \ + TYPE __tmp; \ + VEC_LOAD##VEC_SIZE(__tmp, AS) return __tmp; \ + } \ + _CLC_OVERLOAD _CLC_DEF TYPE __clc_vloada_half##SUFFIX(size_t offset, \ + const AS half *mem) { \ + offset *= OFFSET_SIZE; \ + TYPE __tmp; \ + VEC_LOAD##VEC_SIZE(__tmp, AS) return __tmp; \ + } + +#define FUNC(SUFFIX, VEC_SIZE, OFFSET_SIZE, TYPE, AS) \ + __FUNC(SUFFIX, VEC_SIZE, OFFSET_SIZE, TYPE, AS) + +#define __CLC_BODY "clc_vload_half.inc" +#include +#undef FUNC +#undef __FUNC +#undef VEC_LOAD16 +#undef VEC_LOAD8 +#undef VEC_LOAD4 +#undef VEC_LOAD3 +#undef VEC_LOAD2 +#undef VEC_LOAD1 +#undef VLOAD_TYPES +#undef VLOAD_ADDR_SPACES +#undef VLOAD_VECTORIZE +#undef VLOAD_VECTORIZE_GENERIC diff --git a/libclc/opencl/lib/generic/shared/vload_half.inc b/libclc/clc/lib/generic/shared/clc_vload_half.inc similarity index 100% rename from libclc/opencl/lib/generic/shared/vload_half.inc rename to libclc/clc/lib/generic/shared/clc_vload_half.inc diff --git a/libclc/clc/lib/generic/shared/clc_vstore.cl b/libclc/clc/lib/generic/shared/clc_vstore.cl new file mode 100644 index 000000000000..adde58aec915 --- /dev/null +++ b/libclc/clc/lib/generic/shared/clc_vstore.cl @@ -0,0 +1,268 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include +#include + +#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable + +#define VSTORE_VECTORIZE(PRIM_TYPE, ADDR_SPACE) \ + typedef PRIM_TYPE##2 less_aligned_##ADDR_SPACE##PRIM_TYPE##2 \ + __attribute__((aligned(sizeof(PRIM_TYPE)))); \ + _CLC_OVERLOAD _CLC_DEF void __clc_vstore2(PRIM_TYPE##2 vec, size_t offset, \ + ADDR_SPACE PRIM_TYPE *mem) { \ + *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2 \ + *)(&mem[2 * offset])) = vec; \ + } \ + \ + _CLC_OVERLOAD _CLC_DEF void __clc_vstore3(PRIM_TYPE##3 vec, size_t offset, \ + ADDR_SPACE PRIM_TYPE *mem) { \ + *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2 \ + *)(&mem[3 * offset])) = (PRIM_TYPE##2)(vec.s0, vec.s1); \ + mem[3 * offset + 2] = vec.s2; \ + } \ + \ + typedef PRIM_TYPE##4 less_aligned_##ADDR_SPACE##PRIM_TYPE##4 \ + __attribute__((aligned(sizeof(PRIM_TYPE)))); \ + _CLC_OVERLOAD _CLC_DEF void __clc_vstore4(PRIM_TYPE##4 vec, size_t offset, \ + ADDR_SPACE PRIM_TYPE *mem) { \ + *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##4 \ + *)(&mem[4 * offset])) = vec; \ + } \ + \ + typedef PRIM_TYPE##8 less_aligned_##ADDR_SPACE##PRIM_TYPE##8 \ + __attribute__((aligned(sizeof(PRIM_TYPE)))); \ + _CLC_OVERLOAD _CLC_DEF void __clc_vstore8(PRIM_TYPE##8 vec, size_t offset, \ + ADDR_SPACE PRIM_TYPE *mem) { \ + *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##8 \ + *)(&mem[8 * offset])) = vec; \ + } \ + \ + typedef PRIM_TYPE##16 less_aligned_##ADDR_SPACE##PRIM_TYPE##16 \ + __attribute__((aligned(sizeof(PRIM_TYPE)))); \ + _CLC_OVERLOAD _CLC_DEF void __clc_vstore16(PRIM_TYPE##16 vec, size_t offset, \ + ADDR_SPACE PRIM_TYPE *mem) { \ + *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##16 \ + *)(&mem[16 * offset])) = vec; \ + } + +#if _CLC_DISTINCT_GENERIC_AS_SUPPORTED +#define VSTORE_VECTORIZE_GENERIC VSTORE_VECTORIZE +#else +// The generic address space isn't available, so make the macro do nothing +#define VSTORE_VECTORIZE_GENERIC(X, Y) +#endif + +#define VSTORE_ADDR_SPACES(__CLC_SCALAR___CLC_GENTYPE) \ + VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __private) \ + VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __local) \ + VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __global) \ + VSTORE_VECTORIZE_GENERIC(__CLC_SCALAR___CLC_GENTYPE, __generic) + +VSTORE_ADDR_SPACES(char) +VSTORE_ADDR_SPACES(uchar) +VSTORE_ADDR_SPACES(short) +VSTORE_ADDR_SPACES(ushort) +VSTORE_ADDR_SPACES(int) +VSTORE_ADDR_SPACES(uint) +VSTORE_ADDR_SPACES(long) +VSTORE_ADDR_SPACES(ulong) +VSTORE_ADDR_SPACES(float) + +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable +VSTORE_ADDR_SPACES(double) +#endif + +#ifdef cl_khr_fp16 +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +VSTORE_ADDR_SPACES(half) +#endif + +#define VEC_STORE1(val, ROUNDF, BUILTIN) BUILTIN(ROUNDF(val), &mem[offset++]); + +#define VEC_STORE2(val, ROUNDF, BUILTIN) \ + VEC_STORE1(val.lo, ROUNDF, BUILTIN) \ + VEC_STORE1(val.hi, ROUNDF, BUILTIN) +#define VEC_STORE3(val, ROUNDF, BUILTIN) \ + VEC_STORE1(val.s0, ROUNDF, BUILTIN) \ + VEC_STORE1(val.s1, ROUNDF, BUILTIN) \ + VEC_STORE1(val.s2, ROUNDF, BUILTIN) +#define VEC_STORE4(val, ROUNDF, BUILTIN) \ + VEC_STORE2(val.lo, ROUNDF, BUILTIN) \ + VEC_STORE2(val.hi, ROUNDF, BUILTIN) +#define VEC_STORE8(val, ROUNDF, BUILTIN) \ + VEC_STORE4(val.lo, ROUNDF, BUILTIN) \ + VEC_STORE4(val.hi, ROUNDF, BUILTIN) +#define VEC_STORE16(val, ROUNDF, BUILTIN) \ + VEC_STORE8(val.lo, ROUNDF, BUILTIN) \ + VEC_STORE8(val.hi, ROUNDF, BUILTIN) + +#define __FUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, AS, ROUNDF, BUILTIN) \ + _CLC_OVERLOAD _CLC_DEF void __clc_vstore_half##SUFFIX( \ + TYPE vec, size_t offset, AS half *mem) { \ + offset *= VEC_SIZE; \ + VEC_STORE##VEC_SIZE(vec, ROUNDF, BUILTIN) \ + } \ + _CLC_OVERLOAD _CLC_DEF void __clc_vstorea_half##SUFFIX( \ + TYPE vec, size_t offset, AS half *mem) { \ + offset *= OFFSET; \ + VEC_STORE##VEC_SIZE(vec, ROUNDF, BUILTIN) \ + } + +_CLC_DEF _CLC_OVERLOAD float __clc_noop(float x) { return x; } +_CLC_DEF _CLC_OVERLOAD float __clc_rtz(float x) { + /* Remove lower 13 bits to make sure the number is rounded down */ + int mask = 0xffffe000; + const int exp = (__clc_as_uint(x) >> 23 & 0xff) - 127; + /* Denormals cannot be flushed, and they use different bit for rounding */ + if (exp < -14) + mask <<= __clc_min(-(exp + 14), 10); + /* RTZ does not produce Inf for large numbers */ + if (__clc_fabs(x) > 65504.0f && !__clc_isinf(x)) + return __clc_copysign(65504.0f, x); + /* Handle nan corner case */ + if (__clc_isnan(x)) + return x; + return __clc_as_float(__clc_as_uint(x) & mask); +} +_CLC_DEF _CLC_OVERLOAD float __clc_rti(float x) { + const float inf = __clc_copysign(INFINITY, x); + /* Set lower 13 bits */ + int mask = (1 << 13) - 1; + const int exp = (__clc_as_uint(x) >> 23 & 0xff) - 127; + /* Denormals cannot be flushed, and they use different bit for rounding */ + if (exp < -14) + mask = (1 << (13 + __clc_min(-(exp + 14), 10))) - 1; + /* Handle nan corner case */ + if (__clc_isnan(x)) + return x; + const float next = + __clc_nextafter(__clc_as_float(__clc_as_uint(x) | mask), inf); + return ((__clc_as_uint(x) & mask) == 0) ? x : next; +} +_CLC_DEF _CLC_OVERLOAD float __clc_rtn(float x) { + return ((__clc_as_uint(x) & 0x80000000) == 0) ? __clc_rtz(x) : __clc_rti(x); +} +_CLC_DEF _CLC_OVERLOAD float __clc_rtp(float x) { + return ((__clc_as_uint(x) & 0x80000000) == 0) ? __clc_rti(x) : __clc_rtz(x); +} +_CLC_DEF _CLC_OVERLOAD float __clc_rte(float x) { + /* Mantisa + implicit bit */ + const uint mantissa = (__clc_as_uint(x) & 0x7fffff) | (1u << 23); + const int exp = (__clc_as_uint(x) >> 23 & 0xff) - 127; + int shift = 13; + if (exp < -14) { + /* The default assumes lower 13 bits are rounded, + * but it might be more for denormals. + * Shifting beyond last == 0b, and qr == 00b is not necessary */ + shift += __clc_min(-(exp + 14), 15); + } + int mask = (1 << shift) - 1; + const uint grs = mantissa & mask; + const uint last = mantissa & (1 << shift); + /* IEEE round up rule is: grs > 101b or grs == 100b and last == 1. + * exp > 15 should round to inf. */ + bool roundup = (grs > (1 << (shift - 1))) || + (grs == (1 << (shift - 1)) && last != 0) || (exp > 15); + return roundup ? __clc_rti(x) : __clc_rtz(x); +} + +#ifdef cl_khr_fp64 +_CLC_DEF _CLC_OVERLOAD double __clc_noop(double x) { return x; } +_CLC_DEF _CLC_OVERLOAD double __clc_rtz(double x) { + /* Remove lower 42 bits to make sure the number is rounded down */ + ulong mask = 0xfffffc0000000000UL; + const int exp = (__clc_as_ulong(x) >> 52 & 0x7ff) - 1023; + /* Denormals cannot be flushed, and they use different bit for rounding */ + if (exp < -14) + mask <<= __clc_min(-(exp + 14), 10); + /* RTZ does not produce Inf for large numbers */ + if (__clc_fabs(x) > 65504.0 && !__clc_isinf(x)) + return __clc_copysign(65504.0, x); + /* Handle nan corner case */ + if (__clc_isnan(x)) + return x; + return __clc_as_double(__clc_as_ulong(x) & mask); +} +_CLC_DEF _CLC_OVERLOAD double __clc_rti(double x) { + const double inf = __clc_copysign((double)INFINITY, x); + /* Set lower 42 bits */ + long mask = (1UL << 42UL) - 1UL; + const int exp = (__clc_as_ulong(x) >> 52 & 0x7ff) - 1023; + /* Denormals cannot be flushed, and they use different bit for rounding */ + if (exp < -14) + mask = (1UL << (42UL + __clc_min(-(exp + 14), 10))) - 1; + /* Handle nan corner case */ + if (__clc_isnan(x)) + return x; + const double next = + __clc_nextafter(__clc_as_double(__clc_as_ulong(x) | mask), inf); + return ((__clc_as_ulong(x) & mask) == 0) ? x : next; +} +_CLC_DEF _CLC_OVERLOAD double __clc_rtn(double x) { + return ((__clc_as_ulong(x) & 0x8000000000000000UL) == 0) ? __clc_rtz(x) + : __clc_rti(x); +} +_CLC_DEF _CLC_OVERLOAD double __clc_rtp(double x) { + return ((__clc_as_ulong(x) & 0x8000000000000000UL) == 0) ? __clc_rti(x) + : __clc_rtz(x); +} +_CLC_DEF _CLC_OVERLOAD double __clc_rte(double x) { + /* Mantisa + implicit bit */ + const ulong mantissa = (__clc_as_ulong(x) & 0xfffffffffffff) | (1UL << 52); + const int exp = (__clc_as_ulong(x) >> 52 & 0x7ff) - 1023; + int shift = 42; + if (exp < -14) { + /* The default assumes lower 13 bits are rounded, + * but it might be more for denormals. + * Shifting beyond last == 0b, and qr == 00b is not necessary */ + shift += __clc_min(-(exp + 14), 15); + } + ulong mask = (1UL << shift) - 1UL; + const ulong grs = mantissa & mask; + const ulong last = mantissa & (1UL << shift); + /* IEEE round up rule is: grs > 101b or grs == 100b and last == 1. + * exp > 15 should round to inf. */ + bool roundup = (grs > (1UL << (shift - 1UL))) || + (grs == (1UL << (shift - 1UL)) && last != 0) || (exp > 15); + return roundup ? __clc_rti(x) : __clc_rtz(x); +} +#endif + +#define __XFUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, AS, BUILTIN) \ + __FUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, AS, __clc_noop, BUILTIN) \ + __FUNC(SUFFIX##_rtz, VEC_SIZE, OFFSET, TYPE, AS, __clc_rtz, BUILTIN) \ + __FUNC(SUFFIX##_rtn, VEC_SIZE, OFFSET, TYPE, AS, __clc_rtn, BUILTIN) \ + __FUNC(SUFFIX##_rtp, VEC_SIZE, OFFSET, TYPE, AS, __clc_rtp, BUILTIN) \ + __FUNC(SUFFIX##_rte, VEC_SIZE, OFFSET, TYPE, AS, __clc_rte, BUILTIN) + +#define FUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, AS, BUILTIN) \ + __XFUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, AS, BUILTIN) + +#define __CLC_BODY "clc_vstore_half.inc" +#include +#undef FUNC +#undef __XFUNC +#undef __FUNC +#undef VEC_LOAD16 +#undef VEC_LOAD8 +#undef VEC_LOAD4 +#undef VEC_LOAD3 +#undef VEC_LOAD2 +#undef VEC_LOAD1 +#undef DECLARE_HELPER +#undef VSTORE_ADDR_SPACES +#undef VSTORE_VECTORIZE +#undef VSTORE_VECTORIZE_GENERIC diff --git a/libclc/opencl/lib/generic/shared/vstore_half.inc b/libclc/clc/lib/generic/shared/clc_vstore_half.inc similarity index 100% rename from libclc/opencl/lib/generic/shared/vstore_half.inc rename to libclc/clc/lib/generic/shared/clc_vstore_half.inc diff --git a/libclc/opencl/lib/generic/shared/vload.cl b/libclc/opencl/lib/generic/shared/vload.cl index 4bfb5a012ce1..ad2283958013 100644 --- a/libclc/opencl/lib/generic/shared/vload.cl +++ b/libclc/opencl/lib/generic/shared/vload.cl @@ -7,134 +7,10 @@ //===----------------------------------------------------------------------===// #include +#include -#define VLOAD_VECTORIZE(PRIM_TYPE, ADDR_SPACE) \ - typedef PRIM_TYPE##2 less_aligned_##ADDR_SPACE##PRIM_TYPE##2 \ - __attribute__((aligned(sizeof(PRIM_TYPE)))); \ - _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##2 vload2(size_t offset, \ - const ADDR_SPACE PRIM_TYPE *x) { \ - return *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2 \ - *)(&x[2 * offset])); \ - } \ - \ - typedef PRIM_TYPE##3 less_aligned_##ADDR_SPACE##PRIM_TYPE##3 \ - __attribute__((aligned(sizeof(PRIM_TYPE)))); \ - _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##3 vload3(size_t offset, \ - const ADDR_SPACE PRIM_TYPE *x) { \ - PRIM_TYPE##2 vec = \ - *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2 \ - *)(&x[3 * offset])); \ - return (PRIM_TYPE##3)(vec.s0, vec.s1, x[offset * 3 + 2]); \ - } \ - \ - typedef PRIM_TYPE##4 less_aligned_##ADDR_SPACE##PRIM_TYPE##4 \ - __attribute__((aligned(sizeof(PRIM_TYPE)))); \ - _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##4 vload4(size_t offset, \ - const ADDR_SPACE PRIM_TYPE *x) { \ - return *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##4 \ - *)(&x[4 * offset])); \ - } \ - \ - typedef PRIM_TYPE##8 less_aligned_##ADDR_SPACE##PRIM_TYPE##8 \ - __attribute__((aligned(sizeof(PRIM_TYPE)))); \ - _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##8 vload8(size_t offset, \ - const ADDR_SPACE PRIM_TYPE *x) { \ - return *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##8 \ - *)(&x[8 * offset])); \ - } \ - \ - typedef PRIM_TYPE##16 less_aligned_##ADDR_SPACE##PRIM_TYPE##16 \ - __attribute__((aligned(sizeof(PRIM_TYPE)))); \ - _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##16 vload16( \ - size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \ - return *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##16 \ - *)(&x[16 * offset])); \ - } +#define __CLC_BODY "vload.inc" +#include -#if _CLC_DISTINCT_GENERIC_AS_SUPPORTED -#define VLOAD_VECTORIZE_GENERIC VLOAD_VECTORIZE -#else -// The generic address space isn't available, so make the macro do nothing -#define VLOAD_VECTORIZE_GENERIC(X, Y) -#endif - -#define VLOAD_ADDR_SPACES(__CLC_SCALAR_GENTYPE) \ - VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __private) \ - VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __local) \ - VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __constant) \ - VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __global) \ - VLOAD_VECTORIZE_GENERIC(__CLC_SCALAR_GENTYPE, __generic) - -#define VLOAD_TYPES() \ - VLOAD_ADDR_SPACES(char) \ - VLOAD_ADDR_SPACES(uchar) \ - VLOAD_ADDR_SPACES(short) \ - VLOAD_ADDR_SPACES(ushort) \ - VLOAD_ADDR_SPACES(int) \ - VLOAD_ADDR_SPACES(uint) \ - VLOAD_ADDR_SPACES(long) \ - VLOAD_ADDR_SPACES(ulong) \ - VLOAD_ADDR_SPACES(float) - -VLOAD_TYPES() - -#ifdef cl_khr_fp64 -#pragma OPENCL EXTENSION cl_khr_fp64 : enable -VLOAD_ADDR_SPACES(double) -#endif -#ifdef cl_khr_fp16 -#pragma OPENCL EXTENSION cl_khr_fp16 : enable -VLOAD_ADDR_SPACES(half) -#endif - -/* vload_half are legal even without cl_khr_fp16 */ -/* no vload_half for double */ -#define VEC_LOAD1(val, AS) val = __builtin_load_halff(&mem[offset++]); -#define VEC_LOAD2(val, AS) \ - VEC_LOAD1(val.lo, AS) \ - VEC_LOAD1(val.hi, AS) -#define VEC_LOAD3(val, AS) \ - VEC_LOAD1(val.s0, AS) \ - VEC_LOAD1(val.s1, AS) \ - VEC_LOAD1(val.s2, AS) -#define VEC_LOAD4(val, AS) \ - VEC_LOAD2(val.lo, AS) \ - VEC_LOAD2(val.hi, AS) -#define VEC_LOAD8(val, AS) \ - VEC_LOAD4(val.lo, AS) \ - VEC_LOAD4(val.hi, AS) -#define VEC_LOAD16(val, AS) \ - VEC_LOAD8(val.lo, AS) \ - VEC_LOAD8(val.hi, AS) - -#define __FUNC(SUFFIX, VEC_SIZE, OFFSET_SIZE, TYPE, AS) \ - _CLC_OVERLOAD _CLC_DEF TYPE vload_half##SUFFIX(size_t offset, \ - const AS half *mem) { \ - offset *= VEC_SIZE; \ - TYPE __tmp; \ - VEC_LOAD##VEC_SIZE(__tmp, AS) return __tmp; \ - } \ - _CLC_OVERLOAD _CLC_DEF TYPE vloada_half##SUFFIX(size_t offset, \ - const AS half *mem) { \ - offset *= OFFSET_SIZE; \ - TYPE __tmp; \ - VEC_LOAD##VEC_SIZE(__tmp, AS) return __tmp; \ - } - -#define FUNC(SUFFIX, VEC_SIZE, OFFSET_SIZE, TYPE, AS) \ - __FUNC(SUFFIX, VEC_SIZE, OFFSET_SIZE, TYPE, AS) - -#define __CLC_BODY "vload_half.inc" +#define __CLC_BODY "vload.inc" #include -#undef FUNC -#undef __FUNC -#undef VEC_LOAD16 -#undef VEC_LOAD8 -#undef VEC_LOAD4 -#undef VEC_LOAD3 -#undef VEC_LOAD2 -#undef VEC_LOAD1 -#undef VLOAD_TYPES -#undef VLOAD_ADDR_SPACES -#undef VLOAD_VECTORIZE -#undef VLOAD_VECTORIZE_GENERIC diff --git a/libclc/opencl/lib/generic/shared/vload.inc b/libclc/opencl/lib/generic/shared/vload.inc new file mode 100644 index 000000000000..62cb040aad18 --- /dev/null +++ b/libclc/opencl/lib/generic/shared/vload.inc @@ -0,0 +1,71 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#define CLC_VLOAD_NAME(x) __CLC_XCONCAT(__CLC_XCONCAT(x, vload), __CLC_VECSIZE) +#define CLC_VLOAD_HALF_NAME(x) \ + __CLC_XCONCAT(__CLC_XCONCAT(x, vload_half), __CLC_VECSIZE) +#define CLC_VLOADA_HALF_NAME(x) \ + __CLC_XCONCAT(__CLC_XCONCAT(x, vloada_half), __CLC_VECSIZE) + +#ifndef __CLC_SCALAR + +#define CLC_VLOAD_TY __CLC_XCONCAT(less_aligned_, __CLC_GENTYPE) + +#define VLOAD_DEF(ADDRSPACE) \ + _CLC_OVERLOAD _CLC_DEF CLC_VLOAD_TY CLC_VLOAD_NAME()( \ + size_t offset, const ADDRSPACE __CLC_SCALAR_GENTYPE *x) { \ + return CLC_VLOAD_NAME(__clc_)(offset, x); \ + } + +VLOAD_DEF(__private) +VLOAD_DEF(__local) +VLOAD_DEF(__constant) +VLOAD_DEF(__global) + +#if _CLC_DISTINCT_GENERIC_AS_SUPPORTED +VLOAD_DEF(__generic) +#endif + +#undef VLOAD_DEF +#undef CLC_VLOAD_TY + +#endif + +// vload_half and vloada_half are available even if cl_khr_fp16 is unavailable. +// Declare these functions when working on float types, which we know are +// always available. +#ifdef __CLC_FPSIZE +#if __CLC_FPSIZE == 32 + +#define VLOAD_HALF_DEF(ADDRSPACE) \ + _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE CLC_VLOAD_HALF_NAME()( \ + size_t offset, const ADDRSPACE half *mem) { \ + return CLC_VLOAD_HALF_NAME(__clc_)(offset, mem); \ + } \ + \ + _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE CLC_VLOADA_HALF_NAME()( \ + size_t offset, const ADDRSPACE half *mem) { \ + return CLC_VLOADA_HALF_NAME(__clc_)(offset, mem); \ + } + +VLOAD_HALF_DEF(__private) +VLOAD_HALF_DEF(__local) +VLOAD_HALF_DEF(__constant) +VLOAD_HALF_DEF(__global) + +#if _CLC_DISTINCT_GENERIC_AS_SUPPORTED +VLOAD_HALF_DEF(__generic) +#endif + +#undef VLOAD_HALF_DEF +#endif +#endif + +#undef CLC_VLOAD_NAME +#undef CLC_VLOAD_HALF_NAME +#undef CLC_VLOADA_HALF_NAME diff --git a/libclc/opencl/lib/generic/shared/vstore.cl b/libclc/opencl/lib/generic/shared/vstore.cl index fe4890defe84..145658f873dc 100644 --- a/libclc/opencl/lib/generic/shared/vstore.cl +++ b/libclc/opencl/lib/generic/shared/vstore.cl @@ -7,253 +7,10 @@ //===----------------------------------------------------------------------===// #include +#include -#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable +#define __CLC_BODY "vstore.inc" +#include -#define VSTORE_VECTORIZE(PRIM_TYPE, ADDR_SPACE) \ - typedef PRIM_TYPE##2 less_aligned_##ADDR_SPACE##PRIM_TYPE##2 \ - __attribute__((aligned(sizeof(PRIM_TYPE)))); \ - _CLC_OVERLOAD _CLC_DEF void vstore2(PRIM_TYPE##2 vec, size_t offset, \ - ADDR_SPACE PRIM_TYPE *mem) { \ - *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2 \ - *)(&mem[2 * offset])) = vec; \ - } \ - \ - _CLC_OVERLOAD _CLC_DEF void vstore3(PRIM_TYPE##3 vec, size_t offset, \ - ADDR_SPACE PRIM_TYPE *mem) { \ - *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2 \ - *)(&mem[3 * offset])) = (PRIM_TYPE##2)(vec.s0, vec.s1); \ - mem[3 * offset + 2] = vec.s2; \ - } \ - \ - typedef PRIM_TYPE##4 less_aligned_##ADDR_SPACE##PRIM_TYPE##4 \ - __attribute__((aligned(sizeof(PRIM_TYPE)))); \ - _CLC_OVERLOAD _CLC_DEF void vstore4(PRIM_TYPE##4 vec, size_t offset, \ - ADDR_SPACE PRIM_TYPE *mem) { \ - *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##4 \ - *)(&mem[4 * offset])) = vec; \ - } \ - \ - typedef PRIM_TYPE##8 less_aligned_##ADDR_SPACE##PRIM_TYPE##8 \ - __attribute__((aligned(sizeof(PRIM_TYPE)))); \ - _CLC_OVERLOAD _CLC_DEF void vstore8(PRIM_TYPE##8 vec, size_t offset, \ - ADDR_SPACE PRIM_TYPE *mem) { \ - *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##8 \ - *)(&mem[8 * offset])) = vec; \ - } \ - \ - typedef PRIM_TYPE##16 less_aligned_##ADDR_SPACE##PRIM_TYPE##16 \ - __attribute__((aligned(sizeof(PRIM_TYPE)))); \ - _CLC_OVERLOAD _CLC_DEF void vstore16(PRIM_TYPE##16 vec, size_t offset, \ - ADDR_SPACE PRIM_TYPE *mem) { \ - *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##16 \ - *)(&mem[16 * offset])) = vec; \ - } - -#if _CLC_DISTINCT_GENERIC_AS_SUPPORTED -#define VSTORE_VECTORIZE_GENERIC VSTORE_VECTORIZE -#else -// The generic address space isn't available, so make the macro do nothing -#define VSTORE_VECTORIZE_GENERIC(X, Y) -#endif - -#define VSTORE_ADDR_SPACES(__CLC_SCALAR___CLC_GENTYPE) \ - VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __private) \ - VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __local) \ - VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __global) \ - VSTORE_VECTORIZE_GENERIC(__CLC_SCALAR___CLC_GENTYPE, __generic) - -VSTORE_ADDR_SPACES(char) -VSTORE_ADDR_SPACES(uchar) -VSTORE_ADDR_SPACES(short) -VSTORE_ADDR_SPACES(ushort) -VSTORE_ADDR_SPACES(int) -VSTORE_ADDR_SPACES(uint) -VSTORE_ADDR_SPACES(long) -VSTORE_ADDR_SPACES(ulong) -VSTORE_ADDR_SPACES(float) - -#ifdef cl_khr_fp64 -#pragma OPENCL EXTENSION cl_khr_fp64 : enable -VSTORE_ADDR_SPACES(double) -#endif - -#ifdef cl_khr_fp16 -#pragma OPENCL EXTENSION cl_khr_fp16 : enable -VSTORE_ADDR_SPACES(half) -#endif - -#define VEC_STORE1(val, ROUNDF, BUILTIN) BUILTIN(ROUNDF(val), &mem[offset++]); - -#define VEC_STORE2(val, ROUNDF, BUILTIN) \ - VEC_STORE1(val.lo, ROUNDF, BUILTIN) \ - VEC_STORE1(val.hi, ROUNDF, BUILTIN) -#define VEC_STORE3(val, ROUNDF, BUILTIN) \ - VEC_STORE1(val.s0, ROUNDF, BUILTIN) \ - VEC_STORE1(val.s1, ROUNDF, BUILTIN) \ - VEC_STORE1(val.s2, ROUNDF, BUILTIN) -#define VEC_STORE4(val, ROUNDF, BUILTIN) \ - VEC_STORE2(val.lo, ROUNDF, BUILTIN) \ - VEC_STORE2(val.hi, ROUNDF, BUILTIN) -#define VEC_STORE8(val, ROUNDF, BUILTIN) \ - VEC_STORE4(val.lo, ROUNDF, BUILTIN) \ - VEC_STORE4(val.hi, ROUNDF, BUILTIN) -#define VEC_STORE16(val, ROUNDF, BUILTIN) \ - VEC_STORE8(val.lo, ROUNDF, BUILTIN) \ - VEC_STORE8(val.hi, ROUNDF, BUILTIN) - -#define __FUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, AS, ROUNDF, BUILTIN) \ - _CLC_OVERLOAD _CLC_DEF void vstore_half##SUFFIX(TYPE vec, size_t offset, \ - AS half *mem) { \ - offset *= VEC_SIZE; \ - VEC_STORE##VEC_SIZE(vec, ROUNDF, BUILTIN) \ - } \ - _CLC_OVERLOAD _CLC_DEF void vstorea_half##SUFFIX(TYPE vec, size_t offset, \ - AS half *mem) { \ - offset *= OFFSET; \ - VEC_STORE##VEC_SIZE(vec, ROUNDF, BUILTIN) \ - } - -_CLC_DEF _CLC_OVERLOAD float __clc_noop(float x) { return x; } -_CLC_DEF _CLC_OVERLOAD float __clc_rtz(float x) { - /* Remove lower 13 bits to make sure the number is rounded down */ - int mask = 0xffffe000; - const int exp = (as_uint(x) >> 23 & 0xff) - 127; - /* Denormals cannot be flushed, and they use different bit for rounding */ - if (exp < -14) - mask <<= min(-(exp + 14), 10); - /* RTZ does not produce Inf for large numbers */ - if (fabs(x) > 65504.0f && !isinf(x)) - return copysign(65504.0f, x); - /* Handle nan corner case */ - if (isnan(x)) - return x; - return as_float(as_uint(x) & mask); -} -_CLC_DEF _CLC_OVERLOAD float __clc_rti(float x) { - const float inf = copysign(INFINITY, x); - /* Set lower 13 bits */ - int mask = (1 << 13) - 1; - const int exp = (as_uint(x) >> 23 & 0xff) - 127; - /* Denormals cannot be flushed, and they use different bit for rounding */ - if (exp < -14) - mask = (1 << (13 + min(-(exp + 14), 10))) - 1; - /* Handle nan corner case */ - if (isnan(x)) - return x; - const float next = nextafter(as_float(as_uint(x) | mask), inf); - return ((as_uint(x) & mask) == 0) ? x : next; -} -_CLC_DEF _CLC_OVERLOAD float __clc_rtn(float x) { - return ((as_uint(x) & 0x80000000) == 0) ? __clc_rtz(x) : __clc_rti(x); -} -_CLC_DEF _CLC_OVERLOAD float __clc_rtp(float x) { - return ((as_uint(x) & 0x80000000) == 0) ? __clc_rti(x) : __clc_rtz(x); -} -_CLC_DEF _CLC_OVERLOAD float __clc_rte(float x) { - /* Mantisa + implicit bit */ - const uint mantissa = (as_uint(x) & 0x7fffff) | (1u << 23); - const int exp = (as_uint(x) >> 23 & 0xff) - 127; - int shift = 13; - if (exp < -14) { - /* The default assumes lower 13 bits are rounded, - * but it might be more for denormals. - * Shifting beyond last == 0b, and qr == 00b is not necessary */ - shift += min(-(exp + 14), 15); - } - int mask = (1 << shift) - 1; - const uint grs = mantissa & mask; - const uint last = mantissa & (1 << shift); - /* IEEE round up rule is: grs > 101b or grs == 100b and last == 1. - * exp > 15 should round to inf. */ - bool roundup = (grs > (1 << (shift - 1))) || - (grs == (1 << (shift - 1)) && last != 0) || (exp > 15); - return roundup ? __clc_rti(x) : __clc_rtz(x); -} - -#ifdef cl_khr_fp64 -_CLC_DEF _CLC_OVERLOAD double __clc_noop(double x) { return x; } -_CLC_DEF _CLC_OVERLOAD double __clc_rtz(double x) { - /* Remove lower 42 bits to make sure the number is rounded down */ - ulong mask = 0xfffffc0000000000UL; - const int exp = (as_ulong(x) >> 52 & 0x7ff) - 1023; - /* Denormals cannot be flushed, and they use different bit for rounding */ - if (exp < -14) - mask <<= min(-(exp + 14), 10); - /* RTZ does not produce Inf for large numbers */ - if (fabs(x) > 65504.0 && !isinf(x)) - return copysign(65504.0, x); - /* Handle nan corner case */ - if (isnan(x)) - return x; - return as_double(as_ulong(x) & mask); -} -_CLC_DEF _CLC_OVERLOAD double __clc_rti(double x) { - const double inf = copysign((double)INFINITY, x); - /* Set lower 42 bits */ - long mask = (1UL << 42UL) - 1UL; - const int exp = (as_ulong(x) >> 52 & 0x7ff) - 1023; - /* Denormals cannot be flushed, and they use different bit for rounding */ - if (exp < -14) - mask = (1UL << (42UL + min(-(exp + 14), 10))) - 1; - /* Handle nan corner case */ - if (isnan(x)) - return x; - const double next = nextafter(as_double(as_ulong(x) | mask), inf); - return ((as_ulong(x) & mask) == 0) ? x : next; -} -_CLC_DEF _CLC_OVERLOAD double __clc_rtn(double x) { - return ((as_ulong(x) & 0x8000000000000000UL) == 0) ? __clc_rtz(x) - : __clc_rti(x); -} -_CLC_DEF _CLC_OVERLOAD double __clc_rtp(double x) { - return ((as_ulong(x) & 0x8000000000000000UL) == 0) ? __clc_rti(x) - : __clc_rtz(x); -} -_CLC_DEF _CLC_OVERLOAD double __clc_rte(double x) { - /* Mantisa + implicit bit */ - const ulong mantissa = (as_ulong(x) & 0xfffffffffffff) | (1UL << 52); - const int exp = (as_ulong(x) >> 52 & 0x7ff) - 1023; - int shift = 42; - if (exp < -14) { - /* The default assumes lower 13 bits are rounded, - * but it might be more for denormals. - * Shifting beyond last == 0b, and qr == 00b is not necessary */ - shift += min(-(exp + 14), 15); - } - ulong mask = (1UL << shift) - 1UL; - const ulong grs = mantissa & mask; - const ulong last = mantissa & (1UL << shift); - /* IEEE round up rule is: grs > 101b or grs == 100b and last == 1. - * exp > 15 should round to inf. */ - bool roundup = (grs > (1UL << (shift - 1UL))) || - (grs == (1UL << (shift - 1UL)) && last != 0) || (exp > 15); - return roundup ? __clc_rti(x) : __clc_rtz(x); -} -#endif - -#define __XFUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, AS, BUILTIN) \ - __FUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, AS, __clc_noop, BUILTIN) \ - __FUNC(SUFFIX##_rtz, VEC_SIZE, OFFSET, TYPE, AS, __clc_rtz, BUILTIN) \ - __FUNC(SUFFIX##_rtn, VEC_SIZE, OFFSET, TYPE, AS, __clc_rtn, BUILTIN) \ - __FUNC(SUFFIX##_rtp, VEC_SIZE, OFFSET, TYPE, AS, __clc_rtp, BUILTIN) \ - __FUNC(SUFFIX##_rte, VEC_SIZE, OFFSET, TYPE, AS, __clc_rte, BUILTIN) - -#define FUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, AS, BUILTIN) \ - __XFUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, AS, BUILTIN) - -#define __CLC_BODY "vstore_half.inc" +#define __CLC_BODY "vstore.inc" #include -#undef FUNC -#undef __XFUNC -#undef __FUNC -#undef VEC_LOAD16 -#undef VEC_LOAD8 -#undef VEC_LOAD4 -#undef VEC_LOAD3 -#undef VEC_LOAD2 -#undef VEC_LOAD1 -#undef DECLARE_HELPER -#undef VSTORE_ADDR_SPACES -#undef VSTORE_VECTORIZE -#undef VSTORE_VECTORIZE_GENERIC diff --git a/libclc/opencl/lib/generic/shared/vstore.inc b/libclc/opencl/lib/generic/shared/vstore.inc new file mode 100644 index 000000000000..4bdce0719912 --- /dev/null +++ b/libclc/opencl/lib/generic/shared/vstore.inc @@ -0,0 +1,77 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#define CLC_VSTORE_TY __CLC_XCONCAT(less_aligned_, __CLC_GENTYPE) +#define CLC_VSTORE_NAME(x) \ + __CLC_XCONCAT(__CLC_XCONCAT(x, vstore), __CLC_VECSIZE) +#define CLC_VSTORE_HALF_NAME(x, y) \ + __CLC_XCONCAT(__CLC_XCONCAT(__CLC_XCONCAT(x, vstore_half), __CLC_VECSIZE), y) +#define CLC_VSTOREA_HALF_NAME(x, y) \ + __CLC_XCONCAT(__CLC_XCONCAT(__CLC_XCONCAT(x, vstorea_half), __CLC_VECSIZE), y) + +#ifndef __CLC_SCALAR + +#define CLC_VSTORE_DEF(ADDRSPACE) \ + _CLC_OVERLOAD _CLC_DEF void CLC_VSTORE_NAME()( \ + CLC_VSTORE_TY data, size_t offset, ADDRSPACE __CLC_SCALAR_GENTYPE *p) { \ + return CLC_VSTORE_NAME(__clc_)(data, offset, p); \ + } + +CLC_VSTORE_DEF(__private) +CLC_VSTORE_DEF(__local) +CLC_VSTORE_DEF(__global) + +#if _CLC_DISTINCT_GENERIC_AS_SUPPORTED +CLC_VSTORE_DEF(__generic) +#endif + +#undef CLC_VSTORE_DEF + +#endif // __CLC_SCALAR + +// vstore_half and vstorea_half are available even if cl_khr_fp16 is +// unavailable. +#ifdef __CLC_FPSIZE +#if __CLC_FPSIZE == 32 || __CLC_FPSIZE == 64 + +#define CLC_VSTORE_HALF_DEF(ADDRSPACE, SUFFIX) \ + _CLC_OVERLOAD _CLC_DEF void CLC_VSTORE_HALF_NAME(, SUFFIX)( \ + CLC_VSTORE_TY data, size_t offset, ADDRSPACE half *p) { \ + CLC_VSTORE_HALF_NAME(__clc_, SUFFIX)(data, offset, p); \ + } \ + \ + _CLC_OVERLOAD _CLC_DEF void CLC_VSTOREA_HALF_NAME(, SUFFIX)( \ + CLC_VSTORE_TY data, size_t offset, ADDRSPACE half *p) { \ + CLC_VSTOREA_HALF_NAME(__clc_, SUFFIX)(data, offset, p); \ + } + +#define CLC_VSTORE_HALF_DEF_ALL_MODES(ADDRSPACE) \ + CLC_VSTORE_HALF_DEF(ADDRSPACE, ) \ + CLC_VSTORE_HALF_DEF(ADDRSPACE, _rtz) \ + CLC_VSTORE_HALF_DEF(ADDRSPACE, _rtn) \ + CLC_VSTORE_HALF_DEF(ADDRSPACE, _rtp) \ + CLC_VSTORE_HALF_DEF(ADDRSPACE, _rte) + +CLC_VSTORE_HALF_DEF_ALL_MODES(__private) +CLC_VSTORE_HALF_DEF_ALL_MODES(__local) +CLC_VSTORE_HALF_DEF_ALL_MODES(__global) + +#if _CLC_DISTINCT_GENERIC_AS_SUPPORTED +CLC_VSTORE_HALF_DEF_ALL_MODES(__generic) +#endif + +#undef CLC_VSTORE_HALF_DEF +#undef CLC_VSTORE_HALF_DEF_ALL_MODES + +#endif +#endif + +#undef CLC_VSTORE_TY +#undef CLC_VSTORE_NAME +#undef CLC_VSTORE_HALF_NAME +#undef CLC_VSTOREA_HALF_NAME