[Clang] Fix constant bit widths in gpuintrin.h (#189387)
Summary: The `ull` suffix can mean 128 bits on some architectures. Replace this with the `stdint.h` constructor to be certain.
This commit is contained in:
parent
7364203924
commit
0d2c59abd5
@ -147,11 +147,10 @@ __gpu_is_first_in_lane(uint64_t __lane_mask) {
|
||||
// Copies the value from the first active thread to the rest.
|
||||
_DEFAULT_FN_ATTRS static __inline__ uint64_t
|
||||
__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x) {
|
||||
uint32_t __hi = (uint32_t)(__x >> 32ull);
|
||||
uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFFull);
|
||||
return ((uint64_t)__gpu_read_first_lane_u32(__lane_mask, __hi) << 32ull) |
|
||||
((uint64_t)__gpu_read_first_lane_u32(__lane_mask, __lo) &
|
||||
0xFFFFFFFFull);
|
||||
uint32_t __hi = (uint32_t)(__x >> 32);
|
||||
uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
|
||||
return ((uint64_t)__gpu_read_first_lane_u32(__lane_mask, __hi) << 32) |
|
||||
((uint64_t)__gpu_read_first_lane_u32(__lane_mask, __lo) & 0xFFFFFFFF);
|
||||
}
|
||||
|
||||
// Gets the first floating point value from the active lanes.
|
||||
@ -174,11 +173,10 @@ __gpu_read_first_lane_f64(uint64_t __lane_mask, double __x) {
|
||||
_DEFAULT_FN_ATTRS static __inline__ uint64_t
|
||||
__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
|
||||
uint32_t __width) {
|
||||
uint32_t __hi = (uint32_t)(__x >> 32ull);
|
||||
uint32_t __hi = (uint32_t)(__x >> 32);
|
||||
uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
|
||||
uint32_t __mask = (uint32_t)__lane_mask;
|
||||
return ((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __hi, __width)
|
||||
<< 32ull) |
|
||||
return ((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __hi, __width) << 32) |
|
||||
((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __lo, __width));
|
||||
}
|
||||
|
||||
@ -211,7 +209,7 @@ __gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x,
|
||||
_DEFAULT_FN_ATTRS static __inline__ __type \
|
||||
__gpu_suffix_scan_##__prefix##_##__suffix(uint64_t __lane_mask, \
|
||||
__type __x) { \
|
||||
uint64_t __above = __lane_mask & -(2ull << __gpu_lane_id()); \
|
||||
uint64_t __above = __lane_mask & -(UINT64_C(2) << __gpu_lane_id()); \
|
||||
for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) { \
|
||||
uint32_t __src = __builtin_ctzg(__above, (int)sizeof(__above) * 8); \
|
||||
__type __result = __gpu_shuffle_idx_##__suffix(__lane_mask, __src, __x, \
|
||||
@ -226,14 +224,15 @@ __gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x,
|
||||
_DEFAULT_FN_ATTRS static __inline__ __type \
|
||||
__gpu_prefix_scan_##__prefix##_##__suffix(uint64_t __lane_mask, \
|
||||
__type __x) { \
|
||||
uint64_t __below = __lane_mask & ((1ull << __gpu_lane_id()) - 1); \
|
||||
uint64_t __below = __lane_mask & ((UINT64_C(1) << __gpu_lane_id()) - 1); \
|
||||
for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) { \
|
||||
uint32_t __src = 63 - __builtin_clzg(__below, (int)sizeof(__below) * 8); \
|
||||
__type __result = __gpu_shuffle_idx_##__suffix(__lane_mask, __src, __x, \
|
||||
__gpu_num_lanes()); \
|
||||
__x = __op(__x, __below ? __result : (__type)__identity); \
|
||||
for (uint32_t __i = 0; __i < __step; ++__i) \
|
||||
__below ^= (1ull << (63 - __builtin_clzg(__below, 0))) & __below; \
|
||||
__below ^= \
|
||||
(UINT64_C(1) << (63 - __builtin_clzg(__below, 0))) & __below; \
|
||||
} \
|
||||
return __x; \
|
||||
} \
|
||||
@ -337,7 +336,7 @@ __gpu_match_all_u32_impl(uint64_t __lane_mask, uint32_t __x) {
|
||||
uint32_t __first = __gpu_shuffle_idx_u32(
|
||||
__lane_mask, __builtin_ctzg(__lane_mask), __x, __gpu_num_lanes());
|
||||
uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
|
||||
return __ballot == __lane_mask ? __lane_mask : 0ull;
|
||||
return __ballot == __lane_mask ? __lane_mask : UINT64_C(0);
|
||||
}
|
||||
|
||||
// Returns the current lane mask if every lane contains __x.
|
||||
@ -346,7 +345,7 @@ __gpu_match_all_u64_impl(uint64_t __lane_mask, uint64_t __x) {
|
||||
uint64_t __first = __gpu_shuffle_idx_u64(
|
||||
__lane_mask, __builtin_ctzg(__lane_mask), __x, __gpu_num_lanes());
|
||||
uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
|
||||
return __ballot == __lane_mask ? __lane_mask : 0ull;
|
||||
return __ballot == __lane_mask ? __lane_mask : UINT64_C(0);
|
||||
}
|
||||
|
||||
_Pragma("omp end declare variant");
|
||||
|
||||
@ -137,7 +137,7 @@ __gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x,
|
||||
uint32_t __width) {
|
||||
// Mask out inactive lanes to match AMDGPU behavior.
|
||||
uint32_t __mask = (uint32_t)__lane_mask;
|
||||
bool __bitmask = (1ull << __idx) & __lane_mask;
|
||||
bool __bitmask = (UINT64_C(1) << __idx) & __lane_mask;
|
||||
return -__bitmask &
|
||||
__nvvm_shfl_sync_idx_i32(__mask, __x, __idx,
|
||||
((__gpu_num_lanes() - __width) << 8u) | 0x1f);
|
||||
|
||||
@ -39,4 +39,7 @@ typedef unsigned __INTPTR_TYPE__ uintptr_t;
|
||||
#define UINT32_MAX __UINT32_C(4294967295)
|
||||
#define UINT64_MAX __UINT64_C(18446744073709551615)
|
||||
|
||||
#define UINT32_C __UINT32_C
|
||||
#define UINT64_C __UINT64_C
|
||||
|
||||
#endif /* STDINT_H */
|
||||
|
||||
@ -40,6 +40,7 @@ __device__ int foo() { return __gpu_thread_id_x(); }
|
||||
#elif defined(SYCL)
|
||||
extern "C" [[clang::sycl_external]] int foo() { return __gpu_thread_id_x(); }
|
||||
#else
|
||||
//
|
||||
// CUDA-LABEL: define dso_local i32 @foo(
|
||||
// CUDA-SAME: ) #[[ATTR0:[0-9]+]] {
|
||||
// CUDA-NEXT: [[ENTRY:.*:]]
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user