[Clang] Fix constant bit widths in gpuintrin.h (#189387)

Summary:
The `ull` suffix can mean 128 bits on some architectures. Replace this
with the `stdint.h` constructor to be certain.
This commit is contained in:
Joseph Huber 2026-03-30 14:19:01 -05:00 committed by GitHub
parent 7364203924
commit 0d2c59abd5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 17 additions and 14 deletions

View File

@ -147,11 +147,10 @@ __gpu_is_first_in_lane(uint64_t __lane_mask) {
// Copies the value from the first active thread to the rest.
_DEFAULT_FN_ATTRS static __inline__ uint64_t
__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x) {
uint32_t __hi = (uint32_t)(__x >> 32ull);
uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFFull);
return ((uint64_t)__gpu_read_first_lane_u32(__lane_mask, __hi) << 32ull) |
((uint64_t)__gpu_read_first_lane_u32(__lane_mask, __lo) &
0xFFFFFFFFull);
uint32_t __hi = (uint32_t)(__x >> 32);
uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
return ((uint64_t)__gpu_read_first_lane_u32(__lane_mask, __hi) << 32) |
((uint64_t)__gpu_read_first_lane_u32(__lane_mask, __lo) & 0xFFFFFFFF);
}
// Gets the first floating point value from the active lanes.
@ -174,11 +173,10 @@ __gpu_read_first_lane_f64(uint64_t __lane_mask, double __x) {
_DEFAULT_FN_ATTRS static __inline__ uint64_t
__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
uint32_t __width) {
uint32_t __hi = (uint32_t)(__x >> 32ull);
uint32_t __hi = (uint32_t)(__x >> 32);
uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
uint32_t __mask = (uint32_t)__lane_mask;
return ((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __hi, __width)
<< 32ull) |
return ((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __hi, __width) << 32) |
((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __lo, __width));
}
@ -211,7 +209,7 @@ __gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x,
_DEFAULT_FN_ATTRS static __inline__ __type \
__gpu_suffix_scan_##__prefix##_##__suffix(uint64_t __lane_mask, \
__type __x) { \
uint64_t __above = __lane_mask & -(2ull << __gpu_lane_id()); \
uint64_t __above = __lane_mask & -(UINT64_C(2) << __gpu_lane_id()); \
for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) { \
uint32_t __src = __builtin_ctzg(__above, (int)sizeof(__above) * 8); \
__type __result = __gpu_shuffle_idx_##__suffix(__lane_mask, __src, __x, \
@ -226,14 +224,15 @@ __gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x,
_DEFAULT_FN_ATTRS static __inline__ __type \
__gpu_prefix_scan_##__prefix##_##__suffix(uint64_t __lane_mask, \
__type __x) { \
uint64_t __below = __lane_mask & ((1ull << __gpu_lane_id()) - 1); \
uint64_t __below = __lane_mask & ((UINT64_C(1) << __gpu_lane_id()) - 1); \
for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) { \
uint32_t __src = 63 - __builtin_clzg(__below, (int)sizeof(__below) * 8); \
__type __result = __gpu_shuffle_idx_##__suffix(__lane_mask, __src, __x, \
__gpu_num_lanes()); \
__x = __op(__x, __below ? __result : (__type)__identity); \
for (uint32_t __i = 0; __i < __step; ++__i) \
__below ^= (1ull << (63 - __builtin_clzg(__below, 0))) & __below; \
__below ^= \
(UINT64_C(1) << (63 - __builtin_clzg(__below, 0))) & __below; \
} \
return __x; \
} \
@ -337,7 +336,7 @@ __gpu_match_all_u32_impl(uint64_t __lane_mask, uint32_t __x) {
uint32_t __first = __gpu_shuffle_idx_u32(
__lane_mask, __builtin_ctzg(__lane_mask), __x, __gpu_num_lanes());
uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
return __ballot == __lane_mask ? __lane_mask : 0ull;
return __ballot == __lane_mask ? __lane_mask : UINT64_C(0);
}
// Returns the current lane mask if every lane contains __x.
@ -346,7 +345,7 @@ __gpu_match_all_u64_impl(uint64_t __lane_mask, uint64_t __x) {
uint64_t __first = __gpu_shuffle_idx_u64(
__lane_mask, __builtin_ctzg(__lane_mask), __x, __gpu_num_lanes());
uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
return __ballot == __lane_mask ? __lane_mask : 0ull;
return __ballot == __lane_mask ? __lane_mask : UINT64_C(0);
}
_Pragma("omp end declare variant");

View File

@ -137,7 +137,7 @@ __gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x,
uint32_t __width) {
// Mask out inactive lanes to match AMDGPU behavior.
uint32_t __mask = (uint32_t)__lane_mask;
bool __bitmask = (1ull << __idx) & __lane_mask;
bool __bitmask = (UINT64_C(1) << __idx) & __lane_mask;
return -__bitmask &
__nvvm_shfl_sync_idx_i32(__mask, __x, __idx,
((__gpu_num_lanes() - __width) << 8u) | 0x1f);

View File

@ -39,4 +39,7 @@ typedef unsigned __INTPTR_TYPE__ uintptr_t;
#define UINT32_MAX __UINT32_C(4294967295)
#define UINT64_MAX __UINT64_C(18446744073709551615)
#define UINT32_C __UINT32_C
#define UINT64_C __UINT64_C
#endif /* STDINT_H */

View File

@ -40,6 +40,7 @@ __device__ int foo() { return __gpu_thread_id_x(); }
#elif defined(SYCL)
extern "C" [[clang::sycl_external]] int foo() { return __gpu_thread_id_x(); }
#else
//
// CUDA-LABEL: define dso_local i32 @foo(
// CUDA-SAME: ) #[[ATTR0:[0-9]+]] {
// CUDA-NEXT: [[ENTRY:.*:]]