[Headers][NFC] Deduplicate gpu_match_ between targets via inlining (#131141)
Declare a few functions before including the target specific headers then define a fallback_match_{any,all} used by amdgpu and by older nvptx. Fixes a minor bug on pre-volta where one of the four fallback paths was missing a sync_lane.
This commit is contained in:
parent
2044dd07da
commit
c9d7f707c1
@ -30,10 +30,6 @@ _Pragma("omp begin declare variant match(device = {arch(amdgcn)})");
|
||||
// Attribute to declare a function as a kernel.
|
||||
#define __gpu_kernel __attribute__((amdgpu_kernel, visibility("protected")))
|
||||
|
||||
// Defined in gpuintrin.h, used later in this file.
|
||||
_DEFAULT_FN_ATTRS static __inline__ uint64_t
|
||||
__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x);
|
||||
|
||||
// Returns the number of workgroups in the 'x' dimension of the grid.
|
||||
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
|
||||
return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
|
||||
@ -146,57 +142,25 @@ __gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x,
|
||||
// Returns a bitmask marking all lanes that have the same value of __x.
|
||||
_DEFAULT_FN_ATTRS static __inline__ uint64_t
|
||||
__gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
|
||||
uint32_t __match_mask = 0;
|
||||
|
||||
bool __done = 0;
|
||||
while (__gpu_ballot(__lane_mask, !__done)) {
|
||||
if (!__done) {
|
||||
uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
|
||||
if (__first == __x) {
|
||||
__match_mask = __gpu_lane_mask();
|
||||
__done = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
__gpu_sync_lane(__lane_mask);
|
||||
return __match_mask;
|
||||
return __gpu_match_any_u32_impl(__lane_mask, __x);
|
||||
}
|
||||
|
||||
// Returns a bitmask marking all lanes that have the same value of __x.
|
||||
_DEFAULT_FN_ATTRS static __inline__ uint64_t
|
||||
__gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
|
||||
uint64_t __match_mask = 0;
|
||||
|
||||
bool __done = 0;
|
||||
while (__gpu_ballot(__lane_mask, !__done)) {
|
||||
if (!__done) {
|
||||
uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
|
||||
if (__first == __x) {
|
||||
__match_mask = __gpu_lane_mask();
|
||||
__done = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
__gpu_sync_lane(__lane_mask);
|
||||
return __match_mask;
|
||||
return __gpu_match_any_u64_impl(__lane_mask, __x);
|
||||
}
|
||||
|
||||
// Returns the current lane mask if every lane contains __x.
|
||||
_DEFAULT_FN_ATTRS static __inline__ uint64_t
|
||||
__gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
|
||||
uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
|
||||
uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
|
||||
__gpu_sync_lane(__lane_mask);
|
||||
return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
|
||||
return __gpu_match_all_u32_impl(__lane_mask, __x);
|
||||
}
|
||||
|
||||
// Returns the current lane mask if every lane contains __x.
|
||||
_DEFAULT_FN_ATTRS static __inline__ uint64_t
|
||||
__gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) {
|
||||
uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
|
||||
uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
|
||||
__gpu_sync_lane(__lane_mask);
|
||||
return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
|
||||
return __gpu_match_all_u64_impl(__lane_mask, __x);
|
||||
}
|
||||
|
||||
// Returns true if the flat pointer points to AMDGPU 'shared' memory.
|
||||
|
@ -32,6 +32,30 @@ _Pragma("push_macro(\"bool\")");
|
||||
#define bool _Bool
|
||||
#endif
|
||||
|
||||
_Pragma("omp begin declare target device_type(nohost)");
|
||||
_Pragma("omp begin declare variant match(device = {kind(gpu)})");
|
||||
|
||||
// Forward declare a few functions for the implementation header.
|
||||
|
||||
// Returns a bitmask marking all lanes that have the same value of __x.
|
||||
_DEFAULT_FN_ATTRS static __inline__ uint64_t
|
||||
__gpu_match_any_u32_impl(uint64_t __lane_mask, uint32_t __x);
|
||||
|
||||
// Returns a bitmask marking all lanes that have the same value of __x.
|
||||
_DEFAULT_FN_ATTRS static __inline__ uint64_t
|
||||
__gpu_match_any_u64_impl(uint64_t __lane_mask, uint64_t __x);
|
||||
|
||||
// Returns the current lane mask if every lane contains __x.
|
||||
_DEFAULT_FN_ATTRS static __inline__ uint64_t
|
||||
__gpu_match_all_u32_impl(uint64_t __lane_mask, uint32_t __x);
|
||||
|
||||
// Returns the current lane mask if every lane contains __x.
|
||||
_DEFAULT_FN_ATTRS static __inline__ uint64_t
|
||||
__gpu_match_all_u64_impl(uint64_t __lane_mask, uint64_t __x);
|
||||
|
||||
_Pragma("omp end declare variant");
|
||||
_Pragma("omp end declare target");
|
||||
|
||||
#if defined(__NVPTX__)
|
||||
#include <nvptxintrin.h>
|
||||
#elif defined(__AMDGPU__)
|
||||
@ -115,7 +139,7 @@ __gpu_is_first_in_lane(uint64_t __lane_mask) {
|
||||
return __gpu_lane_id() == __gpu_first_lane_id(__lane_mask);
|
||||
}
|
||||
|
||||
// Copies the value from the first active thread in the wavefront to the rest.
|
||||
// Copies the value from the first active thread to the rest.
|
||||
_DEFAULT_FN_ATTRS static __inline__ uint64_t
|
||||
__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x) {
|
||||
uint32_t __hi = (uint32_t)(__x >> 32ull);
|
||||
@ -234,6 +258,62 @@ __DO_LANE_SUM(float, f32); // float __gpu_lane_sum_f32(m, x)
|
||||
__DO_LANE_SUM(double, f64); // double __gpu_lane_sum_f64(m, x)
|
||||
#undef __DO_LANE_SUM
|
||||
|
||||
// Returns a bitmask marking all lanes that have the same value of __x.
|
||||
_DEFAULT_FN_ATTRS static __inline__ uint64_t
|
||||
__gpu_match_any_u32_impl(uint64_t __lane_mask, uint32_t __x) {
|
||||
uint32_t __match_mask = 0;
|
||||
|
||||
bool __done = 0;
|
||||
while (__gpu_ballot(__lane_mask, !__done)) {
|
||||
if (!__done) {
|
||||
uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
|
||||
if (__first == __x) {
|
||||
__match_mask = __gpu_lane_mask();
|
||||
__done = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
__gpu_sync_lane(__lane_mask);
|
||||
return __match_mask;
|
||||
}
|
||||
|
||||
// Returns a bitmask marking all lanes that have the same value of __x.
|
||||
_DEFAULT_FN_ATTRS static __inline__ uint64_t
|
||||
__gpu_match_any_u64_impl(uint64_t __lane_mask, uint64_t __x) {
|
||||
uint64_t __match_mask = 0;
|
||||
|
||||
bool __done = 0;
|
||||
while (__gpu_ballot(__lane_mask, !__done)) {
|
||||
if (!__done) {
|
||||
uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
|
||||
if (__first == __x) {
|
||||
__match_mask = __gpu_lane_mask();
|
||||
__done = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
__gpu_sync_lane(__lane_mask);
|
||||
return __match_mask;
|
||||
}
|
||||
|
||||
// Returns the current lane mask if every lane contains __x.
|
||||
_DEFAULT_FN_ATTRS static __inline__ uint64_t
|
||||
__gpu_match_all_u32_impl(uint64_t __lane_mask, uint32_t __x) {
|
||||
uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
|
||||
uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
|
||||
__gpu_sync_lane(__lane_mask);
|
||||
return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
|
||||
}
|
||||
|
||||
// Returns the current lane mask if every lane contains __x.
|
||||
_DEFAULT_FN_ATTRS static __inline__ uint64_t
|
||||
__gpu_match_all_u64_impl(uint64_t __lane_mask, uint64_t __x) {
|
||||
uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
|
||||
uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
|
||||
__gpu_sync_lane(__lane_mask);
|
||||
return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
|
||||
}
|
||||
|
||||
_Pragma("omp end declare variant");
|
||||
_Pragma("omp end declare target");
|
||||
|
||||
|
@ -34,10 +34,6 @@ _Pragma("omp begin declare variant match(device = {arch(nvptx64)})");
|
||||
// Attribute to declare a function as a kernel.
|
||||
#define __gpu_kernel __attribute__((nvptx_kernel, visibility("protected")))
|
||||
|
||||
// Defined in gpuintrin.h, used later in this file.
|
||||
_DEFAULT_FN_ATTRS static __inline__ uint64_t
|
||||
__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x);
|
||||
|
||||
// Returns the number of CUDA blocks in the 'x' dimension.
|
||||
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
|
||||
return __nvvm_read_ptx_sreg_nctaid_x();
|
||||
@ -156,20 +152,9 @@ __gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
|
||||
// Newer targets can use the dedicated CUDA support.
|
||||
#if __CUDA_ARCH__ >= 700
|
||||
return __nvvm_match_any_sync_i32(__lane_mask, __x);
|
||||
#else
|
||||
return __gpu_match_any_u32_impl(__lane_mask, __x);
|
||||
#endif
|
||||
|
||||
uint32_t __match_mask = 0;
|
||||
bool __done = 0;
|
||||
while (__gpu_ballot(__lane_mask, !__done)) {
|
||||
if (!__done) {
|
||||
uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
|
||||
if (__first == __x) {
|
||||
__match_mask = __gpu_lane_mask();
|
||||
__done = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
return __match_mask;
|
||||
}
|
||||
|
||||
// Returns a bitmask marking all lanes that have the same value of __x.
|
||||
@ -178,22 +163,9 @@ __gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
|
||||
// Newer targets can use the dedicated CUDA support.
|
||||
#if __CUDA_ARCH__ >= 700
|
||||
return __nvvm_match_any_sync_i64(__lane_mask, __x);
|
||||
#else
|
||||
return __gpu_match_any_u64_impl(__lane_mask, __x);
|
||||
#endif
|
||||
|
||||
uint64_t __match_mask = 0;
|
||||
|
||||
bool __done = 0;
|
||||
while (__gpu_ballot(__lane_mask, !__done)) {
|
||||
if (!__done) {
|
||||
uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
|
||||
if (__first == __x) {
|
||||
__match_mask = __gpu_lane_mask();
|
||||
__done = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
__gpu_sync_lane(__lane_mask);
|
||||
return __match_mask;
|
||||
}
|
||||
|
||||
// Returns the current lane mask if every lane contains __x.
|
||||
@ -203,11 +175,9 @@ __gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
|
||||
#if __CUDA_ARCH__ >= 700
|
||||
int predicate;
|
||||
return __nvvm_match_all_sync_i32p(__lane_mask, __x, &predicate);
|
||||
#else
|
||||
return __gpu_match_all_u32_impl(__lane_mask, __x);
|
||||
#endif
|
||||
|
||||
uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
|
||||
uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
|
||||
return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
|
||||
}
|
||||
|
||||
// Returns the current lane mask if every lane contains __x.
|
||||
@ -217,11 +187,9 @@ __gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) {
|
||||
#if __CUDA_ARCH__ >= 700
|
||||
int predicate;
|
||||
return __nvvm_match_all_sync_i64p(__lane_mask, __x, &predicate);
|
||||
#else
|
||||
return __gpu_match_all_u64_impl(__lane_mask, __x);
|
||||
#endif
|
||||
|
||||
uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
|
||||
uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
|
||||
return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
|
||||
}
|
||||
|
||||
// Returns true if the flat pointer points to CUDA 'shared' memory.
|
||||
|
Loading…
x
Reference in New Issue
Block a user