Summary: The changes in https://www.github.com/llvm/llvm-project/pull/185552 allowed us to start building the standard `libclang_rt.profile.a` for GPU targets. This PR expands this by adding an optimized GPU routine for counter increment and removing the special-case handling of these functions in the OpenMP runtime. Vast majority of these functions are boilerplate, but we should be able to do more interesting things with this in the future, like value or memory profiling.
43 lines
1.8 KiB
C
43 lines
1.8 KiB
C
/*===- InstrProfilingPlatformGPU.c - GPU profiling support ----------------===*\
|
|
|*
|
|
|* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
|* See https://llvm.org/LICENSE.txt for license information.
|
|
|* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
|*
|
|
\*===----------------------------------------------------------------------===*/
|
|
|
|
// GPU-specific profiling functions for AMDGPU and NVPTX targets. This file
|
|
// provides:
|
|
//
|
|
// Platform plumbing (section boundaries, binary IDs, VNodes) are handled by
|
|
// InstrProfilingPlatformLinux.c via the COMPILER_RT_PROFILE_BAREMETAL path.
|
|
|
|
#if defined(__NVPTX__) || defined(__AMDGPU__)
|
|
|
|
#include "InstrProfiling.h"
|
|
#include <gpuintrin.h>
|
|
|
|
// Indicates that the current wave is fully occupied.
|
|
static int is_uniform(uint64_t mask) {
|
|
const uint64_t uniform_mask = ~0ull >> (64 - __gpu_num_lanes());
|
|
return mask == uniform_mask;
|
|
}
|
|
|
|
// Wave-cooperative counter increment. The instrumentation pass emits calls to
|
|
// this in place of the default non-atomic load/add/store or atomicrmw sequence.
|
|
// The optional uniform counter allows calculating wave uniformity if present.
|
|
COMPILER_RT_VISIBILITY void __llvm_profile_instrument_gpu(uint64_t *counter,
|
|
uint64_t *uniform,
|
|
uint64_t step) {
|
|
uint64_t mask = __gpu_lane_mask();
|
|
if (__gpu_is_first_in_lane(mask)) {
|
|
__scoped_atomic_fetch_add(counter, step * __builtin_popcountg(mask),
|
|
__ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE);
|
|
if (uniform && is_uniform(mask))
|
|
__scoped_atomic_fetch_add(uniform, step * __builtin_popcountg(mask),
|
|
__ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE);
|
|
}
|
|
}
|
|
|
|
#endif
|