[flang][rt] Add noinline attributes for CUDA compile path for successful compilation (#161760)

NVCC does more aggressive inlining than Clang/GCC causing the exported
functions in extrema.cpp and findloc.cpp to become extremely large from
function specializations leading to compilation timeouts. Marking the 2
functions in this change as noinline for NVCC alleviates this problem as
it removes the worst of the cross-matrix argument specializations.

Also remove the workaround in
https://github.com/llvm/llvm-project/pull/156542 that opted out
findloc.cpp from the CUDA flang-rt build

Testing:
ninja flang-rt builds in ~30 minutes, these 2 files build in ~3 minutes
This commit is contained in:
modiking 2025-10-03 09:48:59 -07:00 committed by GitHub
parent d0e98909d2
commit 74180eb024
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 13 additions and 10 deletions

View File

@ -178,9 +178,6 @@ endif ()
if ("${LLVM_RUNTIMES_TARGET}" MATCHES "^amdgcn|^nvptx")
set(sources ${gpu_sources})
elseif(FLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT STREQUAL "CUDA")
# findloc.cpp has some issues with higher compute capability. Remove it
# from CUDA build until we can lower its memory footprint.
list(REMOVE_ITEM supported_sources findloc.cpp)
set(sources ${supported_sources})
else ()
set(sources ${supported_sources} ${host_sources} ${f128_sources})

View File

@ -397,9 +397,12 @@ template <TypeCategory CAT, bool IS_MAX,
template <typename, bool, bool> class COMPARE>
struct DoPartialMaxOrMinLocHelper {
template <int KIND> struct Functor {
RT_API_ATTRS void operator()(const char *intrinsic, Descriptor &result,
const Descriptor &x, int kind, int dim, const Descriptor *mask,
bool back, Terminator &terminator) const {
// NVCC inlines more aggressively which causes too many specializations of
// this function to be inlined causing compiler timeouts. Set as
// noinline to allow compilation to complete.
RT_API_ATTRS RT_DEVICE_NOINLINE void operator()(const char *intrinsic,
Descriptor &result, const Descriptor &x, int kind, int dim,
const Descriptor *mask, bool back, Terminator &terminator) const {
DoPartialMaxOrMinLoc<CAT, KIND, IS_MAX, COMPARE>(
intrinsic, result, x, kind, dim, mask, back, terminator);
}

View File

@ -153,10 +153,13 @@ template <TypeCategory CAT,
class HELPER>
struct NumericFindlocHelper {
template <int KIND> struct Functor {
RT_API_ATTRS void operator()(TypeCategory targetCat, int targetKind,
Descriptor &result, const Descriptor &x, const Descriptor &target,
int kind, int dim, const Descriptor *mask, bool back,
Terminator &terminator) const {
// NVCC inlines more aggressively which causes too many specializations of
// this function to be inlined causing compiler timeouts. Set as
// noinline to allow compilation to complete.
RT_API_ATTRS RT_DEVICE_NOINLINE void operator()(TypeCategory targetCat,
int targetKind, Descriptor &result, const Descriptor &x,
const Descriptor &target, int kind, int dim, const Descriptor *mask,
bool back, Terminator &terminator) const {
switch (targetCat) {
case TypeCategory::Integer:
case TypeCategory::Unsigned: