llvm-project/clang/test/CodeGenCUDA/offload_via_llvm.cu
Alex MacLean 4583f6d344
[NVPTX] Switch front-ends and tests to ptx_kernel cc (#120806)
the `ptx_kernel` calling convention is a more idiomatic and standard way
of specifying a NVPTX kernel than using the metadata which is not
supposed to change the meaning of the program. Further, checking the
calling convention is significantly faster than traversing the metadata,
improving compile time.

This change updates the clang and mlir frontends as well as the
NVPTXCtorDtorLowering pass to emit kernels using the calling convention.
In addition, this updates all NVPTX unit tests to use the calling
convention as well.
2025-01-07 18:24:50 -08:00

91 lines
5.9 KiB
Plaintext

// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
// RUN: %clang_cc1 %s -triple nvptx-unknown-unknown -foffload-via-llvm -emit-llvm -o - | FileCheck %s --check-prefix=HST
// RUN: %clang_cc1 %s -triple nvptx-unknown-unknown -fcuda-is-device -foffload-via-llvm -emit-llvm -o - | FileCheck %s --check-prefix=DEV
// Check that we generate LLVM/Offload calls, including the KERNEL_LAUNCH_PARAMS argument.
#define __OFFLOAD_VIA_LLVM__ 1
#include "Inputs/cuda.h"
// HST-LABEL: define dso_local ptx_kernel void @_Z18__device_stub__fooisPvS_(
// HST-SAME: i32 noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], ptr noundef [[TMP2:%.*]], ptr noundef [[TMP3:%.*]]) #[[ATTR0:[0-9]+]] {
// HST-NEXT: [[ENTRY:.*:]]
// HST-NEXT: [[DOTADDR:%.*]] = alloca i32, align 4
// HST-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2
// HST-NEXT: [[DOTADDR2:%.*]] = alloca ptr, align 4
// HST-NEXT: [[DOTADDR3:%.*]] = alloca ptr, align 4
// HST-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[TMP0]], align 16
// HST-NEXT: [[KERNEL_LAUNCH_PARAMS:%.*]] = alloca [[TMP1]], align 16
// HST-NEXT: [[GRID_DIM:%.*]] = alloca [[STRUCT_DIM3:%.*]], align 8
// HST-NEXT: [[BLOCK_DIM:%.*]] = alloca [[STRUCT_DIM3]], align 8
// HST-NEXT: [[SHMEM_SIZE:%.*]] = alloca i32, align 4
// HST-NEXT: [[STREAM:%.*]] = alloca ptr, align 4
// HST-NEXT: store i32 [[TMP0]], ptr [[DOTADDR]], align 4
// HST-NEXT: store i16 [[TMP1]], ptr [[DOTADDR1]], align 2
// HST-NEXT: store ptr [[TMP2]], ptr [[DOTADDR2]], align 4
// HST-NEXT: store ptr [[TMP3]], ptr [[DOTADDR3]], align 4
// HST-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[TMP1]], ptr [[KERNEL_LAUNCH_PARAMS]], i32 0, i32 0
// HST-NEXT: store i64 16, ptr [[TMP4]], align 16
// HST-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw [[TMP1]], ptr [[KERNEL_LAUNCH_PARAMS]], i32 0, i32 1
// HST-NEXT: store ptr [[KERNEL_ARGS]], ptr [[TMP5]], align 8
// HST-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[TMP1]], ptr [[KERNEL_LAUNCH_PARAMS]], i32 0, i32 2
// HST-NEXT: store ptr null, ptr [[TMP6]], align 4
// HST-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTADDR]], align 4
// HST-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [[TMP0]], ptr [[KERNEL_ARGS]], i32 0, i32 0
// HST-NEXT: store i32 [[TMP7]], ptr [[TMP8]], align 16
// HST-NEXT: [[TMP9:%.*]] = load i16, ptr [[DOTADDR1]], align 2
// HST-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw [[TMP0]], ptr [[KERNEL_ARGS]], i32 0, i32 1
// HST-NEXT: store i16 [[TMP9]], ptr [[TMP10]], align 4
// HST-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DOTADDR2]], align 4
// HST-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw [[TMP0]], ptr [[KERNEL_ARGS]], i32 0, i32 2
// HST-NEXT: store ptr [[TMP11]], ptr [[TMP12]], align 8
// HST-NEXT: [[TMP13:%.*]] = load ptr, ptr [[DOTADDR3]], align 4
// HST-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw [[TMP0]], ptr [[KERNEL_ARGS]], i32 0, i32 3
// HST-NEXT: store ptr [[TMP13]], ptr [[TMP14]], align 4
// HST-NEXT: [[TMP15:%.*]] = call i32 @__llvmPopCallConfiguration(ptr [[GRID_DIM]], ptr [[BLOCK_DIM]], ptr [[SHMEM_SIZE]], ptr [[STREAM]])
// HST-NEXT: [[TMP16:%.*]] = load i32, ptr [[SHMEM_SIZE]], align 4
// HST-NEXT: [[TMP17:%.*]] = load ptr, ptr [[STREAM]], align 4
// HST-NEXT: [[CALL:%.*]] = call noundef i32 @llvmLaunchKernel(ptr noundef @_Z18__device_stub__fooisPvS_, ptr noundef byval([[STRUCT_DIM3]]) align 4 [[GRID_DIM]], ptr noundef byval([[STRUCT_DIM3]]) align 4 [[BLOCK_DIM]], ptr noundef [[KERNEL_LAUNCH_PARAMS]], i32 noundef [[TMP16]], ptr noundef [[TMP17]])
// HST-NEXT: br label %[[SETUP_END:.*]]
// HST: [[SETUP_END]]:
// HST-NEXT: ret void
//
// DEV-LABEL: define dso_local ptx_kernel void @_Z3fooisPvS_(
// DEV-SAME: i32 noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], ptr noundef [[TMP2:%.*]], ptr noundef [[TMP3:%.*]]) #[[ATTR0:[0-9]+]] {
// DEV-NEXT: [[ENTRY:.*:]]
// DEV-NEXT: [[DOTADDR:%.*]] = alloca i32, align 4
// DEV-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2
// DEV-NEXT: [[DOTADDR2:%.*]] = alloca ptr, align 4
// DEV-NEXT: [[DOTADDR3:%.*]] = alloca ptr, align 4
// DEV-NEXT: store i32 [[TMP0]], ptr [[DOTADDR]], align 4
// DEV-NEXT: store i16 [[TMP1]], ptr [[DOTADDR1]], align 2
// DEV-NEXT: store ptr [[TMP2]], ptr [[DOTADDR2]], align 4
// DEV-NEXT: store ptr [[TMP3]], ptr [[DOTADDR3]], align 4
// DEV-NEXT: ret void
//
__global__ void foo(int, short, void *, void *) {}
// HST-LABEL: define dso_local void @_Z5test1Pv(
// HST-SAME: ptr noundef [[PTR:%.*]]) #[[ATTR1:[0-9]+]] {
// HST-NEXT: [[ENTRY:.*:]]
// HST-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 4
// HST-NEXT: [[AGG_TMP:%.*]] = alloca [[STRUCT_DIM3:%.*]], align 4
// HST-NEXT: [[AGG_TMP1:%.*]] = alloca [[STRUCT_DIM3]], align 4
// HST-NEXT: store ptr [[PTR]], ptr [[PTR_ADDR]], align 4
// HST-NEXT: call void @_ZN4dim3C1Ejjj(ptr noundef nonnull align 4 dereferenceable(12) [[AGG_TMP]], i32 noundef 3, i32 noundef 1, i32 noundef 1)
// HST-NEXT: call void @_ZN4dim3C1Ejjj(ptr noundef nonnull align 4 dereferenceable(12) [[AGG_TMP1]], i32 noundef 7, i32 noundef 1, i32 noundef 1)
// HST-NEXT: [[CALL:%.*]] = call i32 @__llvmPushCallConfiguration(ptr noundef byval([[STRUCT_DIM3]]) align 4 [[AGG_TMP]], ptr noundef byval([[STRUCT_DIM3]]) align 4 [[AGG_TMP1]], i32 noundef 0, ptr noundef null)
// HST-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[CALL]], 0
// HST-NEXT: br i1 [[TOBOOL]], label %[[KCALL_END:.*]], label %[[KCALL_CONFIGOK:.*]]
// HST: [[KCALL_CONFIGOK]]:
// HST-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 4
// HST-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR]], align 4
// HST-NEXT: call void @_Z18__device_stub__fooisPvS_(i32 noundef 13, i16 noundef signext 1, ptr noundef [[TMP0]], ptr noundef [[TMP1]]) #[[ATTR3:[0-9]+]]
// HST-NEXT: br label %[[KCALL_END]]
// HST: [[KCALL_END]]:
// HST-NEXT: ret void
//
void test1(void *Ptr) {
foo<<<3, 7>>>(13, 1, Ptr, Ptr);
}