
Through the new `-foffload-via-llvm` flag, CUDA kernels can now be lowered to the LLVM/Offload API. On the Clang side, this is simply done by using the OpenMP offload toolchain and emitting calls to `llvm*` functions to orchestrate the kernel launch rather than `cuda*` functions. These `llvm*` functions are implemented on top of the existing LLVM/Offload API. As we are about to redefine the Offload API, this wil help us in the design process as a second offload language. We do not support any CUDA APIs yet, however, we could: https://www.osti.gov/servlets/purl/1892137 For proper host execution we need to resurrect/rebase https://tianshilei.me/wp-content/uploads/2021/12/llpp-2021.pdf (which was designed for debugging). ``` ❯❯❯ cat test.cu extern "C" { void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum); void llvm_omp_target_free_shared(void *DevicePtr, int DeviceNum); } __global__ void square(int *A) { *A = 42; } int main(int argc, char **argv) { int DevNo = 0; int *Ptr = reinterpret_cast<int *>(llvm_omp_target_alloc_shared(4, DevNo)); *Ptr = 7; printf("Ptr %p, *Ptr %i\n", Ptr, *Ptr); square<<<1, 1>>>(Ptr); printf("Ptr %p, *Ptr %i\n", Ptr, *Ptr); llvm_omp_target_free_shared(Ptr, DevNo); } ❯❯❯ clang++ test.cu -O3 -o test123 -foffload-via-llvm --offload-arch=native ❯❯❯ llvm-objdump --offloading test123 test123: file format elf64-x86-64 OFFLOADING IMAGE [0]: kind elf arch gfx90a triple amdgcn-amd-amdhsa producer openmp ❯❯❯ LIBOMPTARGET_INFO=16 ./test123 Ptr 0x155448ac8000, *Ptr 7 Ptr 0x155448ac8000, *Ptr 42 ```
91 lines
5.9 KiB
Plaintext
91 lines
5.9 KiB
Plaintext
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
|
|
// RUN: %clang_cc1 %s -triple nvptx-unknown-unknown -foffload-via-llvm -emit-llvm -o - | FileCheck %s --check-prefix=HST
|
|
// RUN: %clang_cc1 %s -triple nvptx-unknown-unknown -fcuda-is-device -foffload-via-llvm -emit-llvm -o - | FileCheck %s --check-prefix=DEV
|
|
|
|
// Check that we generate LLVM/Offload calls, including the KERNEL_LAUNCH_PARAMS argument.
|
|
|
|
#define __OFFLOAD_VIA_LLVM__ 1
|
|
#include "Inputs/cuda.h"
|
|
|
|
// HST-LABEL: define dso_local void @_Z18__device_stub__fooisPvS_(
|
|
// HST-SAME: i32 noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], ptr noundef [[TMP2:%.*]], ptr noundef [[TMP3:%.*]]) #[[ATTR0:[0-9]+]] {
|
|
// HST-NEXT: [[ENTRY:.*:]]
|
|
// HST-NEXT: [[DOTADDR:%.*]] = alloca i32, align 4
|
|
// HST-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2
|
|
// HST-NEXT: [[DOTADDR2:%.*]] = alloca ptr, align 4
|
|
// HST-NEXT: [[DOTADDR3:%.*]] = alloca ptr, align 4
|
|
// HST-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[TMP0]], align 16
|
|
// HST-NEXT: [[KERNEL_LAUNCH_PARAMS:%.*]] = alloca [[TMP1]], align 16
|
|
// HST-NEXT: [[GRID_DIM:%.*]] = alloca [[STRUCT_DIM3:%.*]], align 8
|
|
// HST-NEXT: [[BLOCK_DIM:%.*]] = alloca [[STRUCT_DIM3]], align 8
|
|
// HST-NEXT: [[SHMEM_SIZE:%.*]] = alloca i32, align 4
|
|
// HST-NEXT: [[STREAM:%.*]] = alloca ptr, align 4
|
|
// HST-NEXT: store i32 [[TMP0]], ptr [[DOTADDR]], align 4
|
|
// HST-NEXT: store i16 [[TMP1]], ptr [[DOTADDR1]], align 2
|
|
// HST-NEXT: store ptr [[TMP2]], ptr [[DOTADDR2]], align 4
|
|
// HST-NEXT: store ptr [[TMP3]], ptr [[DOTADDR3]], align 4
|
|
// HST-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[TMP1]], ptr [[KERNEL_LAUNCH_PARAMS]], i32 0, i32 0
|
|
// HST-NEXT: store i64 16, ptr [[TMP4]], align 16
|
|
// HST-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw [[TMP1]], ptr [[KERNEL_LAUNCH_PARAMS]], i32 0, i32 1
|
|
// HST-NEXT: store ptr [[KERNEL_ARGS]], ptr [[TMP5]], align 8
|
|
// HST-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[TMP1]], ptr [[KERNEL_LAUNCH_PARAMS]], i32 0, i32 2
|
|
// HST-NEXT: store ptr null, ptr [[TMP6]], align 4
|
|
// HST-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTADDR]], align 4
|
|
// HST-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [[TMP0]], ptr [[KERNEL_ARGS]], i32 0, i32 0
|
|
// HST-NEXT: store i32 [[TMP7]], ptr [[TMP8]], align 16
|
|
// HST-NEXT: [[TMP9:%.*]] = load i16, ptr [[DOTADDR1]], align 2
|
|
// HST-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw [[TMP0]], ptr [[KERNEL_ARGS]], i32 0, i32 1
|
|
// HST-NEXT: store i16 [[TMP9]], ptr [[TMP10]], align 4
|
|
// HST-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DOTADDR2]], align 4
|
|
// HST-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw [[TMP0]], ptr [[KERNEL_ARGS]], i32 0, i32 2
|
|
// HST-NEXT: store ptr [[TMP11]], ptr [[TMP12]], align 8
|
|
// HST-NEXT: [[TMP13:%.*]] = load ptr, ptr [[DOTADDR3]], align 4
|
|
// HST-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw [[TMP0]], ptr [[KERNEL_ARGS]], i32 0, i32 3
|
|
// HST-NEXT: store ptr [[TMP13]], ptr [[TMP14]], align 4
|
|
// HST-NEXT: [[TMP15:%.*]] = call i32 @__llvmPopCallConfiguration(ptr [[GRID_DIM]], ptr [[BLOCK_DIM]], ptr [[SHMEM_SIZE]], ptr [[STREAM]])
|
|
// HST-NEXT: [[TMP16:%.*]] = load i32, ptr [[SHMEM_SIZE]], align 4
|
|
// HST-NEXT: [[TMP17:%.*]] = load ptr, ptr [[STREAM]], align 4
|
|
// HST-NEXT: [[CALL:%.*]] = call noundef i32 @llvmLaunchKernel(ptr noundef @_Z18__device_stub__fooisPvS_, ptr noundef byval([[STRUCT_DIM3]]) align 4 [[GRID_DIM]], ptr noundef byval([[STRUCT_DIM3]]) align 4 [[BLOCK_DIM]], ptr noundef [[KERNEL_LAUNCH_PARAMS]], i32 noundef [[TMP16]], ptr noundef [[TMP17]])
|
|
// HST-NEXT: br label %[[SETUP_END:.*]]
|
|
// HST: [[SETUP_END]]:
|
|
// HST-NEXT: ret void
|
|
//
|
|
// DEV-LABEL: define dso_local void @_Z3fooisPvS_(
|
|
// DEV-SAME: i32 noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], ptr noundef [[TMP2:%.*]], ptr noundef [[TMP3:%.*]]) #[[ATTR0:[0-9]+]] {
|
|
// DEV-NEXT: [[ENTRY:.*:]]
|
|
// DEV-NEXT: [[DOTADDR:%.*]] = alloca i32, align 4
|
|
// DEV-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2
|
|
// DEV-NEXT: [[DOTADDR2:%.*]] = alloca ptr, align 4
|
|
// DEV-NEXT: [[DOTADDR3:%.*]] = alloca ptr, align 4
|
|
// DEV-NEXT: store i32 [[TMP0]], ptr [[DOTADDR]], align 4
|
|
// DEV-NEXT: store i16 [[TMP1]], ptr [[DOTADDR1]], align 2
|
|
// DEV-NEXT: store ptr [[TMP2]], ptr [[DOTADDR2]], align 4
|
|
// DEV-NEXT: store ptr [[TMP3]], ptr [[DOTADDR3]], align 4
|
|
// DEV-NEXT: ret void
|
|
//
|
|
__global__ void foo(int, short, void *, void *) {}
|
|
|
|
// HST-LABEL: define dso_local void @_Z5test1Pv(
|
|
// HST-SAME: ptr noundef [[PTR:%.*]]) #[[ATTR1:[0-9]+]] {
|
|
// HST-NEXT: [[ENTRY:.*:]]
|
|
// HST-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 4
|
|
// HST-NEXT: [[AGG_TMP:%.*]] = alloca [[STRUCT_DIM3:%.*]], align 4
|
|
// HST-NEXT: [[AGG_TMP1:%.*]] = alloca [[STRUCT_DIM3]], align 4
|
|
// HST-NEXT: store ptr [[PTR]], ptr [[PTR_ADDR]], align 4
|
|
// HST-NEXT: call void @_ZN4dim3C1Ejjj(ptr noundef nonnull align 4 dereferenceable(12) [[AGG_TMP]], i32 noundef 3, i32 noundef 1, i32 noundef 1)
|
|
// HST-NEXT: call void @_ZN4dim3C1Ejjj(ptr noundef nonnull align 4 dereferenceable(12) [[AGG_TMP1]], i32 noundef 7, i32 noundef 1, i32 noundef 1)
|
|
// HST-NEXT: [[CALL:%.*]] = call i32 @__llvmPushCallConfiguration(ptr noundef byval([[STRUCT_DIM3]]) align 4 [[AGG_TMP]], ptr noundef byval([[STRUCT_DIM3]]) align 4 [[AGG_TMP1]], i32 noundef 0, ptr noundef null)
|
|
// HST-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[CALL]], 0
|
|
// HST-NEXT: br i1 [[TOBOOL]], label %[[KCALL_END:.*]], label %[[KCALL_CONFIGOK:.*]]
|
|
// HST: [[KCALL_CONFIGOK]]:
|
|
// HST-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 4
|
|
// HST-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR]], align 4
|
|
// HST-NEXT: call void @_Z18__device_stub__fooisPvS_(i32 noundef 13, i16 noundef signext 1, ptr noundef [[TMP0]], ptr noundef [[TMP1]]) #[[ATTR3:[0-9]+]]
|
|
// HST-NEXT: br label %[[KCALL_END]]
|
|
// HST: [[KCALL_END]]:
|
|
// HST-NEXT: ret void
|
|
//
|
|
void test1(void *Ptr) {
|
|
foo<<<3, 7>>>(13, 1, Ptr, Ptr);
|
|
}
|