llvm-project/clang/test/OpenMP/declare_target_codegen_globalization.cpp
Joseph Huber 68d133a3e8 [OpenMP] Simplify GPU memory globalization
Summary:
Memory globalization is required to maintain OpenMP standard semantics for data sharing between
worker and master threads. The GPU cannot share data between its threads so must allocate global or
shared memory to store the data in. Currently this is implemented fully in the frontend using the
`__kmpc_data_sharing_push_stack` and __kmpc_data_sharing_pop_stack` functions to emulate standard
CPU stack sharing. The front-end scans the target region for variables that escape the region and
must be shared between the threads. Each variable then has a field created for it in a global record
type.

This patch replaces this functinality with a single allocation command, effectively mimicing an
alloca instruction for the variables that must be shared between the threads. This will be much
slower than the current solution, but makes it much easier to optimize as we can analyze each
variable independently and determine if it is not captured. In the future, we can replace these
calls with an `alloca` and small allocations can be pushed to shared memory.

Reviewed By: tianshilei1992

Differential Revision: https://reviews.llvm.org/D97680
2021-06-22 10:52:46 -04:00

89 lines
4.5 KiB
C++

// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-nvidia-cuda -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix=CHECK1
// expected-no-diagnostics
int foo(int &a) { return a; }
int bar() {
int a;
return foo(a);
}
int maini1() {
int a;
#pragma omp target parallel map(from:a)
{
int b;
a = foo(b) + bar();
}
return a;
}
// parallel region
// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6maini1v_l16
// CHECK1-SAME: (i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR0:[0-9]+]] {
// CHECK1-NEXT: entry:
// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8
// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8
// CHECK1-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8
// CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8
// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1)
// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]]
// CHECK1: .execute:
// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
// CHECK1-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
// CHECK1-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP0]] to i8*
// CHECK1-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 8
// CHECK1-NEXT: [[TMP4:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP4]], i64 1)
// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]]
// CHECK1: .omp.deinit:
// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1)
// CHECK1-NEXT: br label [[DOTEXIT:%.*]]
// CHECK1: .exit:
// CHECK1-NEXT: ret void
//
//
// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__
// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR0]] {
// CHECK1-NEXT: entry:
// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8
// CHECK1-NEXT: [[B:%.*]] = alloca i32, align 4
// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
// CHECK1-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8
// CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8
// CHECK1-NEXT: [[CALL:%.*]] = call i32 @_Z3fooRi(i32* nonnull align 4 dereferenceable(4) [[B]]) #[[ATTR4:[0-9]+]]
// CHECK1-NEXT: [[CALL1:%.*]] = call i32 @_Z3barv() #[[ATTR4]]
// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[CALL]], [[CALL1]]
// CHECK1-NEXT: store i32 [[ADD]], i32* [[TMP0]], align 4
// CHECK1-NEXT: ret void
//
//
// CHECK1-LABEL: define {{[^@]+}}@_Z3fooRi
// CHECK1-SAME: (i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2:[0-9]+]] {
// CHECK1-NEXT: entry:
// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8
// CHECK1-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8
// CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8
// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
// CHECK1-NEXT: ret i32 [[TMP1]]
//
//
// CHECK1-LABEL: define {{[^@]+}}@_Z3barv
// CHECK1-SAME: () #[[ATTR2]] {
// CHECK1-NEXT: entry:
// CHECK1-NEXT: [[A:%.*]] = call i8* @__kmpc_alloc_shared(i64 4)
// CHECK1-NEXT: [[A_ON_STACK:%.*]] = bitcast i8* [[A]] to i32*
// CHECK1-NEXT: [[CALL:%.*]] = call i32 @_Z3fooRi(i32* nonnull align 4 dereferenceable(4) [[A_ON_STACK]]) #[[ATTR4]]
// CHECK1-NEXT: call void @__kmpc_free_shared(i8* [[A]])
// CHECK1-NEXT: ret i32 [[CALL]]
//