
Summary: Memory globalization is required to maintain OpenMP standard semantics for data sharing between worker and master threads. The GPU cannot share data between its threads so must allocate global or shared memory to store the data in. Currently this is implemented fully in the frontend using the `__kmpc_data_sharing_push_stack` and __kmpc_data_sharing_pop_stack` functions to emulate standard CPU stack sharing. The front-end scans the target region for variables that escape the region and must be shared between the threads. Each variable then has a field created for it in a global record type. This patch replaces this functinality with a single allocation command, effectively mimicing an alloca instruction for the variables that must be shared between the threads. This will be much slower than the current solution, but makes it much easier to optimize as we can analyze each variable independently and determine if it is not captured. In the future, we can replace these calls with an `alloca` and small allocations can be pushed to shared memory. Reviewed By: tianshilei1992 Differential Revision: https://reviews.llvm.org/D97680
89 lines
4.5 KiB
C++
89 lines
4.5 KiB
C++
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
|
|
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
|
|
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-nvidia-cuda -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix=CHECK1
|
|
// expected-no-diagnostics
|
|
|
|
int foo(int &a) { return a; }
|
|
|
|
int bar() {
|
|
int a;
|
|
return foo(a);
|
|
}
|
|
|
|
|
|
int maini1() {
|
|
int a;
|
|
#pragma omp target parallel map(from:a)
|
|
{
|
|
int b;
|
|
a = foo(b) + bar();
|
|
}
|
|
return a;
|
|
}
|
|
|
|
// parallel region
|
|
|
|
|
|
// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6maini1v_l16
|
|
// CHECK1-SAME: (i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR0:[0-9]+]] {
|
|
// CHECK1-NEXT: entry:
|
|
// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8
|
|
// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8
|
|
// CHECK1-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8
|
|
// CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8
|
|
// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
|
|
// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1)
|
|
// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]]
|
|
// CHECK1: .execute:
|
|
// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
|
|
// CHECK1-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
|
|
// CHECK1-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP0]] to i8*
|
|
// CHECK1-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 8
|
|
// CHECK1-NEXT: [[TMP4:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
|
|
// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP4]], i64 1)
|
|
// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]]
|
|
// CHECK1: .omp.deinit:
|
|
// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1)
|
|
// CHECK1-NEXT: br label [[DOTEXIT:%.*]]
|
|
// CHECK1: .exit:
|
|
// CHECK1-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__
|
|
// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR0]] {
|
|
// CHECK1-NEXT: entry:
|
|
// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
|
|
// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
|
|
// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8
|
|
// CHECK1-NEXT: [[B:%.*]] = alloca i32, align 4
|
|
// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
|
|
// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
|
|
// CHECK1-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8
|
|
// CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8
|
|
// CHECK1-NEXT: [[CALL:%.*]] = call i32 @_Z3fooRi(i32* nonnull align 4 dereferenceable(4) [[B]]) #[[ATTR4:[0-9]+]]
|
|
// CHECK1-NEXT: [[CALL1:%.*]] = call i32 @_Z3barv() #[[ATTR4]]
|
|
// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[CALL]], [[CALL1]]
|
|
// CHECK1-NEXT: store i32 [[ADD]], i32* [[TMP0]], align 4
|
|
// CHECK1-NEXT: ret void
|
|
//
|
|
//
|
|
// CHECK1-LABEL: define {{[^@]+}}@_Z3fooRi
|
|
// CHECK1-SAME: (i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2:[0-9]+]] {
|
|
// CHECK1-NEXT: entry:
|
|
// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8
|
|
// CHECK1-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8
|
|
// CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8
|
|
// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
|
|
// CHECK1-NEXT: ret i32 [[TMP1]]
|
|
//
|
|
//
|
|
// CHECK1-LABEL: define {{[^@]+}}@_Z3barv
|
|
// CHECK1-SAME: () #[[ATTR2]] {
|
|
// CHECK1-NEXT: entry:
|
|
// CHECK1-NEXT: [[A:%.*]] = call i8* @__kmpc_alloc_shared(i64 4)
|
|
// CHECK1-NEXT: [[A_ON_STACK:%.*]] = bitcast i8* [[A]] to i32*
|
|
// CHECK1-NEXT: [[CALL:%.*]] = call i32 @_Z3fooRi(i32* nonnull align 4 dereferenceable(4) [[A_ON_STACK]]) #[[ATTR4]]
|
|
// CHECK1-NEXT: call void @__kmpc_free_shared(i8* [[A]])
|
|
// CHECK1-NEXT: ret i32 [[CALL]]
|
|
//
|