[Clang][OpenCL][AMDGPU] OpenCL Kernel stubs should be assigned alwaysinline attribute (#137769)
OpenCL Kernels body is emitted as stubs and the kernel is emitted as call to respective stub. (https://github.com/llvm/llvm-project/pull/115821). The stub function should be alwaysinlined, since call to stub can cause performance drop. Co-authored-by: anikelal <anikelal@amd.com>
This commit is contained in:
parent
62385b8487
commit
c3ce5684a8
@ -6172,6 +6172,22 @@ void CodeGenModule::EmitGlobalFunctionDefinition(GlobalDecl GD,
|
||||
CodeGenFunction(*this).GenerateCode(GD, Fn, FI);
|
||||
|
||||
setNonAliasAttributes(GD, Fn);
|
||||
|
||||
bool ShouldAddOptNone = !CodeGenOpts.DisableO0ImplyOptNone &&
|
||||
(CodeGenOpts.OptimizationLevel == 0) &&
|
||||
!D->hasAttr<MinSizeAttr>();
|
||||
|
||||
if (D->hasAttr<OpenCLKernelAttr>()) {
|
||||
if (GD.getKernelReferenceKind() == KernelReferenceKind::Stub &&
|
||||
!D->hasAttr<NoInlineAttr>() &&
|
||||
!Fn->hasFnAttribute(llvm::Attribute::NoInline) &&
|
||||
!D->hasAttr<OptimizeNoneAttr>() &&
|
||||
!Fn->hasFnAttribute(llvm::Attribute::OptimizeNone) &&
|
||||
!ShouldAddOptNone) {
|
||||
Fn->addFnAttr(llvm::Attribute::AlwaysInline);
|
||||
}
|
||||
}
|
||||
|
||||
SetLLVMFunctionAttributesForDefinition(D, Fn);
|
||||
|
||||
if (const ConstructorAttr *CA = D->getAttr<ConstructorAttr>())
|
||||
|
||||
@ -492,7 +492,7 @@ kernel void test_target_features_kernel(global int *i) {
|
||||
// GFX900-NEXT: ret void
|
||||
//
|
||||
//
|
||||
// GFX900: Function Attrs: convergent norecurse nounwind
|
||||
// GFX900: Function Attrs: alwaysinline convergent norecurse nounwind
|
||||
// GFX900-LABEL: define dso_local void @__clang_ocl_kern_imp_test(
|
||||
// GFX900-SAME: ptr addrspace(1) noundef align 1 [[A:%.*]], i8 noundef signext [[B:%.*]], ptr addrspace(1) noundef align 8 [[C:%.*]], i64 noundef [[D:%.*]]) #[[ATTR3:[0-9]+]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META12]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13]] {
|
||||
// GFX900-NEXT: [[ENTRY:.*:]]
|
||||
@ -640,7 +640,7 @@ kernel void test_target_features_kernel(global int *i) {
|
||||
// GFX900-NEXT: ret void
|
||||
//
|
||||
//
|
||||
// GFX900: Function Attrs: convergent norecurse nounwind
|
||||
// GFX900: Function Attrs: alwaysinline convergent norecurse nounwind
|
||||
// GFX900-LABEL: define dso_local void @__clang_ocl_kern_imp_test_target_features_kernel(
|
||||
// GFX900-SAME: ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR3]] !kernel_arg_addr_space [[META22]] !kernel_arg_access_qual [[META23]] !kernel_arg_type [[META24]] !kernel_arg_base_type [[META24]] !kernel_arg_type_qual [[META25]] {
|
||||
// GFX900-NEXT: [[ENTRY:.*:]]
|
||||
@ -832,7 +832,7 @@ kernel void test_target_features_kernel(global int *i) {
|
||||
// GFX900: attributes #[[ATTR0:[0-9]+]] = { "objc_arc_inert" }
|
||||
// GFX900: attributes #[[ATTR1]] = { convergent norecurse nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,-sram-ecc" }
|
||||
// GFX900: attributes #[[ATTR2]] = { convergent norecurse nounwind "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,-sram-ecc" "uniform-work-group-size"="false" }
|
||||
// GFX900: attributes #[[ATTR3]] = { convergent norecurse nounwind "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,-sram-ecc" }
|
||||
// GFX900: attributes #[[ATTR3]] = { alwaysinline convergent norecurse nounwind "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,-sram-ecc" }
|
||||
// GFX900: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
|
||||
// GFX900: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
|
||||
// GFX900: attributes #[[ATTR6]] = { convergent nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,-sram-ecc" }
|
||||
|
||||
@ -5,7 +5,6 @@
|
||||
|
||||
kernel void ker() {};
|
||||
// CHECK: define{{.*}}@ker() #[[ATTR0:[0-9]+]]
|
||||
// CHECK: call void @__clang_ocl_kern_imp_ker() #[[ATTR2:[0-9]+]]
|
||||
|
||||
// CHECK: define{{.*}}@__clang_ocl_kern_imp_ker() #[[ATTR1:[0-9]+]]
|
||||
|
||||
@ -18,6 +17,3 @@ void foo() {};
|
||||
|
||||
// CHECK: attributes #[[ATTR1]]
|
||||
// CHECK-NOT: uniform-work-group-size
|
||||
|
||||
// CHECK: attributes #[[ATTR2]]
|
||||
// CHECK-NOT: uniform-work-group-size
|
||||
|
||||
@ -1,12 +1,12 @@
|
||||
// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "spir-unknown-unknown" | FileCheck %s --check-prefixes=COMMON,B32,SPIR,TRIPLESPIR
|
||||
// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "spir64-unknown-unknown" | FileCheck %s --check-prefixes=COMMON,B64,SPIR,TRIPLESPIR
|
||||
// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -ffake-address-space-map -O1 -emit-llvm -o - -triple "spir64-unknown-unknown" | FileCheck %s --check-prefixes=CHECK-LIFETIMES,TRIPLESPIR
|
||||
// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL3.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "spir-unknown-unknown" | FileCheck %s --check-prefixes=COMMON,B32,SPIR,TRIPLESPIR
|
||||
// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL3.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "spir64-unknown-unknown" | FileCheck %s --check-prefixes=COMMON,B64,SPIR,TRIPLESPIR
|
||||
// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL3.0 -ffake-address-space-map -O1 -emit-llvm -o - -triple "spir64-unknown-unknown" | FileCheck %s --check-prefixes=CHECK-LIFETIMES,TRIPLESPIR
|
||||
// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "x86_64-unknown-linux-gnu" | FileCheck %s --check-prefixes=COMMON,B64,X86,TRIPLEX86
|
||||
// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL3.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "x86_64-unknown-linux-gnu" | FileCheck %s --check-prefixes=COMMON,B64,X86,TRIPLEX86
|
||||
// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL3.0 -ffake-address-space-map -O1 -emit-llvm -o - -triple "x86_64-unknown-linux-gnu" | FileCheck %s --check-prefixes=CHECK-LIFETIMES,TRIPLEX86
|
||||
// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "spir-unknown-unknown" | FileCheck %s --check-prefixes=COMMON,B32,SPIR
|
||||
// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "spir64-unknown-unknown" | FileCheck %s --check-prefixes=COMMON,B64,SPIR
|
||||
// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -ffake-address-space-map -O1 -emit-llvm -o - -triple "spir64-unknown-unknown" | FileCheck %s --check-prefix=CHECK-LIFETIMES
|
||||
// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL3.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "spir-unknown-unknown" | FileCheck %s --check-prefixes=COMMON,B32,SPIR
|
||||
// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL3.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "spir64-unknown-unknown" | FileCheck %s --check-prefixes=COMMON,B64,SPIR
|
||||
// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL3.0 -ffake-address-space-map -O1 -emit-llvm -o - -triple "spir64-unknown-unknown" | FileCheck %s --check-prefix=CHECK-LIFETIMES
|
||||
// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "x86_64-unknown-linux-gnu" | FileCheck %s --check-prefixes=COMMON,B64,X86
|
||||
// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL3.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "x86_64-unknown-linux-gnu" | FileCheck %s --check-prefixes=COMMON,B64,X86
|
||||
// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL3.0 -ffake-address-space-map -O1 -emit-llvm -o - -triple "x86_64-unknown-linux-gnu" | FileCheck %s --check-prefix=CHECK-LIFETIMES
|
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
||||
|
||||
@ -39,12 +39,6 @@ void callee(int id, __global int *out) {
|
||||
out[id] = id;
|
||||
}
|
||||
|
||||
// TRIPLESPIR: define{{.*}} void @device_side_enqueue(ptr addrspace(1) align 4 %{{.*}}, ptr addrspace(1) align 4 %b, i32 %i)
|
||||
// TRIPLESPIR: call spir_func void @__clang_ocl_kern_imp_device_side_enqueue({{.*}})
|
||||
|
||||
// TRIPLEX86: define{{.*}} void @device_side_enqueue(ptr addrspace(1) align 4 %{{.*}}, ptr addrspace(1) align 4 %b, i32 %i)
|
||||
// TRIPLEX86: call void @__clang_ocl_kern_imp_device_side_enqueue({{.*}})
|
||||
|
||||
// COMMON-LABEL: define{{.*}} void @__clang_ocl_kern_imp_device_side_enqueue(ptr addrspace(1) align 4 %{{.*}}, ptr addrspace(1) align 4 %b, i32 %i)
|
||||
kernel void device_side_enqueue(global int *a, global int *b, int i) {
|
||||
// SPIR: %default_queue = alloca target("spirv.Queue")
|
||||
|
||||
@ -127,7 +127,7 @@ void test_not_unroll() {
|
||||
// CHECK: declare spir_func void @nodupfun(){{[^#]*}} #[[attr3:[0-9]+]]
|
||||
|
||||
// CHECK-LABEL: @assume_convergent_asm
|
||||
// CHECK: tail call void asm sideeffect "s_barrier", ""() #5
|
||||
// CHECK: tail call void asm sideeffect "s_barrier", ""() #6
|
||||
kernel void assume_convergent_asm()
|
||||
{
|
||||
__asm__ volatile("s_barrier");
|
||||
@ -138,6 +138,7 @@ kernel void assume_convergent_asm()
|
||||
// CHECK: attributes #2 = { {{[^}]*}}convergent{{[^}]*}} }
|
||||
// CHECK: attributes #3 = { {{[^}]*}}convergent noduplicate{{[^}]*}} }
|
||||
// CHECK: attributes #4 = { {{[^}]*}}convergent{{[^}]*}} }
|
||||
// CHECK: attributes #5 = { {{[^}]*}}convergent{{[^}]*}} }
|
||||
// CHECK: attributes #6 = { {{[^}]*}}nounwind{{[^}]*}} }
|
||||
// CHECK: attributes #7 = { {{[^}]*}}convergent noduplicate nounwind{{[^}]*}} }
|
||||
// CHECK: attributes #5 = { {{[^}]*}}alwaysinline convergent{{[^}]*}} }
|
||||
// CHECK: attributes #6 = { {{[^}]*}}convergent{{[^}]*}} }
|
||||
// CHECK: attributes #7 = { {{[^}]*}}nounwind{{[^}]*}} }
|
||||
// CHECK: attributes #8 = { {{[^}]*}}convergent noduplicate nounwind{{[^}]*}} }
|
||||
|
||||
@ -9,15 +9,8 @@
|
||||
typedef struct {int a;} ndrange_t;
|
||||
|
||||
kernel void test(int i) {
|
||||
|
||||
// AMDGPU-LABEL: define {{.*}} amdgpu_kernel void @test
|
||||
// AMDGPU-LABEL: call void @__clang_ocl_kern_imp_test(i32 noundef %0)
|
||||
|
||||
// SPIR-LABEL: define {{.*}} spir_kernel void @test
|
||||
// SPIR-LABEL: call spir_func void @__clang_ocl_kern_imp_test(i32 noundef %0)
|
||||
|
||||
// AMDGPU-LABEL: define {{.*}} void @__clang_ocl_kern_imp_test
|
||||
// SPIR-LABEL: define {{.*}} spir_func void @__clang_ocl_kern_imp_test
|
||||
|
||||
// COMMON-LABEL: entry:
|
||||
// AMDGPU: %block_sizes = alloca [1 x i64]
|
||||
@ -44,5 +37,5 @@ kernel void test(int i) {
|
||||
|
||||
// CHECK-DEBUG: ![[TESTFILE:[0-9]+]] = !DIFile(filename: "<stdin>"
|
||||
// CHECK-DEBUG: ![[TESTSCOPE:[0-9]+]] = distinct !DISubprogram(name: "test", linkageName: "__clang_ocl_kern_imp_test", {{.*}} file: ![[TESTFILE]]
|
||||
// CHECK-DEBUG: ![[IFSCOPE:[0-9]+]] = distinct !DILexicalBlock(scope: ![[TESTSCOPE]], file: ![[TESTFILE]], line: 33)
|
||||
// CHECK-DEBUG: ![[TEMPLOCATION]] = !DILocation(line: 34, scope: ![[IFSCOPE]])
|
||||
// CHECK-DEBUG: ![[IFSCOPE:[0-9]+]] = distinct !DILexicalBlock(scope: ![[TESTSCOPE]], file: ![[TESTFILE]], line: 26)
|
||||
// CHECK-DEBUG: ![[TEMPLOCATION]] = !DILocation(line: 27, scope: ![[IFSCOPE]])
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user