
Specifying a kernel with the `ptx_kernel` or `amdgpu_kernel` calling convention is a more idiomatic and compile-time performant than using the `nvvm.annoation !"kernel"` metadata. Transition OMPIRBuilder to use calling conventions for PTX kernels and no longer emit `nvvm.annoation`. Update OpenMPOpt to work with kernels specified via calling convention as well as metadata. Update OpenMP tests to use the calling conventions.
105 lines
5.2 KiB
LLVM
105 lines
5.2 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals
|
|
; RUN: opt < %s -S -passes=openmp-opt -openmp-opt-inline-device | FileCheck %s
|
|
|
|
%struct.ident_t = type { i32, i32, i32, i32, ptr }
|
|
%struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
|
|
%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32, i32, i32 }
|
|
@0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
|
|
@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @0 }, align 8
|
|
@G = external global i8
|
|
|
|
@kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
|
|
|
|
; Function Attrs: convergent norecurse nounwind
|
|
;.
|
|
; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
|
|
; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
|
|
; CHECK: @G = external global i8
|
|
; CHECK: @kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
|
|
;.
|
|
define weak ptx_kernel void @__omp_offloading_fd02_c0934fc2_foo_l4(ptr %dyn) #0 {
|
|
; CHECK: Function Attrs: norecurse nounwind
|
|
; CHECK-LABEL: @__omp_offloading_fd02_c0934fc2_foo_l4(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @kernel_environment, ptr [[DYN:%.*]])
|
|
; CHECK-NEXT: [[THREAD_ID_IN_BLOCK:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
|
|
; CHECK-NEXT: [[THREAD_IS_MAIN:%.*]] = icmp ne i32 [[THREAD_ID_IN_BLOCK]], 0
|
|
; CHECK-NEXT: br i1 [[THREAD_IS_MAIN]], label [[EXIT_THREADS:%.*]], label [[MAIN_THREAD_USER_CODE:%.*]]
|
|
; CHECK: exit.threads:
|
|
; CHECK-NEXT: ret void
|
|
; CHECK: main.thread.user_code:
|
|
; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
|
|
; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
|
|
; CHECK: user_code.entry:
|
|
; CHECK-NEXT: store i8 1, ptr @G, align 1
|
|
; CHECK-NEXT: call void @__kmpc_target_deinit()
|
|
; CHECK-NEXT: ret void
|
|
; CHECK: worker.exit:
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
entry:
|
|
%0 = call i32 @__kmpc_target_init(ptr @kernel_environment, ptr %dyn)
|
|
%exec_user_code = icmp eq i32 %0, -1
|
|
br i1 %exec_user_code, label %user_code.entry, label %worker.exit
|
|
|
|
user_code.entry: ; preds = %entry
|
|
; Ensure we see a 0 here as the kernel doesn't have parallel regions and we want
|
|
; generic execution.
|
|
; TODO: This is not perfect. We should rather go for SPMD mode and tell the runtime
|
|
; to only spawn a single thread. Further, we then should not guard any code.
|
|
%isSPMD = call i8 @__kmpc_is_spmd_exec_mode()
|
|
store i8 %isSPMD, ptr @G
|
|
call void @bar() #2
|
|
call void @__kmpc_target_deinit()
|
|
ret void
|
|
|
|
worker.exit: ; preds = %entry
|
|
ret void
|
|
}
|
|
|
|
declare i8 @__kmpc_is_spmd_exec_mode()
|
|
|
|
declare i32 @__kmpc_target_init(ptr, ptr)
|
|
|
|
declare void @__kmpc_target_deinit()
|
|
|
|
; Function Attrs: convergent nounwind
|
|
define hidden void @bar() #1 {
|
|
; CHECK: Function Attrs: alwaysinline convergent nounwind
|
|
; CHECK-LABEL: @bar(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
entry:
|
|
ret void
|
|
}
|
|
|
|
attributes #0 = { convergent norecurse nounwind "kernel" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx32,+sm_70" }
|
|
attributes #1 = { convergent nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx32,+sm_70" }
|
|
attributes #2 = { convergent }
|
|
|
|
!omp_offload.info = !{!0}
|
|
!llvm.module.flags = !{!2, !3, !4, !5, !6}
|
|
!llvm.ident = !{!7}
|
|
|
|
!0 = !{i32 0, i32 64770, i32 -1064087614, !"foo", i32 4, i32 0}
|
|
!2 = !{i32 1, !"wchar_size", i32 4}
|
|
!3 = !{i32 7, !"openmp", i32 50}
|
|
!4 = !{i32 7, !"openmp-device", i32 50}
|
|
!5 = !{i32 7, !"PIC Level", i32 2}
|
|
!6 = !{i32 7, !"frame-pointer", i32 2}
|
|
!7 = !{!"clang version 14.0.0"}
|
|
;.
|
|
; CHECK: attributes #[[ATTR0:[0-9]+]] = { norecurse nounwind "frame-pointer"="all" "kernel" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx32,+sm_70" }
|
|
; CHECK: attributes #[[ATTR1:[0-9]+]] = { alwaysinline convergent nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx32,+sm_70" }
|
|
; CHECK: attributes #[[ATTR2:[0-9]+]] = { nounwind }
|
|
;.
|
|
; CHECK: [[META0:![0-9]+]] = !{i32 0, i32 64770, i32 -1064087614, !"foo", i32 4, i32 0}
|
|
; CHECK: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
|
|
; CHECK: [[META2:![0-9]+]] = !{i32 7, !"openmp", i32 50}
|
|
; CHECK: [[META3:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
|
|
; CHECK: [[META4:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
|
|
; CHECK: [[META5:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
|
|
; CHECK: [[META6:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
|
|
;.
|