Before we tracked the size of the teams reduction buffer in order to allocate it at runtime per kernel launch. This patch splits the number into two parts, the size of the reduction data (=all reduction variables) and the (maximal) length of the buffer. This will allow us to allocate less if we need less, e.g., if we have less teams than the maximal length. It also allows us to move code from clangs codegen into the runtime as we now know how large the reduction data is.
112 lines
5.1 KiB
LLVM
112 lines
5.1 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
|
|
; RUN: opt --mtriple=amdgcn-amd-amdhsa --data-layout=A5 -S -passes=openmp-opt < %s | FileCheck %s --check-prefixes=AMDGPU
|
|
|
|
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8"
|
|
target triple = "amdgcn-amd-amdhsa"
|
|
|
|
%struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy.8, ptr, ptr }
|
|
%struct.ConfigurationEnvironmentTy.8 = type { i8, i8, i8, i32, i32, i32, i32, i32, i32 }
|
|
|
|
@IsSPMDMode = internal addrspace(3) global i32 undef
|
|
@__omp_offloading_10302_b20a40e_main_l4_kernel_environment = addrspace(1) constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy.8 { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr addrspacecast (ptr addrspace(1) null to ptr), ptr addrspacecast (ptr addrspace(1) null to ptr) }
|
|
|
|
;.
|
|
; AMDGPU: @[[ISSPMDMODE:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global i32 undef
|
|
; AMDGPU: @[[__OMP_OFFLOADING_10302_B20A40E_MAIN_L4_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = addrspace(1) constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY_8:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr addrspacecast (ptr addrspace(1) null to ptr), ptr addrspacecast (ptr addrspace(1) null to ptr) }
|
|
;.
|
|
define i32 @fputs() {
|
|
; AMDGPU-LABEL: define {{[^@]+}}@fputs
|
|
; AMDGPU-SAME: () #[[ATTR0:[0-9]+]] {
|
|
; AMDGPU-NEXT: fence acquire
|
|
; AMDGPU-NEXT: ret i32 0
|
|
;
|
|
fence acquire
|
|
ret i32 0
|
|
}
|
|
|
|
define internal i32 @__kmpc_target_init(ptr %0, ptr %dyn) {
|
|
; AMDGPU-LABEL: define {{[^@]+}}@__kmpc_target_init
|
|
; AMDGPU-SAME: (ptr [[TMP0:%.*]], ptr [[DYN:%.*]]) #[[ATTR1:[0-9]+]] {
|
|
; AMDGPU-NEXT: [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr addrspacecast (ptr addrspace(1) @__omp_offloading_10302_b20a40e_main_l4_kernel_environment to ptr), i64 2), align 2
|
|
; AMDGPU-NEXT: [[TMP3:%.*]] = and i8 [[TMP2]], 2
|
|
; AMDGPU-NEXT: [[TMP4:%.*]] = icmp ne i8 [[TMP3]], 0
|
|
; AMDGPU-NEXT: [[TMP5:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() #[[ATTR3:[0-9]+]]
|
|
; AMDGPU-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
|
|
; AMDGPU-NEXT: [[OR_COND:%.*]] = select i1 [[TMP4]], i1 [[TMP6]], i1 false
|
|
; AMDGPU-NEXT: br i1 [[OR_COND]], label [[TMP7:%.*]], label [[TMP8:%.*]]
|
|
; AMDGPU: 7:
|
|
; AMDGPU-NEXT: store i8 0, ptr addrspace(3) null, align 2147483648
|
|
; AMDGPU-NEXT: br label [[TMP8]]
|
|
; AMDGPU: 8:
|
|
; AMDGPU-NEXT: br label [[TMP10:%.*]]
|
|
; AMDGPU: 9:
|
|
; AMDGPU-NEXT: unreachable
|
|
; AMDGPU: 10:
|
|
; AMDGPU-NEXT: ret i32 0
|
|
;
|
|
%2 = getelementptr %struct.ConfigurationEnvironmentTy.8, ptr %0, i64 0, i32 2
|
|
%3 = load i8, ptr %2, align 2
|
|
%4 = and i8 %3, 2
|
|
%5 = icmp ne i8 %4, 0
|
|
%6 = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%7 = icmp eq i32 %6, 0
|
|
%or.cond = select i1 %5, i1 %7, i1 false
|
|
br i1 %or.cond, label %8, label %9
|
|
|
|
8: ; preds = %1
|
|
store i32 1, ptr addrspace(3) @IsSPMDMode, align 4
|
|
store i8 0, ptr addrspace(3) null, align 2147483648
|
|
br label %9
|
|
|
|
9: ; preds = %8, %1
|
|
%10 = load i32, ptr addrspace(3) @IsSPMDMode, align 4
|
|
%11 = icmp eq i32 %10, 0
|
|
br i1 %11, label %12, label %13
|
|
|
|
12: ; preds = %9
|
|
unreachable
|
|
|
|
13: ; preds = %9
|
|
ret i32 0
|
|
}
|
|
|
|
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
|
declare i32 @llvm.amdgcn.workitem.id.x() #0
|
|
|
|
declare void @__kmpc_target_deinit()
|
|
|
|
define amdgpu_kernel void @__omp_offloading_10302_b20a40e_main_l4(ptr %dyn) {
|
|
; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_10302_b20a40e_main_l4
|
|
; AMDGPU-SAME: (ptr [[DYN:%.*]]) {
|
|
; AMDGPU-NEXT: [[TMP1:%.*]] = tail call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @__omp_offloading_10302_b20a40e_main_l4_kernel_environment to ptr), ptr [[DYN]]) #[[ATTR4:[0-9]+]]
|
|
; AMDGPU-NEXT: br label [[TMP2:%.*]]
|
|
; AMDGPU: 2:
|
|
; AMDGPU-NEXT: [[TMP3:%.*]] = call i32 @fputs() #[[ATTR0]]
|
|
; AMDGPU-NEXT: tail call void @__kmpc_target_deinit()
|
|
; AMDGPU-NEXT: ret void
|
|
;
|
|
%1 = tail call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @__omp_offloading_10302_b20a40e_main_l4_kernel_environment to ptr), ptr %dyn)
|
|
br label %2
|
|
|
|
2: ; preds = %0
|
|
%3 = call i32 @fputs()
|
|
tail call void @__kmpc_target_deinit()
|
|
ret void
|
|
}
|
|
|
|
attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
|
|
|
!llvm.module.flags = !{!0}
|
|
|
|
!0 = !{i32 7, !"openmp", i32 51}
|
|
|
|
;.
|
|
; AMDGPU: attributes #[[ATTR0]] = { nounwind }
|
|
; AMDGPU: attributes #[[ATTR1]] = { norecurse nosync nounwind }
|
|
; AMDGPU: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
|
; AMDGPU: attributes #[[ATTR3]] = { nosync }
|
|
; AMDGPU: attributes #[[ATTR4]] = { nosync nounwind }
|
|
;.
|
|
; AMDGPU: [[META0:![0-9]+]] = !{i32 7, !"openmp", i32 51}
|
|
;.
|