
Currently, we use `AAAMDWavesPerEU` to iteratively update values based on attributes from the associated function, potentially propagating user-annotated values, along with `AAAMDFlatWorkGroupSize`. Similarly, we have `AAAMDFlatWorkGroupSize`. However, since the value calculated through the flat workgroup size always dominates the user annotation (i.e., the attribute), running `AAAMDWavesPerEU` iteratively is unnecessary if no user-annotated value exists. This PR completely rewrites how the `amdgpu-waves-per-eu` attribute is handled in `AMDGPUAttributor`. The key changes are as follows: - `AAAMDFlatWorkGroupSize` remains unchanged. - `AAAMDWavesPerEU` now only propagates user-annotated values. - A new function is added to check and update `amdgpu-waves-per-eu` based on the following rules: - No waves per eu, no flat workgroup size: Assume a flat workgroup size of `1,1024` and compute waves per eu based on this. - No waves per eu, flat workgroup size exists: Use the provided flat workgroup size to compute waves-per-eu. - Waves per eu exists, no flat workgroup size: This is a tricky case. In this PR, we assume a flat workgroup size of `1,1024`, but this can be adjusted if a different approach is preferred. Alternatively, we could directly use the user-annotated value. - Both waves per eu and flat workgroup size exist: If there’s a conflict, the value derived from the flat workgroup size takes precedence over waves per eu. This PR also updates the logic for merging two waves per eu pairs. The current implementation, which uses `clampStateAndIndicateChange` to compute a union, might not be ideal. If we think from ensure proper resource allocation perspective, for instance, if one pair specifies a minimum of 2 waves per eu, and another specifies a minimum of 4, we should guarantee that 4 waves per eu can be supported, as failing to do so could result in excessive resource allocation per wave. A similar principle applies to the upper bound. Thus, the PR uses the following approach for merging two pairs, `lo_a,up_a` and `lo_b,up_b`: `max(lo_a, lo_b), max(up_a, up_b)`. This ensures that resource allocation adheres to the stricter constraints from both inputs. Fix #123092.
491 lines
26 KiB
LLVM
491 lines
26 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
|
|
; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -passes=amdgpu-attributor < %s | FileCheck -check-prefix=HSA %s
|
|
|
|
declare i32 @llvm.amdgcn.workgroup.id.x() #0
|
|
declare i32 @llvm.amdgcn.workgroup.id.y() #0
|
|
declare i32 @llvm.amdgcn.workgroup.id.z() #0
|
|
|
|
declare i32 @llvm.amdgcn.workitem.id.x() #0
|
|
declare i32 @llvm.amdgcn.workitem.id.y() #0
|
|
declare i32 @llvm.amdgcn.workitem.id.z() #0
|
|
|
|
declare ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() #0
|
|
declare ptr addrspace(4) @llvm.amdgcn.queue.ptr() #0
|
|
declare ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() #0
|
|
declare ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() #0
|
|
|
|
declare i1 @llvm.amdgcn.is.shared(ptr nocapture) #2
|
|
declare i1 @llvm.amdgcn.is.private(ptr nocapture) #2
|
|
|
|
define amdgpu_kernel void @use_tgid_x(ptr addrspace(1) %ptr) #1 {
|
|
; HSA-LABEL: define {{[^@]+}}@use_tgid_x
|
|
; HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1:[0-9]+]] {
|
|
; HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x()
|
|
; HSA-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4
|
|
; HSA-NEXT: ret void
|
|
;
|
|
%val = call i32 @llvm.amdgcn.workgroup.id.x()
|
|
store i32 %val, ptr addrspace(1) %ptr
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @use_tgid_y(ptr addrspace(1) %ptr) #1 {
|
|
; HSA-LABEL: define {{[^@]+}}@use_tgid_y
|
|
; HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR2:[0-9]+]] {
|
|
; HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y()
|
|
; HSA-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4
|
|
; HSA-NEXT: ret void
|
|
;
|
|
%val = call i32 @llvm.amdgcn.workgroup.id.y()
|
|
store i32 %val, ptr addrspace(1) %ptr
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @multi_use_tgid_y(ptr addrspace(1) %ptr) #1 {
|
|
; HSA-LABEL: define {{[^@]+}}@multi_use_tgid_y
|
|
; HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR2]] {
|
|
; HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y()
|
|
; HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4
|
|
; HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y()
|
|
; HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4
|
|
; HSA-NEXT: ret void
|
|
;
|
|
%val0 = call i32 @llvm.amdgcn.workgroup.id.y()
|
|
store volatile i32 %val0, ptr addrspace(1) %ptr
|
|
%val1 = call i32 @llvm.amdgcn.workgroup.id.y()
|
|
store volatile i32 %val1, ptr addrspace(1) %ptr
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @use_tgid_x_y(ptr addrspace(1) %ptr) #1 {
|
|
; HSA-LABEL: define {{[^@]+}}@use_tgid_x_y
|
|
; HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR2]] {
|
|
; HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x()
|
|
; HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y()
|
|
; HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4
|
|
; HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4
|
|
; HSA-NEXT: ret void
|
|
;
|
|
%val0 = call i32 @llvm.amdgcn.workgroup.id.x()
|
|
%val1 = call i32 @llvm.amdgcn.workgroup.id.y()
|
|
store volatile i32 %val0, ptr addrspace(1) %ptr
|
|
store volatile i32 %val1, ptr addrspace(1) %ptr
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @use_tgid_z(ptr addrspace(1) %ptr) #1 {
|
|
; HSA-LABEL: define {{[^@]+}}@use_tgid_z
|
|
; HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR3:[0-9]+]] {
|
|
; HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z()
|
|
; HSA-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4
|
|
; HSA-NEXT: ret void
|
|
;
|
|
%val = call i32 @llvm.amdgcn.workgroup.id.z()
|
|
store i32 %val, ptr addrspace(1) %ptr
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @use_tgid_x_z(ptr addrspace(1) %ptr) #1 {
|
|
; HSA-LABEL: define {{[^@]+}}@use_tgid_x_z
|
|
; HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR3]] {
|
|
; HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x()
|
|
; HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z()
|
|
; HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4
|
|
; HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4
|
|
; HSA-NEXT: ret void
|
|
;
|
|
%val0 = call i32 @llvm.amdgcn.workgroup.id.x()
|
|
%val1 = call i32 @llvm.amdgcn.workgroup.id.z()
|
|
store volatile i32 %val0, ptr addrspace(1) %ptr
|
|
store volatile i32 %val1, ptr addrspace(1) %ptr
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @use_tgid_y_z(ptr addrspace(1) %ptr) #1 {
|
|
; HSA-LABEL: define {{[^@]+}}@use_tgid_y_z
|
|
; HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR4:[0-9]+]] {
|
|
; HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y()
|
|
; HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z()
|
|
; HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4
|
|
; HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4
|
|
; HSA-NEXT: ret void
|
|
;
|
|
%val0 = call i32 @llvm.amdgcn.workgroup.id.y()
|
|
%val1 = call i32 @llvm.amdgcn.workgroup.id.z()
|
|
store volatile i32 %val0, ptr addrspace(1) %ptr
|
|
store volatile i32 %val1, ptr addrspace(1) %ptr
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @use_tgid_x_y_z(ptr addrspace(1) %ptr) #1 {
|
|
; HSA-LABEL: define {{[^@]+}}@use_tgid_x_y_z
|
|
; HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR4]] {
|
|
; HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x()
|
|
; HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y()
|
|
; HSA-NEXT: [[VAL2:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z()
|
|
; HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4
|
|
; HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4
|
|
; HSA-NEXT: store volatile i32 [[VAL2]], ptr addrspace(1) [[PTR]], align 4
|
|
; HSA-NEXT: ret void
|
|
;
|
|
%val0 = call i32 @llvm.amdgcn.workgroup.id.x()
|
|
%val1 = call i32 @llvm.amdgcn.workgroup.id.y()
|
|
%val2 = call i32 @llvm.amdgcn.workgroup.id.z()
|
|
store volatile i32 %val0, ptr addrspace(1) %ptr
|
|
store volatile i32 %val1, ptr addrspace(1) %ptr
|
|
store volatile i32 %val2, ptr addrspace(1) %ptr
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @use_tidig_x(ptr addrspace(1) %ptr) #1 {
|
|
; HSA-LABEL: define {{[^@]+}}@use_tidig_x
|
|
; HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
|
|
; HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
|
|
; HSA-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4
|
|
; HSA-NEXT: ret void
|
|
;
|
|
%val = call i32 @llvm.amdgcn.workitem.id.x()
|
|
store i32 %val, ptr addrspace(1) %ptr
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @use_tidig_y(ptr addrspace(1) %ptr) #1 {
|
|
; HSA-LABEL: define {{[^@]+}}@use_tidig_y
|
|
; HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR5:[0-9]+]] {
|
|
; HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
|
|
; HSA-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4
|
|
; HSA-NEXT: ret void
|
|
;
|
|
%val = call i32 @llvm.amdgcn.workitem.id.y()
|
|
store i32 %val, ptr addrspace(1) %ptr
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @use_tidig_z(ptr addrspace(1) %ptr) #1 {
|
|
; HSA-LABEL: define {{[^@]+}}@use_tidig_z
|
|
; HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR6:[0-9]+]] {
|
|
; HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
|
|
; HSA-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4
|
|
; HSA-NEXT: ret void
|
|
;
|
|
%val = call i32 @llvm.amdgcn.workitem.id.z()
|
|
store i32 %val, ptr addrspace(1) %ptr
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @use_tidig_x_tgid_x(ptr addrspace(1) %ptr) #1 {
|
|
; HSA-LABEL: define {{[^@]+}}@use_tidig_x_tgid_x
|
|
; HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
|
|
; HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
|
|
; HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x()
|
|
; HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4
|
|
; HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4
|
|
; HSA-NEXT: ret void
|
|
;
|
|
%val0 = call i32 @llvm.amdgcn.workitem.id.x()
|
|
%val1 = call i32 @llvm.amdgcn.workgroup.id.x()
|
|
store volatile i32 %val0, ptr addrspace(1) %ptr
|
|
store volatile i32 %val1, ptr addrspace(1) %ptr
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @use_tidig_y_tgid_y(ptr addrspace(1) %ptr) #1 {
|
|
; HSA-LABEL: define {{[^@]+}}@use_tidig_y_tgid_y
|
|
; HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR7:[0-9]+]] {
|
|
; HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
|
|
; HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y()
|
|
; HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4
|
|
; HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4
|
|
; HSA-NEXT: ret void
|
|
;
|
|
%val0 = call i32 @llvm.amdgcn.workitem.id.y()
|
|
%val1 = call i32 @llvm.amdgcn.workgroup.id.y()
|
|
store volatile i32 %val0, ptr addrspace(1) %ptr
|
|
store volatile i32 %val1, ptr addrspace(1) %ptr
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @use_tidig_x_y_z(ptr addrspace(1) %ptr) #1 {
|
|
; HSA-LABEL: define {{[^@]+}}@use_tidig_x_y_z
|
|
; HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR8:[0-9]+]] {
|
|
; HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
|
|
; HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
|
|
; HSA-NEXT: [[VAL2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
|
|
; HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4
|
|
; HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4
|
|
; HSA-NEXT: store volatile i32 [[VAL2]], ptr addrspace(1) [[PTR]], align 4
|
|
; HSA-NEXT: ret void
|
|
;
|
|
%val0 = call i32 @llvm.amdgcn.workitem.id.x()
|
|
%val1 = call i32 @llvm.amdgcn.workitem.id.y()
|
|
%val2 = call i32 @llvm.amdgcn.workitem.id.z()
|
|
store volatile i32 %val0, ptr addrspace(1) %ptr
|
|
store volatile i32 %val1, ptr addrspace(1) %ptr
|
|
store volatile i32 %val2, ptr addrspace(1) %ptr
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @use_all_workitems(ptr addrspace(1) %ptr) #1 {
|
|
; HSA-LABEL: define {{[^@]+}}@use_all_workitems
|
|
; HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR9:[0-9]+]] {
|
|
; HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
|
|
; HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
|
|
; HSA-NEXT: [[VAL2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
|
|
; HSA-NEXT: [[VAL3:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x()
|
|
; HSA-NEXT: [[VAL4:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y()
|
|
; HSA-NEXT: [[VAL5:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z()
|
|
; HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4
|
|
; HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4
|
|
; HSA-NEXT: store volatile i32 [[VAL2]], ptr addrspace(1) [[PTR]], align 4
|
|
; HSA-NEXT: store volatile i32 [[VAL3]], ptr addrspace(1) [[PTR]], align 4
|
|
; HSA-NEXT: store volatile i32 [[VAL4]], ptr addrspace(1) [[PTR]], align 4
|
|
; HSA-NEXT: store volatile i32 [[VAL5]], ptr addrspace(1) [[PTR]], align 4
|
|
; HSA-NEXT: ret void
|
|
;
|
|
%val0 = call i32 @llvm.amdgcn.workitem.id.x()
|
|
%val1 = call i32 @llvm.amdgcn.workitem.id.y()
|
|
%val2 = call i32 @llvm.amdgcn.workitem.id.z()
|
|
%val3 = call i32 @llvm.amdgcn.workgroup.id.x()
|
|
%val4 = call i32 @llvm.amdgcn.workgroup.id.y()
|
|
%val5 = call i32 @llvm.amdgcn.workgroup.id.z()
|
|
store volatile i32 %val0, ptr addrspace(1) %ptr
|
|
store volatile i32 %val1, ptr addrspace(1) %ptr
|
|
store volatile i32 %val2, ptr addrspace(1) %ptr
|
|
store volatile i32 %val3, ptr addrspace(1) %ptr
|
|
store volatile i32 %val4, ptr addrspace(1) %ptr
|
|
store volatile i32 %val5, ptr addrspace(1) %ptr
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @use_dispatch_ptr(ptr addrspace(1) %ptr) #1 {
|
|
; HSA-LABEL: define {{[^@]+}}@use_dispatch_ptr
|
|
; HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR10:[0-9]+]] {
|
|
; HSA-NEXT: [[DISPATCH_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
|
|
; HSA-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(4) [[DISPATCH_PTR]], align 4
|
|
; HSA-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4
|
|
; HSA-NEXT: ret void
|
|
;
|
|
%dispatch.ptr = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
|
|
%val = load i32, ptr addrspace(4) %dispatch.ptr
|
|
store i32 %val, ptr addrspace(1) %ptr
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @use_queue_ptr(ptr addrspace(1) %ptr) #1 {
|
|
; HSA-LABEL: define {{[^@]+}}@use_queue_ptr
|
|
; HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR11:[0-9]+]] {
|
|
; HSA-NEXT: [[DISPATCH_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.queue.ptr()
|
|
; HSA-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(4) [[DISPATCH_PTR]], align 4
|
|
; HSA-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4
|
|
; HSA-NEXT: ret void
|
|
;
|
|
%dispatch.ptr = call ptr addrspace(4) @llvm.amdgcn.queue.ptr()
|
|
%val = load i32, ptr addrspace(4) %dispatch.ptr
|
|
store i32 %val, ptr addrspace(1) %ptr
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @use_kernarg_segment_ptr(ptr addrspace(1) %ptr) #1 {
|
|
; HSA-LABEL: define {{[^@]+}}@use_kernarg_segment_ptr
|
|
; HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
|
|
; HSA-NEXT: [[DISPATCH_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
|
|
; HSA-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(4) [[DISPATCH_PTR]], align 4
|
|
; HSA-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4
|
|
; HSA-NEXT: ret void
|
|
;
|
|
%dispatch.ptr = call ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
|
|
%val = load i32, ptr addrspace(4) %dispatch.ptr
|
|
store i32 %val, ptr addrspace(1) %ptr
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @use_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) #1 {
|
|
; HSA-LABEL: define {{[^@]+}}@use_group_to_flat_addrspacecast
|
|
; HSA-SAME: (ptr addrspace(3) [[PTR:%.*]]) #[[ATTR12:[0-9]+]] {
|
|
; HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr
|
|
; HSA-NEXT: store volatile i32 0, ptr [[STOF]], align 4
|
|
; HSA-NEXT: ret void
|
|
;
|
|
%stof = addrspacecast ptr addrspace(3) %ptr to ptr
|
|
store volatile i32 0, ptr %stof
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @use_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) #1 {
|
|
; HSA-LABEL: define {{[^@]+}}@use_private_to_flat_addrspacecast
|
|
; HSA-SAME: (ptr addrspace(5) [[PTR:%.*]]) #[[ATTR13:[0-9]+]] {
|
|
; HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr
|
|
; HSA-NEXT: store volatile i32 0, ptr [[STOF]], align 4
|
|
; HSA-NEXT: ret void
|
|
;
|
|
%stof = addrspacecast ptr addrspace(5) %ptr to ptr
|
|
store volatile i32 0, ptr %stof
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @use_flat_to_group_addrspacecast(ptr %ptr) #1 {
|
|
; HSA-LABEL: define {{[^@]+}}@use_flat_to_group_addrspacecast
|
|
; HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR1]] {
|
|
; HSA-NEXT: [[FTOS:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(3)
|
|
; HSA-NEXT: store volatile i32 0, ptr addrspace(3) [[FTOS]], align 4
|
|
; HSA-NEXT: ret void
|
|
;
|
|
%ftos = addrspacecast ptr %ptr to ptr addrspace(3)
|
|
store volatile i32 0, ptr addrspace(3) %ftos
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @use_flat_to_private_addrspacecast(ptr %ptr) #1 {
|
|
; HSA-LABEL: define {{[^@]+}}@use_flat_to_private_addrspacecast
|
|
; HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR1]] {
|
|
; HSA-NEXT: [[FTOS:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5)
|
|
; HSA-NEXT: store volatile i32 0, ptr addrspace(5) [[FTOS]], align 4
|
|
; HSA-NEXT: ret void
|
|
;
|
|
%ftos = addrspacecast ptr %ptr to ptr addrspace(5)
|
|
store volatile i32 0, ptr addrspace(5) %ftos
|
|
ret void
|
|
}
|
|
|
|
; No-op addrspacecast should not use queue ptr
|
|
define amdgpu_kernel void @use_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) #1 {
|
|
; HSA-LABEL: define {{[^@]+}}@use_global_to_flat_addrspacecast
|
|
; HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
|
|
; HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
|
|
; HSA-NEXT: store volatile i32 0, ptr [[STOF]], align 4
|
|
; HSA-NEXT: ret void
|
|
;
|
|
%stof = addrspacecast ptr addrspace(1) %ptr to ptr
|
|
store volatile i32 0, ptr %stof
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @use_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr) #1 {
|
|
; HSA-LABEL: define {{[^@]+}}@use_constant_to_flat_addrspacecast
|
|
; HSA-SAME: (ptr addrspace(4) [[PTR:%.*]]) #[[ATTR1]] {
|
|
; HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(4) [[PTR]] to ptr
|
|
; HSA-NEXT: [[LD:%.*]] = load volatile i32, ptr [[STOF]], align 4
|
|
; HSA-NEXT: ret void
|
|
;
|
|
%stof = addrspacecast ptr addrspace(4) %ptr to ptr
|
|
%ld = load volatile i32, ptr %stof
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @use_flat_to_global_addrspacecast(ptr %ptr) #1 {
|
|
; HSA-LABEL: define {{[^@]+}}@use_flat_to_global_addrspacecast
|
|
; HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR1]] {
|
|
; HSA-NEXT: [[FTOS:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(1)
|
|
; HSA-NEXT: store volatile i32 0, ptr addrspace(1) [[FTOS]], align 4
|
|
; HSA-NEXT: ret void
|
|
;
|
|
%ftos = addrspacecast ptr %ptr to ptr addrspace(1)
|
|
store volatile i32 0, ptr addrspace(1) %ftos
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @use_flat_to_constant_addrspacecast(ptr %ptr) #1 {
|
|
; HSA-LABEL: define {{[^@]+}}@use_flat_to_constant_addrspacecast
|
|
; HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR1]] {
|
|
; HSA-NEXT: [[FTOS:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(4)
|
|
; HSA-NEXT: [[LD:%.*]] = load volatile i32, ptr addrspace(4) [[FTOS]], align 4
|
|
; HSA-NEXT: ret void
|
|
;
|
|
%ftos = addrspacecast ptr %ptr to ptr addrspace(4)
|
|
%ld = load volatile i32, ptr addrspace(4) %ftos
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @use_is_shared(ptr %ptr) #1 {
|
|
; HSA-LABEL: define {{[^@]+}}@use_is_shared
|
|
; HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR12]] {
|
|
; HSA-NEXT: [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(ptr [[PTR]])
|
|
; HSA-NEXT: [[EXT:%.*]] = zext i1 [[IS_SHARED]] to i32
|
|
; HSA-NEXT: store i32 [[EXT]], ptr addrspace(1) poison, align 4
|
|
; HSA-NEXT: ret void
|
|
;
|
|
%is.shared = call i1 @llvm.amdgcn.is.shared(ptr %ptr)
|
|
%ext = zext i1 %is.shared to i32
|
|
store i32 %ext, ptr addrspace(1) poison
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @use_is_private(ptr %ptr) #1 {
|
|
; HSA-LABEL: define {{[^@]+}}@use_is_private
|
|
; HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR12]] {
|
|
; HSA-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]])
|
|
; HSA-NEXT: [[EXT:%.*]] = zext i1 [[IS_PRIVATE]] to i32
|
|
; HSA-NEXT: store i32 [[EXT]], ptr addrspace(1) poison, align 4
|
|
; HSA-NEXT: ret void
|
|
;
|
|
%is.private = call i1 @llvm.amdgcn.is.private(ptr %ptr)
|
|
%ext = zext i1 %is.private to i32
|
|
store i32 %ext, ptr addrspace(1) poison
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @use_alloca() #1 {
|
|
; HSA-LABEL: define {{[^@]+}}@use_alloca
|
|
; HSA-SAME: () #[[ATTR1]] {
|
|
; HSA-NEXT: [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5)
|
|
; HSA-NEXT: store i32 0, ptr addrspace(5) [[ALLOCA]], align 4
|
|
; HSA-NEXT: ret void
|
|
;
|
|
%alloca = alloca i32, addrspace(5)
|
|
store i32 0, ptr addrspace(5) %alloca
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @use_alloca_non_entry_block() #1 {
|
|
; HSA-LABEL: define {{[^@]+}}@use_alloca_non_entry_block
|
|
; HSA-SAME: () #[[ATTR1]] {
|
|
; HSA-NEXT: entry:
|
|
; HSA-NEXT: br label [[BB:%.*]]
|
|
; HSA: bb:
|
|
; HSA-NEXT: [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5)
|
|
; HSA-NEXT: store i32 0, ptr addrspace(5) [[ALLOCA]], align 4
|
|
; HSA-NEXT: ret void
|
|
;
|
|
entry:
|
|
br label %bb
|
|
|
|
bb:
|
|
%alloca = alloca i32, addrspace(5)
|
|
store i32 0, ptr addrspace(5) %alloca
|
|
ret void
|
|
}
|
|
|
|
define void @use_alloca_func() #1 {
|
|
; HSA-LABEL: define {{[^@]+}}@use_alloca_func
|
|
; HSA-SAME: () #[[ATTR1]] {
|
|
; HSA-NEXT: [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5)
|
|
; HSA-NEXT: store i32 0, ptr addrspace(5) [[ALLOCA]], align 4
|
|
; HSA-NEXT: ret void
|
|
;
|
|
%alloca = alloca i32, addrspace(5)
|
|
store i32 0, ptr addrspace(5) %alloca
|
|
ret void
|
|
}
|
|
|
|
attributes #0 = { nounwind readnone speculatable }
|
|
attributes #1 = { nounwind }
|
|
|
|
|
|
; AKF_HSA: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500}
|
|
;.
|
|
; HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
|
; HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
|
|
; HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
|
|
; HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
|
|
; HSA: attributes #[[ATTR4]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
|
|
; HSA: attributes #[[ATTR5]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
|
|
; HSA: attributes #[[ATTR6]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
|
|
; HSA: attributes #[[ATTR7]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
|
|
; HSA: attributes #[[ATTR8]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
|
|
; HSA: attributes #[[ATTR9]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
|
|
; HSA: attributes #[[ATTR10]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
|
|
; HSA: attributes #[[ATTR11]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
|
|
; HSA: attributes #[[ATTR12]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
|
|
; HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
|
|
;.
|