
Currently, we use `AAAMDWavesPerEU` to iteratively update values based on attributes from the associated function, potentially propagating user-annotated values, along with `AAAMDFlatWorkGroupSize`. Similarly, we have `AAAMDFlatWorkGroupSize`. However, since the value calculated through the flat workgroup size always dominates the user annotation (i.e., the attribute), running `AAAMDWavesPerEU` iteratively is unnecessary if no user-annotated value exists. This PR completely rewrites how the `amdgpu-waves-per-eu` attribute is handled in `AMDGPUAttributor`. The key changes are as follows: - `AAAMDFlatWorkGroupSize` remains unchanged. - `AAAMDWavesPerEU` now only propagates user-annotated values. - A new function is added to check and update `amdgpu-waves-per-eu` based on the following rules: - No waves per eu, no flat workgroup size: Assume a flat workgroup size of `1,1024` and compute waves per eu based on this. - No waves per eu, flat workgroup size exists: Use the provided flat workgroup size to compute waves-per-eu. - Waves per eu exists, no flat workgroup size: This is a tricky case. In this PR, we assume a flat workgroup size of `1,1024`, but this can be adjusted if a different approach is preferred. Alternatively, we could directly use the user-annotated value. - Both waves per eu and flat workgroup size exist: If there’s a conflict, the value derived from the flat workgroup size takes precedence over waves per eu. This PR also updates the logic for merging two waves per eu pairs. The current implementation, which uses `clampStateAndIndicateChange` to compute a union, might not be ideal. If we think from ensure proper resource allocation perspective, for instance, if one pair specifies a minimum of 2 waves per eu, and another specifies a minimum of 4, we should guarantee that 4 waves per eu can be supported, as failing to do so could result in excessive resource allocation per wave. A similar principle applies to the upper bound. Thus, the PR uses the following approach for merging two pairs, `lo_a,up_a` and `lo_b,up_b`: `max(lo_a, lo_b), max(up_a, up_b)`. This ensures that resource allocation adheres to the stricter constraints from both inputs. Fix #123092.
423 lines
23 KiB
LLVM
423 lines
23 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals --version 2
|
|
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor %s | FileCheck %s
|
|
|
|
; Check propagation of amdgpu-flat-work-group-size attribute.
|
|
|
|
; Called from a single kernel with 1,8
|
|
define internal void @default_to_1_8_a() {
|
|
; CHECK-LABEL: define internal void @default_to_1_8_a
|
|
; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @kernel_1_8() #0 {
|
|
; CHECK-LABEL: define amdgpu_kernel void @kernel_1_8
|
|
; CHECK-SAME: () #[[ATTR1:[0-9]+]] {
|
|
; CHECK-NEXT: call void @default_to_1_8_a()
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
call void @default_to_1_8_a()
|
|
ret void
|
|
}
|
|
|
|
; Called from a single kernel with 1,2
|
|
define internal void @default_to_1_2() {
|
|
; CHECK-LABEL: define internal void @default_to_1_2
|
|
; CHECK-SAME: () #[[ATTR2:[0-9]+]] {
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @kernel_1_2() #1 {
|
|
; CHECK-LABEL: define amdgpu_kernel void @kernel_1_2
|
|
; CHECK-SAME: () #[[ATTR2]] {
|
|
; CHECK-NEXT: call void @default_to_1_2()
|
|
; CHECK-NEXT: call void @flat_group_1_1()
|
|
; CHECK-NEXT: call void @default_to_1_8_b()
|
|
; CHECK-NEXT: call void @flat_group_2_8()
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
call void @default_to_1_2()
|
|
call void @flat_group_1_1()
|
|
call void @default_to_1_8_b()
|
|
call void @flat_group_2_8()
|
|
ret void
|
|
}
|
|
|
|
; Called from a single kernel with 1,4
|
|
define internal void @default_to_1_4() {
|
|
; CHECK-LABEL: define internal void @default_to_1_4
|
|
; CHECK-SAME: () #[[ATTR3:[0-9]+]] {
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @kernel_1_4() #2 {
|
|
; CHECK-LABEL: define amdgpu_kernel void @kernel_1_4
|
|
; CHECK-SAME: () #[[ATTR3]] {
|
|
; CHECK-NEXT: call void @default_to_1_4()
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
call void @default_to_1_4()
|
|
ret void
|
|
}
|
|
|
|
; Called from kernels with 2,9 and 9,9
|
|
define internal void @default_to_2_9() {
|
|
; CHECK-LABEL: define internal void @default_to_2_9
|
|
; CHECK-SAME: () #[[ATTR4:[0-9]+]] {
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
ret void
|
|
}
|
|
|
|
; This already has strict bounds, but called from kernels with wider
|
|
; bounds, and should not be changed.
|
|
define internal void @flat_group_1_1() #3 {
|
|
; CHECK-LABEL: define internal void @flat_group_1_1
|
|
; CHECK-SAME: () #[[ATTR5:[0-9]+]] {
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
ret void
|
|
}
|
|
|
|
; 2,8 -> 2,2
|
|
define internal void @flat_group_2_8() #4 {
|
|
; CHECK-LABEL: define internal void @flat_group_2_8
|
|
; CHECK-SAME: () #[[ATTR0]] {
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
ret void
|
|
}
|
|
|
|
; 9,10 -> 9,9
|
|
define internal void @flat_group_9_10() #5 {
|
|
; CHECK-LABEL: define internal void @flat_group_9_10
|
|
; CHECK-SAME: () #[[ATTR6:[0-9]+]] {
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @kernel_2_9() #6 {
|
|
; CHECK-LABEL: define amdgpu_kernel void @kernel_2_9
|
|
; CHECK-SAME: () #[[ATTR7:[0-9]+]] {
|
|
; CHECK-NEXT: call void @default_to_2_9()
|
|
; CHECK-NEXT: call void @flat_group_1_1()
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
call void @default_to_2_9()
|
|
call void @flat_group_1_1()
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @kernel_9_9() #7 {
|
|
; CHECK-LABEL: define amdgpu_kernel void @kernel_9_9
|
|
; CHECK-SAME: () #[[ATTR4]] {
|
|
; CHECK-NEXT: call void @default_to_2_9()
|
|
; CHECK-NEXT: call void @flat_group_9_10()
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
call void @default_to_2_9()
|
|
call void @flat_group_9_10()
|
|
ret void
|
|
}
|
|
|
|
; Called from kernels with 2,8 and 1,2 => 1,8
|
|
define internal void @default_to_1_8_b() {
|
|
; CHECK-LABEL: define internal void @default_to_1_8_b
|
|
; CHECK-SAME: () #[[ATTR0]] {
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
ret void
|
|
}
|
|
|
|
; The kernel's lower bound is higher than the callee's lower bound, so
|
|
; this should probably be illegal.
|
|
define amdgpu_kernel void @kernel_2_8() #4 {
|
|
; CHECK-LABEL: define amdgpu_kernel void @kernel_2_8
|
|
; CHECK-SAME: () #[[ATTR0]] {
|
|
; CHECK-NEXT: call void @default_to_1_8_a()
|
|
; CHECK-NEXT: call void @default_to_1_8_b()
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
call void @default_to_1_8_a()
|
|
call void @default_to_1_8_b()
|
|
ret void
|
|
}
|
|
|
|
; 1,2 -> 2,2
|
|
define internal void @merge_cycle_0() #1 {
|
|
; CHECK-LABEL: define internal void @merge_cycle_0
|
|
; CHECK-SAME: () #[[ATTR2]] {
|
|
; CHECK-NEXT: call void @merge_cycle_1()
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
call void @merge_cycle_1()
|
|
ret void
|
|
}
|
|
|
|
; Called from 1,2 + 3,8
|
|
; 2,8 -> 2,8
|
|
define internal void @merge_cycle_1() #4 {
|
|
; CHECK-LABEL: define internal void @merge_cycle_1
|
|
; CHECK-SAME: () #[[ATTR0]] {
|
|
; CHECK-NEXT: call void @merge_cycle_0()
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
call void @merge_cycle_0()
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @kernel_3_8() #8 {
|
|
; CHECK-LABEL: define amdgpu_kernel void @kernel_3_8
|
|
; CHECK-SAME: () #[[ATTR8:[0-9]+]] {
|
|
; CHECK-NEXT: call void @merge_cycle_0()
|
|
; CHECK-NEXT: call void @default_captured_address()
|
|
; CHECK-NEXT: call void @externally_visible_default()
|
|
; CHECK-NEXT: [[F32:%.*]] = call float @bitcasted_function()
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
call void @merge_cycle_0()
|
|
call void @default_captured_address()
|
|
call void @externally_visible_default()
|
|
%f32 = call float @bitcasted_function()
|
|
ret void
|
|
}
|
|
|
|
define internal void @default_captured_address() {
|
|
; CHECK-LABEL: define internal void @default_captured_address
|
|
; CHECK-SAME: () #[[ATTR9:[0-9]+]] {
|
|
; CHECK-NEXT: store volatile ptr @default_captured_address, ptr poison, align 8
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
store volatile ptr @default_captured_address, ptr poison, align 8
|
|
ret void
|
|
}
|
|
|
|
define void @externally_visible_default() {
|
|
; CHECK-LABEL: define void @externally_visible_default
|
|
; CHECK-SAME: () #[[ATTR9]] {
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
ret void
|
|
}
|
|
|
|
; 1,10 -> 3,8
|
|
define internal i32 @bitcasted_function() {
|
|
; CHECK-LABEL: define internal i32 @bitcasted_function
|
|
; CHECK-SAME: () #[[ATTR8]] {
|
|
; CHECK-NEXT: ret i32 0
|
|
;
|
|
ret i32 0
|
|
}
|
|
|
|
define internal void @called_from_invalid_bounds_0() {
|
|
; CHECK-LABEL: define internal void @called_from_invalid_bounds_0
|
|
; CHECK-SAME: () #[[ATTR1]] {
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
ret void
|
|
}
|
|
|
|
define internal void @called_from_invalid_bounds_1() {
|
|
; CHECK-LABEL: define internal void @called_from_invalid_bounds_1
|
|
; CHECK-SAME: () #[[ATTR10:[0-9]+]] {
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
ret void
|
|
}
|
|
|
|
; Invalid range for amdgpu-waves-per-eu
|
|
define amdgpu_kernel void @kernel_invalid_bounds_0_8() #9 {
|
|
; CHECK-LABEL: define amdgpu_kernel void @kernel_invalid_bounds_0_8
|
|
; CHECK-SAME: () #[[ATTR1]] {
|
|
; CHECK-NEXT: call void @called_from_invalid_bounds_0()
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
call void @called_from_invalid_bounds_0()
|
|
ret void
|
|
}
|
|
|
|
; Invalid range for amdgpu-waves-per-eu
|
|
define amdgpu_kernel void @kernel_invalid_bounds_1_123() #10 {
|
|
; CHECK-LABEL: define amdgpu_kernel void @kernel_invalid_bounds_1_123
|
|
; CHECK-SAME: () #[[ATTR11:[0-9]+]] {
|
|
; CHECK-NEXT: call void @called_from_invalid_bounds_1()
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
call void @called_from_invalid_bounds_1()
|
|
ret void
|
|
}
|
|
|
|
; XXX - Why is the maximum not 6?
|
|
; The 512 maximum workgroup size implies a minimum occupancy of 2. The
|
|
; implied minimum waves-per-eu should not be 3
|
|
; -> 2,10
|
|
define void @larger_group_size_implies_lower_minimum() #11 {
|
|
; CHECK-LABEL: define void @larger_group_size_implies_lower_minimum
|
|
; CHECK-SAME: () #[[ATTR12:[0-9]+]] {
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @kernel_3_6() #12 {
|
|
; CHECK-LABEL: define amdgpu_kernel void @kernel_3_6
|
|
; CHECK-SAME: () #[[ATTR13:[0-9]+]] {
|
|
; CHECK-NEXT: call void @larger_group_size_implies_lower_minimum()
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
call void @larger_group_size_implies_lower_minimum()
|
|
ret void
|
|
}
|
|
|
|
; 3,6 -> 6,9
|
|
define internal void @refine_upper_func_3_6() #13 {
|
|
; CHECK-LABEL: define internal void @refine_upper_func_3_6
|
|
; CHECK-SAME: () #[[ATTR9]] {
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
ret void
|
|
}
|
|
|
|
; 4,8 -> 6,8
|
|
define internal void @refine_lower_func_4_8() #14 {
|
|
; CHECK-LABEL: define internal void @refine_lower_func_4_8
|
|
; CHECK-SAME: () #[[ATTR14:[0-9]+]] {
|
|
; CHECK-NEXT: call void @refine_upper_func_3_6()
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
call void @refine_upper_func_3_6()
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @kernel_foo_6_8() #15 {
|
|
; CHECK-LABEL: define amdgpu_kernel void @kernel_foo_6_8
|
|
; CHECK-SAME: () #[[ATTR15:[0-9]+]] {
|
|
; CHECK-NEXT: call void @refine_upper_func_3_6()
|
|
; CHECK-NEXT: call void @refine_lower_func_4_8()
|
|
; CHECK-NEXT: call void @func_9_10_a()
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
call void @refine_upper_func_3_6()
|
|
call void @refine_lower_func_4_8()
|
|
call void @func_9_10_a()
|
|
ret void
|
|
}
|
|
|
|
; 5,5 -> 5,5
|
|
define internal void @func_5_5() #16 {
|
|
; CHECK-LABEL: define internal void @func_5_5
|
|
; CHECK-SAME: () #[[ATTR16:[0-9]+]] {
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
ret void
|
|
}
|
|
|
|
; 5,8 -> 8,8
|
|
define internal void @func_5_8() #17 {
|
|
; CHECK-LABEL: define internal void @func_5_8
|
|
; CHECK-SAME: () #[[ATTR17:[0-9]+]] {
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
ret void
|
|
}
|
|
|
|
; 9,10 -> 9,10
|
|
define internal void @func_9_10_a() #18 {
|
|
; CHECK-LABEL: define internal void @func_9_10_a
|
|
; CHECK-SAME: () #[[ATTR18:[0-9]+]] {
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
ret void
|
|
}
|
|
|
|
; 9,10 -> 9,9
|
|
define internal void @func_9_10_b() #18 {
|
|
; CHECK-LABEL: define internal void @func_9_10_b
|
|
; CHECK-SAME: () #[[ATTR18]] {
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @kernel_bar_8_9() #19 {
|
|
; CHECK-LABEL: define amdgpu_kernel void @kernel_bar_8_9
|
|
; CHECK-SAME: () #[[ATTR19:[0-9]+]] {
|
|
; CHECK-NEXT: call void @refine_upper_func_3_6()
|
|
; CHECK-NEXT: call void @func_5_5()
|
|
; CHECK-NEXT: call void @func_9_10_b()
|
|
; CHECK-NEXT: call void @func_5_8()
|
|
; CHECK-NEXT: call void @externally_visible()
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
call void @refine_upper_func_3_6()
|
|
call void @func_5_5()
|
|
call void @func_9_10_b()
|
|
call void @func_5_8()
|
|
call void @externally_visible()
|
|
ret void
|
|
}
|
|
|
|
; This is an optimization hint based on users, so it's not strictly
|
|
; required that all callers be visible.
|
|
define void @externally_visible() {
|
|
; CHECK-LABEL: define void @externally_visible
|
|
; CHECK-SAME: () #[[ATTR9]] {
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
ret void
|
|
}
|
|
|
|
|
|
; Use a 1 wave workgroup so there is no interaction by the workgroup
|
|
; size on the implied waves per EU.
|
|
|
|
attributes #0 = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-waves-per-eu"="1,8" }
|
|
attributes #1 = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-waves-per-eu"="1,2" }
|
|
attributes #2 = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-waves-per-eu"="1,4" }
|
|
attributes #3 = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-waves-per-eu"="1,1" }
|
|
attributes #4 = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-waves-per-eu"="2,8" }
|
|
attributes #5 = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-waves-per-eu"="9,10" }
|
|
attributes #6 = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-waves-per-eu"="2,9" }
|
|
attributes #7 = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-waves-per-eu"="9,9" }
|
|
attributes #8 = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-waves-per-eu"="3,8" }
|
|
attributes #9 = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-waves-per-eu"="0,8" }
|
|
attributes #10 = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-waves-per-eu"="1,123" }
|
|
attributes #11 = { "amdgpu-flat-work-group-size"="1,512" }
|
|
attributes #12 = { "amdgpu-flat-work-group-size"="1,512" "amdgpu-waves-per-eu"="3,6" }
|
|
attributes #13 = { "amdgpu-waves-per-eu"="3,6" }
|
|
attributes #14 = { "amdgpu-waves-per-eu"="4,8" }
|
|
attributes #15 = { "amdgpu-waves-per-eu"="6,8" }
|
|
attributes #16 = { "amdgpu-waves-per-eu"="5,5" }
|
|
attributes #17 = { "amdgpu-waves-per-eu"="5,8" }
|
|
attributes #18 = { "amdgpu-waves-per-eu"="9,10" }
|
|
attributes #19 = { "amdgpu-waves-per-eu"="8,9" }
|
|
;.
|
|
; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,8" "uniform-work-group-size"="false" }
|
|
; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,8" "uniform-work-group-size"="false" }
|
|
; CHECK: attributes #[[ATTR2]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,2" "uniform-work-group-size"="false" }
|
|
; CHECK: attributes #[[ATTR3]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,4" "uniform-work-group-size"="false" }
|
|
; CHECK: attributes #[[ATTR4]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,9" "uniform-work-group-size"="false" }
|
|
; CHECK: attributes #[[ATTR5]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,1" "uniform-work-group-size"="false" }
|
|
; CHECK: attributes #[[ATTR6]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,10" "uniform-work-group-size"="false" }
|
|
; CHECK: attributes #[[ATTR7]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,9" "uniform-work-group-size"="false" }
|
|
; CHECK: attributes #[[ATTR8]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,8" "uniform-work-group-size"="false" }
|
|
; CHECK: attributes #[[ATTR9]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
|
|
; CHECK: attributes #[[ATTR10]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
|
|
; CHECK: attributes #[[ATTR11]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,123" "uniform-work-group-size"="false" }
|
|
; CHECK: attributes #[[ATTR12]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,10" "uniform-work-group-size"="false" }
|
|
; CHECK: attributes #[[ATTR13]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,6" "uniform-work-group-size"="false" }
|
|
; CHECK: attributes #[[ATTR14]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "uniform-work-group-size"="false" }
|
|
; CHECK: attributes #[[ATTR15]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="6,8" "uniform-work-group-size"="false" }
|
|
; CHECK: attributes #[[ATTR16]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="5,5" "uniform-work-group-size"="false" }
|
|
; CHECK: attributes #[[ATTR17]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="5,8" "uniform-work-group-size"="false" }
|
|
; CHECK: attributes #[[ATTR18]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,10" "uniform-work-group-size"="false" }
|
|
; CHECK: attributes #[[ATTR19]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,9" "uniform-work-group-size"="false" }
|
|
;.
|