addAliasScopeMetadata in AMDGPULowerKernelArguments skips instructions
with empty PtrArgs, including memory-accessing calls that have no
pointer arguments (e.g. builtins like threadIdx()). Because these calls
never receive !noalias metadata, ScopedNoAliasAA cannot prove they don't
alias noalias kernel arguments. MemorySSA then conservatively reports
them as clobbers, which prevents AMDGPUAnnotateUniformValues from
marking loads as noclobber, blocking scalarization (s_load) and forcing
expensive vector loads (global_load) instead.
Fix by adding all noalias kernel argument scopes to !noalias metadata
for memory-accessing instructions with no pointer arguments. Since such
instructions cannot access memory through any kernel pointer argument,
all noalias scopes are safe to apply.
This fixes a performance regression in rocFFT introduced by bd9668df0f00
("[AMDGPU] Propagate alias information in AMDGPULowerKernelArguments").
Assisted-by: Claude Opus
134 lines
8.8 KiB
LLVM
134 lines
8.8 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
|
|
; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=amdgpu-lower-kernel-arguments %s | FileCheck %s
|
|
|
|
; Regression test for a bug where addAliasScopeMetadata skipped memory-
|
|
; accessing calls with no pointer arguments, leaving them without !noalias
|
|
; metadata. This caused AA to conservatively report them as potential
|
|
; clobbers of noalias kernel arguments, blocking downstream scalarization
|
|
; in AMDGPUAnnotateUniformValues and causing severe performance regressions
|
|
; (e.g. in rocFFT).
|
|
|
|
declare i32 @memory_read_no_ptr_args() #1
|
|
|
|
; The call reads memory but has no pointer arguments — it cannot alias
|
|
; any noalias kernel argument. The pass must add !noalias metadata to it.
|
|
define amdgpu_kernel void @call_without_ptr_args(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 {
|
|
; CHECK-LABEL: define amdgpu_kernel void @call_without_ptr_args(
|
|
; CHECK-SAME: ptr addrspace(1) noalias [[OUT:%.*]], ptr addrspace(1) noalias [[IN:%.*]]) #[[ATTR1:[0-9]+]] {
|
|
; CHECK-NEXT: [[CALL_WITHOUT_PTR_ARGS_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
|
|
; CHECK-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[CALL_WITHOUT_PTR_ARGS_KERNARG_SEGMENT]], i64 0
|
|
; CHECK-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0:![0-9]+]]
|
|
; CHECK-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[CALL_WITHOUT_PTR_ARGS_KERNARG_SEGMENT]], i64 8
|
|
; CHECK-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
|
|
; CHECK-NEXT: [[VAL:%.*]] = call i32 @memory_read_no_ptr_args(), !noalias [[META1:![0-9]+]]
|
|
; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[IN_LOAD]], i32 [[VAL]]
|
|
; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[GEP]], align 4, !alias.scope [[META5:![0-9]+]], !noalias [[META6:![0-9]+]]
|
|
; CHECK-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4, !alias.scope [[META6]], !noalias [[META5]]
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
%val = call i32 @memory_read_no_ptr_args()
|
|
%gep = getelementptr i32, ptr addrspace(1) %in, i32 %val
|
|
%load = load i32, ptr addrspace(1) %gep, align 4
|
|
store i32 %load, ptr addrspace(1) %out, align 4
|
|
ret void
|
|
}
|
|
|
|
; Same scenario but the call is readnone — should NOT get noalias metadata
|
|
; because it doesn't access memory at all and is skipped by the pass.
|
|
declare i32 @readnone_no_ptr_args() #2
|
|
|
|
define amdgpu_kernel void @readnone_call_without_ptr_args(ptr addrspace(1) noalias %out) #0 {
|
|
; CHECK-LABEL: define amdgpu_kernel void @readnone_call_without_ptr_args(
|
|
; CHECK-SAME: ptr addrspace(1) noalias [[OUT:%.*]]) #[[ATTR1]] {
|
|
; CHECK-NEXT: [[READNONE_CALL_WITHOUT_PTR_ARGS_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
|
|
; CHECK-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[READNONE_CALL_WITHOUT_PTR_ARGS_KERNARG_SEGMENT]], i64 0
|
|
; CHECK-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
|
|
; CHECK-NEXT: [[VAL:%.*]] = call i32 @readnone_no_ptr_args()
|
|
; CHECK-NEXT: store i32 [[VAL]], ptr addrspace(1) [[OUT_LOAD]], align 4, !alias.scope [[META7:![0-9]+]]
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
%val = call i32 @readnone_no_ptr_args()
|
|
store i32 %val, ptr addrspace(1) %out, align 4
|
|
ret void
|
|
}
|
|
|
|
; argmemonly variant: memory(argmem: read) with no pointer arguments.
|
|
; This function can only access memory through its pointer arguments, but
|
|
; has none — so it effectively cannot access memory at all. The pass must
|
|
; still add !noalias metadata since doesNotAccessMemory() returns false.
|
|
declare i32 @argmemonly_read_no_ptr_args() #3
|
|
|
|
define amdgpu_kernel void @argmemonly_call_without_ptr_args(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 {
|
|
; CHECK-LABEL: define amdgpu_kernel void @argmemonly_call_without_ptr_args(
|
|
; CHECK-SAME: ptr addrspace(1) noalias [[OUT:%.*]], ptr addrspace(1) noalias [[IN:%.*]]) #[[ATTR1]] {
|
|
; CHECK-NEXT: [[ARGMEMONLY_CALL_WITHOUT_PTR_ARGS_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
|
|
; CHECK-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ARGMEMONLY_CALL_WITHOUT_PTR_ARGS_KERNARG_SEGMENT]], i64 0
|
|
; CHECK-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
|
|
; CHECK-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ARGMEMONLY_CALL_WITHOUT_PTR_ARGS_KERNARG_SEGMENT]], i64 8
|
|
; CHECK-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
|
|
; CHECK-NEXT: [[VAL:%.*]] = call i32 @argmemonly_read_no_ptr_args(), !noalias [[META10:![0-9]+]]
|
|
; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[IN_LOAD]], i32 [[VAL]]
|
|
; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[GEP]], align 4, !alias.scope [[META14:![0-9]+]], !noalias [[META15:![0-9]+]]
|
|
; CHECK-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4, !alias.scope [[META15]], !noalias [[META14]]
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
%val = call i32 @argmemonly_read_no_ptr_args()
|
|
%gep = getelementptr i32, ptr addrspace(1) %in, i32 %val
|
|
%load = load i32, ptr addrspace(1) %gep, align 4
|
|
store i32 %load, ptr addrspace(1) %out, align 4
|
|
ret void
|
|
}
|
|
|
|
; argmemonly with a pointer argument pointing to a noalias kernel arg —
|
|
; standard metadata path. The call accesses kernel arg memory through its
|
|
; pointer argument and gets both !alias.scope and !noalias as appropriate.
|
|
declare void @argmemonly_with_ptr_arg(ptr addrspace(1)) #4
|
|
|
|
define amdgpu_kernel void @argmemonly_call_with_ptr_arg(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 {
|
|
; CHECK-LABEL: define amdgpu_kernel void @argmemonly_call_with_ptr_arg(
|
|
; CHECK-SAME: ptr addrspace(1) noalias [[OUT:%.*]], ptr addrspace(1) noalias [[IN:%.*]]) #[[ATTR1]] {
|
|
; CHECK-NEXT: [[ARGMEMONLY_CALL_WITH_PTR_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
|
|
; CHECK-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ARGMEMONLY_CALL_WITH_PTR_ARG_KERNARG_SEGMENT]], i64 0
|
|
; CHECK-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
|
|
; CHECK-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ARGMEMONLY_CALL_WITH_PTR_ARG_KERNARG_SEGMENT]], i64 8
|
|
; CHECK-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
|
|
; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4, !alias.scope [[META16:![0-9]+]], !noalias [[META19:![0-9]+]]
|
|
; CHECK-NEXT: call void @argmemonly_with_ptr_arg(ptr addrspace(1) [[OUT_LOAD]]), !alias.scope [[META19]], !noalias [[META16]]
|
|
; CHECK-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4, !alias.scope [[META19]], !noalias [[META16]]
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
%load = load i32, ptr addrspace(1) %in, align 4
|
|
call void @argmemonly_with_ptr_arg(ptr addrspace(1) %out)
|
|
store i32 %load, ptr addrspace(1) %out, align 4
|
|
ret void
|
|
}
|
|
|
|
attributes #0 = { nounwind }
|
|
attributes #1 = { nounwind memory(read) }
|
|
attributes #2 = { nounwind memory(none) }
|
|
attributes #3 = { nounwind memory(argmem: read) }
|
|
attributes #4 = { nounwind memory(argmem: readwrite) }
|
|
;.
|
|
; CHECK: [[META0]] = !{}
|
|
; CHECK: [[META1]] = !{[[META2:![0-9]+]], [[META4:![0-9]+]]}
|
|
; CHECK: [[META2]] = distinct !{[[META2]], [[META3:![0-9]+]], !"out"}
|
|
; CHECK: [[META3]] = distinct !{[[META3]], !"call_without_ptr_args"}
|
|
; CHECK: [[META4]] = distinct !{[[META4]], [[META3]], !"in"}
|
|
; CHECK: [[META5]] = !{[[META4]]}
|
|
; CHECK: [[META6]] = !{[[META2]]}
|
|
; CHECK: [[META7]] = !{[[META8:![0-9]+]]}
|
|
; CHECK: [[META8]] = distinct !{[[META8]], [[META9:![0-9]+]], !"out"}
|
|
; CHECK: [[META9]] = distinct !{[[META9]], !"readnone_call_without_ptr_args"}
|
|
; CHECK: [[META10]] = !{[[META11:![0-9]+]], [[META13:![0-9]+]]}
|
|
; CHECK: [[META11]] = distinct !{[[META11]], [[META12:![0-9]+]], !"out"}
|
|
; CHECK: [[META12]] = distinct !{[[META12]], !"argmemonly_call_without_ptr_args"}
|
|
; CHECK: [[META13]] = distinct !{[[META13]], [[META12]], !"in"}
|
|
; CHECK: [[META14]] = !{[[META13]]}
|
|
; CHECK: [[META15]] = !{[[META11]]}
|
|
; CHECK: [[META16]] = !{[[META17:![0-9]+]]}
|
|
; CHECK: [[META17]] = distinct !{[[META17]], [[META18:![0-9]+]], !"in"}
|
|
; CHECK: [[META18]] = distinct !{[[META18]], !"argmemonly_call_with_ptr_arg"}
|
|
; CHECK: [[META19]] = !{[[META20:![0-9]+]]}
|
|
; CHECK: [[META20]] = distinct !{[[META20]], [[META18]], !"out"}
|
|
;.
|