llvm-project/llvm/test/Transforms/StraightLineStrengthReduce/AMDGPU/reassociate-geps-and-slsr-addrspace.ll
Fabian Ritter 332f060363
[SeparateConstOffsetFromGEP] Don't set unsound inbounds flag (#130616)
The language reference says about inbounds geps that "if the
getelementptr has any non-zero indices[...] [t]he base pointer has an in
bounds address of the allocated object that it is based on [and]
[d]uring the successive addition of offsets to the address, the
resulting pointer must remain in bounds of the allocated object at each
step."

If (gep inbounds p, (a + 5)) is translated to (gep [inbounds] (gep p,
a), 5) with p pointing to the beginning of an object and a=-4, as the
example in the comments suggests, that's the case for neither of the
resulting geps. Therefore, we need to clear the inbounds flag for both
geps.

We might want to use ValueTracking to check if a is known to be
non-negative to preserve the inbounds flags.

For the AMDGPU tests with scratch instructions, removing the unsound
inbounds flag means that AMDGPUDAGToDAGISel::isFlatScratchBaseLegal sees
no NUW flag at the pointer add, which prevents generation of scratch
instructions with immediate offsets.

For SWDEV-516125.
2025-03-18 12:30:20 +01:00

132 lines
6.4 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
; RUN: opt -S -mtriple=amdgcn-- -passes=separate-const-offset-from-gep,slsr,gvn < %s | FileCheck %s
target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
define amdgpu_kernel void @slsr_after_reassociate_global_geps_mubuf_max_offset(ptr addrspace(1) %out, ptr addrspace(1) noalias %arr, i32 %i) {
; CHECK-LABEL: define amdgpu_kernel void @slsr_after_reassociate_global_geps_mubuf_max_offset(
; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) noalias [[ARR:%.*]], i32 [[I:%.*]]) {
; CHECK-NEXT: bb:
; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[I]] to i64
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr float, ptr addrspace(1) [[ARR]], i64 [[TMP0]]
; CHECK-NEXT: [[P12:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP1]], i64 4092
; CHECK-NEXT: [[V11:%.*]] = load i32, ptr addrspace(1) [[P12]], align 4
; CHECK-NEXT: store i32 [[V11]], ptr addrspace(1) [[OUT]], align 4
; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[TMP0]], 2
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP1]], i64 [[TMP2]]
; CHECK-NEXT: [[P24:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP3]], i64 4092
; CHECK-NEXT: [[V22:%.*]] = load i32, ptr addrspace(1) [[P24]], align 4
; CHECK-NEXT: store i32 [[V22]], ptr addrspace(1) [[OUT]], align 4
; CHECK-NEXT: ret void
;
bb:
%i2 = shl nsw i32 %i, 1
%j1 = add nsw i32 %i, 1023
%tmp = sext i32 %j1 to i64
%p1 = getelementptr inbounds float, ptr addrspace(1) %arr, i64 %tmp
%v11 = load i32, ptr addrspace(1) %p1, align 4
store i32 %v11, ptr addrspace(1) %out, align 4
%j2 = add nsw i32 %i2, 1023
%tmp5 = sext i32 %j2 to i64
%p2 = getelementptr inbounds float, ptr addrspace(1) %arr, i64 %tmp5
%v22 = load i32, ptr addrspace(1) %p2, align 4
store i32 %v22, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @slsr_after_reassociate_global_geps_over_mubuf_max_offset(ptr addrspace(1) %out, ptr addrspace(1) noalias %arr, i32 %i) {
; CHECK-LABEL: define amdgpu_kernel void @slsr_after_reassociate_global_geps_over_mubuf_max_offset(
; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) noalias [[ARR:%.*]], i32 [[I:%.*]]) {
; CHECK-NEXT: bb:
; CHECK-NEXT: [[J1:%.*]] = add nsw i32 [[I]], 1024
; CHECK-NEXT: [[TMP:%.*]] = sext i32 [[J1]] to i64
; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[ARR]], i64 [[TMP]]
; CHECK-NEXT: [[V11:%.*]] = load i32, ptr addrspace(1) [[P1]], align 4
; CHECK-NEXT: store i32 [[V11]], ptr addrspace(1) [[OUT]], align 4
; CHECK-NEXT: [[J2:%.*]] = add i32 [[J1]], [[I]]
; CHECK-NEXT: [[TMP5:%.*]] = sext i32 [[J2]] to i64
; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[ARR]], i64 [[TMP5]]
; CHECK-NEXT: [[V22:%.*]] = load i32, ptr addrspace(1) [[P2]], align 4
; CHECK-NEXT: store i32 [[V22]], ptr addrspace(1) [[OUT]], align 4
; CHECK-NEXT: ret void
;
bb:
%i2 = shl nsw i32 %i, 1
%j1 = add nsw i32 %i, 1024
%tmp = sext i32 %j1 to i64
%p1 = getelementptr inbounds float, ptr addrspace(1) %arr, i64 %tmp
%v11 = load i32, ptr addrspace(1) %p1, align 4
store i32 %v11, ptr addrspace(1) %out, align 4
%j2 = add nsw i32 %i2, 1024
%tmp5 = sext i32 %j2 to i64
%p2 = getelementptr inbounds float, ptr addrspace(1) %arr, i64 %tmp5
%v22 = load i32, ptr addrspace(1) %p2, align 4
store i32 %v22, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @slsr_after_reassociate_lds_geps_ds_max_offset(ptr addrspace(1) %out, ptr addrspace(3) noalias %arr, i32 %i) {
; CHECK-LABEL: define amdgpu_kernel void @slsr_after_reassociate_lds_geps_ds_max_offset(
; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(3) noalias [[ARR:%.*]], i32 [[I:%.*]]) {
; CHECK-NEXT: bb:
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr float, ptr addrspace(3) [[ARR]], i32 [[I]]
; CHECK-NEXT: [[P12:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP0]], i32 65532
; CHECK-NEXT: [[V11:%.*]] = load i32, ptr addrspace(3) [[P12]], align 4
; CHECK-NEXT: store i32 [[V11]], ptr addrspace(1) [[OUT]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[I]], 2
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP0]], i32 [[TMP1]]
; CHECK-NEXT: [[P24:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP2]], i32 65532
; CHECK-NEXT: [[V22:%.*]] = load i32, ptr addrspace(3) [[P24]], align 4
; CHECK-NEXT: store i32 [[V22]], ptr addrspace(1) [[OUT]], align 4
; CHECK-NEXT: ret void
;
bb:
%i2 = shl nsw i32 %i, 1
%j1 = add nsw i32 %i, 16383
%p1 = getelementptr inbounds float, ptr addrspace(3) %arr, i32 %j1
%v11 = load i32, ptr addrspace(3) %p1, align 4
store i32 %v11, ptr addrspace(1) %out, align 4
%j2 = add nsw i32 %i2, 16383
%p2 = getelementptr inbounds float, ptr addrspace(3) %arr, i32 %j2
%v22 = load i32, ptr addrspace(3) %p2, align 4
store i32 %v22, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @slsr_after_reassociate_lds_geps_over_ds_max_offset(ptr addrspace(1) %out, ptr addrspace(3) noalias %arr, i32 %i) {
; CHECK-LABEL: define amdgpu_kernel void @slsr_after_reassociate_lds_geps_over_ds_max_offset(
; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(3) noalias [[ARR:%.*]], i32 [[I:%.*]]) {
; CHECK-NEXT: bb:
; CHECK-NEXT: [[J1:%.*]] = add nsw i32 [[I]], 16384
; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[ARR]], i32 [[J1]]
; CHECK-NEXT: [[V11:%.*]] = load i32, ptr addrspace(3) [[P1]], align 4
; CHECK-NEXT: store i32 [[V11]], ptr addrspace(1) [[OUT]], align 4
; CHECK-NEXT: [[J2:%.*]] = add i32 [[J1]], [[I]]
; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[ARR]], i32 [[J2]]
; CHECK-NEXT: [[V22:%.*]] = load i32, ptr addrspace(3) [[P2]], align 4
; CHECK-NEXT: store i32 [[V22]], ptr addrspace(1) [[OUT]], align 4
; CHECK-NEXT: ret void
;
bb:
%i2 = shl nsw i32 %i, 1
%j1 = add nsw i32 %i, 16384
%p1 = getelementptr inbounds float, ptr addrspace(3) %arr, i32 %j1
%v11 = load i32, ptr addrspace(3) %p1, align 4
store i32 %v11, ptr addrspace(1) %out, align 4
%j2 = add nsw i32 %i2, 16384
%p2 = getelementptr inbounds float, ptr addrspace(3) %arr, i32 %j2
%v22 = load i32, ptr addrspace(3) %p2, align 4
store i32 %v22, ptr addrspace(1) %out, align 4
ret void
}