For flat memory instructions where the address is supplied as a base address register with an immediate offset, the memory aperture test ignores the immediate offset. Currently, SDISel does not respect that, which leads to miscompilations where valid input programs crash when the address computation relies on the immediate offset to get the base address in the proper memory aperture. Global or scratch instructions are not affected. This patch only selects flat instructions with immediate offsets from PTRADD address computations with the inbounds flag: If the PTRADD does not leave the bounds of the allocated object, it cannot leave the bounds of the memory aperture and is therefore safe to handle with an immediate offset. Affected tests: - CodeGen/AMDGPU/fold-gep-offset.ll: Offsets are no longer wrongly folded, added new positive tests where we still do fold them. - CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll: Offset folding doesn't seem integral to this test, so the test is not changed to make offset folding still happen. - CodeGen/AMDGPU/loop-prefetch-data.ll: loop-reduce transforms inbounds addresses for accesses to be based on potentially OOB addresses used for prefetching. - I think the remaining ones suffer from the limited preservation of the inbounds flag in PTRADD DAGCombines due to the provenance problems pointed out in PR #165424 and the fact that `AMDGPUTargetLowering::SplitVector{Load|Store}` legalizes too-wide accesses by repeatedly splitting them in half. Legalizing a V32S32 memory accesses therefore leads to inbounds ptradd chains like (ptradd inbounds (ptradd inbounds (ptradd inbounds P, 64), 32), 16). The DAGCombines fold them into a single ptradd, but the involved transformations generally cannot preserve the inbounds flag (even though it would be valid in this case). Similar previous PR that relied on `ISD::ADD inbounds` instead of `ISD::PTRADD inbounds` (closed): #132353 Analogous PR for GISel (merged): #153001 Fixes SWDEV-516125.
192 lines
7.8 KiB
LLVM
192 lines
7.8 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
|
|
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck %s
|
|
|
|
; Test that negative 64-bit values shifted by [32-63] bits have
|
|
; a hi-result created by moving an all-ones constant.
|
|
|
|
; FIXME: Range metadata is invalidated when i64 types are legalized to v2i32 types.
|
|
; We could call performSraCombine before legalization, but other optimizations only work
|
|
; with 64-bit sra.
|
|
define i64 @scalar_ashr_metadata(ptr %arg0.ptr, ptr %arg1.ptr) {
|
|
; CHECK-LABEL: scalar_ashr_metadata:
|
|
; CHECK: ; %bb.0:
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_load_dword v4, v[2:3]
|
|
; CHECK-NEXT: ; kill: killed $vgpr0 killed $vgpr1
|
|
; CHECK-NEXT: ; kill: killed $vgpr2 killed $vgpr3
|
|
; CHECK-NEXT: v_ashrrev_i32_e32 v1, 31, v5
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: v_ashrrev_i32_e32 v0, v4, v5
|
|
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
|
%val = load i64, ptr %arg0.ptr, !range !0, !noundef !{}
|
|
%shift.amt = load i64, ptr %arg1.ptr, !range !1, !noundef !{}
|
|
%ashr = ashr i64 %val, %shift.amt
|
|
ret i64 %ashr
|
|
}
|
|
|
|
define <2 x i64> @v2_ashr_metadata(ptr %arg0.ptr, ptr %arg1.ptr) {
|
|
; CHECK-LABEL: v2_ashr_metadata:
|
|
; CHECK: ; %bb.0:
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
|
|
; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3]
|
|
; CHECK-NEXT: v_mov_b32_e32 v1, -1
|
|
; CHECK-NEXT: v_mov_b32_e32 v3, -1
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: v_ashrrev_i32_e32 v0, v8, v5
|
|
; CHECK-NEXT: v_ashrrev_i32_e32 v2, v10, v7
|
|
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
|
%val = load <2 x i64>, ptr %arg0.ptr, !range !2, !noundef !{}
|
|
%shift.amt = load <2 x i64>, ptr %arg1.ptr, !range !3, !noundef !{}
|
|
%ashr = ashr <2 x i64> %val, %shift.amt
|
|
ret <2 x i64> %ashr
|
|
}
|
|
|
|
define <3 x i64> @v3_ashr_metadata(ptr %arg0.ptr, ptr %arg1.ptr) {
|
|
; CHECK-LABEL: v3_ashr_metadata:
|
|
; CHECK: ; %bb.0:
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
|
|
; CHECK-NEXT: v_add_co_u32_e32 v0, vcc, 20, v0
|
|
; CHECK-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_load_dword v4, v[2:3] offset:16
|
|
; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3]
|
|
; CHECK-NEXT: v_mov_b32_e32 v3, -1
|
|
; CHECK-NEXT: flat_load_dword v1, v[0:1]
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: v_ashrrev_i32_e32 v0, v8, v5
|
|
; CHECK-NEXT: v_ashrrev_i32_e32 v2, v10, v7
|
|
; CHECK-NEXT: v_ashrrev_i32_e32 v4, v4, v1
|
|
; CHECK-NEXT: v_mov_b32_e32 v1, -1
|
|
; CHECK-NEXT: v_mov_b32_e32 v5, -1
|
|
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
|
%val = load <3 x i64>, ptr %arg0.ptr, !range !4, !noundef !{}
|
|
%shift.amt = load <3 x i64>, ptr %arg1.ptr, !range !5, !noundef !{}
|
|
%ashr = ashr <3 x i64> %val, %shift.amt
|
|
ret <3 x i64> %ashr
|
|
}
|
|
|
|
define <4 x i64> @v4_ashr_metadata(ptr %arg0.ptr, ptr %arg1.ptr) {
|
|
; CHECK-LABEL: v4_ashr_metadata:
|
|
; CHECK: ; %bb.0:
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3]
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_load_dwordx4 v[7:10], v[0:1]
|
|
; CHECK-NEXT: flat_load_dwordx4 v[11:14], v[0:1] offset:16
|
|
; CHECK-NEXT: flat_load_dwordx4 v[15:18], v[2:3] offset:16
|
|
; CHECK-NEXT: v_mov_b32_e32 v1, -1
|
|
; CHECK-NEXT: v_mov_b32_e32 v3, -1
|
|
; CHECK-NEXT: v_mov_b32_e32 v5, -1
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: v_mov_b32_e32 v7, -1
|
|
; CHECK-NEXT: v_ashrrev_i32_e32 v0, v4, v8
|
|
; CHECK-NEXT: v_ashrrev_i32_e32 v2, v6, v10
|
|
; CHECK-NEXT: v_ashrrev_i32_e32 v4, v15, v12
|
|
; CHECK-NEXT: v_ashrrev_i32_e32 v6, v17, v14
|
|
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
|
%val = load <4 x i64>, ptr %arg0.ptr, !range !6, !noundef !{}
|
|
%shift.amt = load <4 x i64>, ptr %arg1.ptr, !range !7, !noundef !{}
|
|
%ashr = ashr <4 x i64> %val, %shift.amt
|
|
ret <4 x i64> %ashr
|
|
}
|
|
|
|
; Ranges used when transformation is valid
|
|
!0 = !{i64 -6000000000, i64 0}
|
|
!1 = !{i64 32, i64 64}
|
|
!2 = !{i64 -7000000000, i64 -1000}
|
|
!3 = !{i64 38, i64 64}
|
|
!4 = !{i64 -8000000000, i64 -2001}
|
|
!5 = !{i64 38, i64 60}
|
|
!6 = !{i64 -9000000000, i64 -3002}
|
|
!7 = !{i64 38, i64 50}
|
|
|
|
; Test that negative 64-bit values shifted by [2?-63] bits do NOT have
|
|
; a hi-result created by moving an all-ones constant.
|
|
|
|
define i64 @no_transform_scalar_ashr_metadata(ptr %arg0.ptr, ptr %arg1.ptr) {
|
|
; CHECK-LABEL: no_transform_scalar_ashr_metadata:
|
|
; CHECK: ; %bb.0:
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
|
|
; CHECK-NEXT: flat_load_dword v6, v[2:3]
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: v_ashrrev_i64 v[0:1], v6, v[4:5]
|
|
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
|
%val = load i64, ptr %arg0.ptr, !range !8, !noundef !{}
|
|
%shift.amt = load i64, ptr %arg1.ptr, !range !9, !noundef !{}
|
|
%ashr = ashr i64 %val, %shift.amt
|
|
ret i64 %ashr
|
|
}
|
|
|
|
define <2 x i64> @no_transform_v2_ashr_metadata(ptr %arg0.ptr, ptr %arg1.ptr) {
|
|
; CHECK-LABEL: no_transform_v2_ashr_metadata:
|
|
; CHECK: ; %bb.0:
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
|
|
; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3]
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: v_ashrrev_i64 v[0:1], v8, v[4:5]
|
|
; CHECK-NEXT: v_ashrrev_i64 v[2:3], v10, v[6:7]
|
|
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
|
%val = load <2 x i64>, ptr %arg0.ptr, !range !10, !noundef !{}
|
|
%shift.amt = load <2 x i64>, ptr %arg1.ptr, !range !11, !noundef !{}
|
|
%ashr = ashr <2 x i64> %val, %shift.amt
|
|
ret <2 x i64> %ashr
|
|
}
|
|
|
|
define <3 x i64> @no_transform_v3_ashr_metadata(ptr %arg0.ptr, ptr %arg1.ptr) {
|
|
; CHECK-LABEL: no_transform_v3_ashr_metadata:
|
|
; CHECK: ; %bb.0:
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3]
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_load_dwordx4 v[7:10], v[0:1]
|
|
; CHECK-NEXT: flat_load_dwordx2 v[11:12], v[0:1] offset:16
|
|
; CHECK-NEXT: flat_load_dword v5, v[2:3] offset:16
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: v_ashrrev_i64 v[0:1], v4, v[7:8]
|
|
; CHECK-NEXT: v_ashrrev_i64 v[2:3], v6, v[9:10]
|
|
; CHECK-NEXT: v_ashrrev_i64 v[4:5], v5, v[11:12]
|
|
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
|
%val = load <3 x i64>, ptr %arg0.ptr, !range !12, !noundef !{}
|
|
%shift.amt = load <3 x i64>, ptr %arg1.ptr, !range !13, !noundef !{}
|
|
%ashr = ashr <3 x i64> %val, %shift.amt
|
|
ret <3 x i64> %ashr
|
|
}
|
|
|
|
define <4 x i64> @no_transform_v4_ashr_metadata(ptr %arg0.ptr, ptr %arg1.ptr) {
|
|
; CHECK-LABEL: no_transform_v4_ashr_metadata:
|
|
; CHECK: ; %bb.0:
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3]
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_load_dwordx4 v[7:10], v[0:1]
|
|
; CHECK-NEXT: flat_load_dwordx4 v[11:14], v[0:1] offset:16
|
|
; CHECK-NEXT: flat_load_dwordx4 v[15:18], v[2:3] offset:16
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: v_ashrrev_i64 v[0:1], v4, v[7:8]
|
|
; CHECK-NEXT: v_ashrrev_i64 v[2:3], v6, v[9:10]
|
|
; CHECK-NEXT: v_ashrrev_i64 v[4:5], v15, v[11:12]
|
|
; CHECK-NEXT: v_ashrrev_i64 v[6:7], v17, v[13:14]
|
|
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
|
%val = load <4 x i64>, ptr %arg0.ptr, !range !14, !noundef !{}
|
|
%shift.amt = load <4 x i64>, ptr %arg1.ptr, !range !15, !noundef !{}
|
|
%ashr = ashr <4 x i64> %val, %shift.amt
|
|
ret <4 x i64> %ashr
|
|
}
|
|
|
|
; Ranges used when transformation is invalid
|
|
!8 = !{i64 -10000000000, i64 0}
|
|
!9 = !{i64 29, i64 64}
|
|
!10 = !{i64 -11000000000, i64 -1000}
|
|
!11 = !{i64 28, i64 64}
|
|
!12 = !{i64 -12000000000, i64 -2001}
|
|
!13 = !{i64 27, i64 60}
|
|
!14 = !{i64 -13000000000, i64 -3002}
|
|
!15 = !{i64 26, i64 50}
|
|
|