llvm-project/llvm/test/CodeGen/AMDGPU/adjust-writemask-cse.ll
Jay Foad b3995aa338
[AMDGPU] Decrease default NSA threshold from 3 to 2 (#116624)
In graphics shaders it is better overall to use NSA encoding for IMAGE
instructions, because the benefit of less constrained register
allocation outweighs the cost of larger encoding. In particular NSA form
often avoids the need for extra V_MOV_B32 instructions between IMAGE
instructions, which can allow the IMAGE instructions to be claused.

Note that in GFX12 there is no longer a bit in the encoding to choose
between NSA and non-NSA forms, so this only affects GFX10 and GFX11.
2024-11-19 15:54:27 +00:00

31 lines
2.2 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -stop-after=finalize-isel < %s | FileCheck %s -check-prefix=GFX10
define float @test() {
; GFX10-LABEL: name: test
; GFX10: bb.0.bb:
; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_256 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3, [[S_MOV_B32_]], %subreg.sub4, [[S_MOV_B32_]], %subreg.sub5, [[S_MOV_B32_]], %subreg.sub6, [[S_MOV_B32_]], %subreg.sub7
; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
; GFX10-NEXT: [[IMAGE_LOAD_V2_V2_nsa_gfx10_:%[0-9]+]]:vreg_64 = IMAGE_LOAD_V2_V2_nsa_gfx10 [[COPY]], [[COPY1]], killed [[REG_SEQUENCE]], 3, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s96), align 16, addrspace 8)
; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[IMAGE_LOAD_V2_V2_nsa_gfx10_]].sub1
; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[IMAGE_LOAD_V2_V2_nsa_gfx10_]].sub0
; GFX10-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY2]], 0, killed [[COPY3]], 0, 0, implicit $mode, implicit $exec
; GFX10-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_ADD_F32_e64_]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
; GFX10-NEXT: $vgpr0 = COPY [[V_ADD_F32_e64_1]]
; GFX10-NEXT: SI_RETURN implicit $vgpr0
bb:
%v0 = call <2 x float> @llvm.amdgcn.image.load.2d.v2f32.i32(i32 3, i32 0, i32 0, <8 x i32> zeroinitializer, i32 0, i32 0)
%v1 = call <3 x float> @llvm.amdgcn.image.load.2d.v3f32.i32(i32 7, i32 0, i32 0, <8 x i32> zeroinitializer, i32 0, i32 0)
%e0 = extractelement <2 x float> %v0, i64 1
%e1 = extractelement <3 x float> %v1, i64 0
%e2 = extractelement <3 x float> %v1, i64 1
%a0 = fadd float %e0, %e1
%a1 = fadd float %a0, %e2
ret float %a1
}
declare <3 x float> @llvm.amdgcn.image.load.2d.v3f32.i32(i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg)
declare <2 x float> @llvm.amdgcn.image.load.2d.v2f32.i32(i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg)