
In graphics shaders it is better overall to use NSA encoding for IMAGE instructions, because the benefit of less constrained register allocation outweighs the cost of larger encoding. In particular NSA form often avoids the need for extra V_MOV_B32 instructions between IMAGE instructions, which can allow the IMAGE instructions to be claused. Note that in GFX12 there is no longer a bit in the encoding to choose between NSA and non-NSA forms, so this only affects GFX10 and GFX11.
31 lines
2.2 KiB
LLVM
31 lines
2.2 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
|
|
; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -stop-after=finalize-isel < %s | FileCheck %s -check-prefix=GFX10
|
|
|
|
define float @test() {
|
|
; GFX10-LABEL: name: test
|
|
; GFX10: bb.0.bb:
|
|
; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
|
|
; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_256 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3, [[S_MOV_B32_]], %subreg.sub4, [[S_MOV_B32_]], %subreg.sub5, [[S_MOV_B32_]], %subreg.sub6, [[S_MOV_B32_]], %subreg.sub7
|
|
; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
|
|
; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
|
|
; GFX10-NEXT: [[IMAGE_LOAD_V2_V2_nsa_gfx10_:%[0-9]+]]:vreg_64 = IMAGE_LOAD_V2_V2_nsa_gfx10 [[COPY]], [[COPY1]], killed [[REG_SEQUENCE]], 3, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s96), align 16, addrspace 8)
|
|
; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[IMAGE_LOAD_V2_V2_nsa_gfx10_]].sub1
|
|
; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[IMAGE_LOAD_V2_V2_nsa_gfx10_]].sub0
|
|
; GFX10-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY2]], 0, killed [[COPY3]], 0, 0, implicit $mode, implicit $exec
|
|
; GFX10-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_ADD_F32_e64_]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
|
|
; GFX10-NEXT: $vgpr0 = COPY [[V_ADD_F32_e64_1]]
|
|
; GFX10-NEXT: SI_RETURN implicit $vgpr0
|
|
bb:
|
|
%v0 = call <2 x float> @llvm.amdgcn.image.load.2d.v2f32.i32(i32 3, i32 0, i32 0, <8 x i32> zeroinitializer, i32 0, i32 0)
|
|
%v1 = call <3 x float> @llvm.amdgcn.image.load.2d.v3f32.i32(i32 7, i32 0, i32 0, <8 x i32> zeroinitializer, i32 0, i32 0)
|
|
%e0 = extractelement <2 x float> %v0, i64 1
|
|
%e1 = extractelement <3 x float> %v1, i64 0
|
|
%e2 = extractelement <3 x float> %v1, i64 1
|
|
%a0 = fadd float %e0, %e1
|
|
%a1 = fadd float %a0, %e2
|
|
ret float %a1
|
|
}
|
|
|
|
declare <3 x float> @llvm.amdgcn.image.load.2d.v3f32.i32(i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg)
|
|
declare <2 x float> @llvm.amdgcn.image.load.2d.v2f32.i32(i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg)
|