llvm-project/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.waitcnt.out.order.ll
Shilei Tian fc0653f31c
[RFC][NFC][AMDGPU] Remove -verify-machineinstrs from llvm/test/CodeGen/AMDGPU/*.ll (#150024)
Recent upstream trends have moved away from explicitly using `-verify-machineinstrs`, as it's already covered by the expensive checks. This PR removes almost all `-verify-machineinstrs` from tests in `llvm/test/CodeGen/AMDGPU/*.ll`, leaving only those tests where its removal currently causes failures.
2025-07-23 13:42:46 -04:00

237 lines
13 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX1150,GFX1150-TRUE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX1150,GFX1150-FAKE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
define amdgpu_ps <3 x float> @gather_sample(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, <8 x i32> inreg %rsrc2, <4 x i32> inreg %samp2, float %s, float %t) {
; GFX11-LABEL: gather_sample:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: image_gather4_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: image_sample_lz v2, [v4, v4], s[12:19], s[20:23] dmask:0x1 dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
;
; GFX1150-LABEL: gather_sample:
; GFX1150: ; %bb.0:
; GFX1150-NEXT: v_mov_b32_e32 v4, 0
; GFX1150-NEXT: image_gather4_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
; GFX1150-NEXT: s_waitcnt vmcnt(0)
; GFX1150-NEXT: image_sample_lz v2, [v4, v4], s[12:19], s[20:23] dmask:0x1 dim:SQ_RSRC_IMG_2D
; GFX1150-NEXT: s_waitcnt vmcnt(0)
; GFX1150-NEXT: ; return to shader part epilog
;
; GFX12-LABEL: gather_sample:
; GFX12: ; %bb.0:
; GFX12-NEXT: v_mov_b32_e32 v4, 0
; GFX12-NEXT: image_gather4_lz v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: image_sample_lz v2, [v4, v4], s[12:19], s[20:23] dmask:0x1 dim:SQ_RSRC_IMG_2D
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: ; return to shader part epilog
%v = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
%w = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32 1, float 0.000000e+00, float 0.000000e+00, <8 x i32> %rsrc2, <4 x i32> %samp2, i1 false, i32 0, i32 0)
%t0 = extractelement <4 x float> %v, i32 0
%res0 = insertelement <3 x float> poison, float %t0, i32 0
%t1 = extractelement <4 x float> %v, i32 1
%res1 = insertelement <3 x float> %res0, float %t1, i32 1
%t2 = extractelement <4 x float> %w, i32 0
%res2 = insertelement <3 x float> %res1, float %t2, i32 2
ret <3 x float> %res2
}
define amdgpu_ps <3 x float> @sample_gather(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, <8 x i32> inreg %rsrc2, <4 x i32> inreg %samp2, float %s, float %t) {
; GFX11-LABEL: sample_gather:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: image_gather4_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: image_sample_lz v2, [v4, v4], s[12:19], s[20:23] dmask:0x1 dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
;
; GFX1150-LABEL: sample_gather:
; GFX1150: ; %bb.0:
; GFX1150-NEXT: v_mov_b32_e32 v4, 0
; GFX1150-NEXT: image_gather4_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
; GFX1150-NEXT: s_waitcnt vmcnt(0)
; GFX1150-NEXT: image_sample_lz v2, [v4, v4], s[12:19], s[20:23] dmask:0x1 dim:SQ_RSRC_IMG_2D
; GFX1150-NEXT: s_waitcnt vmcnt(0)
; GFX1150-NEXT: ; return to shader part epilog
;
; GFX12-LABEL: sample_gather:
; GFX12: ; %bb.0:
; GFX12-NEXT: v_mov_b32_e32 v4, 0
; GFX12-NEXT: image_gather4_lz v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: image_sample_lz v2, [v4, v4], s[12:19], s[20:23] dmask:0x1 dim:SQ_RSRC_IMG_2D
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: ; return to shader part epilog
%w = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> %rsrc2, <4 x i32> %samp2, i1 false, i32 0, i32 0)
%v = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
%t0 = extractelement <4 x float> %v, i32 0
%res0 = insertelement <3 x float> poison, float %t0, i32 0
%t1 = extractelement <4 x float> %v, i32 1
%res1 = insertelement <3 x float> %res0, float %t1, i32 1
%t2 = extractelement <4 x float> %w, i32 0
%res2 = insertelement <3 x float> %res1, float %t2, i32 2
ret <3 x float> %res2
}
define amdgpu_ps <3 x float> @sample_load(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, <8 x i32> inreg %rsrc2, i16 %s.16, i16 %t.16, i16 %fragid) {
; GFX11-TRUE16-LABEL: sample_load:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, 0
; GFX11-TRUE16-NEXT: image_msaa_load v[0:3], v[2:3], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16
; GFX11-TRUE16-NEXT: image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: ; return to shader part epilog
;
; GFX11-FAKE16-LABEL: sample_load:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, 0
; GFX11-FAKE16-NEXT: image_msaa_load v[0:3], [v0, v2], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16
; GFX11-FAKE16-NEXT: image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: ; return to shader part epilog
;
; GFX1150-TRUE16-LABEL: sample_load:
; GFX1150-TRUE16: ; %bb.0:
; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l
; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v4, 0
; GFX1150-TRUE16-NEXT: image_msaa_load v[0:3], v[2:3], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16
; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX1150-TRUE16-NEXT: image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX1150-TRUE16-NEXT: ; return to shader part epilog
;
; GFX1150-FAKE16-LABEL: sample_load:
; GFX1150-FAKE16: ; %bb.0:
; GFX1150-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v4, 0
; GFX1150-FAKE16-NEXT: image_msaa_load v[0:3], [v0, v2], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16
; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX1150-FAKE16-NEXT: image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX1150-FAKE16-NEXT: ; return to shader part epilog
;
; GFX12-TRUE16-LABEL: sample_load:
; GFX12-TRUE16: ; %bb.0:
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, 0
; GFX12-TRUE16-NEXT: image_msaa_load v[0:3], [v0, v2], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: ; return to shader part epilog
;
; GFX12-FAKE16-LABEL: sample_load:
; GFX12-FAKE16: ; %bb.0:
; GFX12-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, 0
; GFX12-FAKE16-NEXT: image_msaa_load v[0:3], [v0, v2], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16
; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
; GFX12-FAKE16-NEXT: image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
; GFX12-FAKE16-NEXT: ; return to shader part epilog
%w = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
%v = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i16 %s.16, i16 %t.16, i16 %fragid, <8 x i32> %rsrc2, i32 0, i32 0)
%t0 = extractelement <4 x float> %v, i32 0
%res0 = insertelement <3 x float> poison, float %t0, i32 0
%t1 = extractelement <4 x float> %v, i32 1
%res1 = insertelement <3 x float> %res0, float %t1, i32 1
%t2 = extractelement <4 x float> %w, i32 0
%res2 = insertelement <3 x float> %res1, float %t2, i32 2
ret <3 x float> %res2
}
define amdgpu_ps <3 x float> @load_sample(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, <8 x i32> inreg %rsrc2, i16 %s.16, i16 %t.16, i16 %fragid) {
; GFX11-TRUE16-LABEL: load_sample:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, 0
; GFX11-TRUE16-NEXT: image_msaa_load v[0:3], v[2:3], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16
; GFX11-TRUE16-NEXT: image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: ; return to shader part epilog
;
; GFX11-FAKE16-LABEL: load_sample:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, 0
; GFX11-FAKE16-NEXT: image_msaa_load v[0:3], [v0, v2], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16
; GFX11-FAKE16-NEXT: image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: ; return to shader part epilog
;
; GFX1150-TRUE16-LABEL: load_sample:
; GFX1150-TRUE16: ; %bb.0:
; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l
; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v4, 0
; GFX1150-TRUE16-NEXT: image_msaa_load v[0:3], v[2:3], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16
; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX1150-TRUE16-NEXT: image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX1150-TRUE16-NEXT: ; return to shader part epilog
;
; GFX1150-FAKE16-LABEL: load_sample:
; GFX1150-FAKE16: ; %bb.0:
; GFX1150-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v4, 0
; GFX1150-FAKE16-NEXT: image_msaa_load v[0:3], [v0, v2], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16
; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX1150-FAKE16-NEXT: image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX1150-FAKE16-NEXT: ; return to shader part epilog
;
; GFX12-TRUE16-LABEL: load_sample:
; GFX12-TRUE16: ; %bb.0:
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, 0
; GFX12-TRUE16-NEXT: image_msaa_load v[0:3], [v0, v2], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: ; return to shader part epilog
;
; GFX12-FAKE16-LABEL: load_sample:
; GFX12-FAKE16: ; %bb.0:
; GFX12-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, 0
; GFX12-FAKE16-NEXT: image_msaa_load v[0:3], [v0, v2], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16
; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
; GFX12-FAKE16-NEXT: image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
; GFX12-FAKE16-NEXT: ; return to shader part epilog
%v = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i16 %s.16, i16 %t.16, i16 %fragid, <8 x i32> %rsrc2, i32 0, i32 0)
%w = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
%t0 = extractelement <4 x float> %v, i32 0
%res0 = insertelement <3 x float> poison, float %t0, i32 0
%t1 = extractelement <4 x float> %v, i32 1
%res1 = insertelement <3 x float> %res0, float %t1, i32 1
%t2 = extractelement <4 x float> %w, i32 0
%res2 = insertelement <3 x float> %res1, float %t2, i32 2
ret <3 x float> %res2
}
declare <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32)
declare <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32)
declare <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i16(i32, i16, i16, i16, <8 x i32>, i32, i32)