In order to make this easier, I also removed all "removeFromParent" calls from the visitors, instead adding instructions to a set of instructions to delete once the function has been visited. This avoids crashes due to functions deleting their operands. In theory we could allow functions to delete the instruction they visited (and only that one) but I think having one idiom for everything is less error-prone. Fixes #140219
220 lines
9.2 KiB
LLVM
220 lines
9.2 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
|
|
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A %s
|
|
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX942 %s
|
|
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX1030 %s
|
|
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX1100 %s
|
|
|
|
define amdgpu_kernel void @test_insert_extract(i32 %p, i32 %q) {
|
|
; GFX90A-LABEL: test_insert_extract:
|
|
; GFX90A: ; %bb.0: ; %entry
|
|
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
|
|
; GFX90A-NEXT: s_mov_b32 s2, 0
|
|
; GFX90A-NEXT: s_and_b64 vcc, exec, -1
|
|
; GFX90A-NEXT: s_mov_b32 s3, 0
|
|
; GFX90A-NEXT: s_mov_b32 s4, 0
|
|
; GFX90A-NEXT: s_mov_b32 s5, 0
|
|
; GFX90A-NEXT: s_mov_b32 s6, 0
|
|
; GFX90A-NEXT: .LBB0_1: ; %for.body
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: s_cmp_eq_u32 s1, 1
|
|
; GFX90A-NEXT: s_cselect_b64 s[8:9], -1, 0
|
|
; GFX90A-NEXT: s_and_b64 s[8:9], s[8:9], exec
|
|
; GFX90A-NEXT: s_cselect_b32 s7, s3, s2
|
|
; GFX90A-NEXT: s_cmp_eq_u32 s1, 2
|
|
; GFX90A-NEXT: s_cselect_b64 s[8:9], -1, 0
|
|
; GFX90A-NEXT: s_and_b64 s[8:9], s[8:9], exec
|
|
; GFX90A-NEXT: s_cselect_b32 s7, s4, s7
|
|
; GFX90A-NEXT: s_cmp_eq_u32 s1, 3
|
|
; GFX90A-NEXT: s_cselect_b64 s[8:9], -1, 0
|
|
; GFX90A-NEXT: s_and_b64 s[8:9], s[8:9], exec
|
|
; GFX90A-NEXT: s_cselect_b32 s7, s5, s7
|
|
; GFX90A-NEXT: s_or_b32 s7, s7, s0
|
|
; GFX90A-NEXT: s_cmp_eq_u32 s1, 1
|
|
; GFX90A-NEXT: s_cselect_b64 s[8:9], -1, 0
|
|
; GFX90A-NEXT: s_and_b64 s[10:11], s[8:9], exec
|
|
; GFX90A-NEXT: s_cselect_b32 s3, s7, s3
|
|
; GFX90A-NEXT: s_cmp_eq_u32 s1, 3
|
|
; GFX90A-NEXT: s_cselect_b64 s[10:11], -1, 0
|
|
; GFX90A-NEXT: s_and_b64 s[12:13], s[10:11], exec
|
|
; GFX90A-NEXT: s_cselect_b32 s5, s7, s5
|
|
; GFX90A-NEXT: s_cmp_eq_u32 s1, 2
|
|
; GFX90A-NEXT: s_cselect_b64 s[12:13], -1, 0
|
|
; GFX90A-NEXT: s_and_b64 s[14:15], s[12:13], exec
|
|
; GFX90A-NEXT: s_cselect_b32 s4, s7, s4
|
|
; GFX90A-NEXT: s_cmp_eq_u32 s1, 0
|
|
; GFX90A-NEXT: s_cselect_b32 s2, s7, s2
|
|
; GFX90A-NEXT: s_or_b64 s[8:9], s[12:13], s[8:9]
|
|
; GFX90A-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9]
|
|
; GFX90A-NEXT: s_and_b64 s[8:9], s[8:9], exec
|
|
; GFX90A-NEXT: s_cselect_b32 s6, 0, s6
|
|
; GFX90A-NEXT: s_mov_b64 vcc, vcc
|
|
; GFX90A-NEXT: s_cbranch_vccnz .LBB0_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %DummyReturnBlock
|
|
; GFX90A-NEXT: s_endpgm
|
|
;
|
|
; GFX942-LABEL: test_insert_extract:
|
|
; GFX942: ; %bb.0: ; %entry
|
|
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX942-NEXT: s_mov_b32 s2, 0
|
|
; GFX942-NEXT: s_and_b64 vcc, exec, -1
|
|
; GFX942-NEXT: s_mov_b32 s3, 0
|
|
; GFX942-NEXT: s_mov_b32 s4, 0
|
|
; GFX942-NEXT: s_mov_b32 s5, 0
|
|
; GFX942-NEXT: s_mov_b32 s6, 0
|
|
; GFX942-NEXT: .LBB0_1: ; %for.body
|
|
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX942-NEXT: s_cmp_eq_u32 s1, 1
|
|
; GFX942-NEXT: s_cselect_b64 s[8:9], -1, 0
|
|
; GFX942-NEXT: s_and_b64 s[8:9], s[8:9], exec
|
|
; GFX942-NEXT: s_cselect_b32 s7, s3, s2
|
|
; GFX942-NEXT: s_cmp_eq_u32 s1, 2
|
|
; GFX942-NEXT: s_cselect_b64 s[8:9], -1, 0
|
|
; GFX942-NEXT: s_and_b64 s[8:9], s[8:9], exec
|
|
; GFX942-NEXT: s_cselect_b32 s7, s4, s7
|
|
; GFX942-NEXT: s_cmp_eq_u32 s1, 3
|
|
; GFX942-NEXT: s_cselect_b64 s[8:9], -1, 0
|
|
; GFX942-NEXT: s_and_b64 s[8:9], s[8:9], exec
|
|
; GFX942-NEXT: s_cselect_b32 s7, s5, s7
|
|
; GFX942-NEXT: s_or_b32 s7, s7, s0
|
|
; GFX942-NEXT: s_cmp_eq_u32 s1, 1
|
|
; GFX942-NEXT: s_cselect_b64 s[8:9], -1, 0
|
|
; GFX942-NEXT: s_and_b64 s[10:11], s[8:9], exec
|
|
; GFX942-NEXT: s_cselect_b32 s3, s7, s3
|
|
; GFX942-NEXT: s_cmp_eq_u32 s1, 3
|
|
; GFX942-NEXT: s_cselect_b64 s[10:11], -1, 0
|
|
; GFX942-NEXT: s_and_b64 s[12:13], s[10:11], exec
|
|
; GFX942-NEXT: s_cselect_b32 s5, s7, s5
|
|
; GFX942-NEXT: s_cmp_eq_u32 s1, 2
|
|
; GFX942-NEXT: s_cselect_b64 s[12:13], -1, 0
|
|
; GFX942-NEXT: s_and_b64 s[14:15], s[12:13], exec
|
|
; GFX942-NEXT: s_cselect_b32 s4, s7, s4
|
|
; GFX942-NEXT: s_cmp_eq_u32 s1, 0
|
|
; GFX942-NEXT: s_cselect_b32 s2, s7, s2
|
|
; GFX942-NEXT: s_or_b64 s[8:9], s[12:13], s[8:9]
|
|
; GFX942-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9]
|
|
; GFX942-NEXT: s_and_b64 s[8:9], s[8:9], exec
|
|
; GFX942-NEXT: s_cselect_b32 s6, 0, s6
|
|
; GFX942-NEXT: s_mov_b64 vcc, vcc
|
|
; GFX942-NEXT: s_cbranch_vccnz .LBB0_1
|
|
; GFX942-NEXT: ; %bb.2: ; %DummyReturnBlock
|
|
; GFX942-NEXT: s_endpgm
|
|
;
|
|
; GFX1030-LABEL: test_insert_extract:
|
|
; GFX1030: ; %bb.0: ; %entry
|
|
; GFX1030-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
|
|
; GFX1030-NEXT: s_mov_b32 s2, 0
|
|
; GFX1030-NEXT: s_mov_b32 s3, 0
|
|
; GFX1030-NEXT: s_mov_b32 s4, 0
|
|
; GFX1030-NEXT: s_mov_b32 s5, 0
|
|
; GFX1030-NEXT: s_mov_b32 s6, 0
|
|
; GFX1030-NEXT: s_mov_b32 vcc_lo, exec_lo
|
|
; GFX1030-NEXT: .p2align 6
|
|
; GFX1030-NEXT: .LBB0_1: ; %for.body
|
|
; GFX1030-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1030-NEXT: s_cmp_eq_u32 s1, 1
|
|
; GFX1030-NEXT: s_cselect_b32 s7, -1, 0
|
|
; GFX1030-NEXT: s_and_b32 s7, s7, exec_lo
|
|
; GFX1030-NEXT: s_cselect_b32 s7, s3, s2
|
|
; GFX1030-NEXT: s_cmp_eq_u32 s1, 2
|
|
; GFX1030-NEXT: s_cselect_b32 s8, -1, 0
|
|
; GFX1030-NEXT: s_and_b32 s8, s8, exec_lo
|
|
; GFX1030-NEXT: s_cselect_b32 s7, s4, s7
|
|
; GFX1030-NEXT: s_cmp_eq_u32 s1, 3
|
|
; GFX1030-NEXT: s_cselect_b32 s8, -1, 0
|
|
; GFX1030-NEXT: s_and_b32 s8, s8, exec_lo
|
|
; GFX1030-NEXT: s_cselect_b32 s7, s5, s7
|
|
; GFX1030-NEXT: s_or_b32 s7, s7, s0
|
|
; GFX1030-NEXT: s_cmp_eq_u32 s1, 1
|
|
; GFX1030-NEXT: s_cselect_b32 s8, -1, 0
|
|
; GFX1030-NEXT: s_and_b32 s9, s8, exec_lo
|
|
; GFX1030-NEXT: s_cselect_b32 s3, s7, s3
|
|
; GFX1030-NEXT: s_cmp_eq_u32 s1, 3
|
|
; GFX1030-NEXT: s_cselect_b32 s9, -1, 0
|
|
; GFX1030-NEXT: s_and_b32 s10, s9, exec_lo
|
|
; GFX1030-NEXT: s_cselect_b32 s5, s7, s5
|
|
; GFX1030-NEXT: s_cmp_eq_u32 s1, 2
|
|
; GFX1030-NEXT: s_cselect_b32 s10, -1, 0
|
|
; GFX1030-NEXT: s_and_b32 s11, s10, exec_lo
|
|
; GFX1030-NEXT: s_cselect_b32 s4, s7, s4
|
|
; GFX1030-NEXT: s_cmp_eq_u32 s1, 0
|
|
; GFX1030-NEXT: s_cselect_b32 s2, s7, s2
|
|
; GFX1030-NEXT: s_or_b32 s7, s10, s8
|
|
; GFX1030-NEXT: s_or_b32 s7, s9, s7
|
|
; GFX1030-NEXT: s_and_b32 s7, s7, exec_lo
|
|
; GFX1030-NEXT: s_cselect_b32 s6, 0, s6
|
|
; GFX1030-NEXT: s_cbranch_vccnz .LBB0_1
|
|
; GFX1030-NEXT: ; %bb.2: ; %DummyReturnBlock
|
|
; GFX1030-NEXT: s_endpgm
|
|
;
|
|
; GFX1100-LABEL: test_insert_extract:
|
|
; GFX1100: ; %bb.0: ; %entry
|
|
; GFX1100-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
|
|
; GFX1100-NEXT: s_mov_b32 s2, 0
|
|
; GFX1100-NEXT: s_mov_b32 s3, 0
|
|
; GFX1100-NEXT: s_mov_b32 s4, 0
|
|
; GFX1100-NEXT: s_mov_b32 s5, 0
|
|
; GFX1100-NEXT: s_mov_b32 s6, 0
|
|
; GFX1100-NEXT: s_mov_b32 vcc_lo, exec_lo
|
|
; GFX1100-NEXT: .p2align 6
|
|
; GFX1100-NEXT: .LBB0_1: ; %for.body
|
|
; GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX1100-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1100-NEXT: s_cmp_eq_u32 s1, 1
|
|
; GFX1100-NEXT: s_cselect_b32 s7, -1, 0
|
|
; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
|
|
; GFX1100-NEXT: s_and_b32 s7, s7, exec_lo
|
|
; GFX1100-NEXT: s_cselect_b32 s7, s3, s2
|
|
; GFX1100-NEXT: s_cmp_eq_u32 s1, 2
|
|
; GFX1100-NEXT: s_cselect_b32 s8, -1, 0
|
|
; GFX1100-NEXT: s_and_b32 s8, s8, exec_lo
|
|
; GFX1100-NEXT: s_cselect_b32 s7, s4, s7
|
|
; GFX1100-NEXT: s_cmp_eq_u32 s1, 3
|
|
; GFX1100-NEXT: s_cselect_b32 s8, -1, 0
|
|
; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
|
|
; GFX1100-NEXT: s_and_b32 s8, s8, exec_lo
|
|
; GFX1100-NEXT: s_cselect_b32 s7, s5, s7
|
|
; GFX1100-NEXT: s_or_b32 s7, s7, s0
|
|
; GFX1100-NEXT: s_cmp_eq_u32 s1, 1
|
|
; GFX1100-NEXT: s_cselect_b32 s8, -1, 0
|
|
; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
|
|
; GFX1100-NEXT: s_and_b32 s9, s8, exec_lo
|
|
; GFX1100-NEXT: s_cselect_b32 s3, s7, s3
|
|
; GFX1100-NEXT: s_cmp_eq_u32 s1, 3
|
|
; GFX1100-NEXT: s_cselect_b32 s9, -1, 0
|
|
; GFX1100-NEXT: s_and_b32 s10, s9, exec_lo
|
|
; GFX1100-NEXT: s_cselect_b32 s5, s7, s5
|
|
; GFX1100-NEXT: s_cmp_eq_u32 s1, 2
|
|
; GFX1100-NEXT: s_cselect_b32 s10, -1, 0
|
|
; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
|
|
; GFX1100-NEXT: s_and_b32 s11, s10, exec_lo
|
|
; GFX1100-NEXT: s_cselect_b32 s4, s7, s4
|
|
; GFX1100-NEXT: s_cmp_eq_u32 s1, 0
|
|
; GFX1100-NEXT: s_cselect_b32 s2, s7, s2
|
|
; GFX1100-NEXT: s_or_b32 s7, s10, s8
|
|
; GFX1100-NEXT: s_or_b32 s7, s9, s7
|
|
; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX1100-NEXT: s_and_b32 s7, s7, exec_lo
|
|
; GFX1100-NEXT: s_cselect_b32 s6, 0, s6
|
|
; GFX1100-NEXT: s_cbranch_vccnz .LBB0_1
|
|
; GFX1100-NEXT: ; %bb.2: ; %DummyReturnBlock
|
|
; GFX1100-NEXT: s_endpgm
|
|
entry:
|
|
%init = insertelement <4 x i32> zeroinitializer, i32 0, i64 0
|
|
br label %for.body
|
|
|
|
for.body: ; preds = %for.body, %entry
|
|
%x1 = phi <4 x i32> [ %init, %entry ], [ %i4, %for.body ]
|
|
%x2 = phi <4 x i32> [ zeroinitializer, %entry ], [ %i2, %for.body ]
|
|
%idxprom = zext i32 %q to i64
|
|
%e1 = extractelement <4 x i32> %x2, i64 %idxprom
|
|
%add = or i32 %e1, %p
|
|
%i2 = insertelement <4 x i32> %x2, i32 %add, i64 %idxprom
|
|
%e3 = extractelement <4 x i32> %x1, i64 %idxprom
|
|
%i4 = insertelement <4 x i32> %x1, i32 %e3, i64 0
|
|
br label %for.body
|
|
}
|
|
|