AMDGPU: Fix verifier error when waterfall call target is in AV register (#168017)
This commit is contained in:
parent
4d3ed10241
commit
b2f12331ab
@ -8177,26 +8177,34 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
|
||||
return;
|
||||
}
|
||||
|
||||
if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() &&
|
||||
NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
|
||||
// Instead of creating a copy where src and dst are the same register
|
||||
// class, we just replace all uses of dst with src. These kinds of
|
||||
// copies interfere with the heuristics MachineSink uses to decide
|
||||
// whether or not to split a critical edge. Since the pass assumes
|
||||
// that copies will end up as machine instructions and not be
|
||||
// eliminated.
|
||||
addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
|
||||
if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual()) {
|
||||
Register NewDstReg = Inst.getOperand(1).getReg();
|
||||
MRI.replaceRegWith(DstReg, NewDstReg);
|
||||
MRI.clearKillFlags(NewDstReg);
|
||||
Inst.getOperand(0).setReg(DstReg);
|
||||
Inst.eraseFromParent();
|
||||
// Legalize t16 operand since replaceReg is called after addUsersToVALU
|
||||
for (MachineOperand &MO :
|
||||
make_early_inc_range(MRI.use_operands(NewDstReg))) {
|
||||
legalizeOperandsVALUt16(*MO.getParent(), MRI);
|
||||
const TargetRegisterClass *SrcRC = RI.getRegClassForReg(MRI, NewDstReg);
|
||||
if (const TargetRegisterClass *CommonRC =
|
||||
RI.getCommonSubClass(NewDstRC, SrcRC)) {
|
||||
// Instead of creating a copy where src and dst are the same register
|
||||
// class, we just replace all uses of dst with src. These kinds of
|
||||
// copies interfere with the heuristics MachineSink uses to decide
|
||||
// whether or not to split a critical edge. Since the pass assumes
|
||||
// that copies will end up as machine instructions and not be
|
||||
// eliminated.
|
||||
addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
|
||||
MRI.replaceRegWith(DstReg, NewDstReg);
|
||||
MRI.clearKillFlags(NewDstReg);
|
||||
Inst.getOperand(0).setReg(DstReg);
|
||||
|
||||
if (!MRI.constrainRegClass(NewDstReg, CommonRC))
|
||||
llvm_unreachable("failed to constrain register");
|
||||
|
||||
Inst.eraseFromParent();
|
||||
// Legalize t16 operand since replaceReg is called after addUsersToVALU
|
||||
for (MachineOperand &MO :
|
||||
make_early_inc_range(MRI.use_operands(NewDstReg))) {
|
||||
legalizeOperandsVALUt16(*MO.getParent(), MRI);
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// If this is a v2s copy between 16bit and 32bit reg,
|
||||
|
||||
@ -10733,15 +10733,16 @@ define void @flat_atomic_fmaximum_f64_ret_a_a(ptr %ptr) #0 {
|
||||
; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
|
||||
; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5]
|
||||
; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
||||
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
|
||||
; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
|
||||
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc
|
||||
; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
|
||||
; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
|
||||
; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4
|
||||
; GFX90A-NEXT: .LBB135_6: ; %atomicrmw.phi
|
||||
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: ;;#ASMSTART
|
||||
@ -11000,15 +11001,16 @@ define void @flat_atomic_fminimum_f64_ret_a_a(ptr %ptr) #0 {
|
||||
; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
|
||||
; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5]
|
||||
; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
||||
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
|
||||
; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
|
||||
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc
|
||||
; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
|
||||
; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
|
||||
; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4
|
||||
; GFX90A-NEXT: .LBB137_6: ; %atomicrmw.phi
|
||||
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: ;;#ASMSTART
|
||||
@ -19023,15 +19025,16 @@ define void @flat_atomic_fmaximum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
|
||||
; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
|
||||
; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5]
|
||||
; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
||||
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
|
||||
; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
|
||||
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc
|
||||
; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
|
||||
; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
|
||||
; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4
|
||||
; GFX90A-NEXT: .LBB243_6: ; %atomicrmw.phi
|
||||
; GFX90A-NEXT: ;;#ASMSTART
|
||||
; GFX90A-NEXT: ; use a[0:1]
|
||||
@ -19282,15 +19285,16 @@ define void @flat_atomic_fminimum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
|
||||
; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
|
||||
; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5]
|
||||
; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
||||
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
|
||||
; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
|
||||
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc
|
||||
; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
|
||||
; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
|
||||
; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4
|
||||
; GFX90A-NEXT: .LBB245_6: ; %atomicrmw.phi
|
||||
; GFX90A-NEXT: ;;#ASMSTART
|
||||
; GFX90A-NEXT: ; use a[0:1]
|
||||
|
||||
@ -43,26 +43,25 @@ define void @phi_with_alloca_and_divergent_copy_to_reg(ptr addrspace(5) %diverge
|
||||
; CHECK-LABEL: phi_with_alloca_and_divergent_copy_to_reg:
|
||||
; CHECK: ; %bb.0: ; %entry
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; CHECK-NEXT: s_lshr_b32 s6, s32, 6
|
||||
; CHECK-NEXT: v_mov_b32_e32 v7, v2
|
||||
; CHECK-NEXT: v_mov_b32_e32 v6, v1
|
||||
; CHECK-NEXT: s_mov_b64 s[4:5], 0
|
||||
; CHECK-NEXT: v_mov_b32_e32 v1, s6
|
||||
; CHECK-NEXT: v_lshrrev_b32_e64 v2, 6, s32
|
||||
; CHECK-NEXT: .LBB1_1: ; %loop
|
||||
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: v_add_u32_e32 v8, 1, v3
|
||||
; CHECK-NEXT: v_lshl_add_u32 v5, v3, 2, v1
|
||||
; CHECK-NEXT: v_cmp_lt_u32_e32 vcc, 15, v8
|
||||
; CHECK-NEXT: v_mov_b32_e32 v2, v1
|
||||
; CHECK-NEXT: v_mov_b32_e32 v1, v0
|
||||
; CHECK-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
|
||||
; CHECK-NEXT: v_mov_b32_e32 v1, v2
|
||||
; CHECK-NEXT: v_lshl_add_u32 v2, v3, 2, v1
|
||||
; CHECK-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen
|
||||
; CHECK-NEXT: v_add_u32_e32 v2, 1, v3
|
||||
; CHECK-NEXT: v_cmp_lt_u32_e32 vcc, 15, v2
|
||||
; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; CHECK-NEXT: v_mov_b32_e32 v3, v4
|
||||
; CHECK-NEXT: v_mov_b32_e32 v2, v0
|
||||
; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; CHECK-NEXT: s_cbranch_execnz .LBB1_1
|
||||
; CHECK-NEXT: ; %bb.2: ; %done
|
||||
; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; CHECK-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
|
||||
; CHECK-NEXT: buffer_load_dword v0, v1, s[0:3], 0 offen
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
||||
; CHECK-NEXT: global_store_dword v[6:7], v0, off
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -6,8 +6,8 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) {
|
||||
; GFX942-LABEL: matmul_kernel:
|
||||
; GFX942: ; %bb.0: ; %entry
|
||||
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX942-NEXT: s_mov_b32 s2, 0
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a1, 0
|
||||
; GFX942-NEXT: s_mov_b32 s3, 0
|
||||
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
||||
|
||||
@ -0,0 +1,141 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
|
||||
; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck %s
|
||||
|
||||
; Make sure SIFixSGPRCopies handles situations where it needs to fix
|
||||
; up copies to physical registers from an AV virtual register.
|
||||
|
||||
define i32 @fix_sgpr_copies_indirect_call(ptr addrspace(5) %ptr) {
|
||||
; CHECK-LABEL: fix_sgpr_copies_indirect_call:
|
||||
; CHECK: ; %bb.0: ; %bb
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; CHECK-NEXT: s_mov_b32 s16, s33
|
||||
; CHECK-NEXT: s_mov_b32 s33, s32
|
||||
; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1
|
||||
; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
|
||||
; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
|
||||
; CHECK-NEXT: s_mov_b64 exec, s[18:19]
|
||||
; CHECK-NEXT: v_writelane_b32 v40, s16, 4
|
||||
; CHECK-NEXT: v_writelane_b32 v40, s34, 2
|
||||
; CHECK-NEXT: v_writelane_b32 v40, s35, 3
|
||||
; CHECK-NEXT: s_add_i32 s32, s32, 0x800
|
||||
; CHECK-NEXT: v_writelane_b32 v40, s30, 0
|
||||
; CHECK-NEXT: v_writelane_b32 v40, s31, 1
|
||||
; CHECK-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
|
||||
; CHECK-NEXT: v_mov_b32_e32 v1, v0
|
||||
; CHECK-NEXT: ; implicit-def: $vgpr41 : SGPR spill to VGPR lane
|
||||
; CHECK-NEXT: v_writelane_b32 v41, s15, 0
|
||||
; CHECK-NEXT: v_writelane_b32 v41, s14, 1
|
||||
; CHECK-NEXT: v_writelane_b32 v41, s13, 2
|
||||
; CHECK-NEXT: v_writelane_b32 v41, s12, 3
|
||||
; CHECK-NEXT: v_writelane_b32 v41, s10, 4
|
||||
; CHECK-NEXT: v_writelane_b32 v41, s11, 5
|
||||
; CHECK-NEXT: v_writelane_b32 v41, s8, 6
|
||||
; CHECK-NEXT: v_writelane_b32 v41, s9, 7
|
||||
; CHECK-NEXT: v_writelane_b32 v41, s6, 8
|
||||
; CHECK-NEXT: v_writelane_b32 v41, s7, 9
|
||||
; CHECK-NEXT: v_writelane_b32 v41, s4, 10
|
||||
; CHECK-NEXT: v_writelane_b32 v41, s5, 11
|
||||
; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1
|
||||
; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
|
||||
; CHECK-NEXT: s_mov_b64 exec, s[34:35]
|
||||
; CHECK-NEXT: buffer_load_dword v0, v1, s[0:3], 0 offen
|
||||
; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:4
|
||||
; CHECK-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
||||
; CHECK-NEXT: v_mov_b32_e32 v1, v2
|
||||
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
|
||||
; CHECK-NEXT: s_nop 0
|
||||
; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
|
||||
; CHECK-NEXT: ; %bb.1: ; %bb1
|
||||
; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1
|
||||
; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
|
||||
; CHECK-NEXT: s_mov_b64 exec, s[34:35]
|
||||
; CHECK-NEXT: s_mov_b64 s[4:5], exec
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
||||
; CHECK-NEXT: v_writelane_b32 v41, s4, 12
|
||||
; CHECK-NEXT: v_writelane_b32 v41, s5, 13
|
||||
; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1
|
||||
; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
|
||||
; CHECK-NEXT: s_mov_b64 exec, s[34:35]
|
||||
; CHECK-NEXT: .LBB0_2: ; =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1
|
||||
; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
|
||||
; CHECK-NEXT: s_mov_b64 exec, s[34:35]
|
||||
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
|
||||
; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(1)
|
||||
; CHECK-NEXT: v_readfirstlane_b32 s6, v0
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
||||
; CHECK-NEXT: v_readfirstlane_b32 s8, v1
|
||||
; CHECK-NEXT: s_mov_b32 s4, s6
|
||||
; CHECK-NEXT: s_mov_b32 s5, s8
|
||||
; CHECK-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1]
|
||||
; CHECK-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
|
||||
; CHECK-NEXT: s_mov_b32 s7, s8
|
||||
; CHECK-NEXT: v_writelane_b32 v41, s6, 14
|
||||
; CHECK-NEXT: v_writelane_b32 v41, s7, 15
|
||||
; CHECK-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
|
||||
; CHECK-NEXT: v_writelane_b32 v41, s4, 16
|
||||
; CHECK-NEXT: v_writelane_b32 v41, s5, 17
|
||||
; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1
|
||||
; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
|
||||
; CHECK-NEXT: s_mov_b64 exec, s[34:35]
|
||||
; CHECK-NEXT: ; %bb.3: ; in Loop: Header=BB0_2 Depth=1
|
||||
; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1
|
||||
; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
|
||||
; CHECK-NEXT: s_mov_b64 exec, s[34:35]
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
||||
; CHECK-NEXT: v_readlane_b32 s16, v41, 14
|
||||
; CHECK-NEXT: v_readlane_b32 s17, v41, 15
|
||||
; CHECK-NEXT: v_readlane_b32 s15, v41, 0
|
||||
; CHECK-NEXT: v_readlane_b32 s14, v41, 1
|
||||
; CHECK-NEXT: v_readlane_b32 s13, v41, 2
|
||||
; CHECK-NEXT: v_readlane_b32 s12, v41, 3
|
||||
; CHECK-NEXT: v_readlane_b32 s10, v41, 4
|
||||
; CHECK-NEXT: v_readlane_b32 s11, v41, 5
|
||||
; CHECK-NEXT: v_readlane_b32 s8, v41, 6
|
||||
; CHECK-NEXT: v_readlane_b32 s9, v41, 7
|
||||
; CHECK-NEXT: v_readlane_b32 s6, v41, 8
|
||||
; CHECK-NEXT: v_readlane_b32 s7, v41, 9
|
||||
; CHECK-NEXT: v_readlane_b32 s4, v41, 10
|
||||
; CHECK-NEXT: v_readlane_b32 s5, v41, 11
|
||||
; CHECK-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
|
||||
; CHECK-NEXT: s_mov_b64 s[22:23], s[2:3]
|
||||
; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1]
|
||||
; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21]
|
||||
; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23]
|
||||
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
|
||||
; CHECK-NEXT: v_readlane_b32 s4, v41, 16
|
||||
; CHECK-NEXT: v_readlane_b32 s5, v41, 17
|
||||
; CHECK-NEXT: s_xor_b64 exec, exec, s[4:5]
|
||||
; CHECK-NEXT: s_cbranch_execnz .LBB0_2
|
||||
; CHECK-NEXT: ; %bb.4:
|
||||
; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1
|
||||
; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
|
||||
; CHECK-NEXT: s_mov_b64 exec, s[34:35]
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
||||
; CHECK-NEXT: v_readlane_b32 s4, v41, 12
|
||||
; CHECK-NEXT: v_readlane_b32 s5, v41, 13
|
||||
; CHECK-NEXT: s_mov_b64 exec, s[4:5]
|
||||
; CHECK-NEXT: v_mov_b32_e32 v0, 0
|
||||
; CHECK-NEXT: v_readlane_b32 s31, v40, 1
|
||||
; CHECK-NEXT: v_readlane_b32 s30, v40, 0
|
||||
; CHECK-NEXT: s_mov_b32 s32, s33
|
||||
; CHECK-NEXT: v_readlane_b32 s4, v40, 4
|
||||
; CHECK-NEXT: v_readlane_b32 s34, v40, 2
|
||||
; CHECK-NEXT: v_readlane_b32 s35, v40, 3
|
||||
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
|
||||
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
|
||||
; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
|
||||
; CHECK-NEXT: s_mov_b64 exec, s[6:7]
|
||||
; CHECK-NEXT: s_mov_b32 s33, s4
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
||||
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
||||
bb:
|
||||
%i = load ptr, ptr addrspace(5) %ptr, align 8
|
||||
br label %bb1
|
||||
|
||||
bb1: ; preds = %bb
|
||||
tail call void %i()
|
||||
ret i32 0
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user