[AMDGPU] si-peephole-sdwa: Fix cndmask vcc use for wave32 (#139541)
Before V_CNDMASK_B32_e64 gets converted to SDWA form, a conversion to V_CNDMASK_B32_e32 occurs. The vcc use of this instruction must be fixed into a vcc_lo use for wave32. This fix only happens after the final conversion to the SDWA form. This led to a compiler error in situations where the conversion to SDWA aborts. Make sure that the vcc-fix gets applied even if the SDWA conversion is not completed. --------- Co-authored-by: Matt Arsenault <arsenm2@gmail.com>
This commit is contained in:
parent
0b490f11da
commit
1377535d99
@ -1105,6 +1105,7 @@ void SIPeepholeSDWA::convertVcndmaskToVOP2(MachineInstr &MI,
|
||||
.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0))
|
||||
.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src1))
|
||||
.setMIFlags(MI.getFlags());
|
||||
TII->fixImplicitOperands(*Converted);
|
||||
LLVM_DEBUG(dbgs() << "Converted to VOP2: " << *Converted);
|
||||
(void)Converted;
|
||||
MI.eraseFromParent();
|
||||
|
65
llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll
Normal file
65
llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll
Normal file
@ -0,0 +1,65 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck %s
|
||||
|
||||
; In this test, V_CNDMASK_B32_e64 gets converted to V_CNDMASK_B32_e32,
|
||||
; but the expected conversion to SDWA does not occur. This led to a
|
||||
; compilation error, because the use of $vcc in the resulting
|
||||
; instruction must be fixed to $vcc_lo for wave32 which only happened
|
||||
; after the full conversion to SDWA.
|
||||
|
||||
define void @quux(i32 %arg, i1 %arg1, i1 %arg2) {
|
||||
; CHECK-LABEL: quux:
|
||||
; CHECK: ; %bb.0: ; %bb
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; CHECK-NEXT: v_and_b32_e32 v1, 1, v1
|
||||
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v1
|
||||
; CHECK-NEXT: v_mov_b32_e32 v1, 0
|
||||
; CHECK-NEXT: s_and_saveexec_b32 s4, vcc_lo
|
||||
; CHECK-NEXT: s_cbranch_execz .LBB0_2
|
||||
; CHECK-NEXT: ; %bb.1: ; %bb3
|
||||
; CHECK-NEXT: v_and_b32_e32 v1, 0x3ff, v31
|
||||
; CHECK-NEXT: v_mov_b32_e32 v2, 0
|
||||
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
|
||||
; CHECK-NEXT: v_mul_u32_u24_e32 v1, 5, v1
|
||||
; CHECK-NEXT: global_load_ushort v1, v[1:2], off offset:3
|
||||
; CHECK-NEXT: v_mov_b32_e32 v2, 0xffff
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
||||
; CHECK-NEXT: v_and_b32_sdwa v0, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
|
||||
; CHECK-NEXT: v_mov_b32_e32 v1, 24
|
||||
; CHECK-NEXT: v_mov_b32_e32 v2, 0xff
|
||||
; CHECK-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo
|
||||
; CHECK-NEXT: v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; CHECK-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; CHECK-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; CHECK-NEXT: .LBB0_2: ; %bb9
|
||||
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s4
|
||||
; CHECK-NEXT: v_mov_b32_e32 v2, 0
|
||||
; CHECK-NEXT: v_mov_b32_e32 v3, 0
|
||||
; CHECK-NEXT: global_store_byte v[2:3], v1, off
|
||||
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
||||
bb:
|
||||
br i1 %arg1, label %bb9, label %bb3
|
||||
|
||||
bb3: ; preds = %bb
|
||||
%call = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%mul = mul i32 %call, 5
|
||||
%zext = zext i32 %mul to i64
|
||||
%getelementptr = getelementptr i8, ptr addrspace(1) null, i64 %zext
|
||||
%getelementptr4 = getelementptr i8, ptr addrspace(1) %getelementptr, i64 4
|
||||
%load = load i8, ptr addrspace(1) %getelementptr4, align 1
|
||||
%getelementptr5 = getelementptr i8, ptr addrspace(1) %getelementptr, i64 3
|
||||
%load6 = load i8, ptr addrspace(1) %getelementptr5, align 1
|
||||
%insertelement = insertelement <5 x i8> poison, i8 %load, i64 4
|
||||
%select = select i1 %arg2, <5 x i8> %insertelement, <5 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 0>
|
||||
%insertelement7 = insertelement <5 x i8> %select, i8 %load6, i64 0
|
||||
%icmp = icmp ult i32 0, %arg
|
||||
%select8 = select i1 %icmp, <5 x i8> zeroinitializer, <5 x i8> %insertelement7
|
||||
%shufflevector = shufflevector <5 x i8> zeroinitializer, <5 x i8> %select8, <5 x i32> <i32 0, i32 1, i32 7, i32 8, i32 9>
|
||||
br label %bb9
|
||||
|
||||
bb9: ; preds = %bb3, %bb
|
||||
%phi = phi <5 x i8> [ %shufflevector, %bb3 ], [ zeroinitializer, %bb ]
|
||||
%extractelement = extractelement <5 x i8> %phi, i64 0
|
||||
store i8 %extractelement, ptr addrspace(1) null, align 1
|
||||
ret void
|
||||
}
|
@ -230,3 +230,92 @@ body: |
|
||||
$vgpr0 = COPY %3
|
||||
SI_RETURN implicit $vgpr0
|
||||
...
|
||||
|
||||
---
|
||||
name: cndmask-not-converted
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
; CHECK-LABEL: name: cndmask-not-converted
|
||||
; CHECK: bb.0:
|
||||
; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
|
||||
; CHECK-NEXT: liveins: $vgpr0, $sgpr8_sgpr9
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
|
||||
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
|
||||
; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0
|
||||
; CHECK-NEXT: S_BITCMP1_B32 [[S_LOAD_DWORDX2_IMM]].sub1, 0, implicit-def $scc
|
||||
; CHECK-NEXT: [[S_CSELECT_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_CSELECT_B32 -1, 0, implicit $scc
|
||||
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
|
||||
; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 $exec_lo, [[S_CSELECT_B32_]], implicit-def dead $scc
|
||||
; CHECK-NEXT: $vcc_lo = COPY [[S_AND_B32_]]
|
||||
; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc_lo
|
||||
; CHECK-NEXT: S_BRANCH %bb.1
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: bb.1:
|
||||
; CHECK-NEXT: successors: %bb.2(0x80000000)
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]]
|
||||
; CHECK-NEXT: [[V_MUL_U32_U24_e64_:%[0-9]+]]:vgpr_32 = V_MUL_U32_U24_e64 [[COPY1]](s32), 5, 0, implicit $exec
|
||||
; CHECK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
|
||||
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MUL_U32_U24_e64_]], %subreg.sub0, killed [[V_MOV_B32_e32_1]], %subreg.sub1
|
||||
; CHECK-NEXT: [[GLOBAL_LOAD_USHORT:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT [[REG_SEQUENCE]], 3, 0, implicit $exec
|
||||
; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[GLOBAL_LOAD_USHORT]], 255, implicit $exec
|
||||
; CHECK-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec
|
||||
; CHECK-NEXT: [[V_AND_B32_sdwa:%[0-9]+]]:vgpr_32 = V_AND_B32_sdwa 0, [[V_MOV_B32_e32_2]], 0, [[GLOBAL_LOAD_USHORT]], 0, 6, 0, 6, 0, implicit $exec
|
||||
; CHECK-NEXT: S_CMP_EQ_U32 [[COPY2]].sub0, 0, implicit-def $scc
|
||||
; CHECK-NEXT: [[S_CSELECT_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_CSELECT_B32 -1, 0, implicit $scc
|
||||
; CHECK-NEXT: $vcc_lo = COPY [[S_CSELECT_B32_1]]
|
||||
; CHECK-NEXT: [[V_CNDMASK_B32_e32_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e32 0, killed [[V_AND_B32_sdwa]], implicit $vcc_lo, implicit $exec
|
||||
; CHECK-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 24, implicit $exec
|
||||
; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_MOV_B32_e32_3]], 0, [[V_CNDMASK_B32_e32_]], 0, 1, 0, 6, 6, implicit $exec
|
||||
; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[V_CNDMASK_B32_e32_]], implicit $exec
|
||||
; CHECK-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 255, implicit $exec
|
||||
; CHECK-NEXT: [[V_AND_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_AND_B32_sdwa 0, [[V_CNDMASK_B32_e32_]], 0, [[V_MOV_B32_e32_4]], 0, 6, 0, 5, 6, implicit $exec
|
||||
; CHECK-NEXT: [[V_OR_B32_sdwa:%[0-9]+]]:vgpr_32 = V_OR_B32_sdwa 0, [[V_AND_B32_sdwa1]], 0, [[V_LSHRREV_B32_sdwa]], 0, 5, 0, 6, 6, implicit $exec
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: bb.2:
|
||||
; CHECK-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_]], %bb.0, [[V_OR_B32_sdwa]], %bb.1
|
||||
; CHECK-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
|
||||
; CHECK-NEXT: GLOBAL_STORE_BYTE killed [[V_MOV_B]], [[PHI]], 0, 0, implicit $exec
|
||||
; CHECK-NEXT: S_ENDPGM 0
|
||||
bb.0:
|
||||
successors: %bb.1(0x40000000), %bb.2(0x40000000)
|
||||
liveins: $vgpr0, $sgpr8_sgpr9
|
||||
|
||||
%0:sgpr_64 = COPY $sgpr8_sgpr9
|
||||
%1:vgpr_32 = COPY $vgpr0
|
||||
%2:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 0, 0
|
||||
S_BITCMP1_B32 %2.sub1, 0, implicit-def $scc
|
||||
%3:sreg_32_xm0_xexec = S_CSELECT_B32 -1, 0, implicit $scc
|
||||
%4:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
|
||||
%5:sreg_32 = S_AND_B32 $exec_lo, %3, implicit-def dead $scc
|
||||
$vcc_lo = COPY %5
|
||||
S_CBRANCH_VCCNZ %bb.2, implicit $vcc
|
||||
S_BRANCH %bb.1
|
||||
|
||||
bb.1:
|
||||
successors: %bb.2(0x80000000)
|
||||
|
||||
%6:sreg_64 = COPY %2
|
||||
%7:vgpr_32 = V_MUL_U32_U24_e64 %1(s32), 5, 0, implicit $exec
|
||||
%8:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
|
||||
%9:vreg_64 = REG_SEQUENCE %7, %subreg.sub0, killed %8, %subreg.sub1
|
||||
%10:vgpr_32 = GLOBAL_LOAD_USHORT %9, 3, 0, implicit $exec
|
||||
%11:vgpr_32 = V_AND_B32_e64 %10, 255, implicit $exec
|
||||
%12:vgpr_32 = V_AND_B32_e64 65535, killed %11, implicit $exec
|
||||
S_CMP_EQ_U32 %6.sub0, 0, implicit-def $scc
|
||||
%13:sreg_32_xm0_xexec = S_CSELECT_B32 -1, 0, implicit $scc
|
||||
%14:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, killed %12, %13, implicit $exec
|
||||
%15:vgpr_32 = V_LSHRREV_B32_e64 24, %14, implicit $exec
|
||||
%16:vgpr_32 = V_LSHLREV_B16_e64 8, %15, implicit $exec
|
||||
%17:vgpr_32 = V_LSHRREV_B32_e64 16, %14, implicit $exec
|
||||
%18:vgpr_32 = V_AND_B32_e64 %17, 255, implicit $exec
|
||||
%19:vgpr_32 = V_OR_B32_e64 killed %18, killed %16, implicit $exec
|
||||
%20:vgpr_32 = V_LSHLREV_B32_e64 16, killed %19, implicit $exec
|
||||
|
||||
bb.2:
|
||||
%21:vgpr_32 = PHI %4, %bb.0, %20, %bb.1
|
||||
%22:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
|
||||
GLOBAL_STORE_BYTE killed %22, %21, 0, 0, implicit $exec
|
||||
S_ENDPGM 0
|
||||
...
|
||||
|
Loading…
x
Reference in New Issue
Block a user