Reland "[AMDGPU] Wave32 CodeGen for amdgcn.ballot.i64"

This time without the extra `->dump()`

A recent addition to the device libs, `__ockl_dm_trim`, caused a series of
failures at O0 due to a i64 ballot intrinsic being inlined into a wave32 function.

The quick fix for this is to support codegen for this rare case.
A proper long-term fix for this type of issue is still being discussed.

Fixes SWDEV-408929, SWDEV-408957, SWDEV-409885, SWDEV-410193

Reviewed By: #amdgpu, arsenm

Differential Revision: https://reviews.llvm.org/D155050
This commit is contained in:
pvanhout 2023-07-12 10:23:28 +02:00
parent 5fca4ce1fd
commit 07c5920487
3 changed files with 160 additions and 21 deletions

View File

@ -1326,27 +1326,44 @@ bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
Register DstReg = I.getOperand(0).getReg();
const unsigned Size = MRI->getType(DstReg).getSizeInBits();
const bool Is64 = Size == 64;
const bool IsWave32 = (STI.getWavefrontSize() == 32);
if (Size != STI.getWavefrontSize())
// In the common case, the return type matches the wave size.
// However we also support emitting i64 ballots in wave32 mode.
if (Size != STI.getWavefrontSize() && (!Is64 || !IsWave32))
return false;
std::optional<ValueAndVReg> Arg =
getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI);
const auto BuildCopy = [&](Register SrcReg) {
if (Size == STI.getWavefrontSize()) {
BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
.addReg(SrcReg);
return;
}
// If emitting a i64 ballot in wave32, fill the upper bits with zeroes.
Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0);
BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
.addReg(SrcReg)
.addImm(AMDGPU::sub0)
.addReg(HiReg)
.addImm(AMDGPU::sub1);
};
if (Arg) {
const int64_t Value = Arg->Value.getSExtValue();
if (Value == 0) {
unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
} else if (Value == -1) { // all ones
Register SrcReg = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg);
} else
} else if (Value == -1) // all ones
BuildCopy(IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC);
else
return false;
} else {
Register SrcReg = I.getOperand(2).getReg();
BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg);
}
} else
BuildCopy(I.getOperand(2).getReg());
I.eraseFromParent();
return true;

View File

@ -992,11 +992,18 @@ multiclass ICMP_Pattern <PatFrags cond, Instruction inst, ValueType vt> {
(i64 (COPY_TO_REGCLASS (inst $src0, $src1), SReg_64))
>;
let WaveSizePredicate = isWave32 in
def : GCNPat <
(i32 (AMDGPUsetcc vt:$src0, vt:$src1, cond)),
(i32 (COPY_TO_REGCLASS (inst $src0, $src1), SReg_32))
>;
let WaveSizePredicate = isWave32 in {
def : GCNPat <
(i32 (AMDGPUsetcc vt:$src0, vt:$src1, cond)),
(i32 (COPY_TO_REGCLASS (inst $src0, $src1), SReg_32))
>;
// Support codegen of i64 setcc in wave32 mode.
def : GCNPat <
(i64 (AMDGPUsetcc vt:$src0, vt:$src1, cond)),
(i64 (REG_SEQUENCE SReg_64, (inst $src0, $src1), sub0, (S_MOV_B32 (i32 0)), sub1))
>;
}
}
defm : ICMP_Pattern <COND_EQ, V_CMP_EQ_U32_e64, i32>;
@ -1056,13 +1063,22 @@ multiclass FCMP_Pattern <PatFrags cond, Instruction inst, ValueType vt> {
DSTCLAMP.NONE), SReg_64))
>;
let WaveSizePredicate = isWave32 in
def : GCNPat <
(i32 (AMDGPUsetcc (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)),
(vt (VOP3Mods vt:$src1, i32:$src1_modifiers)), cond)),
(i32 (COPY_TO_REGCLASS (inst $src0_modifiers, $src0, $src1_modifiers, $src1,
DSTCLAMP.NONE), SReg_32))
>;
let WaveSizePredicate = isWave32 in {
def : GCNPat <
(i32 (AMDGPUsetcc (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)),
(vt (VOP3Mods vt:$src1, i32:$src1_modifiers)), cond)),
(i32 (COPY_TO_REGCLASS (inst $src0_modifiers, $src0, $src1_modifiers, $src1,
DSTCLAMP.NONE), SReg_32))
>;
def : GCNPat <
(i64 (AMDGPUsetcc (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)),
(vt (VOP3Mods vt:$src1, i32:$src1_modifiers)), cond)),
(i64 (REG_SEQUENCE SReg_64, (inst $src0_modifiers, $src0, $src1_modifiers, $src1,
DSTCLAMP.NONE), sub0,
(S_MOV_B32 (i32 0)), sub1))
>;
}
}
defm : FCMP_Pattern <COND_OEQ, V_CMP_EQ_F32_e64, f32>;

View File

@ -0,0 +1,106 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -global-isel=0 -mcpu=gfx1010 < %s | FileCheck %s --check-prefixes=CHECK,DAGISEL
; RUN: llc -march=amdgcn -global-isel=0 -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck %s --check-prefixes=CHECK,DAGISEL
; RUN: llc -march=amdgcn -global-isel -mcpu=gfx1010 < %s | FileCheck %s --check-prefixes=CHECK,GISEL
; RUN: llc -march=amdgcn -global-isel -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck %s --check-prefixes=CHECK,GISEL
declare i64 @llvm.amdgcn.ballot.i64(i1)
declare i64 @llvm.ctpop.i64(i64)
; Test ballot(0)
define amdgpu_cs i64 @constant_false() {
; CHECK-LABEL: constant_false:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_mov_b32 s0, 0
; CHECK-NEXT: s_mov_b32 s1, 0
; CHECK-NEXT: ; return to shader part epilog
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 0)
ret i64 %ballot
}
; Test ballot(1)
define amdgpu_cs i64 @constant_true() {
; DAGISEL-LABEL: constant_true:
; DAGISEL: ; %bb.0:
; DAGISEL-NEXT: s_mov_b32 s0, exec_lo
; DAGISEL-NEXT: s_mov_b32 s1, exec_hi
; DAGISEL-NEXT: ; return to shader part epilog
;
; GISEL-LABEL: constant_true:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_mov_b32 s0, exec_lo
; GISEL-NEXT: s_mov_b32 s1, 0
; GISEL-NEXT: ; return to shader part epilog
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 1)
ret i64 %ballot
}
; Test ballot of a non-comparison operation
define amdgpu_cs i64 @non_compare(i32 %x) {
; CHECK-LABEL: non_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
; CHECK-NEXT: s_mov_b32 s1, 0
; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, v0
; CHECK-NEXT: ; return to shader part epilog
%trunc = trunc i32 %x to i1
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %trunc)
ret i64 %ballot
}
; Test ballot of comparisons
define amdgpu_cs i64 @compare_ints(i32 %x, i32 %y) {
; CHECK-LABEL: compare_ints:
; CHECK: ; %bb.0:
; CHECK-NEXT: v_cmp_eq_u32_e64 s0, v0, v1
; CHECK-NEXT: s_mov_b32 s1, 0
; CHECK-NEXT: ; return to shader part epilog
%cmp = icmp eq i32 %x, %y
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
ret i64 %ballot
}
define amdgpu_cs i64 @compare_int_with_constant(i32 %x) {
; DAGISEL-LABEL: compare_int_with_constant:
; DAGISEL: ; %bb.0:
; DAGISEL-NEXT: v_cmp_lt_i32_e64 s0, 0x62, v0
; DAGISEL-NEXT: s_mov_b32 s1, 0
; DAGISEL-NEXT: ; return to shader part epilog
;
; GISEL-LABEL: compare_int_with_constant:
; GISEL: ; %bb.0:
; GISEL-NEXT: v_cmp_le_i32_e64 s0, 0x63, v0
; GISEL-NEXT: s_mov_b32 s1, 0
; GISEL-NEXT: ; return to shader part epilog
%cmp = icmp sge i32 %x, 99
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
ret i64 %ballot
}
define amdgpu_cs i64 @compare_floats(float %x, float %y) {
; CHECK-LABEL: compare_floats:
; CHECK: ; %bb.0:
; CHECK-NEXT: v_cmp_gt_f32_e64 s0, v0, v1
; CHECK-NEXT: s_mov_b32 s1, 0
; CHECK-NEXT: ; return to shader part epilog
%cmp = fcmp ogt float %x, %y
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
ret i64 %ballot
}
define amdgpu_cs i64 @ctpop_of_ballot(float %x, float %y) {
; CHECK-LABEL: ctpop_of_ballot:
; CHECK: ; %bb.0:
; CHECK-NEXT: v_cmp_gt_f32_e64 s0, v0, v1
; CHECK-NEXT: s_mov_b32 s1, 0
; CHECK-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
; CHECK-NEXT: ; return to shader part epilog
%cmp = fcmp ogt float %x, %y
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
%bcnt = call i64 @llvm.ctpop.i64(i64 %ballot)
ret i64 %bcnt
}