AMDGPU/GlobalISel: Fix isExtractHiElt when selecting fma_mix (#102130)

isExtractHiElt should return new source register instead of returning
instruction that defines it. Src = MI.getOperand(0).getReg() is not
correct when MI(for example G_UNMERGE_VALUES) defines multiple registers.
Refactor existing code to work with source registers only.
This commit is contained in:
Petar Avramovic 2024-08-07 12:13:39 +02:00 committed by GitHub
parent e8429986c8
commit 269cefbc02
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 74 additions and 100 deletions

View File

@ -1372,8 +1372,8 @@ bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const {
MachineInstrBuilder SelectedMI;
MachineOperand &LHS = I.getOperand(2);
MachineOperand &RHS = I.getOperand(3);
auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS);
auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS);
auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS.getReg());
auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS.getReg());
Register Src0Reg =
copyToVGPRIfSrcFolded(Src0, Src0Mods, LHS, &I, /*ForceVGPR*/ true);
Register Src1Reg =
@ -2467,14 +2467,48 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
return false;
}
static Register stripCopy(Register Reg, MachineRegisterInfo &MRI) {
return getDefSrcRegIgnoringCopies(Reg, MRI)->Reg;
}
static Register stripBitCast(Register Reg, MachineRegisterInfo &MRI) {
Register BitcastSrc;
if (mi_match(Reg, MRI, m_GBitcast(m_Reg(BitcastSrc))))
Reg = BitcastSrc;
return Reg;
}
static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In,
Register &Out) {
Register Trunc;
if (!mi_match(In, MRI, m_GTrunc(m_Reg(Trunc))))
return false;
Register LShlSrc;
if (mi_match(In, MRI,
m_GTrunc(m_GLShr(m_Reg(LShlSrc), m_SpecificICst(16))))) {
Out = LShlSrc;
Register Cst;
if (mi_match(Trunc, MRI, m_GLShr(m_Reg(LShlSrc), m_Reg(Cst)))) {
Cst = stripCopy(Cst, MRI);
if (mi_match(Cst, MRI, m_SpecificICst(16))) {
Out = stripBitCast(LShlSrc, MRI);
return true;
}
}
MachineInstr *Shuffle = MRI.getVRegDef(Trunc);
if (Shuffle->getOpcode() != AMDGPU::G_SHUFFLE_VECTOR)
return false;
assert(MRI.getType(Shuffle->getOperand(0).getReg()) ==
LLT::fixed_vector(2, 16));
ArrayRef<int> Mask = Shuffle->getOperand(3).getShuffleMask();
assert(Mask.size() == 2);
if (Mask[0] == 1 && Mask[1] <= 1) {
Out = Shuffle->getOperand(0).getReg();
return true;
}
return false;
}
@ -3550,11 +3584,8 @@ AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
}
std::pair<Register, unsigned>
AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root,
bool IsCanonicalizing,
bool AllowAbs, bool OpSel) const {
Register Src = Root.getReg();
std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3ModsImpl(
Register Src, bool IsCanonicalizing, bool AllowAbs, bool OpSel) const {
unsigned Mods = 0;
MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
@ -3617,7 +3648,7 @@ InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
Register Src;
unsigned Mods;
std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
return {{
[=](MachineInstrBuilder &MIB) {
@ -3633,7 +3664,7 @@ InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const {
Register Src;
unsigned Mods;
std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
/*IsCanonicalizing=*/true,
/*AllowAbs=*/false);
@ -3660,7 +3691,7 @@ InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
Register Src;
unsigned Mods;
std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
return {{
[=](MachineInstrBuilder &MIB) {
@ -3675,7 +3706,8 @@ AMDGPUInstructionSelector::selectVOP3ModsNonCanonicalizing(
MachineOperand &Root) const {
Register Src;
unsigned Mods;
std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /*IsCanonicalizing=*/false);
std::tie(Src, Mods) =
selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/false);
return {{
[=](MachineInstrBuilder &MIB) {
@ -3689,8 +3721,9 @@ InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const {
Register Src;
unsigned Mods;
std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /*IsCanonicalizing=*/true,
/*AllowAbs=*/false);
std::tie(Src, Mods) =
selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/true,
/*AllowAbs=*/false);
return {{
[=](MachineInstrBuilder &MIB) {
@ -4016,7 +4049,7 @@ InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
Register Src;
unsigned Mods;
std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
// FIXME: Handle op_sel
return {{
@ -4029,7 +4062,7 @@ InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const {
Register Src;
unsigned Mods;
std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
/*IsCanonicalizing=*/true,
/*AllowAbs=*/false,
/*OpSel=*/false);
@ -4047,7 +4080,7 @@ InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
Register Src;
unsigned Mods;
std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
/*IsCanonicalizing=*/true,
/*AllowAbs=*/false,
/*OpSel=*/true);
@ -5229,59 +5262,6 @@ AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const {
[=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedOffset); }}};
}
// Variant of stripBitCast that returns the instruction instead of a
// MachineOperand.
static MachineInstr *stripBitCast(MachineInstr *MI, MachineRegisterInfo &MRI) {
if (MI->getOpcode() == AMDGPU::G_BITCAST)
return getDefIgnoringCopies(MI->getOperand(1).getReg(), MRI);
return MI;
}
// Figure out if this is really an extract of the high 16-bits of a dword,
// returns nullptr if it isn't.
static MachineInstr *isExtractHiElt(MachineInstr *Inst,
MachineRegisterInfo &MRI) {
Inst = stripBitCast(Inst, MRI);
if (Inst->getOpcode() != AMDGPU::G_TRUNC)
return nullptr;
MachineInstr *TruncOp =
getDefIgnoringCopies(Inst->getOperand(1).getReg(), MRI);
TruncOp = stripBitCast(TruncOp, MRI);
// G_LSHR x, (G_CONSTANT i32 16)
if (TruncOp->getOpcode() == AMDGPU::G_LSHR) {
auto SrlAmount = getIConstantVRegValWithLookThrough(
TruncOp->getOperand(2).getReg(), MRI);
if (SrlAmount && SrlAmount->Value.getZExtValue() == 16) {
MachineInstr *SrlOp =
getDefIgnoringCopies(TruncOp->getOperand(1).getReg(), MRI);
return stripBitCast(SrlOp, MRI);
}
}
// G_SHUFFLE_VECTOR x, y, shufflemask(1, 1|0)
// 1, 0 swaps the low/high 16 bits.
// 1, 1 sets the high 16 bits to be the same as the low 16.
// in any case, it selects the high elts.
if (TruncOp->getOpcode() == AMDGPU::G_SHUFFLE_VECTOR) {
assert(MRI.getType(TruncOp->getOperand(0).getReg()) ==
LLT::fixed_vector(2, 16));
ArrayRef<int> Mask = TruncOp->getOperand(3).getShuffleMask();
assert(Mask.size() == 2);
if (Mask[0] == 1 && Mask[1] <= 1) {
MachineInstr *LHS =
getDefIgnoringCopies(TruncOp->getOperand(1).getReg(), MRI);
return stripBitCast(LHS, MRI);
}
}
return nullptr;
}
std::pair<Register, unsigned>
AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
bool &Matched) const {
@ -5289,37 +5269,34 @@ AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
Register Src;
unsigned Mods;
std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
if (MI->getOpcode() == AMDGPU::G_FPEXT) {
MachineOperand *MO = &MI->getOperand(1);
Src = MO->getReg();
MI = getDefIgnoringCopies(Src, *MRI);
std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
if (mi_match(Src, *MRI, m_GFPExt(m_Reg(Src)))) {
assert(MRI->getType(Src) == LLT::scalar(16));
// See through bitcasts.
// FIXME: Would be nice to use stripBitCast here.
if (MI->getOpcode() == AMDGPU::G_BITCAST) {
MO = &MI->getOperand(1);
Src = MO->getReg();
MI = getDefIgnoringCopies(Src, *MRI);
}
// Only change Src if src modifier could be gained. In such cases new Src
// could be sgpr but this does not violate constant bus restriction for
// instruction that is being selected.
// Note: Src is not changed when there is only a simple sgpr to vgpr copy
// since this could violate constant bus restriction.
Register PeekSrc = stripCopy(Src, *MRI);
const auto CheckAbsNeg = [&]() {
// Be careful about folding modifiers if we already have an abs. fneg is
// applied last, so we don't want to apply an earlier fneg.
if ((Mods & SISrcMods::ABS) == 0) {
unsigned ModsTmp;
std::tie(Src, ModsTmp) = selectVOP3ModsImpl(*MO);
MI = getDefIgnoringCopies(Src, *MRI);
std::tie(PeekSrc, ModsTmp) = selectVOP3ModsImpl(PeekSrc);
if ((ModsTmp & SISrcMods::NEG) != 0)
if ((ModsTmp & SISrcMods::NEG) != 0) {
Mods ^= SISrcMods::NEG;
Src = PeekSrc;
}
if ((ModsTmp & SISrcMods::ABS) != 0)
if ((ModsTmp & SISrcMods::ABS) != 0) {
Mods |= SISrcMods::ABS;
Src = PeekSrc;
}
}
};
@ -5332,12 +5309,9 @@ AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
Mods |= SISrcMods::OP_SEL_1;
if (MachineInstr *ExtractHiEltMI = isExtractHiElt(MI, *MRI)) {
if (isExtractHiElt(*MRI, PeekSrc, PeekSrc)) {
Src = PeekSrc;
Mods |= SISrcMods::OP_SEL_0;
MI = ExtractHiEltMI;
MO = &MI->getOperand(0);
Src = MO->getReg();
CheckAbsNeg();
}

View File

@ -150,7 +150,7 @@ private:
bool selectSBarrierSignalIsfirst(MachineInstr &I, Intrinsic::ID IID) const;
bool selectSBarrierLeave(MachineInstr &I) const;
std::pair<Register, unsigned> selectVOP3ModsImpl(MachineOperand &Root,
std::pair<Register, unsigned> selectVOP3ModsImpl(Register Src,
bool IsCanonicalizing = true,
bool AllowAbs = true,
bool OpSel = false) const;

View File

@ -446,28 +446,28 @@ define amdgpu_ps float @test_matching_source_from_unmerge(ptr addrspace(3) %aptr
; GFX9-DENORM: ; %bb.0: ; %.entry
; GFX9-DENORM-NEXT: ds_read_b64 v[2:3], v0
; GFX9-DENORM-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DENORM-NEXT: v_mad_mix_f32 v0, v2, v2, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
; GFX9-DENORM-NEXT: v_mad_mix_f32 v0, v2, v3, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
; GFX9-DENORM-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: test_matching_source_from_unmerge:
; GFX10: ; %bb.0: ; %.entry
; GFX10-NEXT: ds_read_b64 v[2:3], v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_fma_mix_f32 v0, v2, v2, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
; GFX10-NEXT: v_fma_mix_f32 v0, v2, v3, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
; GFX10-NEXT: ; return to shader part epilog
;
; GFX10-CONTRACT-LABEL: test_matching_source_from_unmerge:
; GFX10-CONTRACT: ; %bb.0: ; %.entry
; GFX10-CONTRACT-NEXT: ds_read_b64 v[2:3], v0
; GFX10-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v0, v2, v2, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v0, v2, v3, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
; GFX10-CONTRACT-NEXT: ; return to shader part epilog
;
; GFX10-DENORM-LABEL: test_matching_source_from_unmerge:
; GFX10-DENORM: ; %bb.0: ; %.entry
; GFX10-DENORM-NEXT: ds_read_b64 v[2:3], v0
; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, v2, v2, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, v2, v3, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
; GFX10-DENORM-NEXT: ; return to shader part epilog
.entry:
%a = load <4 x half>, ptr addrspace(3) %aptr, align 16