llvm-project/llvm/lib/Target/LoongArch/LoongArchMergeBaseOffset.cpp

842 lines
30 KiB
C++

//===---- LoongArchMergeBaseOffset.cpp - Optimise address calculations ----===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Merge the offset of address calculation into the offset field
// of instructions in a global address lowering sequence.
//
//===----------------------------------------------------------------------===//
#include "LoongArch.h"
#include "LoongArchTargetMachine.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/MC/TargetRegistry.h"
#include "llvm/Support/Debug.h"
#include "llvm/Target/TargetOptions.h"
#include <optional>
using namespace llvm;
#define DEBUG_TYPE "loongarch-merge-base-offset"
#define LoongArch_MERGE_BASE_OFFSET_NAME "LoongArch Merge Base Offset"
namespace {
class LoongArchMergeBaseOffsetOpt : public MachineFunctionPass {
const LoongArchSubtarget *ST = nullptr;
MachineRegisterInfo *MRI;
public:
static char ID;
bool runOnMachineFunction(MachineFunction &Fn) override;
bool detectFoldable(MachineInstr &Hi20, MachineInstr *&Lo12,
MachineInstr *&Lo20, MachineInstr *&Hi12,
MachineInstr *&Last);
bool detectFoldable(MachineInstr &Hi20, MachineInstr *&Add,
MachineInstr *&Lo12);
bool detectAndFoldOffset(MachineInstr &Hi20, MachineInstr &Lo12,
MachineInstr *&Lo20, MachineInstr *&Hi12,
MachineInstr *&Last);
void foldOffset(MachineInstr &Hi20, MachineInstr &Lo12, MachineInstr *&Lo20,
MachineInstr *&Hi12, MachineInstr *&Last, MachineInstr &Tail,
int64_t Offset);
bool foldLargeOffset(MachineInstr &Hi20, MachineInstr &Lo12,
MachineInstr *&Lo20, MachineInstr *&Hi12,
MachineInstr *&Last, MachineInstr &TailAdd,
Register GAReg);
bool foldIntoMemoryOps(MachineInstr &Hi20, MachineInstr &Lo12,
MachineInstr *&Lo20, MachineInstr *&Hi12,
MachineInstr *&Last);
LoongArchMergeBaseOffsetOpt() : MachineFunctionPass(ID) {}
MachineFunctionProperties getRequiredProperties() const override {
return MachineFunctionProperties().setIsSSA();
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
MachineFunctionPass::getAnalysisUsage(AU);
}
StringRef getPassName() const override {
return LoongArch_MERGE_BASE_OFFSET_NAME;
}
};
} // end anonymous namespace
char LoongArchMergeBaseOffsetOpt::ID = 0;
INITIALIZE_PASS(LoongArchMergeBaseOffsetOpt, DEBUG_TYPE,
LoongArch_MERGE_BASE_OFFSET_NAME, false, false)
// Detect either of the patterns:
//
// 1. (small/medium):
// pcalau12i vreg1, %pc_hi20(s)
// addi.d vreg2, vreg1, %pc_lo12(s)
//
// 2. (large):
// pcalau12i vreg1, %pc_hi20(s)
// addi.d vreg2, $zero, %pc_lo12(s)
// lu32i.d vreg3, vreg2, %pc64_lo20(s)
// lu52i.d vreg4, vreg3, %pc64_hi12(s)
// add.d vreg5, vreg4, vreg1
// The pattern is only accepted if:
// 1) For small and medium pattern, the first instruction has only one use,
// which is the ADDI.
// 2) For large pattern, the first four instructions each have only one use,
// and the user of the fourth instruction is ADD.
// 3) The address operands have the appropriate type, reflecting the
// lowering of a global address or constant pool using the pattern.
// 4) The offset value in the Global Address or Constant Pool is 0.
bool LoongArchMergeBaseOffsetOpt::detectFoldable(MachineInstr &Hi20,
MachineInstr *&Lo12,
MachineInstr *&Lo20,
MachineInstr *&Hi12,
MachineInstr *&Last) {
if (Hi20.getOpcode() != LoongArch::PCALAU12I)
return false;
const MachineOperand &Hi20Op1 = Hi20.getOperand(1);
if (LoongArchII::getDirectFlags(Hi20Op1) != LoongArchII::MO_PCREL_HI)
return false;
auto isGlobalOrCPIOrBlockAddress = [](const MachineOperand &Op) {
return Op.isGlobal() || Op.isCPI() || Op.isBlockAddress();
};
if (!isGlobalOrCPIOrBlockAddress(Hi20Op1) || Hi20Op1.getOffset() != 0)
return false;
Register HiDestReg = Hi20.getOperand(0).getReg();
if (!MRI->hasOneUse(HiDestReg))
return false;
MachineInstr *UseInst = &*MRI->use_instr_begin(HiDestReg);
if (UseInst->getOpcode() != LoongArch::ADD_D) {
Lo12 = UseInst;
if ((ST->is64Bit() && Lo12->getOpcode() != LoongArch::ADDI_D) ||
(!ST->is64Bit() && Lo12->getOpcode() != LoongArch::ADDI_W))
return false;
} else {
assert(ST->is64Bit());
Last = UseInst;
Register LastOp1Reg = Last->getOperand(1).getReg();
if (!LastOp1Reg.isVirtual())
return false;
Hi12 = MRI->getVRegDef(LastOp1Reg);
const MachineOperand &Hi12Op2 = Hi12->getOperand(2);
if (Hi12Op2.getTargetFlags() != LoongArchII::MO_PCREL64_HI)
return false;
if (!isGlobalOrCPIOrBlockAddress(Hi12Op2) || Hi12Op2.getOffset() != 0)
return false;
if (!MRI->hasOneUse(Hi12->getOperand(0).getReg()))
return false;
Lo20 = MRI->getVRegDef(Hi12->getOperand(1).getReg());
const MachineOperand &Lo20Op2 = Lo20->getOperand(2);
if (Lo20Op2.getTargetFlags() != LoongArchII::MO_PCREL64_LO)
return false;
if (!isGlobalOrCPIOrBlockAddress(Lo20Op2) || Lo20Op2.getOffset() != 0)
return false;
if (!MRI->hasOneUse(Lo20->getOperand(0).getReg()))
return false;
Lo12 = MRI->getVRegDef(Lo20->getOperand(1).getReg());
if (!MRI->hasOneUse(Lo12->getOperand(0).getReg()))
return false;
}
const MachineOperand &Lo12Op2 = Lo12->getOperand(2);
assert(Hi20.getOpcode() == LoongArch::PCALAU12I);
if (LoongArchII::getDirectFlags(Lo12Op2) != LoongArchII::MO_PCREL_LO ||
!(isGlobalOrCPIOrBlockAddress(Lo12Op2) || Lo12Op2.isMCSymbol()) ||
Lo12Op2.getOffset() != 0)
return false;
if (Hi20Op1.isGlobal()) {
LLVM_DEBUG(dbgs() << " Found lowered global address: "
<< *Hi20Op1.getGlobal() << "\n");
} else if (Hi20Op1.isBlockAddress()) {
LLVM_DEBUG(dbgs() << " Found lowered basic address: "
<< *Hi20Op1.getBlockAddress() << "\n");
} else if (Hi20Op1.isCPI()) {
LLVM_DEBUG(dbgs() << " Found lowered constant pool: " << Hi20Op1.getIndex()
<< "\n");
}
return true;
}
// Detect the pattern:
//
// (small/medium):
// lu12i.w vreg1, %le_hi20_r(s)
// add.w/d vreg2, vreg1, r2, %le_add_r(s)
// addi.w/d vreg3, vreg2, %le_lo12_r(s)
// The pattern is only accepted if:
// 1) The first instruction has only one use, which is the PseudoAddTPRel.
// The second instruction has only one use, which is the ADDI. The
// second instruction's last operand is the tp register.
// 2) The address operands have the appropriate type, reflecting the
// lowering of a thread_local global address using the pattern.
// 3) The offset value in the ThreadLocal Global Address is 0.
bool LoongArchMergeBaseOffsetOpt::detectFoldable(MachineInstr &Hi20,
MachineInstr *&Add,
MachineInstr *&Lo12) {
if (Hi20.getOpcode() != LoongArch::LU12I_W)
return false;
auto isGlobalOrCPI = [](const MachineOperand &Op) {
return Op.isGlobal() || Op.isCPI();
};
const MachineOperand &Hi20Op1 = Hi20.getOperand(1);
if (LoongArchII::getDirectFlags(Hi20Op1) != LoongArchII::MO_LE_HI_R ||
!isGlobalOrCPI(Hi20Op1) || Hi20Op1.getOffset() != 0)
return false;
Register HiDestReg = Hi20.getOperand(0).getReg();
if (!MRI->hasOneUse(HiDestReg))
return false;
Add = &*MRI->use_instr_begin(HiDestReg);
if ((ST->is64Bit() && Add->getOpcode() != LoongArch::PseudoAddTPRel_D) ||
(!ST->is64Bit() && Add->getOpcode() != LoongArch::PseudoAddTPRel_W))
return false;
if (Add->getOperand(2).getReg() != LoongArch::R2)
return false;
const MachineOperand &AddOp3 = Add->getOperand(3);
if (LoongArchII::getDirectFlags(AddOp3) != LoongArchII::MO_LE_ADD_R ||
!(isGlobalOrCPI(AddOp3) || AddOp3.isMCSymbol()) ||
AddOp3.getOffset() != 0)
return false;
Register AddDestReg = Add->getOperand(0).getReg();
if (!MRI->hasOneUse(AddDestReg))
return false;
Lo12 = &*MRI->use_instr_begin(AddDestReg);
if ((ST->is64Bit() && Lo12->getOpcode() != LoongArch::ADDI_D) ||
(!ST->is64Bit() && Lo12->getOpcode() != LoongArch::ADDI_W))
return false;
const MachineOperand &Lo12Op2 = Lo12->getOperand(2);
if (LoongArchII::getDirectFlags(Lo12Op2) != LoongArchII::MO_LE_LO_R ||
!(isGlobalOrCPI(Lo12Op2) || Lo12Op2.isMCSymbol()) ||
Lo12Op2.getOffset() != 0)
return false;
if (Hi20Op1.isGlobal()) {
LLVM_DEBUG(dbgs() << " Found lowered global address: "
<< *Hi20Op1.getGlobal() << "\n");
} else if (Hi20Op1.isCPI()) {
LLVM_DEBUG(dbgs() << " Found lowered constant pool: " << Hi20Op1.getIndex()
<< "\n");
}
return true;
}
// Update the offset in Hi20, (Add), Lo12, (Lo20 and Hi12) instructions.
// Delete the tail instruction and update all the uses to use the
// output from Last.
void LoongArchMergeBaseOffsetOpt::foldOffset(
MachineInstr &Hi20, MachineInstr &Lo12, MachineInstr *&Lo20,
MachineInstr *&Hi12, MachineInstr *&Last, MachineInstr &Tail,
int64_t Offset) {
// Put the offset back in Hi and the Lo
Hi20.getOperand(1).setOffset(Offset);
Lo12.getOperand(2).setOffset(Offset);
if (Lo20 && Hi12) {
Lo20->getOperand(2).setOffset(Offset);
Hi12->getOperand(2).setOffset(Offset);
}
// For tls-le, offset of the second PseudoAddTPRel instr should also be
// updated.
MachineInstr *Add = &*MRI->use_instr_begin(Hi20.getOperand(0).getReg());
if (Hi20.getOpcode() == LoongArch::LU12I_W)
Add->getOperand(3).setOffset(Offset);
// Delete the tail instruction.
MachineInstr *Def = Last ? Last : &Lo12;
MRI->constrainRegClass(Def->getOperand(0).getReg(),
MRI->getRegClass(Tail.getOperand(0).getReg()));
MRI->replaceRegWith(Tail.getOperand(0).getReg(), Def->getOperand(0).getReg());
Tail.eraseFromParent();
LLVM_DEBUG(dbgs() << " Merged offset " << Offset << " into base.\n"
<< " " << Hi20;);
if (Hi20.getOpcode() == LoongArch::LU12I_W) {
LLVM_DEBUG(dbgs() << " " << *Add;);
}
LLVM_DEBUG(dbgs() << " " << Lo12;);
if (Lo20 && Hi12) {
LLVM_DEBUG(dbgs() << " " << *Lo20 << " " << *Hi12;);
}
}
// Detect patterns for large offsets that are passed into an ADD instruction.
// If the pattern is found, updates the offset in Hi20, (Add), Lo12,
// (Lo20 and Hi12) instructions and deletes TailAdd and the instructions that
// produced the offset.
//
// (The instructions marked with "!" are not necessarily present)
//
// Base address lowering is of the form:
// 1) pcala:
// Hi20: pcalau12i vreg1, %pc_hi20(s)
// +--- Lo12: addi.d vreg2, vreg1, %pc_lo12(s)
// | Lo20: lu32i.d vreg2, %pc64_lo20(s) !
// +--- Hi12: lu52i.d vreg2, vreg2, %pc64_hi12(s) !
// |
// | 2) tls-le:
// | Hi20: lu12i.w vreg1, %le_hi20_r(s)
// | Add: add.w/d vreg1, vreg1, r2, %le_add_r(s)
// +--- Lo12: addi.w/d vreg2, vreg1, %le_lo12_r(s)
// |
// | The large offset can be one of the forms:
// |
// +-> 1) Offset that has non zero bits in Hi20 and Lo12 bits:
// | OffsetHi20: lu12i.w vreg3, 4
// | OffsetLo12: ori voff, vreg3, 188 ------------------+
// | |
// +-> 2) Offset that has non zero bits in Hi20 bits only: |
// | OffsetHi20: lu12i.w voff, 128 ------------------+
// | |
// +-> 3) Offset that has non zero bits in Lo20 bits: |
// | OffsetHi20: lu12i.w vreg3, 121 ! |
// | OffsetLo12: ori voff, vreg3, 122 ! |
// | OffsetLo20: lu32i.d voff, 123 ------------------+
// +-> 4) Offset that has non zero bits in Hi12 bits: |
// OffsetHi20: lu12i.w vreg3, 121 ! |
// OffsetLo12: ori voff, vreg3, 122 ! |
// OffsetLo20: lu32i.d vreg3, 123 ! |
// OffsetHi12: lu52i.d voff, vrg3, 124 ------------------+
// |
// TailAdd: add.d vreg4, vreg2, voff <------------------+
//
bool LoongArchMergeBaseOffsetOpt::foldLargeOffset(
MachineInstr &Hi20, MachineInstr &Lo12, MachineInstr *&Lo20,
MachineInstr *&Hi12, MachineInstr *&Last, MachineInstr &TailAdd,
Register GAReg) {
assert((TailAdd.getOpcode() == LoongArch::ADD_W ||
TailAdd.getOpcode() == LoongArch::ADD_D) &&
"Expected ADD instruction!");
Register Rs = TailAdd.getOperand(1).getReg();
Register Rt = TailAdd.getOperand(2).getReg();
Register Reg = Rs == GAReg ? Rt : Rs;
SmallVector<MachineInstr *, 4> Instrs;
int64_t Offset = 0;
int64_t Mask = -1;
// This can point to one of [ORI, LU12I.W, LU32I.D, LU52I.D]:
for (int i = 0; i < 4; i++) {
// Handle Reg is R0.
if (Reg == LoongArch::R0)
break;
// Can't fold if the register has more than one use.
if (!Reg.isVirtual() || !MRI->hasOneUse(Reg))
return false;
MachineInstr *Curr = MRI->getVRegDef(Reg);
if (!Curr)
break;
switch (Curr->getOpcode()) {
default:
// Can't fold if the instruction opcode is unexpected.
return false;
case LoongArch::ORI: {
MachineOperand ImmOp = Curr->getOperand(2);
if (ImmOp.getTargetFlags() != LoongArchII::MO_None)
return false;
Offset += ImmOp.getImm();
Reg = Curr->getOperand(1).getReg();
Instrs.push_back(Curr);
break;
}
case LoongArch::LU12I_W: {
MachineOperand ImmOp = Curr->getOperand(1);
if (ImmOp.getTargetFlags() != LoongArchII::MO_None)
return false;
Offset += SignExtend64<32>(ImmOp.getImm() << 12) & Mask;
Reg = LoongArch::R0;
Instrs.push_back(Curr);
break;
}
case LoongArch::LU32I_D: {
MachineOperand ImmOp = Curr->getOperand(2);
if (ImmOp.getTargetFlags() != LoongArchII::MO_None || !Lo20)
return false;
Offset += SignExtend64<52>(ImmOp.getImm() << 32) & Mask;
Mask ^= 0x000FFFFF00000000ULL;
Reg = Curr->getOperand(1).getReg();
Instrs.push_back(Curr);
break;
}
case LoongArch::LU52I_D: {
MachineOperand ImmOp = Curr->getOperand(2);
if (ImmOp.getTargetFlags() != LoongArchII::MO_None || !Hi12)
return false;
Offset += ImmOp.getImm() << 52;
Mask ^= 0xFFF0000000000000ULL;
Reg = Curr->getOperand(1).getReg();
Instrs.push_back(Curr);
break;
}
}
}
// Can't fold if the offset is not extracted.
if (!Offset)
return false;
foldOffset(Hi20, Lo12, Lo20, Hi12, Last, TailAdd, Offset);
LLVM_DEBUG(dbgs() << " Offset Instrs:\n");
for (auto I : Instrs) {
LLVM_DEBUG(dbgs() << " " << *I);
I->eraseFromParent();
}
return true;
}
bool LoongArchMergeBaseOffsetOpt::detectAndFoldOffset(MachineInstr &Hi20,
MachineInstr &Lo12,
MachineInstr *&Lo20,
MachineInstr *&Hi12,
MachineInstr *&Last) {
Register DestReg =
Last ? Last->getOperand(0).getReg() : Lo12.getOperand(0).getReg();
// Look for arithmetic instructions we can get an offset from.
// We might be able to remove the arithmetic instructions by folding the
// offset into the PCALAU12I+(ADDI/ADDI+LU32I+LU52I) or
// LU12I_W+PseudoAddTPRel+ADDI.
if (!MRI->hasOneUse(DestReg))
return false;
// DestReg has only one use.
MachineInstr &Tail = *MRI->use_instr_begin(DestReg);
switch (Tail.getOpcode()) {
default:
LLVM_DEBUG(dbgs() << "Don't know how to get offset from this instr:"
<< Tail);
break;
case LoongArch::ADDI_W:
if (ST->is64Bit())
return false;
[[fallthrough]];
case LoongArch::ADDI_D:
case LoongArch::ADDU16I_D: {
// Offset is simply an immediate operand.
int64_t Offset = Tail.getOperand(2).getImm();
if (Tail.getOpcode() == LoongArch::ADDU16I_D)
Offset = SignExtend64<32>(Offset << 16);
// We might have two ADDIs in a row.
Register TailDestReg = Tail.getOperand(0).getReg();
if (MRI->hasOneUse(TailDestReg)) {
MachineInstr &TailTail = *MRI->use_instr_begin(TailDestReg);
if (ST->is64Bit() && TailTail.getOpcode() == LoongArch::ADDI_W)
return false;
if (TailTail.getOpcode() == LoongArch::ADDI_W ||
TailTail.getOpcode() == LoongArch::ADDI_D) {
Offset += TailTail.getOperand(2).getImm();
LLVM_DEBUG(dbgs() << " Offset Instrs: " << Tail << TailTail);
foldOffset(Hi20, Lo12, Lo20, Hi12, Last, TailTail, Offset);
Tail.eraseFromParent();
return true;
}
}
LLVM_DEBUG(dbgs() << " Offset Instr: " << Tail);
foldOffset(Hi20, Lo12, Lo20, Hi12, Last, Tail, Offset);
return true;
}
case LoongArch::ADD_W:
if (ST->is64Bit())
return false;
[[fallthrough]];
case LoongArch::ADD_D:
// The offset is too large to fit in the immediate field of ADDI.
return foldLargeOffset(Hi20, Lo12, Lo20, Hi12, Last, Tail, DestReg);
break;
}
return false;
}
// Memory access opcode mapping for transforms.
static unsigned getNewOpc(unsigned Op, bool isLarge) {
switch (Op) {
case LoongArch::LD_B:
return isLarge ? LoongArch::LDX_B : LoongArch::LD_B;
case LoongArch::LD_H:
return isLarge ? LoongArch::LDX_H : LoongArch::LD_H;
case LoongArch::LD_W:
case LoongArch::LDPTR_W:
return isLarge ? LoongArch::LDX_W : LoongArch::LD_W;
case LoongArch::LD_D:
case LoongArch::LDPTR_D:
return isLarge ? LoongArch::LDX_D : LoongArch::LD_D;
case LoongArch::LD_BU:
return isLarge ? LoongArch::LDX_BU : LoongArch::LD_BU;
case LoongArch::LD_HU:
return isLarge ? LoongArch::LDX_HU : LoongArch::LD_HU;
case LoongArch::LD_WU:
return isLarge ? LoongArch::LDX_WU : LoongArch::LD_WU;
case LoongArch::FLD_S:
return isLarge ? LoongArch::FLDX_S : LoongArch::FLD_S;
case LoongArch::FLD_D:
return isLarge ? LoongArch::FLDX_D : LoongArch::FLD_D;
case LoongArch::VLD:
return isLarge ? LoongArch::VLDX : LoongArch::VLD;
case LoongArch::XVLD:
return isLarge ? LoongArch::XVLDX : LoongArch::XVLD;
case LoongArch::VLDREPL_B:
return LoongArch::VLDREPL_B;
case LoongArch::XVLDREPL_B:
return LoongArch::XVLDREPL_B;
case LoongArch::ST_B:
return isLarge ? LoongArch::STX_B : LoongArch::ST_B;
case LoongArch::ST_H:
return isLarge ? LoongArch::STX_H : LoongArch::ST_H;
case LoongArch::ST_W:
case LoongArch::STPTR_W:
return isLarge ? LoongArch::STX_W : LoongArch::ST_W;
case LoongArch::ST_D:
case LoongArch::STPTR_D:
return isLarge ? LoongArch::STX_D : LoongArch::ST_D;
case LoongArch::FST_S:
return isLarge ? LoongArch::FSTX_S : LoongArch::FST_S;
case LoongArch::FST_D:
return isLarge ? LoongArch::FSTX_D : LoongArch::FST_D;
case LoongArch::VST:
return isLarge ? LoongArch::VSTX : LoongArch::VST;
case LoongArch::XVST:
return isLarge ? LoongArch::XVSTX : LoongArch::XVST;
default:
llvm_unreachable("Unexpected opcode for replacement");
}
}
bool LoongArchMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi20,
MachineInstr &Lo12,
MachineInstr *&Lo20,
MachineInstr *&Hi12,
MachineInstr *&Last) {
Register DestReg =
Last ? Last->getOperand(0).getReg() : Lo12.getOperand(0).getReg();
// If all the uses are memory ops with the same offset, we can transform:
//
// 1. (small/medium):
// 1.1. pcala
// pcalau12i vreg1, %pc_hi20(s)
// addi.d vreg2, vreg1, %pc_lo12(s)
// ld.w vreg3, 8(vreg2)
//
// =>
//
// pcalau12i vreg1, %pc_hi20(s+8)
// ld.w vreg3, vreg1, %pc_lo12(s+8)(vreg1)
//
// 1.2. tls-le
// lu12i.w vreg1, %le_hi20_r(s)
// add.w/d vreg2, vreg1, r2, %le_add_r(s)
// addi.w/d vreg3, vreg2, %le_lo12_r(s)
// ld.w vreg4, 8(vreg3)
//
// =>
//
// lu12i.w vreg1, %le_hi20_r(s+8)
// add.w/d vreg2, vreg1, r2, %le_add_r(s+8)
// ld.w vreg4, vreg2, %le_lo12_r(s+8)(vreg2)
//
// 2. (large):
// pcalau12i vreg1, %pc_hi20(s)
// addi.d vreg2, $zero, %pc_lo12(s)
// lu32i.d vreg3, vreg2, %pc64_lo20(s)
// lu52i.d vreg4, vreg3, %pc64_hi12(s)
// add.d vreg5, vreg4, vreg1
// ld.w vreg6, 8(vreg5)
//
// =>
//
// pcalau12i vreg1, %pc_hi20(s+8)
// addi.d vreg2, $zero, %pc_lo12(s+8)
// lu32i.d vreg3, vreg2, %pc64_lo20(s+8)
// lu52i.d vreg4, vreg3, %pc64_hi12(s+8)
// ldx.w vreg6, vreg4, vreg1
std::optional<int64_t> CommonOffset;
DenseMap<const MachineInstr *, SmallVector<unsigned>>
InlineAsmMemoryOpIndexesMap;
for (const MachineInstr &UseMI : MRI->use_instructions(DestReg)) {
switch (UseMI.getOpcode()) {
default:
LLVM_DEBUG(dbgs() << "Not a load or store instruction: " << UseMI);
return false;
case LoongArch::VLDREPL_B:
case LoongArch::XVLDREPL_B:
// We can't do this for large pattern.
if (Last)
return false;
[[fallthrough]];
case LoongArch::LD_B:
case LoongArch::LD_H:
case LoongArch::LD_W:
case LoongArch::LD_D:
case LoongArch::LD_BU:
case LoongArch::LD_HU:
case LoongArch::LD_WU:
case LoongArch::LDPTR_W:
case LoongArch::LDPTR_D:
case LoongArch::FLD_S:
case LoongArch::FLD_D:
case LoongArch::VLD:
case LoongArch::XVLD:
case LoongArch::ST_B:
case LoongArch::ST_H:
case LoongArch::ST_W:
case LoongArch::ST_D:
case LoongArch::STPTR_W:
case LoongArch::STPTR_D:
case LoongArch::FST_S:
case LoongArch::FST_D:
case LoongArch::VST:
case LoongArch::XVST: {
if (UseMI.getOperand(1).isFI())
return false;
// Register defined by Lo should not be the value register.
if (DestReg == UseMI.getOperand(0).getReg())
return false;
assert(DestReg == UseMI.getOperand(1).getReg() &&
"Expected base address use");
// All load/store instructions must use the same offset.
int64_t Offset = UseMI.getOperand(2).getImm();
if (CommonOffset && Offset != CommonOffset)
return false;
CommonOffset = Offset;
break;
}
case LoongArch::INLINEASM:
case LoongArch::INLINEASM_BR: {
// We can't do this for large pattern.
if (Last)
return false;
SmallVector<unsigned> InlineAsmMemoryOpIndexes;
unsigned NumOps = 0;
for (unsigned I = InlineAsm::MIOp_FirstOperand;
I < UseMI.getNumOperands(); I += 1 + NumOps) {
const MachineOperand &FlagsMO = UseMI.getOperand(I);
// Should be an imm.
if (!FlagsMO.isImm())
continue;
const InlineAsm::Flag Flags(FlagsMO.getImm());
NumOps = Flags.getNumOperandRegisters();
// Memory constraints have two operands.
if (NumOps != 2 || !Flags.isMemKind()) {
// If the register is used by something other than a memory contraint,
// we should not fold.
for (unsigned J = 0; J < NumOps; ++J) {
const MachineOperand &MO = UseMI.getOperand(I + 1 + J);
if (MO.isReg() && MO.getReg() == DestReg)
return false;
}
continue;
}
// We can only do this for constraint m.
if (Flags.getMemoryConstraintID() != InlineAsm::ConstraintCode::m)
return false;
const MachineOperand &AddrMO = UseMI.getOperand(I + 1);
if (!AddrMO.isReg() || AddrMO.getReg() != DestReg)
continue;
const MachineOperand &OffsetMO = UseMI.getOperand(I + 2);
if (!OffsetMO.isImm())
continue;
// All inline asm memory operands must use the same offset.
int64_t Offset = OffsetMO.getImm();
if (CommonOffset && Offset != CommonOffset)
return false;
CommonOffset = Offset;
InlineAsmMemoryOpIndexes.push_back(I + 1);
}
InlineAsmMemoryOpIndexesMap.insert(
std::make_pair(&UseMI, InlineAsmMemoryOpIndexes));
break;
}
}
}
// We found a common offset.
// Update the offsets in global address lowering.
// We may have already folded some arithmetic so we need to add to any
// existing offset.
int64_t NewOffset = Hi20.getOperand(1).getOffset() + *CommonOffset;
// LA32 ignores the upper 32 bits.
if (!ST->is64Bit())
NewOffset = SignExtend64<32>(NewOffset);
// We can only fold simm32 offsets.
if (!isInt<32>(NewOffset))
return false;
// If optimized by this pass successfully, MO_RELAX bitmask target-flag should
// be removed from the pcala code sequence. Code sequence of tls-le can still
// be relaxed after being optimized.
//
// For example:
// pcalau12i $a0, %pc_hi20(symbol)
// addi.d $a0, $a0, %pc_lo12(symbol)
// ld.w $a0, $a0, 0
//
// =>
//
// pcalau12i $a0, %pc_hi20(symbol)
// ld.w $a0, $a0, %pc_lo12(symbol)
//
// Code sequence optimized before can be relax by linker. But after being
// optimized, it cannot be relaxed any more. So MO_RELAX flag should not be
// carried by them.
Hi20.getOperand(1).setOffset(NewOffset);
MachineOperand &ImmOp = Lo12.getOperand(2);
ImmOp.setOffset(NewOffset);
if (Lo20 && Hi12) {
Lo20->getOperand(2).setOffset(NewOffset);
Hi12->getOperand(2).setOffset(NewOffset);
}
if (Hi20.getOpcode() == LoongArch::PCALAU12I) {
Hi20.getOperand(1).setTargetFlags(
LoongArchII::getDirectFlags(Hi20.getOperand(1)));
ImmOp.setTargetFlags(LoongArchII::getDirectFlags(ImmOp));
} else if (Hi20.getOpcode() == LoongArch::LU12I_W) {
MachineInstr *Add = &*MRI->use_instr_begin(Hi20.getOperand(0).getReg());
Add->getOperand(3).setOffset(NewOffset);
}
// Update the immediate in the load/store instructions to add the offset.
const LoongArchInstrInfo &TII = *ST->getInstrInfo();
for (MachineInstr &UseMI :
llvm::make_early_inc_range(MRI->use_instructions(DestReg))) {
if (UseMI.getOpcode() == LoongArch::INLINEASM ||
UseMI.getOpcode() == LoongArch::INLINEASM_BR) {
auto &InlineAsmMemoryOpIndexes = InlineAsmMemoryOpIndexesMap[&UseMI];
for (unsigned I : InlineAsmMemoryOpIndexes) {
MachineOperand &MO = UseMI.getOperand(I + 1);
switch (ImmOp.getType()) {
case MachineOperand::MO_GlobalAddress:
MO.ChangeToGA(ImmOp.getGlobal(), ImmOp.getOffset(),
LoongArchII::getDirectFlags(ImmOp));
break;
case MachineOperand::MO_MCSymbol:
MO.ChangeToMCSymbol(ImmOp.getMCSymbol(),
LoongArchII::getDirectFlags(ImmOp));
MO.setOffset(ImmOp.getOffset());
break;
case MachineOperand::MO_BlockAddress:
MO.ChangeToBA(ImmOp.getBlockAddress(), ImmOp.getOffset(),
LoongArchII::getDirectFlags(ImmOp));
break;
default:
report_fatal_error("unsupported machine operand type");
break;
}
}
} else {
UseMI.setDesc(TII.get(getNewOpc(UseMI.getOpcode(), Last)));
if (Last) {
UseMI.removeOperand(2);
UseMI.removeOperand(1);
UseMI.addOperand(Last->getOperand(1));
UseMI.addOperand(Last->getOperand(2));
UseMI.getOperand(1).setIsKill(false);
UseMI.getOperand(2).setIsKill(false);
} else {
UseMI.removeOperand(2);
UseMI.addOperand(ImmOp);
}
}
}
if (Last) {
Last->eraseFromParent();
return true;
}
if (Hi20.getOpcode() == LoongArch::PCALAU12I) {
MRI->replaceRegWith(Lo12.getOperand(0).getReg(),
Hi20.getOperand(0).getReg());
} else if (Hi20.getOpcode() == LoongArch::LU12I_W) {
MachineInstr *Add = &*MRI->use_instr_begin(Hi20.getOperand(0).getReg());
MRI->replaceRegWith(Lo12.getOperand(0).getReg(),
Add->getOperand(0).getReg());
}
Lo12.eraseFromParent();
return true;
}
bool LoongArchMergeBaseOffsetOpt::runOnMachineFunction(MachineFunction &Fn) {
if (skipFunction(Fn.getFunction()))
return false;
ST = &Fn.getSubtarget<LoongArchSubtarget>();
bool MadeChange = false;
MRI = &Fn.getRegInfo();
for (MachineBasicBlock &MBB : Fn) {
LLVM_DEBUG(dbgs() << "MBB: " << MBB.getName() << "\n");
for (MachineInstr &Hi20 : MBB) {
MachineInstr *Lo12 = nullptr;
MachineInstr *Lo20 = nullptr;
MachineInstr *Hi12 = nullptr;
MachineInstr *Last = nullptr;
if (Hi20.getOpcode() == LoongArch::PCALAU12I) {
// Detect foldable pcala code sequence in small/medium/large code model.
if (!detectFoldable(Hi20, Lo12, Lo20, Hi12, Last))
continue;
} else if (Hi20.getOpcode() == LoongArch::LU12I_W) {
MachineInstr *Add = nullptr;
// Detect foldable tls-le code sequence in small/medium code model.
if (!detectFoldable(Hi20, Add, Lo12))
continue;
} else {
continue;
}
// For tls-le, we do not pass the second PseudoAddTPRel instr in order to
// reuse the existing hooks and the last three paramaters should always be
// nullptr.
MadeChange |= detectAndFoldOffset(Hi20, *Lo12, Lo20, Hi12, Last);
MadeChange |= foldIntoMemoryOps(Hi20, *Lo12, Lo20, Hi12, Last);
}
}
return MadeChange;
}
/// Returns an instance of the Merge Base Offset Optimization pass.
FunctionPass *llvm::createLoongArchMergeBaseOffsetOptPass() {
return new LoongArchMergeBaseOffsetOpt();
}