[x64][win] Add compiler support for x64 import call optimization (equivalent to MSVC /d2guardretpoline) (#126631)

This is the x64 equivalent of #121516

Since import call optimization was originally [added to x64 Windows to
implement a more efficient retpoline
mitigation](https://techcommunity.microsoft.com/blog/windowsosplatform/mitigating-spectre-variant-2-with-retpoline-on-windows/295618)
the section and constant names relating to this all mention "retpoline"
and we need to mark indirect calls, control-flow guard calls and jumps
for jump tables in the section alongside calls to imported functions.

As with the AArch64 feature, this emits a new section into the obj which
is used by the MSVC linker to generate the Dynamic Value Relocation
Table and the section itself does not appear in the final binary.

The Windows Loader requires a specific sequence of instructions be
emitted when this feature is enabled:
* Indirect calls/jumps must have the function pointer to jump to in
`rax`.
* Calls to imported functions must use the `rex` prefix and be followed
by a 5-byte nop.
* Indirect calls must be followed by a 3-byte nop.
This commit is contained in:
Daniel Paoliello 2025-05-20 14:48:41 -07:00 committed by GitHub
parent a690852b29
commit a414877a7a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
24 changed files with 588 additions and 31 deletions

View File

@ -16,6 +16,7 @@
namespace llvm {
class FunctionPass;
class GlobalValue;
class CFGuardPass : public PassInfoMixin<CFGuardPass> {
public:
@ -34,6 +35,8 @@ FunctionPass *createCFGuardCheckPass();
/// Insert Control FLow Guard dispatches on indirect function calls.
FunctionPass *createCFGuardDispatchPass();
bool isCFGuardFunction(const GlobalValue *GV);
} // namespace llvm
#endif

View File

@ -599,6 +599,11 @@ void MCObjectFileInfo::initCOFFMCObjectFileInfo(const Triple &T) {
if (T.getArch() == Triple::aarch64) {
ImportCallSection =
Ctx->getCOFFSection(".impcall", COFF::IMAGE_SCN_LNK_INFO);
} else if (T.getArch() == Triple::x86_64) {
// Import Call Optimization on x64 leverages the same metadata as the
// retpoline mitigation, hence the unusual section name.
ImportCallSection =
Ctx->getCOFFSection(".retplne", COFF::IMAGE_SCN_LNK_INFO);
}
// Debug info.

View File

@ -464,7 +464,8 @@ static bool isIndirectBranchOrTailCall(const MachineInstr &MI) {
Opc == X86::TAILJMPr64 || Opc == X86::TAILJMPm64 ||
Opc == X86::TCRETURNri || Opc == X86::TCRETURNmi ||
Opc == X86::TCRETURNri64 || Opc == X86::TCRETURNmi64 ||
Opc == X86::TAILJMPr64_REX || Opc == X86::TAILJMPm64_REX;
Opc == X86::TCRETURNri64_ImpCall || Opc == X86::TAILJMPr64_REX ||
Opc == X86::TAILJMPm64_REX;
}
void X86AsmPrinter::emitBasicBlockEnd(const MachineBasicBlock &MBB) {
@ -912,6 +913,9 @@ void X86AsmPrinter::emitStartOfAsmFile(Module &M) {
if (TT.isOSBinFormatCOFF()) {
emitCOFFFeatureSymbol(M);
emitCOFFReplaceableFunctionData(M);
if (M.getModuleFlag("import-call-optimization"))
EnableImportCallOptimization = true;
}
OutStreamer->emitSyntaxDirective();
@ -1016,6 +1020,35 @@ void X86AsmPrinter::emitEndOfAsmFile(Module &M) {
// safe to set.
OutStreamer->emitSubsectionsViaSymbols();
} else if (TT.isOSBinFormatCOFF()) {
// If import call optimization is enabled, emit the appropriate section.
// We do this whether or not we recorded any items.
if (EnableImportCallOptimization) {
OutStreamer->switchSection(getObjFileLowering().getImportCallSection());
// Section always starts with some magic.
constexpr char ImpCallMagic[12] = "RetpolineV1";
OutStreamer->emitBytes(StringRef{ImpCallMagic, sizeof(ImpCallMagic)});
// Layout of this section is:
// Per section that contains an item to record:
// uint32_t SectionSize: Size in bytes for information in this section.
// uint32_t Section Number
// Per call to imported function in section:
// uint32_t Kind: the kind of item.
// uint32_t InstOffset: the offset of the instr in its parent section.
for (auto &[Section, CallsToImportedFuncs] :
SectionToImportedFunctionCalls) {
unsigned SectionSize =
sizeof(uint32_t) * (2 + 2 * CallsToImportedFuncs.size());
OutStreamer->emitInt32(SectionSize);
OutStreamer->emitCOFFSecNumber(Section->getBeginSymbol());
for (auto &[CallsiteSymbol, Kind] : CallsToImportedFuncs) {
OutStreamer->emitInt32(Kind);
OutStreamer->emitCOFFSecOffset(CallsiteSymbol);
}
}
}
if (usesMSVCFloatingPoint(TT, M)) {
// In Windows' libcmt.lib, there is a file which is linked in only if the
// symbol _fltused is referenced. Linking this in causes some

View File

@ -35,6 +35,26 @@ private:
bool EmitFPOData = false;
bool ShouldEmitWeakSwiftAsyncExtendedFramePointerFlags = false;
bool IndCSPrefix = false;
bool EnableImportCallOptimization = false;
enum ImportCallKind : unsigned {
IMAGE_RETPOLINE_AMD64_IMPORT_BR = 0x02,
IMAGE_RETPOLINE_AMD64_IMPORT_CALL = 0x03,
IMAGE_RETPOLINE_AMD64_INDIR_BR = 0x04,
IMAGE_RETPOLINE_AMD64_INDIR_CALL = 0x05,
IMAGE_RETPOLINE_AMD64_INDIR_BR_REX = 0x06,
IMAGE_RETPOLINE_AMD64_CFG_BR = 0x08,
IMAGE_RETPOLINE_AMD64_CFG_CALL = 0x09,
IMAGE_RETPOLINE_AMD64_CFG_BR_REX = 0x0A,
IMAGE_RETPOLINE_AMD64_SWITCHTABLE_FIRST = 0x010,
IMAGE_RETPOLINE_AMD64_SWITCHTABLE_LAST = 0x01F,
};
struct ImportCallInfo {
MCSymbol *CalleeSymbol;
ImportCallKind Kind;
};
DenseMap<MCSection *, std::vector<ImportCallInfo>>
SectionToImportedFunctionCalls;
// This utility class tracks the length of a stackmap instruction's 'shadow'.
// It is used by the X86AsmPrinter to ensure that the stackmap shadow
@ -49,7 +69,7 @@ private:
void startFunction(MachineFunction &MF) {
this->MF = &MF;
}
void count(MCInst &Inst, const MCSubtargetInfo &STI,
void count(const MCInst &Inst, const MCSubtargetInfo &STI,
MCCodeEmitter *CodeEmitter);
// Called to signal the start of a shadow of RequiredSize bytes.
@ -130,6 +150,12 @@ private:
void emitMachOIFuncStubHelperBody(Module &M, const GlobalIFunc &GI,
MCSymbol *LazyPointer) override;
void emitCallInstruction(const llvm::MCInst &MCI);
// Emits a label to mark the next instruction as being relevant to Import Call
// Optimization.
void emitLabelAndRecordForImportCallOptimization(ImportCallKind Kind);
public:
X86AsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer);

View File

@ -274,6 +274,7 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
case X86::TCRETURNdi64:
case X86::TCRETURNdi64cc:
case X86::TCRETURNri64:
case X86::TCRETURNri64_ImpCall:
case X86::TCRETURNmi64: {
bool isMem = Opcode == X86::TCRETURNmi || Opcode == X86::TCRETURNmi64;
MachineOperand &JumpTarget = MBBI->getOperand(0);
@ -345,12 +346,14 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(Op));
for (unsigned i = 0; i != X86::AddrNumOperands; ++i)
MIB.add(MBBI->getOperand(i));
} else if (Opcode == X86::TCRETURNri64) {
} else if ((Opcode == X86::TCRETURNri64) ||
(Opcode == X86::TCRETURNri64_ImpCall)) {
JumpTarget.setIsKill();
BuildMI(MBB, MBBI, DL,
TII->get(IsWin64 ? X86::TAILJMPr64_REX : X86::TAILJMPr64))
.add(JumpTarget);
} else {
assert(!IsWin64 && "Win64 requires REX for indirect jumps.");
JumpTarget.setIsKill();
BuildMI(MBB, MBBI, DL, TII->get(X86::TAILJMPr))
.add(JumpTarget);
@ -875,6 +878,9 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
case X86::CALL64m_RVMARKER:
expandCALL_RVMARKER(MBB, MBBI);
return true;
case X86::CALL64r_ImpCall:
MI.setDesc(TII->get(X86::CALL64r));
return true;
case X86::ADD32mi_ND:
case X86::ADD64mi32_ND:
case X86::SUB32mi_ND:

View File

@ -34,6 +34,7 @@
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/IntrinsicsX86.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Operator.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCSymbol.h"
@ -3316,6 +3317,11 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
if (Flag.isSwiftError() || Flag.isPreallocated())
return false;
// Can't handle import call optimization.
if (Is64Bit &&
MF->getFunction().getParent()->getModuleFlag("import-call-optimization"))
return false;
SmallVector<MVT, 16> OutVTs;
SmallVector<Register, 16> ArgRegs;

View File

@ -2399,7 +2399,8 @@ X86FrameLowering::getWinEHFuncletFrameSize(const MachineFunction &MF) const {
static bool isTailCallOpcode(unsigned Opc) {
return Opc == X86::TCRETURNri || Opc == X86::TCRETURNdi ||
Opc == X86::TCRETURNmi || Opc == X86::TCRETURNri64 ||
Opc == X86::TCRETURNdi64 || Opc == X86::TCRETURNmi64;
Opc == X86::TCRETURNri64_ImpCall || Opc == X86::TCRETURNdi64 ||
Opc == X86::TCRETURNmi64;
}
void X86FrameLowering::emitEpilogue(MachineFunction &MF,

View File

@ -19179,7 +19179,7 @@ SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
SelectionDAG &DAG) const {
return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false, nullptr);
}
SDValue
@ -19207,7 +19207,8 @@ X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
/// Creates target global address or external symbol nodes for calls or
/// other uses.
SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
bool ForCall) const {
bool ForCall,
bool *IsImpCall) const {
// Unpack the global address or external symbol.
SDLoc dl(Op);
const GlobalValue *GV = nullptr;
@ -19257,6 +19258,16 @@ SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)
return Result;
// If Import Call Optimization is enabled and this is an imported function
// then make a note of it and return the global address without wrapping.
if (IsImpCall && (OpFlags == X86II::MO_DLLIMPORT) &&
Mod.getModuleFlag("import-call-optimization")) {
assert(ForCall && "Should only enable import call optimization if we are "
"lowering a call");
*IsImpCall = true;
return Result;
}
Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
// With PIC, the address is actually $g + Offset.
@ -19282,7 +19293,7 @@ SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
SDValue
X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false, nullptr);
}
static SDValue GetTLSADDR(SelectionDAG &DAG, GlobalAddressSDNode *GA,
@ -34821,6 +34832,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(FST)
NODE_NAME_CASE(CALL)
NODE_NAME_CASE(CALL_RVMARKER)
NODE_NAME_CASE(IMP_CALL)
NODE_NAME_CASE(BT)
NODE_NAME_CASE(CMP)
NODE_NAME_CASE(FCMP)
@ -62092,6 +62104,7 @@ X86TargetLowering::EmitKCFICheck(MachineBasicBlock &MBB,
Register TargetReg;
switch (MBBI->getOpcode()) {
case X86::CALL64r:
case X86::CALL64r_ImpCall:
case X86::CALL64r_NT:
case X86::TAILJMPr64:
case X86::TAILJMPr64_REX:

View File

@ -90,6 +90,10 @@ namespace llvm {
/// POP_FROM_X87_REG (which may remove a required FPU stack pop).
POP_FROM_X87_REG,
// Pseudo for a call to an imported function to ensure the correct machine
// instruction is emitted for Import Call Optimization.
IMP_CALL,
/// X86 compare and logical compare instructions.
CMP,
FCMP,
@ -1746,8 +1750,8 @@ namespace llvm {
/// Creates target global address or external symbol nodes for calls or
/// other uses.
SDValue LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
bool ForCall) const;
SDValue LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG, bool ForCall,
bool *IsImpCall) const;
SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;

View File

@ -2050,6 +2050,12 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
if (CallConv == CallingConv::X86_INTR)
report_fatal_error("X86 interrupts may not be called directly");
if (IsIndirectCall && !IsWin64 &&
M->getModuleFlag("import-call-optimization"))
errorUnsupported(DAG, dl,
"Indirect calls must have a normal calling convention if "
"Import Call Optimization is enabled");
// Analyze operands of the call, assigning locations to each operand.
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
@ -2421,6 +2427,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
InGlue = Chain.getValue(1);
}
bool IsImpCall = false;
if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
// In the 64-bit large code model, we have to make all calls
@ -2433,7 +2440,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// ForCall to true here has the effect of removing WrapperRIP when possible
// to allow direct calls to be selected without first materializing the
// address into a register.
Callee = LowerGlobalOrExternal(Callee, DAG, /*ForCall=*/true);
Callee = LowerGlobalOrExternal(Callee, DAG, /*ForCall=*/true, &IsImpCall);
} else if (Subtarget.isTarget64BitILP32() &&
Callee.getValueType() == MVT::i32) {
// Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
@ -2555,7 +2562,9 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// Returns a chain & a glue for retval copy to use.
SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
if (IsNoTrackIndirectCall) {
if (IsImpCall) {
Chain = DAG.getNode(X86ISD::IMP_CALL, dl, NodeTys, Ops);
} else if (IsNoTrackIndirectCall) {
Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops);
} else if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
// Calls with a "clang.arc.attachedcall" bundle are special. They should be

View File

@ -1313,6 +1313,8 @@ def : Pat<(X86call_rvmarker (i64 tglobaladdr:$rvfunc), (i64 texternalsym:$dst)),
def : Pat<(X86call_rvmarker (i64 tglobaladdr:$rvfunc), (i64 tglobaladdr:$dst)),
(CALL64pcrel32_RVMARKER tglobaladdr:$rvfunc, tglobaladdr:$dst)>;
def : Pat<(X86imp_call (i64 tglobaladdr:$dst)),
(CALL64pcrel32 tglobaladdr:$dst)>;
// Tailcall stuff. The TCRETURN instructions execute after the epilog, so they
// can never use callee-saved registers. That is the purpose of the GR64_TC
@ -1344,7 +1346,11 @@ def : Pat<(X86tcret (i32 texternalsym:$dst), timm:$off),
def : Pat<(X86tcret ptr_rc_tailcall:$dst, timm:$off),
(TCRETURNri64 ptr_rc_tailcall:$dst, timm:$off)>,
Requires<[In64BitMode, NotUseIndirectThunkCalls]>;
Requires<[In64BitMode, NotUseIndirectThunkCalls, ImportCallOptimizationDisabled]>;
def : Pat<(X86tcret ptr_rc_tailcall:$dst, timm:$off),
(TCRETURNri64_ImpCall ptr_rc_tailcall:$dst, timm:$off)>,
Requires<[In64BitMode, NotUseIndirectThunkCalls, ImportCallOptimizationEnabled]>;
// Don't fold loads into X86tcret requiring more than 6 regs.
// There wouldn't be enough scratch registers for base+index.

View File

@ -327,7 +327,7 @@ let isCall = 1, Uses = [RSP, SSP], SchedRW = [WriteJump] in {
Requires<[In64BitMode]>;
def CALL64r : I<0xFF, MRM2r, (outs), (ins GR64:$dst),
"call{q}\t{*}$dst", [(X86call GR64:$dst)]>,
Requires<[In64BitMode,NotUseIndirectThunkCalls]>;
Requires<[In64BitMode,NotUseIndirectThunkCalls,ImportCallOptimizationDisabled]>;
def CALL64m : I<0xFF, MRM2m, (outs), (ins i64mem:$dst),
"call{q}\t{*}$dst", [(X86call (loadi64 addr:$dst))]>,
Requires<[In64BitMode,FavorMemIndirectCall,
@ -357,6 +357,10 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
def TCRETURNri64 : PseudoI<(outs),
(ins ptr_rc_tailcall:$dst, i32imm:$offset),
[]>, Sched<[WriteJump]>;
def TCRETURNri64_ImpCall : PseudoI<(outs),
(ins GR64_A:$dst, i32imm:$offset),
[]>, Sched<[WriteJump]>;
let mayLoad = 1 in
def TCRETURNmi64 : PseudoI<(outs),
(ins i64mem_TC:$dst, i32imm:$offset),
@ -418,6 +422,10 @@ let isPseudo = 1, isCall = 1, isCodeGenOnly = 1,
def CALL64pcrel32_RVMARKER :
PseudoI<(outs), (ins i64imm:$rvfunc, i64i32imm_brtarget:$dst), []>,
Requires<[In64BitMode]>;
def CALL64r_ImpCall :
PseudoI<(outs), (ins GR64_A:$dst), [(X86call GR64_A:$dst)]>,
Requires<[In64BitMode,NotUseIndirectThunkCalls,ImportCallOptimizationEnabled]>;
}
// Conditional tail calls are similar to the above, but they are branches

View File

@ -210,6 +210,9 @@ def X86call_rvmarker : SDNode<"X86ISD::CALL_RVMARKER", SDT_X86Call,
[SDNPHasChain, SDNPOutGlue, SDNPOptInGlue,
SDNPVariadic]>;
def X86imp_call : SDNode<"X86ISD::IMP_CALL", SDT_X86Call,
[SDNPHasChain, SDNPOutGlue, SDNPOptInGlue,
SDNPVariadic]>;
def X86NoTrackCall : SDNode<"X86ISD::NT_CALL", SDT_X86Call,
[SDNPHasChain, SDNPOutGlue, SDNPOptInGlue,

View File

@ -3714,6 +3714,7 @@ bool X86InstrInfo::isUnconditionalTailCall(const MachineInstr &MI) const {
case X86::TCRETURNmi:
case X86::TCRETURNdi64:
case X86::TCRETURNri64:
case X86::TCRETURNri64_ImpCall:
case X86::TCRETURNmi64:
return true;
default:
@ -7458,7 +7459,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
// do not fold loads into calls or pushes, unless optimizing for size
// aggressively.
if (isSlowTwoMemOps && !MF.getFunction().hasMinSize() &&
(Opc == X86::CALL32r || Opc == X86::CALL64r || Opc == X86::PUSH16r ||
(Opc == X86::CALL32r || Opc == X86::CALL64r ||
Opc == X86::CALL64r_ImpCall || Opc == X86::PUSH16r ||
Opc == X86::PUSH32r || Opc == X86::PUSH64r))
return nullptr;

View File

@ -233,6 +233,8 @@ let RecomputePerFunction = 1 in {
"shouldOptForSize(MF)">;
def NoSSE41_Or_OptForSize : Predicate<"shouldOptForSize(MF) || "
"!Subtarget->hasSSE41()">;
def ImportCallOptimizationEnabled : Predicate<"MF->getFunction().getParent()->getModuleFlag(\"import-call-optimization\")">;
def ImportCallOptimizationDisabled : Predicate<"!MF->getFunction().getParent()->getModuleFlag(\"import-call-optimization\")">;
}
def CallImmAddr : Predicate<"Subtarget->isLegalToCallImmediateAddr()">;

View File

@ -48,6 +48,7 @@
#include "llvm/MC/TargetRegistry.h"
#include "llvm/Target/TargetLoweringObjectFile.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Transforms/CFGuard.h"
#include "llvm/Transforms/Instrumentation/AddressSanitizer.h"
#include "llvm/Transforms/Instrumentation/AddressSanitizerCommon.h"
#include <string>
@ -113,7 +114,7 @@ struct NoAutoPaddingScope {
static void emitX86Nops(MCStreamer &OS, unsigned NumBytes,
const X86Subtarget *Subtarget);
void X86AsmPrinter::StackMapShadowTracker::count(MCInst &Inst,
void X86AsmPrinter::StackMapShadowTracker::count(const MCInst &Inst,
const MCSubtargetInfo &STI,
MCCodeEmitter *CodeEmitter) {
if (InShadow) {
@ -2214,6 +2215,31 @@ static void addConstantComments(const MachineInstr *MI,
}
}
// Does the given operand refer to a DLLIMPORT function?
bool isImportedFunction(const MachineOperand &MO) {
return MO.isGlobal() && (MO.getTargetFlags() == X86II::MO_DLLIMPORT);
}
// Is the given instruction a call to a CFGuard function?
bool isCallToCFGuardFunction(const MachineInstr *MI) {
assert(MI->getOpcode() == X86::TAILJMPm64_REX ||
MI->getOpcode() == X86::CALL64m);
const MachineOperand &MO = MI->getOperand(3);
return MO.isGlobal() && (MO.getTargetFlags() == X86II::MO_NO_FLAG) &&
isCFGuardFunction(MO.getGlobal());
}
// Does the containing block for the given instruction contain any jump table
// info (indicating that the block is a dispatch for a jump table)?
bool hasJumpTableInfoInBlock(const llvm::MachineInstr *MI) {
const MachineBasicBlock &MBB = *MI->getParent();
for (auto I = MBB.instr_rbegin(), E = MBB.instr_rend(); I != E; ++I)
if (I->isJumpTableDebugInfo())
return true;
return false;
}
void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
// FIXME: Enable feature predicate checks once all the test pass.
// X86_MC::verifyInstructionPredicates(MI->getOpcode(),
@ -2292,7 +2318,16 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
case X86::TAILJMPd64:
if (IndCSPrefix && MI->hasRegisterImplicitUseOperand(X86::R11))
EmitAndCountInstruction(MCInstBuilder(X86::CS_PREFIX));
[[fallthrough]];
if (EnableImportCallOptimization && isImportedFunction(MI->getOperand(0))) {
emitLabelAndRecordForImportCallOptimization(
IMAGE_RETPOLINE_AMD64_IMPORT_BR);
}
// Lower this as normal, but add a comment.
OutStreamer->AddComment("TAILCALL");
break;
case X86::TAILJMPr:
case X86::TAILJMPm:
case X86::TAILJMPd:
@ -2300,12 +2335,58 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
case X86::TAILJMPr64:
case X86::TAILJMPm64:
case X86::TAILJMPd64_CC:
case X86::TAILJMPr64_REX:
case X86::TAILJMPm64_REX:
if (EnableImportCallOptimization)
report_fatal_error("Unexpected TAILJMP instruction was emitted when "
"import call optimization was enabled");
// Lower these as normal, but add some comments.
OutStreamer->AddComment("TAILCALL");
break;
case X86::TAILJMPm64_REX:
if (EnableImportCallOptimization && isCallToCFGuardFunction(MI)) {
emitLabelAndRecordForImportCallOptimization(
IMAGE_RETPOLINE_AMD64_CFG_BR_REX);
}
OutStreamer->AddComment("TAILCALL");
break;
case X86::TAILJMPr64_REX: {
if (EnableImportCallOptimization) {
assert(MI->getOperand(0).getReg() == X86::RAX &&
"Indirect tail calls with impcall enabled must go through RAX (as "
"enforced by TCRETURNImpCallri64)");
emitLabelAndRecordForImportCallOptimization(
IMAGE_RETPOLINE_AMD64_INDIR_BR);
}
OutStreamer->AddComment("TAILCALL");
break;
}
case X86::JMP64r:
if (EnableImportCallOptimization && hasJumpTableInfoInBlock(MI)) {
uint16_t EncodedReg =
this->getSubtarget().getRegisterInfo()->getEncodingValue(
MI->getOperand(0).getReg().asMCReg());
emitLabelAndRecordForImportCallOptimization(
(ImportCallKind)(IMAGE_RETPOLINE_AMD64_SWITCHTABLE_FIRST +
EncodedReg));
}
break;
case X86::JMP16r:
case X86::JMP16m:
case X86::JMP32r:
case X86::JMP32m:
case X86::JMP64m:
if (EnableImportCallOptimization && hasJumpTableInfoInBlock(MI))
report_fatal_error(
"Unexpected JMP instruction was emitted for a jump-table when import "
"call optimization was enabled");
break;
case X86::TLS_addr32:
case X86::TLS_addr64:
case X86::TLS_addrX32:
@ -2492,7 +2573,50 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
case X86::CALL64pcrel32:
if (IndCSPrefix && MI->hasRegisterImplicitUseOperand(X86::R11))
EmitAndCountInstruction(MCInstBuilder(X86::CS_PREFIX));
if (EnableImportCallOptimization && isImportedFunction(MI->getOperand(0))) {
emitLabelAndRecordForImportCallOptimization(
IMAGE_RETPOLINE_AMD64_IMPORT_CALL);
MCInst TmpInst;
MCInstLowering.Lower(MI, TmpInst);
// For Import Call Optimization to work, we need a the call instruction
// with a rex prefix, and a 5-byte nop after the call instruction.
EmitAndCountInstruction(MCInstBuilder(X86::REX64_PREFIX));
emitCallInstruction(TmpInst);
emitNop(*OutStreamer, 5, Subtarget);
return;
}
break;
case X86::CALL64r:
if (EnableImportCallOptimization) {
assert(MI->getOperand(0).getReg() == X86::RAX &&
"Indirect calls with impcall enabled must go through RAX (as "
"enforced by CALL64r_ImpCall)");
emitLabelAndRecordForImportCallOptimization(
IMAGE_RETPOLINE_AMD64_INDIR_CALL);
MCInst TmpInst;
MCInstLowering.Lower(MI, TmpInst);
emitCallInstruction(TmpInst);
// For Import Call Optimization to work, we need a 3-byte nop after the
// call instruction.
emitNop(*OutStreamer, 3, Subtarget);
return;
}
break;
case X86::CALL64m:
if (EnableImportCallOptimization && isCallToCFGuardFunction(MI)) {
emitLabelAndRecordForImportCallOptimization(
IMAGE_RETPOLINE_AMD64_CFG_CALL);
}
break;
case X86::JCC_1:
// Two instruction prefixes (2EH for branch not-taken and 3EH for branch
// taken) are used as branch hints. Here we add branch taken prefix for
@ -2513,20 +2637,36 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
MCInst TmpInst;
MCInstLowering.Lower(MI, TmpInst);
// Stackmap shadows cannot include branch targets, so we can count the bytes
// in a call towards the shadow, but must ensure that the no thread returns
// in to the stackmap shadow. The only way to achieve this is if the call
// is at the end of the shadow.
if (MI->isCall()) {
// Count then size of the call towards the shadow
SMShadowTracker.count(TmpInst, getSubtargetInfo(), CodeEmitter.get());
// Then flush the shadow so that we fill with nops before the call, not
// after it.
SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo());
// Then emit the call
OutStreamer->emitInstruction(TmpInst, getSubtargetInfo());
emitCallInstruction(TmpInst);
return;
}
EmitAndCountInstruction(TmpInst);
}
void X86AsmPrinter::emitCallInstruction(const llvm::MCInst &MCI) {
// Stackmap shadows cannot include branch targets, so we can count the bytes
// in a call towards the shadow, but must ensure that the no thread returns
// in to the stackmap shadow. The only way to achieve this is if the call
// is at the end of the shadow.
// Count then size of the call towards the shadow
SMShadowTracker.count(MCI, getSubtargetInfo(), CodeEmitter.get());
// Then flush the shadow so that we fill with nops before the call, not
// after it.
SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo());
// Then emit the call
OutStreamer->emitInstruction(MCI, getSubtargetInfo());
}
void X86AsmPrinter::emitLabelAndRecordForImportCallOptimization(
ImportCallKind Kind) {
assert(EnableImportCallOptimization);
MCSymbol *CallSiteSymbol = MMI->getContext().createNamedTempSymbol("impcall");
OutStreamer->emitLabel(CallSiteSymbol);
SectionToImportedFunctionCalls[OutStreamer->getCurrentSectionOnly()]
.push_back({CallSiteSymbol, Kind});
}

View File

@ -999,6 +999,7 @@ unsigned X86RegisterInfo::findDeadCallerSavedReg(
case X86::TCRETURNmi:
case X86::TCRETURNdi64:
case X86::TCRETURNri64:
case X86::TCRETURNri64_ImpCall:
case X86::TCRETURNmi64:
case X86::EH_RETURN:
case X86::EH_RETURN64: {

View File

@ -737,6 +737,10 @@ def GR32_SIDI : RegisterClass<"X86", [i32], 32, (add ESI, EDI)>;
def GR32_DIBP : RegisterClass<"X86", [i32], 32, (add EDI, EBP)>;
def GR32_BPSP : RegisterClass<"X86", [i32], 32, (add EBP, ESP)>;
// Class to support Windows Import Call Optimization: all indirect jumps must
// happen through RAX.
def GR64_A : RegisterClass<"X86", [i64], 64, (add RAX)>;
// Scalar SSE2 floating point registers.
def FR32 : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 15)>;

View File

@ -31,6 +31,9 @@ using OperandBundleDef = OperandBundleDefT<Value *>;
STATISTIC(CFGuardCounter, "Number of Control Flow Guard checks added");
constexpr StringRef GuardCheckFunctionName = "__guard_check_icall_fptr";
constexpr StringRef GuardDispatchFunctionName = "__guard_dispatch_icall_fptr";
namespace {
/// Adds Control Flow Guard (CFG) checks on indirect function calls/invokes.
@ -45,10 +48,10 @@ public:
// Get or insert the guard check or dispatch global symbols.
switch (GuardMechanism) {
case Mechanism::Check:
GuardFnName = "__guard_check_icall_fptr";
GuardFnName = GuardCheckFunctionName;
break;
case Mechanism::Dispatch:
GuardFnName = "__guard_dispatch_icall_fptr";
GuardFnName = GuardDispatchFunctionName;
break;
}
}
@ -318,3 +321,11 @@ FunctionPass *llvm::createCFGuardCheckPass() {
FunctionPass *llvm::createCFGuardDispatchPass() {
return new CFGuard(CFGuardPass::Mechanism::Dispatch);
}
bool llvm::isCFGuardFunction(const GlobalValue *GV) {
if (GV->getLinkage() != GlobalValue::ExternalLinkage)
return false;
StringRef Name = GV->getName();
return Name == GuardCheckFunctionName || Name == GuardDispatchFunctionName;
}

View File

@ -0,0 +1,34 @@
; RUN: llc -mtriple=x86_64-pc-windows-msvc < %s | FileCheck %s --check-prefix=CHECK
define dso_local void @normal_call(ptr noundef readonly %func_ptr) local_unnamed_addr section "nc_sect" {
entry:
call void %func_ptr()
ret void
}
; CHECK-LABEL: normal_call:
; CHECK: .Limpcall0:
; CHECK-NEXT: callq *__guard_dispatch_icall_fptr(%rip)
define dso_local void @tail_call_fp(ptr noundef readonly %func_ptr) local_unnamed_addr section "tc_sect" {
entry:
tail call void %func_ptr()
ret void
}
; CHECK-LABEL: tail_call_fp:
; CHECK: .Limpcall1:
; CHECK-NEXT: rex64 jmpq *__guard_dispatch_icall_fptr(%rip)
; CHECK-LABEL .section .retplne,"yi"
; CHECK-NEXT .asciz "RetpolineV1"
; CHECK-NEXT .long 16
; CHECK-NEXT .secnum tc_sect
; CHECK-NEXT .long 10
; CHECK-NEXT .secoffset .Limpcall1
; CHECK-NEXT .long 16
; CHECK-NEXT .secnum nc_sect
; CHECK-NEXT .long 9
; CHECK-NEXT .secoffset .Limpcall0
!llvm.module.flags = !{!0, !1}
!0 = !{i32 1, !"import-call-optimization", i32 1}
!1 = !{i32 2, !"cfguard", i32 2}

View File

@ -0,0 +1,83 @@
; RUN: llc -mtriple=x86_64-pc-windows-msvc < %s | FileCheck %s
; CHECK-LABEL: uses_rax:
; CHECK: .Limpcall0:
; CHECK-NEXT: jmpq *%rax
define void @uses_rax(i32 %x) {
entry:
switch i32 %x, label %sw.epilog [
i32 0, label %sw.bb
i32 1, label %sw.bb1
i32 2, label %sw.bb2
i32 3, label %sw.bb3
]
sw.bb:
tail call void @g(i32 0) #2
br label %sw.epilog
sw.bb1:
tail call void @g(i32 1) #2
br label %sw.epilog
sw.bb2:
tail call void @g(i32 2) #2
br label %sw.epilog
sw.bb3:
tail call void @g(i32 3) #2
br label %sw.epilog
sw.epilog:
tail call void @g(i32 10) #2
ret void
}
; CHECK-LABEL: uses_rcx:
; CHECK: .Limpcall1:
; CHECK-NEXT: jmpq *%rcx
define void @uses_rcx(i32 %x) {
entry:
switch i32 %x, label %sw.epilog [
i32 10, label %sw.bb
i32 11, label %sw.bb1
i32 12, label %sw.bb2
i32 13, label %sw.bb3
]
sw.bb:
tail call void @g(i32 0) #2
br label %sw.epilog
sw.bb1:
tail call void @g(i32 1) #2
br label %sw.epilog
sw.bb2:
tail call void @g(i32 2) #2
br label %sw.epilog
sw.bb3:
tail call void @g(i32 3) #2
br label %sw.epilog
sw.epilog:
tail call void @g(i32 10) #2
ret void
}
declare void @g(i32)
; CHECK-LABEL: .section .retplne,"yi"
; CHECK-NEXT: .asciz "RetpolineV1"
; CHECK-NEXT: .long 24
; CHECK-NEXT: .secnum .text
; CHECK-NEXT: .long 16
; CHECK-NEXT: .secoffset .Limpcall0
; CHECK-NEXT: .long 17
; CHECK-NEXT: .secoffset .Limpcall1
!llvm.module.flags = !{!0}
!0 = !{i32 1, !"import-call-optimization", i32 1}

View File

@ -0,0 +1,21 @@
; RUN: llc -mtriple=x86_64-pc-windows-msvc < %s | FileCheck %s
define dso_local void @normal_call() local_unnamed_addr {
entry:
call void @a()
ret void
}
; CHECK-LABEL: normal_call:
; CHECK: callq a
declare void @a() local_unnamed_addr
; Even if there are no calls to imported functions, we still need to emit the
; .impcall section.
; CHECK-LABEL .section .retplne,"yi"
; CHECK-NEXT .asciz "RetpolineV1"
; CHECK-NOT .secnum
!llvm.module.flags = !{!0}
!0 = !{i32 1, !"import-call-optimization", i32 1}

View File

@ -0,0 +1,67 @@
; RUN: llc -mtriple=x86_64-pc-windows-msvc < %s | FileCheck %s --check-prefix=CHECK
; RUN: llc --fast-isel -mtriple=x86_64-pc-windows-msvc < %s | FileCheck %s --check-prefix=CHECK
; RUN: llc --global-isel --global-isel-abort=2 -mtriple=x86_64-pc-windows-msvc < %s | FileCheck %s --check-prefix=CHECK
define dso_local void @normal_call(ptr noundef readonly %func_ptr) local_unnamed_addr section "nc_sect" {
entry:
call void @a()
call void @a()
call void %func_ptr()
ret void
}
; CHECK-LABEL: normal_call:
; CHECK: .Limpcall0:
; CHECK-NEXT: rex64
; CHECK-NEXT: callq __imp_a
; CHECK-NEXT: nopl 8(%rax,%rax)
; CHECK-NEXT: .Limpcall1:
; CHECK-NEXT: rex64
; CHECK-NEXT: callq __imp_a
; CHECK-NEXT: nopl 8(%rax,%rax)
; CHECK-NEXT: movq %rsi, %rax
; CHECK-NEXT: .Limpcall2:
; CHECK-NEXT: callq *%rax
; CHECK-NEXT: nopl (%rax)
; CHECK-NEXT: nop
define dso_local void @tail_call() local_unnamed_addr section "tc_sect" {
entry:
tail call void @b()
ret void
}
; CHECK-LABEL: tail_call:
; CHECK: .Limpcall3:
; CHECK-NEXT: jmp __imp_b
define dso_local void @tail_call_fp(ptr noundef readonly %func_ptr) local_unnamed_addr section "tc_sect" {
entry:
tail call void %func_ptr()
ret void
}
; CHECK-LABEL: tail_call_fp:
; CHECK: movq %rcx, %rax
; CHECK-NEXT: .Limpcall4:
; CHECK-NEXT: rex64 jmpq *%rax
declare dllimport void @a() local_unnamed_addr
declare dllimport void @b() local_unnamed_addr
; CHECK-LABEL .section .retplne,"yi"
; CHECK-NEXT .asciz "RetpolineV1"
; CHECK-NEXT .long 24
; CHECK-NEXT .secnum tc_sect
; CHECK-NEXT .long 3
; CHECK-NEXT .secoffset .Limpcall3
; CHECK-NEXT .long 5
; CHECK-NEXT .secoffset .Limpcall4
; CHECK-NEXT .long 32
; CHECK-NEXT .secnum nc_sect
; CHECK-NEXT .long 3
; CHECK-NEXT .secoffset .Limpcall0
; CHECK-NEXT .long 3
; CHECK-NEXT .secoffset .Limpcall1
; CHECK-NEXT .long 5
; CHECK-NEXT .secoffset .Limpcall2
!llvm.module.flags = !{!0}
!0 = !{i32 1, !"import-call-optimization", i32 1}

View File

@ -0,0 +1,69 @@
// RUN: llvm-mc -triple x86_64-windows-msvc -filetype obj -o %t.obj %s
// RUN: llvm-readobj --sections --sd --relocs %t.obj | FileCheck %s
.section nc_sect,"xr"
normal_call:
.seh_proc normal_call
# %bb.0: # %entry
subq $40, %rsp
.seh_stackalloc 40
.seh_endprologue
.Limpcall0:
rex64
callq *__imp_a(%rip)
nopl 8(%rax,%rax)
nop
addq $40, %rsp
retq
.seh_endproc
.section tc_sect,"xr"
tail_call:
.Limpcall1:
rex64
jmp *__imp_b(%rip)
.section .retplne,"yi"
.asciz "RetpolineV1"
.long 16
.secnum tc_sect
.long 2
.secoffset .Limpcall1
.long 16
.secnum nc_sect
.long 3
.secoffset .Limpcall0
// CHECK-LABEL: Name: .retplne (2E 72 65 74 70 6C 6E 65)
// CHECK-NEXT: VirtualSize: 0x0
// CHECK-NEXT: VirtualAddress: 0x0
// CHECK-NEXT: RawDataSize: 44
// CHECK-NEXT: PointerToRawData:
// CHECK-NEXT: PointerToRelocations:
// CHECK-NEXT: PointerToLineNumbers:
// CHECK-NEXT: RelocationCount: 0
// CHECK-NEXT: LineNumberCount: 0
// CHECK-NEXT: Characteristics [
// CHECK-NEXT: IMAGE_SCN_ALIGN_1BYTES
// CHECK-NEXT: IMAGE_SCN_LNK_INFO
// CHECK-NEXT: ]
// CHECK-NEXT: SectionData (
// CHECK-NEXT: 52657470 6F6C696E 65563100 10000000 |RetpolineV1.....|
// CHECK-NEXT: 0010:
// CHECK-SAME: [[#%.2X,TCSECT:]]000000
// CHECK-SAME: 02000000
// CHECK-SAME: [[#%.2X,TCOFFSET:]]000000
// CHECK-SAME: 10000000
// CHECK-NEXT: 0020:
// CHECK-SAME: [[#%.2X,NCSECT:]]000000
// CHECK-SAME: 03000000
// CHECK-SAME: [[#%.2X,NCOFFSET:]]000000
// CHECK-NEXT: )
// CHECK-LABEL: Relocations [
// CHECK-NEXT: Section ([[#%u,NCSECT]]) nc_sect {
// CHECK-NEXT: 0x[[#%x,NCOFFSET + 3]] IMAGE_REL_AMD64_REL32 __imp_a
// CHECK-NEXT: }
// CHECK-NEXT: Section ([[#%u,TCSECT]]) tc_sect {
// CHECK-NEXT: 0x[[#%x,TCOFFSET + 3]] IMAGE_REL_AMD64_REL32 __imp_b
// CHECK-NEXT: }